# LOCKED POSITION RANDOMIZED DATA

FILENAME: budget_randomized.ipynb
    
PROJECT: Randomized Data Model

DATE CREATED: 27-MAY-20

DATE UPDATED: 27-MAY-20

## PHASE 1: PROJECT SETUP

Import the necessary libraries needed for ETL, engineering, and export efforts

In [30]:
import pandas as pd
import csv
import random
import sqlite3
import itertools
import numpy as np
import datetime
from itertools import repeat
import time as t
import getpass as gp

# plotting modules
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as po

In [9]:
def init_array(df_length):
    '''
    DESCRIPTION: A function to create and return a two_dimensional array with randomized float values
    '''
    
    length = df_length
    
    num_of_projects = 20 # can change the amount of projects
    num_of_subprojects = 40 # can change the amount of subprojects
    num_of_programs = 80
    num_of_rca = 120 # can change the amount of RCAs
    
    # create a sequential list of column values for projects, subprojects, and RCAs
    proj_list = ['project-' + str(i+1) for i in range(num_of_projects)] # create a list of project names
    subproject_list = ['subproject-' + str(i+1) for i in range(num_of_subprojects)]
    program_list = ['program-' + str(i+1) for i in range(num_of_programs)]
    rca_list = ['rca-' + str(i+1) for i in range(num_of_rca)]
    
    project_values = []
    subproject_values = []
    program_values = []
    rca_values = []

    for index in range(length):
        
        # select a random value from each list
        proj_val = random.choice(proj_list)
        sb_val = (random.choice(subproject_list))
        program_val = random.choice(program_list)
        rca_val = random.choice(rca_list)
        
        # append the randomized value to the default list
        project_values.append(proj_val)
        subproject_values.append(sb_val)
        program_values.append(program_val)
        rca_values.append(rca_val)
    
    # create randomized budget data
    yr3_forecast= np.random.randint(low = 100000, high = 30000000, size = df_length) 
    yr2_random = np.random.uniform(low=0.5, high=1.3, size=df_length)
    yr2_forecast = np.round(yr3_forecast * yr2_random,2)
    
    yr1_random = np.random.uniform(low=0.8, high=1.2, size=df_length)
    yr1_forecast = np.round(yr2_forecast * yr1_random,2)
    
    plan_random = np.random.uniform(low=0.6, high=1.3, size=df_length)
    plan_val = np.round(yr1_forecast * plan_random,2)
    
    approp_random = np.random.uniform(low=0.6, high=1.2, size=df_length)
    approp_val = np.round(plan_val * approp_random,2)
    
    oblig_random = np.random.uniform(low=0.8, high=1.0, size=df_length)
    oblig_val = np.round(approp_val * oblig_random,2)
    
    raw_df = pd.DataFrame(columns=['project', 'subproject', 'program_title', 'rca','yr+3_forecast','yr+2_forecast','yr+1_forecast','yr0_plan','yr-1_approp','yr-2_oblig'])

    raw_df['project'] = project_values
    raw_df['subproject'] = subproject_values
    raw_df['program_title'] = program_values
    raw_df['rca'] = rca_values
    raw_df['yr+3_forecast'] = yr3_forecast
    raw_df['yr+2_forecast'] = yr2_forecast
    raw_df['yr+1_forecast'] = yr1_forecast
    raw_df['yr0_plan'] = plan_val
    raw_df['yr-1_approp'] = approp_val
    raw_df['yr-2_oblig'] = oblig_val
    
    return raw_df

# PHASE 2: FUNCTION TEST

In [36]:
train_df = init_array(10000)
train_df.tail(10)

Unnamed: 0,project,subproject,program_title,rca,yr+3_forecast,yr+2_forecast,yr+1_forecast,yr0_plan,yr-1_approp,yr-2_oblig
9990,project-17,subproject-29,program-38,rca-49,380154,386219.74,367743.84,441433.34,326416.52,316365.13
9991,project-20,subproject-33,program-71,rca-97,14053907,9614434.68,10909420.13,13031054.15,14164783.16,13553076.92
9992,project-17,subproject-4,program-61,rca-12,5122201,5293380.03,6021288.46,6824253.44,5695993.7,5561404.36
9993,project-12,subproject-34,program-40,rca-87,2273834,1615131.55,1508285.09,1355572.32,905931.08,735457.77
9994,project-12,subproject-6,program-16,rca-19,20736342,10729197.66,11324846.96,14142262.21,13371296.55,12090286.28
9995,project-17,subproject-18,program-35,rca-107,2046400,2204623.21,2545027.73,2738369.79,2881836.58,2716093.29
9996,project-12,subproject-22,program-3,rca-18,12275170,10816669.97,11195424.17,6796332.1,5652112.33,4908539.96
9997,project-4,subproject-13,program-58,rca-117,4404451,5445979.35,5656234.29,6216160.78,4110618.21,3344989.39
9998,project-13,subproject-39,program-31,rca-82,27828285,25635013.67,30523050.62,24023769.94,25706216.55,22117139.05
9999,project-10,subproject-4,program-2,rca-56,24849738,17355028.65,16806662.95,18464708.53,21934189.05,18268227.11


In [37]:
train_df['key'] = train_df['project'] + '-' + train_df['subproject'] + '-' + train_df['program_title'] + '-' + train_df['rca']
train_df['key']


0        project-19-subproject-24-program-35-rca-78
1        project-12-subproject-26-program-58-rca-19
2         project-4-subproject-11-program-15-rca-23
3       project-15-subproject-28-program-11-rca-111
4         project-7-subproject-29-program-66-rca-20
                           ...                     
9995    project-17-subproject-18-program-35-rca-107
9996      project-12-subproject-22-program-3-rca-18
9997     project-4-subproject-13-program-58-rca-117
9998     project-13-subproject-39-program-31-rca-82
9999       project-10-subproject-4-program-2-rca-56
Name: key, Length: 10000, dtype: object

In [38]:
key_df = train_df[['key']]
duplicateRowsDF = key_df[key_df.duplicated()]
 
print("Duplicate Rows except first occurrence based on all columns are :")
print(duplicateRowsDF)

print("Total count of duplicated rows :")
print(len(duplicateRowsDF))

Duplicate Rows except first occurrence based on all columns are :
                                             key
1277   project-15-subproject-8-program-29-rca-16
2320    project-6-subproject-7-program-57-rca-50
4614  project-19-subproject-12-program-79-rca-32
8644  project-19-subproject-39-program-31-rca-70
Total count of duplicated rows :
4


In [39]:
# dropping ALL duplicte values 
train_df.drop_duplicates(subset ="key", 
                     keep = False, inplace = True) 


In [40]:
key_df = train_df[['key']]
duplicateRowsDF = key_df[key_df.duplicated()]
 
print("Duplicate Rows except first occurrence based on all columns are :")
print(duplicateRowsDF)

print("\nTotal count of duplicated rows :",len(duplicateRowsDF))


Duplicate Rows except first occurrence based on all columns are :
Empty DataFrame
Columns: [key]
Index: []

Total count of duplicated rows : 0


### Store unique values in data structures

In [41]:
budget_col_names = list(train_df.columns)
budget_col_names

['project',
 'subproject',
 'program_title',
 'rca',
 'yr+3_forecast',
 'yr+2_forecast',
 'yr+1_forecast',
 'yr0_plan',
 'yr-1_approp',
 'yr-2_oblig',
 'key']

In [42]:
budget_ordinal_names = budget_col_names[:4]
budget_ordinal_names

['project', 'subproject', 'program_title', 'rca']

In [43]:
proj_uniq_values = train_df['project'].unique()
print(set(proj_uniq_values))

{'project-15', 'project-16', 'project-3', 'project-13', 'project-8', 'project-10', 'project-9', 'project-4', 'project-6', 'project-5', 'project-7', 'project-17', 'project-19', 'project-14', 'project-20', 'project-18', 'project-2', 'project-1', 'project-12', 'project-11'}


In [44]:
subproj_uniq_values = train_df['subproject'].unique()
print(set(subproj_uniq_values))

{'subproject-24', 'subproject-40', 'subproject-17', 'subproject-20', 'subproject-23', 'subproject-38', 'subproject-28', 'subproject-14', 'subproject-37', 'subproject-15', 'subproject-12', 'subproject-39', 'subproject-25', 'subproject-36', 'subproject-8', 'subproject-34', 'subproject-11', 'subproject-19', 'subproject-3', 'subproject-18', 'subproject-31', 'subproject-29', 'subproject-30', 'subproject-16', 'subproject-6', 'subproject-9', 'subproject-32', 'subproject-2', 'subproject-33', 'subproject-21', 'subproject-27', 'subproject-22', 'subproject-10', 'subproject-13', 'subproject-26', 'subproject-4', 'subproject-7', 'subproject-35', 'subproject-1', 'subproject-5'}


In [45]:
program_uniq_values = train_df['program_title'].unique()
print(set(program_uniq_values))

{'program-67', 'program-36', 'program-33', 'program-24', 'program-79', 'program-38', 'program-42', 'program-66', 'program-65', 'program-50', 'program-57', 'program-41', 'program-10', 'program-43', 'program-52', 'program-71', 'program-74', 'program-37', 'program-31', 'program-2', 'program-77', 'program-11', 'program-54', 'program-44', 'program-30', 'program-22', 'program-61', 'program-13', 'program-80', 'program-68', 'program-39', 'program-21', 'program-8', 'program-40', 'program-49', 'program-32', 'program-48', 'program-14', 'program-5', 'program-58', 'program-17', 'program-12', 'program-63', 'program-28', 'program-69', 'program-3', 'program-27', 'program-20', 'program-9', 'program-1', 'program-59', 'program-23', 'program-62', 'program-25', 'program-16', 'program-55', 'program-4', 'program-46', 'program-15', 'program-76', 'program-26', 'program-75', 'program-60', 'program-6', 'program-35', 'program-34', 'program-7', 'program-64', 'program-18', 'program-73', 'program-53', 'program-47', 

In [46]:
rca_uniq_values = train_df['rca'].unique()
print(set(rca_uniq_values))

{'rca-41', 'rca-80', 'rca-81', 'rca-22', 'rca-111', 'rca-93', 'rca-97', 'rca-76', 'rca-20', 'rca-77', 'rca-26', 'rca-102', 'rca-42', 'rca-30', 'rca-44', 'rca-115', 'rca-73', 'rca-36', 'rca-54', 'rca-83', 'rca-45', 'rca-98', 'rca-19', 'rca-67', 'rca-49', 'rca-9', 'rca-16', 'rca-33', 'rca-48', 'rca-61', 'rca-29', 'rca-70', 'rca-34', 'rca-43', 'rca-103', 'rca-94', 'rca-6', 'rca-57', 'rca-50', 'rca-72', 'rca-32', 'rca-69', 'rca-4', 'rca-39', 'rca-55', 'rca-60', 'rca-117', 'rca-56', 'rca-99', 'rca-86', 'rca-71', 'rca-90', 'rca-14', 'rca-92', 'rca-25', 'rca-112', 'rca-53', 'rca-11', 'rca-8', 'rca-18', 'rca-35', 'rca-113', 'rca-79', 'rca-95', 'rca-40', 'rca-46', 'rca-3', 'rca-24', 'rca-65', 'rca-100', 'rca-38', 'rca-63', 'rca-21', 'rca-23', 'rca-110', 'rca-17', 'rca-84', 'rca-82', 'rca-104', 'rca-31', 'rca-116', 'rca-52', 'rca-119', 'rca-107', 'rca-12', 'rca-58', 'rca-1', 'rca-118', 'rca-2', 'rca-89', 'rca-51', 'rca-5', 'rca-66', 'rca-75', 'rca-105', 'rca-64', 'rca-37', 'rca-109', 'rca-15', '

### Visualize the generated data set

Project Title Visualization

In [47]:
for col_name in budget_col_names:
    raw_val = train_df[[col_name]]
    chart_title = col_name + ' Histogram'
    xaxis_title = col_name + ' Counts'
    yaxis_title = 'Frequency'
    #chart_title = "District " + district_name + " Total Events by Month & SHIFT"
    
    # plot the histogram
    fig = px.histogram(raw_val, x=col_name, marginal="box", # can be `box`, `violin`
                   nbins=12)
    fig.update_layout(
        title = chart_title,
        autosize=False,
        width=1500,
        height=700,
        margin=dict(
            l=50,
            r=50,
            b=100,
            t=100,
            pad=4
            ),
        paper_bgcolor="white",
    )
    fig.show()

# PHASE 3: RANDOMIZE EXECUTION DATA 

Section to build out the test data and functions

# END OF PROGRAM

"Si vacem para bellum"

# TEST SECTION

In [1]:
num_exe_proj = 15
num_exe_subproj = 30
num_exe_rca = 50
num_exe_program = 75

num_of_kc = 10
is_base = 1
is_nip = 1

In [2]:
exe_proj_list = ['proj-' + str(x) for x in range(num_exe_proj)]
exe_proj_list[:5]

['proj-0', 'proj-1', 'proj-2', 'proj-3', 'proj-4']

In [3]:
exe_subproj_list = ['subproj-' + str(x) for x in range(num_exe_subproj)]
exe_subproj_list[:5]

['subproj-0', 'subproj-1', 'subproj-2', 'subproj-3', 'subproj-4']

In [4]:
exe_rca_list = ['rca-' + str(x) for x in range(num_exe_rca)]
exe_rca_list[:5]

['rca-0', 'rca-1', 'rca-2', 'rca-3', 'rca-4']

In [5]:
exe_prog_list = ['program-' + str(x) for x in range(num_exe_program)]
exe_prog_list[:5]

['program-0', 'program-1', 'program-2', 'program-3', 'program-4']

# PHASE 4: EXPORT DATA

In [5]:
train_df.to_csv(r'locked_random_data_v1', index = False)