# LOCKED POSITION RANDOMIZED DATA

FILENAME: budget_randomized.ipynb
    
PROJECT: Randomized Data Model

DATE CREATED: 27-MAY-20

DATE UPDATED: 27-MAY-20

## PHASE 1: PROJECT SETUP

Import the necessary libraries needed for ETL, engineering, and export efforts

In [1]:
import pandas as pd
import csv
import random
import sqlite3
import itertools
import numpy as np
import datetime
from itertools import repeat
import time as t
import getpass as gp

In [2]:
def init_array(df_length):
    '''
    DESCRIPTION: A function to create and return a two_dimensional array with randomized float values
    '''
    
    length = df_length
    
    num_of_projects = 20 # can change the amount of projects
    num_of_subprojects = 40 # can change the amount of subprojects
    num_of_programs = 80
    num_of_rca = 120 # can change the amount of RCAs
    
    # create a sequential list of column values for projects, subprojects, and RCAs
    proj_list = ['project-' + str(i+1) for i in range(num_of_projects)] # create a list of project names
    subproject_list = ['subproject-' + str(i+1) for i in range(num_of_subprojects)]
    program_list = ['program-' + str(i+1) for i in range(num_of_programs)]
    rca_list = ['rca-' + str(i+1) for i in range(num_of_rca)]
    
    project_values = []
    subproject_values = []
    program_values = []
    rca_values = []

    for index in range(length):
        
        # select a random value from each list
        proj_val = random.choice(proj_list)
        sb_val = (random.choice(subproject_list))
        program_val = random.choice(program_list)
        rca_val = random.choice(rca_list)
        
        # append the randomized value to the default list
        project_values.append(proj_val)
        subproject_values.append(sb_val)
        program_values.append(program_val)
        rca_values.append(rca_val)
    
    # create randomized budget data
    yr3_forecast= np.random.randint(low = 100000, high = 30000000, size = df_length) 
    yr2_random = np.random.uniform(low=0.5, high=1.3, size=df_length)
    yr2_forecast = np.round(yr3_forecast * yr2_random,2)
    
    yr1_random = np.random.uniform(low=0.8, high=1.2, size=df_length)
    yr1_forecast = np.round(yr2_forecast * yr1_random,2)
    
    plan_random = np.random.uniform(low=0.6, high=1.3, size=df_length)
    plan_val = np.round(yr1_forecast * plan_random,2)
    
    approp_random = np.random.uniform(low=0.6, high=1.2, size=df_length)
    approp_val = np.round(plan_val * approp_random,2)
    
    oblig_random = np.random.uniform(low=0.8, high=1.0, size=df_length)
    oblig_val = np.round(approp_val * oblig_random,2)
    
    raw_df = pd.DataFrame(columns=['project', 'subproject', 'program_title', 'rca','yr+3_forecast','yr+2_forecast','yr+1_forecast','yr0_plan','yr-1_approp','yr-2_oblig'])

    raw_df['project'] = project_values
    raw_df['subproject'] = subproject_values
    raw_df['program_title'] = rca_values
    raw_df['rca'] = rca_values
    raw_df['yr+3_forecast'] = yr3_forecast
    raw_df['yr+2_forecast'] = yr2_forecast
    raw_df['yr+1_forecast'] = yr1_forecast
    raw_df['yr0_plan'] = plan_val
    raw_df['yr-1_approp'] = approp_val
    raw_df['yr-2_oblig'] = oblig_val
    
    return raw_df

# PHASE 2: Function Test

In [3]:
train_df = init_array(10000)
train_df.tail(10)

Unnamed: 0,project,subproject,program_title,rca,yr+3_forecast,yr+2_forecast,yr+1_forecast,yr0_plan,yr-1_approp,yr-2_oblig
9990,project-11,subproject-28,rca-70,rca-70,17914034,14517587.21,15148778.39,15750656.56,18202116.81,15322015.69
9991,project-2,subproject-13,rca-120,rca-120,6493174,7634813.34,8845080.32,8539298.24,6048655.03,5252986.17
9992,project-1,subproject-24,rca-64,rca-64,28458127,20924125.41,24898060.95,24941542.66,21241616.37,17877144.7
9993,project-17,subproject-25,rca-95,rca-95,25910351,21288988.51,24751602.0,31561458.04,26732193.41,24894476.94
9994,project-16,subproject-5,rca-19,rca-19,5875978,6369027.36,6864420.2,4402688.33,3759224.53,3723314.3
9995,project-13,subproject-3,rca-18,rca-18,13321490,16809784.54,17514734.74,12298731.16,8278178.04,7470235.24
9996,project-15,subproject-16,rca-76,rca-76,4153007,2577479.49,3047741.65,3317289.7,3173565.36,3118173.62
9997,project-20,subproject-18,rca-65,rca-65,18306949,17205109.95,19747399.69,21238338.06,13636510.52,11160160.29
9998,project-4,subproject-22,rca-47,rca-47,12051943,11376899.72,10244178.02,11253886.01,10986209.42,10325302.8
9999,project-14,subproject-39,rca-10,rca-10,15807160,14478359.77,14062761.67,16676729.34,14659152.8,14199573.94


In [9]:
train_df['key'] = train_df['project'] + '-' + train_df['subproject'] + '-' + train_df['program_title'] + '-' + train_df['rca']
train_df['key']


0          project-16-subproject-36-rca-2-rca-2
1        project-20-subproject-20-rca-45-rca-45
2       project-6-subproject-30-rca-110-rca-110
3        project-18-subproject-15-rca-57-rca-57
4        project-16-subproject-20-rca-46-rca-46
                         ...                   
9995      project-13-subproject-3-rca-18-rca-18
9996     project-15-subproject-16-rca-76-rca-76
9997     project-20-subproject-18-rca-65-rca-65
9998      project-4-subproject-22-rca-47-rca-47
9999     project-14-subproject-39-rca-10-rca-10
Name: key, Length: 10000, dtype: object

In [11]:
key_df = train_df[['key']]
duplicateRowsDF = key_df[key_df.duplicated()]
 
print("Duplicate Rows except first occurrence based on all columns are :")
print(duplicateRowsDF)

Duplicate Rows except first occurrence based on all columns are :
                                           key
297       project-3-subproject-3-rca-40-rca-40
856      project-6-subproject-37-rca-39-rca-39
951   project-14-subproject-33-rca-102-rca-102
1000   project-6-subproject-30-rca-110-rca-110
1098   project-9-subproject-12-rca-106-rca-106
...                                        ...
9955    project-20-subproject-34-rca-46-rca-46
9965    project-15-subproject-32-rca-82-rca-82
9972     project-8-subproject-28-rca-27-rca-27
9975    project-12-subproject-30-rca-64-rca-64
9986  project-11-subproject-31-rca-120-rca-120

[480 rows x 1 columns]


In [12]:
# dropping ALL duplicte values 
train_df.drop_duplicates(subset ="key", 
                     keep = False, inplace = True) 


In [13]:
key_df = train_df[['key']]
duplicateRowsDF = key_df[key_df.duplicated()]
 
print("Duplicate Rows except first occurrence based on all columns are :")
print(duplicateRowsDF)

Duplicate Rows except first occurrence based on all columns are :
Empty DataFrame
Columns: [key]
Index: []


# PHASE 3: EXPORT DATA

In [5]:
train_df.to_csv(r'locked_random_data_v1', index = False)

# TEST

In [6]:
length = 10000
num_of_projects = 20
num_of_subprojects = 40

In [7]:
proj_list = ['project' + str(i+1) for i in range(num_of_projects)]
proj_list[-5:]

['project16', 'project17', 'project18', 'project19', 'project20']

In [8]:
proj_values = [x for item in proj_list for x in repeat(item, 20)]
proj_repeat

NameError: name 'proj_repeat' is not defined

In [None]:
subproject_list = ['subproject' + str(i+1) for i in range(num_of_subprojects)]
subproject_list

In [None]:
subproject_values = []

for index in range(length):
    val = (random.choice(subproject_list))
    subproject_values.append(val)

In [None]:
subproject_values