# Synthetic Data Generation

In [1]:
import numpy as np
import pandas as pd
import string
import random

### Survey Data

In [2]:
# Random seeds
np.random.seed(0)
random.seed(0)
survey_n_obs = 3000

# Phone numbers
phone_n_digits = 10
primary_nums = [''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(phone_n_digits)) \
          for _ in range(survey_n_obs)]
secondary_nums = [''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(phone_n_digits)) \
          for _ in range(survey_n_obs)]
secondary_nums = [num if np.random.rand() > 0.9 else np.nan for num in secondary_nums]

# Main poverty outcomes
ultra_poor = (np.random.rand(survey_n_obs) > .75).astype('int')
log_expend = np.random.normal(5, 1, survey_n_obs)

# Other outcomes
num_phones = np.random.randint(0, 4, survey_n_obs)
fsec_index = np.random.normal(0, 1, survey_n_obs)
fin_index = np.random.normal(0, 1, survey_n_obs)
psych_index = np.random.normal(0, 1, survey_n_obs)
cwr_group = np.random.randint(1, 6, survey_n_obs)

# Weights
weight = [np.random.rand()*6 if up == 1 else np.random.rand() for up in ultra_poor]

# Combine survey data together 
survey = pd.DataFrame([primary_nums, secondary_nums, ultra_poor, log_expend, num_phones, fsec_index, fin_index, 
                       psych_index, cwr_group, weight]).T
survey.columns = ['phone_number', 'secondary_phone_number', 'ultra_poor', 'log_expend', 'num_phones', 
                  'fsec_index', 'fin_index', 'psych_index', 'cwr_group', 'weight']

# Asset index components
for asset in ['radio_cd_player', 'tv', 'tv_dish', 'vcr_dvd_player', 'refridgerator', 'generator', 'matress',
             'mobile_phone', 'non_mobile_phone', 'iron', 'bed_frame', 'jewelry', 'mosquito_net', 
             'mosquito_repellent_candle', 'fan', 'camera']:
    max_val = np.random.randint(2, 8)
    survey['asset_' + asset] = np.random.randint(0, max_val, survey_n_obs)

# Add household ID
survey['hhid'] = ['HH' + str(i) for i in range(len(survey))]

# Clean and write to file
for col in survey.columns:
    if 'phone_number' not in col and col != 'hhid':
        survey[col] = survey[col].astype('float')
survey.to_csv('raw_data/survey.csv', index=False)
survey.head()

Unnamed: 0,phone_number,secondary_phone_number,ultra_poor,log_expend,num_phones,fsec_index,fin_index,psych_index,cwr_group,weight,...,asset_mobile_phone,asset_non_mobile_phone,asset_iron,asset_bed_frame,asset_jewelry,asset_mosquito_net,asset_mosquito_repellent_candle,asset_fan,asset_camera,hhid
0,Y0CQ65ZT4W,,0.0,3.895909,3.0,0.104988,-0.703058,0.145612,3.0,0.098406,...,0.0,0.0,5.0,0.0,2.0,6.0,0.0,2.0,1.0,HH0
1,N6ISIGQ8JT,,0.0,3.061541,0.0,-0.084172,0.733819,0.859695,3.0,0.089966,...,0.0,0.0,0.0,0.0,3.0,3.0,2.0,0.0,2.0,HH1
2,GEV49GW1UN,,1.0,5.901897,1.0,-0.506115,-0.729145,0.587032,3.0,4.794578,...,1.0,1.0,1.0,1.0,3.0,5.0,0.0,2.0,2.0,HH2
3,9427QD9AFZ,,1.0,6.635481,1.0,-1.237672,-0.378207,0.322221,3.0,0.124083,...,1.0,1.0,2.0,0.0,0.0,2.0,3.0,2.0,0.0,HH3
4,A5VPUEMOPJ,,1.0,5.528926,3.0,-1.230211,-2.815455,-1.105884,4.0,4.372538,...,1.0,1.0,4.0,1.0,3.0,0.0,2.0,0.0,3.0,HH4


### Phone Data

In [3]:
# Random seeds
random.seed(0)
np.random.seed(0)
n_phone_obs = 500
n_individual_obs = 500

# Phone numbers (obtained from survey)
phone_numbers = list(survey.sample(n=n_phone_obs, random_state=12)['phone_number'])
phone_numbers += list(survey.sample(n=n_individual_obs, random_state=13)['secondary_phone_number'].dropna())

# Features
phone_columns = list(pd.read_csv('phone_column_names.csv')['column_name'])[1:]
phone_features = pd.DataFrame(phone_numbers)
phone_features.columns = ['phone_number']
for col in phone_columns:
    nan_thresh = np.random.rand()
    phone_features[col] = [np.nan if np.random.rand() > nan_thresh else val for val in 
                           np.random.normal(0, 1, len(phone_features))]
    
phone_features.to_csv('raw_data/phone_features.csv', index=False)
phone_features.head()

  phone_features[col] = [np.nan if np.random.rand() > nan_thresh else val for val in


Unnamed: 0,phone_number,reporting__number_of_records,active_days__allweek__allday__callandtext,active_days__allweek__day__callandtext,active_days__allweek__night__callandtext,active_days__weekday__allday__callandtext,active_days__weekday__day__callandtext,active_days__weekday__night__callandtext,active_days__weekend__allday__callandtext,active_days__weekend__day__callandtext,...,number_of_recharges__allweek__allday,number_of_recharges__allweek__day,number_of_recharges__allweek__night,number_of_recharges__weekday__allday,number_of_recharges__weekday__day,number_of_recharges__weekday__night,number_of_recharges__weekend__allday,number_of_recharges__weekend__day,number_of_recharges__weekend__night,average_balance_recharges
0,PNQP8X3PVJ,0.741592,,,1.998486,,,,1.342904,0.082433,...,,,0.434496,,0.558207,-0.537678,,-0.242691,-1.924758,-0.208682
1,IGZ2DGL243,1.552914,-0.126128,1.366759,-0.54618,,,,-0.040766,-0.914141,...,,,0.830496,-0.26656,0.746483,,-0.555859,0.557882,,
2,B0X3L0SM9R,-2.268328,-0.248587,,0.703177,0.712944,,,0.557105,0.948765,...,,-0.826722,-1.093254,0.589975,-1.313714,-0.542383,-0.755803,-0.294403,,-0.697436
3,H4R7QW1UJ0,1.333545,,0.361343,-1.154514,-0.347047,,,,,...,,,,0.562262,,1.245296,-2.136482,0.668161,,
4,9RYBACLOMD,-0.842724,,-0.249852,-0.128004,-0.36142,,,,1.93808,...,,,0.10097,0.506896,,-2.297741,,,,1.196654
