In [1]:
import numpy as np
import pandas as pd
import string
import random

In [2]:
######################
#### Survey Data #####
######################

# Random seeds
np.random.seed(0)
random.seed(0)
survey_n_obs = 3000

# Phone numbers
phone_n_digits = 10
primary_nums = [''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(phone_n_digits)) \
          for _ in range(survey_n_obs)]
secondary_nums = [''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(phone_n_digits)) \
          for _ in range(survey_n_obs)]
secondary_nums = [num if np.random.rand() > 0.9 else np.nan for num in secondary_nums]

# Main poverty outcomes
ultra_poor = (np.random.rand(survey_n_obs) > .75).astype('int')
log_expend = np.random.normal(5, 1, survey_n_obs)
asset_index = np.random.normal(0, 1, survey_n_obs)

# Other outcomes
num_phones = np.random.randint(0, 4, survey_n_obs)
fsec_index = np.random.normal(0, 1, survey_n_obs)
fin_index = np.random.normal(0, 1, survey_n_obs)
psych_index = np.random.normal(0, 1, survey_n_obs)
cwr_group = np.random.randint(1, 6, survey_n_obs)

# Weights
weight = [np.random.rand()*6 if up == 1 else np.random.rand() for up in ultra_poor]

# Combine survey data together 
survey = pd.DataFrame([primary_nums, secondary_nums, ultra_poor, log_expend, asset_index, num_phones,
                      fsec_index, fin_index, psych_index, cwr_group, weight]).T
survey.columns = ['phone_number', 'secondary_phone_number', 'ultra_poor', 'log_expend', 'asset_index', 
                  'num_phones', 'fsec_index', 'fin_index', 'psych_index', 'cwr_group', 'weight']
for col in survey.columns:
    if 'phone_number' not in col:
        survey[col] = survey[col].astype('float')
survey.head()

Unnamed: 0,phone_number,secondary_phone_number,ultra_poor,log_expend,asset_index,num_phones,fsec_index,fin_index,psych_index,cwr_group,weight
0,Y0CQ65ZT4W,,0.0,3.895909,-0.512181,3.0,2.088159,-0.492326,-0.869939,2.0,0.973121
1,N6ISIGQ8JT,,0.0,3.061541,0.278683,2.0,-0.31637,-0.528452,-0.023731,4.0,0.422905
2,GEV49GW1UN,,1.0,5.901897,2.327715,3.0,-0.099634,0.732697,-1.123836,2.0,3.165034
3,9427QD9AFZ,,1.0,6.635481,1.732464,0.0,-0.041353,-0.632044,0.137734,4.0,5.755008
4,A5VPUEMOPJ,,1.0,5.528926,-0.135506,3.0,-1.417507,-0.136175,0.244179,2.0,0.465863


In [3]:
#####################
#### Phone Data #####
#####################

# Random seed
random.seed(0)
np.random.seed(0)
n_phone_obs = 500
n_individual_obs = 500

# Phone numbers (obtained from survey)
phone_numbers = list(survey.sample(n=n_phone_obs, random_state=12)['phone_number'])
phone_numbers += list(survey.sample(n=n_individual_obs, random_state=13)['secondary_phone_number'].dropna())

# Features
phone_columns = list(pd.read_csv('phone_column_names.csv')['column_name'])[1:]
phone_features = pd.DataFrame(phone_numbers)
phone_features.columns = ['phone_number']
for col in phone_columns:
    nan_thresh = np.random.rand()
    phone_features[col] = [np.nan if np.random.rand() > nan_thresh else val for val in 
                           np.random.normal(0, 1, len(phone_features))]
phone_features.head()

Unnamed: 0,phone_number,reporting__number_of_records,active_days__allweek__allday__callandtext,active_days__allweek__day__callandtext,active_days__allweek__night__callandtext,active_days__weekday__allday__callandtext,active_days__weekday__day__callandtext,active_days__weekday__night__callandtext,active_days__weekend__allday__callandtext,active_days__weekend__day__callandtext,...,number_of_recharges__allweek__allday,number_of_recharges__allweek__day,number_of_recharges__allweek__night,number_of_recharges__weekday__allday,number_of_recharges__weekday__day,number_of_recharges__weekday__night,number_of_recharges__weekend__allday,number_of_recharges__weekend__day,number_of_recharges__weekend__night,average_balance_recharges
0,PNQP8X3PVJ,0.741592,,,1.998486,,,,1.342904,0.082433,...,,,0.434496,,0.558207,-0.537678,,-0.242691,-1.924758,-0.208682
1,IGZ2DGL243,1.552914,-0.126128,1.366759,-0.54618,,,,-0.040766,-0.914141,...,,,0.830496,-0.26656,0.746483,,-0.555859,0.557882,,
2,B0X3L0SM9R,-2.268328,-0.248587,,0.703177,0.712944,,,0.557105,0.948765,...,,-0.826722,-1.093254,0.589975,-1.313714,-0.542383,-0.755803,-0.294403,,-0.697436
3,H4R7QW1UJ0,1.333545,,0.361343,-1.154514,-0.347047,,,,,...,,,,0.562262,,1.245296,-2.136482,0.668161,,
4,9RYBACLOMD,-0.842724,,-0.249852,-0.128004,-0.36142,,,,1.93808,...,,,0.10097,0.506896,,-2.297741,,,,1.196654
