In [45]:
import numpy as np
import pandas as pd
import string
import random
import geopandas as gpd

In [46]:
# Mobile phone features for 2018 survey
np.random.seed(100)
n = 15
i = 100
names = [''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(n)) for _ in range(i)]
df = pd.DataFrame(names)
df.columns = ['phone_number']
for i in range(100):
    df['feature' + str(i)] = np.random.rand(len(df))*np.random.randint(0, 1000)
df.to_csv('features2018.csv', index=False)

In [48]:
# 2018 Survey Data
np.random.seed(11)
features = pd.read_csv('features2018.csv')
regions = gpd.read_file('shapefiles/regions.geojson')
prefectures = gpd.read_file('shapefiles/prefectures.geojson')
cantons = gpd.read_file('shapefiles/cantons.geojson')

# Basic survey information
labels = features.copy()
labels['consumption'] = 3*labels['feature1'] + 5*labels['feature6'] + \
    np.random.rand(len(labels))*1000

extra_phone_numbers = [np.nan for _ in range(20)] + [''.join(random.choice(string.ascii_uppercase + string.digits)
                                                    for _ in range(n)) for _ in range(20)]
extra_consumption = list(np.random.rand(len(extra_phone_numbers))*labels['consumption'].max())
extra = pd.DataFrame([extra_phone_numbers, extra_consumption]).T
extra.columns = ['phone_number', 'consumption']

labels = pd.concat([labels[['phone_number', 'consumption']], extra])
labels['uid'] = range(len(labels))
labels['weight'] = np.random.rand(len(labels))*5 + 1

# Asset and demographic questions
# Binary
for i in range(20):
    labels['bin' + str(i)] = (np.random.rand(len(labels)) > .5).astype('int')
# Categorical 
for i in range(10):
    num_categories = np.random.randint(2, 10)
    labels['cat' + str(i)] = np.random.choice(['group' + str(i) for i in range(0, num_categories)], len(labels))
# Continuous
for i in range(20):
    max_val = np.random.randint(1, 10)
    labels['cont' + str(i)] = np.random.rand(len(labels))*max_val

# PMT, asset index, and PPI
labels['pmt'] = labels['consumption'] + np.random.rand(len(labels))*labels['consumption'].max()/4
labels['rural_pmt'] = labels['consumption'] + np.random.rand(len(labels))*labels['consumption'].max()/3.8
labels['ppi']  = labels['consumption'] + np.random.rand(len(labels))*labels['consumption'].max()/3
labels['assetindex']  = labels['consumption'] + np.random.rand(len(labels))*labels['consumption'].max()/2

# Location 
labels['milieu'] = (np.random.rand(len(labels)) > .5).astype('int')
labels['milieu'] = labels['milieu'].apply(lambda x: 'rural' if x == 1 else 'urban')
labels['region'] = regions.sample(len(labels), replace=True, random_state=1)['region'].values
labels['prefecture'] = prefectures.sample(len(labels), replace=True, random_state=1)['prefecture'].values
labels['canton'] = cantons.sample(len(labels), replace=True, random_state=1)['canton'].values
labels['voter_region'] = regions.sample(len(labels), replace=True, random_state=86)['region'].values
labels['voter_prefecture'] = prefectures.sample(len(labels), replace=True, random_state=235)['prefecture'].values
labels['voter_canton'] = cantons.sample(len(labels), replace=True, random_state=81)['canton'].values

# Occupation
labels['formal_occupation'] = (np.random.rand(len(labels)) > .6).astype('int')
labels['occupation_poverty'] = (np.random.rand(len(labels))*10).astype('int') + 1

# Demographic characteristics
labels['age'] = np.random.randint(18, 100, size=len(labels))
labels['age_group'] = np.random.choice(['<30', '30-40', '40-50', '50-60', '60+'], len(labels))
labels['gender'] = np.random.choice(['M', 'F'], len(labels))
labels['ethnicity'] = np.random.choice(['Ewé', 'Kabyè', 'Moba', 'Kotokoli', 'Other'], len(labels))
labels['religion'] = np.random.choice(['Christian', 'Muslim', 'Animist', 'Other/No Religion'], len(labels))
labels['children'] = np.random.choice(['0', '1-2', '3-4', '5+'], len(labels))
labels['disability'] = np.random.choice(['Yes', 'No'])
labels['marital_status'] = np.random.choice(['Single', 'Marriage (Monogomous)', 'Marriage (Polygomous)',
                                            'Widow/Widower', 'Divorced/Separated'], len(labels))
labels['any_vulnerability'] = ((labels['gender'] == 'Female') | 
                               (labels['marital_status'].isin(['Widow/Widower', 'Single'])) | 
                               (labels['children'].isin(['5+'])) | 
                               (labels['age_group'] == '60+')).astype('int')
labels['any_vulnerability'] = labels['any_vulnerability'].apply(lambda x: 'Yes' if x == 1 else 'No')

# Food security
labels['fsec'] = np.random.choice([0, 1, 2, 3, 4, 5, 6, 7], len(labels))

# Phone ownership
labels['own_phone'] = labels['age'].apply(lambda x: int(np.random.rand() < 0.3) if x < 23 else 
                                          int(np.random.rand() < 0.7) if x < 50 else
                                          int(np.random.rand() < 0.4))

# Write file
labels.to_csv('survey2018.csv', index=False)

In [49]:
# 2018 Individual survey data
np.random.seed(11)
n_obs = 10000

prefectures = gpd.read_file('shapefiles/prefectures.geojson')
prefectures_list = prefectures.sample(n_obs, replace=True, random_state=1)['prefecture'].values
survey_indiv = pd.DataFrame(prefectures_list)
survey_indiv.columns = ['prefecture']

survey_indiv['gender'] = np.random.choice(['M', 'F'], len(survey_indiv))
survey_indiv['age'] = np.random.randint(18, 100, size=len(survey_indiv))
survey_indiv['own_phone'] = survey_indiv['gender'].apply(lambda x: np.random.rand() < 0.8 if x == 'M'
                                                        else np.random.rand() < 0.5)

# Write file
survey_indiv.to_csv('survey_indiv2018.csv', index=False)

In [51]:
# Single mobile phone feature
singlefeature = pd.read_csv('survey2018.csv')
singlefeature['single_feature'] = singlefeature['consumption'] + \
    np.random.rand(len(labels))*labels['consumption'].max()
singlefeature = singlefeature[['phone_number', 'single_feature']]
singlefeature.to_csv('single_feature2018.csv', index=False)

In [52]:
# Inferred home locations
np.random.seed(13)
homes = pd.read_csv('features2018.csv')[['phone_number']]
regions = gpd.read_file('shapefiles/regions.geojson')
prefectures = gpd.read_file('shapefiles/prefectures.geojson')
cantons = gpd.read_file('shapefiles/cantons.geojson')

homes['region'] = regions.sample(len(homes), replace=True, random_state=5)['region'].values
homes['prefecture'] = prefectures.sample(len(homes), replace=True, random_state=9)['prefecture'].values
homes['canton'] = cantons.sample(len(homes), replace=True, random_state=14)['canton'].values
homes.to_csv('inferred_home_locations2018.csv', index=False)

In [53]:
# Mobile phone features 2020
np.random.seed(500)
n = 15
i = 300
names = [''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(n)) for _ in range(i)]
df = pd.DataFrame(names)
df.columns = ['phone_number']
for i in range(100):
    df['feature' + str(i)] = np.random.rand(len(df))*np.random.randint(0, 1000)
df.to_csv('features2020.csv', index=False)

In [55]:
# 2020 Survey Data
np.random.seed(11)
features = pd.read_csv('features2020.csv')
regions = gpd.read_file('shapefiles/regions.geojson')
prefectures = gpd.read_file('shapefiles/prefectures.geojson')
cantons = gpd.read_file('shapefiles/cantons.geojson')

# Basic survey information
labels = features.copy()
labels['pmt'] = 2*labels['feature1'] + 6*labels['feature9'] + 3*labels['feature14'] + \
    np.random.rand(len(labels))*1000
labels = labels[['phone_number', 'pmt']]
labels['uid'] = range(len(labels))
labels['weight'] = np.random.rand(len(labels))*5 + 1

# Asset and demographic questions
# Binary
for i in range(20):
    labels['bin' + str(i)] = (np.random.rand(len(labels)) > .5).astype('int')
# Categorical 
for i in range(10):
    num_categories = np.random.randint(2, 10)
    labels['cat' + str(i)] = np.random.choice(['group' + str(i) for i in range(0, num_categories)], len(labels))
# Continuous
for i in range(20):
    max_val = np.random.randint(1, 10)
    labels['cont' + str(i)] = np.random.rand(len(labels))*max_val

# Location 
labels['milieu'] = (np.random.rand(len(labels)) > .5).astype('int')
labels['milieu'] = labels['milieu'].apply(lambda x: 'rural' if x == 1 else 'urban')
labels['region'] = regions.sample(len(labels), replace=True, random_state=1)['region'].values
labels['prefecture'] = prefectures.sample(len(labels), replace=True, random_state=1)['prefecture'].values
labels['canton'] = cantons.sample(len(labels), replace=True, random_state=1)['canton'].values
labels['voter_region'] = regions.sample(len(labels), replace=True, random_state=86)['region'].values
labels['voter_prefecture'] = prefectures.sample(len(labels), replace=True, random_state=235)['prefecture'].values
labels['voter_canton'] = cantons.sample(len(labels), replace=True, random_state=81)['canton'].values

# Occupation
labels['formal_occupation'] = (np.random.rand(len(labels)) > .6).astype('int')
labels['occupation_poverty'] = (np.random.rand(len(labels))*10).astype('int') + 1

# Add non-response observations
labels['responded'] = (np.random.rand(len(labels)) < 0.4).astype('int')
labels['draw_probability'] = np.random.rand(len(labels))
labels['response_probability'] = np.random.rand(len(labels))
for col in labels.columns:
    if col not in ['phone_number', 'uid', 'responded', 'draw_probability', 'response_probability']:
        labels[col] = labels.apply(lambda row: np.nan if row['responded'] == 0 else row[col], axis=1)

# Write to file
labels.to_csv('survey2020.csv', index=False)