# Notebook 1: Survey data manipulation
Replication code for:
- Figure S1
- Figure S3
- Figure S6
- Figure S7
- Figure S14
- Figure S15
- Table S7
- Table S8
- Table S9
- Table S10
- Table S11
- Table S16

In [None]:
import numpy as np
import sys
import os
import time
import json
import shutil
import random
from joblib import dump, load
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from scipy.stats import percentileofscore
import matplotlib.ticker as mtick
from multiprocessing import Pool
from sklearn.model_selection import train_test_split, cross_validate, KFold
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, RidgeCV, ElasticNet, Ridge
from sklearn.linear_model import lars_path
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model.base import _rescale_data
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display, clear_output

from helpers import *

import warnings
warnings.filterwarnings('ignore')

### Figure S7

In [None]:
def pmt_feature_selection(df, outname):
    # Normalize continuous variables
    continuous_columns = [col for col in components if col[:4] == 'cont']
    for col in continuous_columns:
        df[col] = (df[col] - min(df[col]))/(max(df[col]) - min(df[col]))

    # Convert categorical variables to dummies
    categorical_columns = [col for col in components if col[:3] == 'cat']
    df = pd.get_dummies(df, columns=categorical_columns, drop_first=True, prefix_sep='__')

    # Train/test split
    train, test = train_test_split(df, test_size=.25, random_state=6, shuffle=True)

    xcols = train.drop(['consumption', 'weight'], axis=1).columns
    x = train.drop(['consumption', 'weight'], axis=1).values
    y = train['consumption'].values
    weights = train['weight'].values
    x_test = test.drop(['consumption', 'weight'], axis=1).values
    y_test = test['consumption'].values
    weights_test = test['weight'].values
    x = pd.DataFrame(x, columns = xcols)
    
    # Define models
    fit_params={'sample_weight':weights}
    rf = GridSearchCV(RandomForestRegressor(random_state=9), 
                       param_grid={'max_depth':range(1, 8, 2), 'n_estimators':[50, 100]}, 
                       cv=3, n_jobs=5)
    lr = LinearRegression()
    
    # Stepwise forward selection of predictors
    for (model, name) in [(lr, 'LR'), (lr, 'RF')]:
        used_features = []
        unused_features = components
        train_scores, test_scores = [], []
        for i in range(30):
            potential_model_test_scores, potential_model_train_scores = [], []
            for feature in unused_features:
                clear_output(wait=True)
                print(('%i ' + feature) % i)
                x_selected = x[inmultiple(used_features, x.columns) + inmultiple([feature], x.columns)]
                results = cv(model, x_selected, y, weights)
                potential_model_train_scores.append(results[0])
                potential_model_test_scores.append(results[1])
            best_idx = np.argmax(potential_model_test_scores)
            best_feature = unused_features[best_idx]
            used_features.append(best_feature)
            train_scores.append(potential_model_train_scores[best_idx])
            test_scores.append(potential_model_test_scores[best_idx])
            print(train_scores[-1], test_scores[-1])
            unused_features = list(set(unused_features) - set([best_feature]))
        pd.DataFrame([used_features, train_scores, test_scores]).T\
            .to_csv('outputs/survey/' + outname + '_' + name + '.csv', index=False)

In [None]:
# Full PMT
survey = pd.read_csv('data/survey2018.csv')
components = [col for col in survey.columns if col[:3] == 'cat' or col[:3] == 'bin' or col[:4] == 'cont']
df = survey[components + ['consumption', 'weight']].copy()
pmt_feature_selection(df, 'pmt_feature_selection')

In [None]:
# Rural-only PMT
survey = pd.read_csv('data/survey2018.csv')
survey = survey[survey['milieu'] == 'rural']
components = [col for col in survey.columns if col[:3] == 'cat' or col[:3] == 'bin' or col[:4] == 'cont']
df = survey[components + ['consumption', 'weight']].copy()
pmt_feature_selection(df, 'pmt_feature_selection_rural')

In [None]:
sns.set(style='white', font_scale=2.3)

fig, ax = plt.subplots(1, 2, figsize=(20, 7))

# Linear Regression
features = pd.read_csv('outputs/survey/pmt_feature_selection_LR.csv')
features.columns = ['feature', 'train', 'test']
ax[0].plot(range(len(features['train'])), features['train'], color='mediumseagreen', label='Train')
ax[0].plot(range(len(features['test'])), features['test'], color='indianred', label='Test')
ax[0].scatter(range(len(features['train'])), features['train'], color='mediumseagreen')
ax[0].scatter(range(len(features['test'])), features['test'], color='indianred')
ax[0].axvline(11, color='grey', dashes=[2, 2])
ax[0].set_ylabel('r2 Score')
ax[0].set_xlabel('Number of Features')
simpleaxis(ax[0])

# Random Forest
features = pd.read_csv('outputs/survey/pmt_feature_selection_RF.csv')
features.columns = ['feature', 'train', 'test']
ax[1].plot(range(len(features['train'])), features['train'], color='mediumseagreen', label='Train')
ax[1].plot(range(len(features['test'])), features['test'], color='indianred', label='Test')
ax[1].scatter(range(len(features['train'])), features['train'], color='mediumseagreen')
ax[1].scatter(range(len(features['test'])), features['test'], color='indianred')
ax[1].axvline(11, color='grey', dashes=[2, 2])
ax[1].set_ylabel('r2 Score')
ax[1].set_xlabel('Number of Features')
simpleaxis(ax[1])

ax[0].set_title('Ridge Regression')
ax[1].set_title('Random Forest')
ax[0].set_xlim(0, 30)
ax[1].set_xlim(0, 30)
ax[0].set_ylim(.2, .8)
ax[1].set_ylim(.2, .9)
ax[1].legend(loc='lower right')
plt.tight_layout()
plt.show()

### Table S8

In [None]:
survey = pd.read_csv('data/survey2018.csv')
components = [col for col in survey.columns if col[:3] == 'cat' or col[:3] == 'bin' or col[:4] == 'cont']
df = survey[components + ['consumption', 'weight']].copy()

# Normalize continuous variables
continuous_columns = [col for col in components if col[:4] == 'cont']
for col in continuous_columns:
    df[col] = (df[col] - min(df[col]))/(max(df[col]) - min(df[col]))

# Convert categorical variables to dummies
categorical_columns = [col for col in components if col[:3] == 'cat']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True, prefix_sep='__')

In [None]:
# Get selected variables, process x and y
number_vars = 12
selected_vars = pd.read_csv('outputs/survey/pmt_feature_selection_LR.csv')
selected_vars.columns = ['feature', 'train', 'test']
selected_vars = selected_vars['feature'][:number_vars]
x = df[inmultiple(selected_vars, df.columns)]
y = np.log(df['consumption'])

# Define model
model = RidgeCV()

# Get r2 over cross validation, produce OOS predictions over cross validation
r2 = np.mean(cross_val_score(model, x, y, cv=KFold(n_splits=10, shuffle=True, random_state=12), 
                             fit_params={'sample_weight':df['weight']}))
print('R2 score: %.2f' % r2)
df['pmt'] = cross_val_predict(model, x, y, cv=KFold(n_splits=10, shuffle=True, random_state=12), 
                                     fit_params={'sample_weight':df['weight']})

# Fit model on all data and report coefficients
model.fit(x, y, sample_weight=df['weight'])
coefficients = pd.DataFrame([list(x.columns), model.coef_]).T
coefficients.columns = ['Feature', 'Coefficient']
coefficients = coefficients.sort_values('Coefficient', ascending=False)
coefficients['Coefficient'] = coefficients['Coefficient'].apply(lambda x: '%.2f' % x)
coefficients.to_csv('outputs/survey/pmt_coefficients.csv', index=False)

### Table S9

In [None]:
survey = pd.read_csv('data/survey2018.csv')
survey = survey[survey['milieu'] == 'rural']
components = [col for col in survey.columns if col[:3] == 'cat' or col[:3] == 'bin' or col[:4] == 'cont']
df = survey[components + ['consumption', 'weight']].copy()

# Normalize continuous variables
continuous_columns = [col for col in components if col[:4] == 'cont']
for col in continuous_columns:
    df[col] = (df[col] - min(df[col]))/(max(df[col]) - min(df[col]))

# Convert categorical variables to dummies
categorical_columns = [col for col in components if col[:3] == 'cat']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True, prefix_sep='__')

In [None]:
# Get selected variables, process x and y
number_vars = 12
selected_vars = pd.read_csv('outputs/survey/pmt_feature_selection_rural_LR.csv')
selected_vars.columns = ['feature', 'train', 'test']
selected_vars = selected_vars['feature'][:number_vars]
x = df[inmultiple(selected_vars, df.columns)]
y = np.log(df['consumption'])

# Define model
model = RidgeCV()

# Get r2 over cross validation, produce OOS predictions over cross validation
r2 = np.mean(cross_val_score(model, x, y, cv=KFold(n_splits=10, shuffle=True, random_state=12), 
                             fit_params={'sample_weight':df['weight']}))
print('R2 score: %.2f' % r2)
df['pmt'] = cross_val_predict(model, x, y, cv=KFold(n_splits=10, shuffle=True, random_state=12), 
                                     fit_params={'sample_weight':df['weight']})

# Fit model on all data and report coefficients
model.fit(x, y, sample_weight=df['weight'])
coefficients = pd.DataFrame([list(x.columns), model.coef_]).T
coefficients.columns = ['Feature', 'Coefficient']
coefficients = coefficients.sort_values('Coefficient', ascending=False)
coefficients['Coefficient'] = coefficients['Coefficient'].apply(lambda x: '%.2f' % x)
coefficients.to_csv('outputs/survey/pmt_coefficients_rural.csv', index=False)

### Table S7

In [None]:
survey = pd.read_csv('data/survey2018.csv')
components = [col for col in survey.columns if col[:3] == 'bin']
df = survey[components].copy()

pca = PCA(n_components=1)
scaler = MinMaxScaler()
assets = scaler.fit_transform(df)
df['assetindex'] = pca.fit_transform(assets)
print('PCA variance explained: %.2f' % (100*pca.explained_variance_ratio_[0]) + '%')

basis_vector = pd.DataFrame([df.columns, pca.components_[0]]).T
basis_vector.columns = ['Asset', 'Magnitude']
basis_vector = basis_vector.sort_values('Magnitude', ascending=False)
basis_vector.to_csv('outputs/survey/asset_index_basis_vector.csv', index=False)

### Table S10

In [None]:
survey = pd.read_csv('data/survey2018.csv')
means = survey.groupby('occupation_poverty', as_index=False).agg('mean')
counts = survey.groupby('occupation_poverty', as_index=False).agg('count')\
    .rename({'consumption':'count'}, axis=1)
sums = survey.groupby('occupation_poverty', as_index=False).agg('sum')\
    .rename({'weight':'total_weight'}, axis=1)
occupations = means[['occupation_poverty', 'consumption']]\
    .merge(counts[['occupation_poverty', 'count']])\
    .merge(sums[['occupation_poverty', 'total_weight']])
occupations = occupations.rename({'occupation_poverty':'occupation_category'}, axis=1)
occupations['Proportion'] = 100*occupations['total_weight']/occupations['total_weight'].sum()
occupations = occupations.drop('total_weight', axis=1).round(2)
occupations

### Figure S3

In [None]:
survey = pd.read_csv('data/survey2018.csv')
fig, ax = plt.subplots(1, figsize=(12, 7))
sns.kdeplot(survey[survey['formal_occupation'] == 0]['consumption'], shade=True, label='Informal Occupation')
sns.kdeplot(survey[survey['formal_occupation'] == 1]['consumption'], shade=False, color='indianred', 
           dashes=[2, 2], label='Formal Occupation')
ax.set_xlabel('Consumption')
ax.set_ylabel('Density')
simpleaxis(ax)
plt.show()

### Table S16

In [None]:
survey = pd.read_csv('data/survey2018.csv')\
    .rename({'prefecture':'survey_prefecture', 'canton':'survey_canton'}, axis=1)
home_locations = pd.read_csv('data/inferred_home_locations2018.csv')\
    .rename({'region':'phone_region', 'prefecture':'phone_prefecture', 'canton':'phone_canton'}, axis=1)
df = survey.merge(home_locations, on='phone_number', how='inner')
df = pd.DataFrame(np.repeat(df.values, df['weight'], axis=0), columns=df.columns)

for spatial in ['prefecture', 'canton']:
    for combo in [('survey', 'phone'), ('survey', 'voter'), ('phone', 'voter')]:
        percent_matching = len(df[df[combo[0] + '_' + spatial] == df[combo[1] + '_' + spatial]])/len(df)
        print(spatial, combo[0] + '<-->' + combo[1], ('%.2f' % percent_matching) + '%')

### Table S11

In [None]:
# Check phone matching
survey = pd.read_csv('data/survey2018.csv')
survey['phone'] = survey['phone_number'].apply(lambda x: 0 if pd.isnull(x) else 1)
phone = pd.read_csv('outputs/ml/consumption/LGBM/oos_predictions.csv')
survey['matched'] = survey['phone_number'].apply(lambda x: 1 if x in set(phone['phone_number']) else 0)

# Transform variables for aggregation
survey['rural'] = survey['milieu'].apply(lambda x: 1 if x == 'rural' else 0)
survey['female'] = survey['gender'].apply(lambda x: 1 if x == 'F' else 0)

In [None]:
cols = ['consumption', 'pmt', 'formal_occupation', 'rural', 'female', 'age']

overall = pd.DataFrame(survey[cols].agg('mean'))
overall.columns = ['Overall']

by_phone = survey.groupby('phone')[cols].agg('mean').T.round(2)
by_phone.columns = ['Phone Number', 'No Phone Number']

have_phone = survey[survey['phone'] == 1].copy()
by_match = have_phone.groupby('matched')[cols].agg('mean').T.round(2)
by_match.columns = ['Matching', 'Not Matching']

table = pd.DataFrame()
table['Overall'] = overall['Overall']
table['Phone Number'] = by_phone['Phone Number']
table['No Phone Number'] = by_phone['No Phone Number']
table['Matching'] = by_match['Matching']
table['Not Matching'] = by_match['Not Matching']
table

### Figure S1

In [None]:
# Check phone matching
survey = pd.read_csv('data/survey2018.csv')
df = pd.DataFrame(np.repeat(survey.values, survey['weight'], axis=0), columns=survey.columns)
fsec = df.groupby('fsec', as_index=False).agg('count')[['fsec', 'uid']]\
    .rename({'uid':'count'}, axis=1)
fsec['count'] = 100*fsec['count']/fsec['count'].sum()

sns.set(font_scale=1.5, style='white')
fig, ax = plt.subplots(1, figsize=(10, 5))
barlist = ax.bar(fsec['fsec'], fsec['count'])

colors = ['darkgreen', 'forestgreen', 'limegreen', 'lightgreen', 'darkkhaki', 'wheat', 'navajowhite', 
          'burlywood']
for b in range(len(barlist)):
    barlist[b].set_color(colors[b])

for i in range(len(fsec)):
    ax.annotate(fsec['count'].values[i].round(2), (fsec['fsec'].values[i]-0.5, fsec['count'].values[i] + 1))
    
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_ylim(0, 20)
ax.set_xlabel('Days')

simpleaxis(ax)
plt.show()

### Figure S6

In [None]:
# Panel A
survey_indiv = pd.read_csv('data/survey_indiv2018.csv')

sns.set(style='white', font_scale=1.5)
fig, ax = plt.subplots(1, figsize=(10, 5))

ages = sorted(survey_indiv['age'].unique())

counts = []
for age in ages:
    counts.append(len(survey_indiv[survey_indiv['age'] == age]))
ax.fill_between(ages, 0, np.array(counts)/max(counts), color='wheat')

for gender, color in [('M', 'blue'), ('F', 'indianred')]:
    subset = survey_indiv[survey_indiv['gender'] == gender].copy()
    means, stds = [], []
    for age in ages:
        means.append(subset[subset['age'] == age]['own_phone'].mean())
        stds.append(subset[subset['age'] == age]['own_phone'].mean())
    ax.scatter(ages, means, color=color)
    for i in range(len(means)):
        ax.axvline(ages[i], means[i] - stds[i], means[i] + stds[i], color=color)

ax.set_title('Individual Phone Ownership')
ax.set_xlabel('Age')
ax.set_ylabel('Proportion of Individuals with 1+ Phones')
simpleaxis(ax)
plt.show()

In [None]:
# Panel B
survey = pd.read_csv('data/survey2018.csv')

means, stds, counts = [], [], []
ages = sorted(survey['age'].unique())
for age in ages:
    means.append(survey[survey['age'] == age]['own_phone'].mean())
    stds.append(survey[survey['age'] == age]['own_phone'].mean())
    counts.append(len(survey[survey['age'] == age]))

sns.set(style='white', font_scale=1.5)
fig, ax = plt.subplots(1, figsize=(10, 5))
ax.fill_between(ages, 0, np.array(counts)/max(counts), color='wheat')
ax.scatter(ages, means, color='black')
for i in range(len(means)):
    ax.axvline(ages[i], means[i] - stds[i], means[i] + stds[i], color='black')
    
ax.set_title('Household Head Phone Ownership')
ax.set_xlabel('Age')
ax.set_ylabel('Proportion of HH with 1+ Phones')
simpleaxis(ax)
plt.show()

In [None]:
# Panel C
prefectures = gpd.read_file('data/shapefiles/prefectures.geojson')
survey_indiv = pd.read_csv('data/survey_indiv2018.csv')
ownership = survey_indiv.groupby('prefecture', as_index=False).agg('mean')[['prefecture', 'own_phone']]
ownership = prefectures.merge(ownership, on='prefecture', how='inner')
ownership['own_phone'] = ownership['own_phone']*100
ownership['own_phone'] = pd.cut(ownership['own_phone'], 10)

sns.reset_orig()
fig, ax = plt.subplots(1, figsize=(10, 12))
ownership.plot(ax=ax, column='own_phone', cmap='Reds', legend=True, edgecolor='lightgrey', linewidth=1)
ax.axis('off')
ax.set_title('Individual Phone Ownership by Prefecture', fontsize='xx-large')
plt.show()

### Figure S14

In [None]:
survey = pd.read_csv('data/survey2020.csv')
survey = survey[survey['responded'] == 1]

sns.set(style='white', font_scale=1.5)
fig, ax = plt.subplots(1, 3, figsize=(20, 5))

sns.kdeplot(survey['draw_probability'], ax=ax[0], shade=True, label='', color='indianred')
ax[0].set_xlabel('Draw Probability')
ax[0].set_ylabel('Density')
ax[0].set_title('Draw Probabilities')
simpleaxis(ax[0])

sns.kdeplot(survey['response_probability'], ax=ax[1], shade=True, label='', color='orange')
ax[1].set_xlabel('Response Probability')
ax[1].set_ylabel('Density')
ax[1].set_title('Response Probabilities')
simpleaxis(ax[1])

sns.kdeplot(survey['weight'], ax=ax[2], shade=True, label='', color='mediumseagreen')
ax[2].set_xlabel('Survey Weight')
ax[2].set_ylabel('Density')
ax[2].set_title('Survey Weights')
simpleaxis(ax[2])

plt.show()

### Figure S15

In [None]:
fig, ax = plt.subplots(1, figsize=(10, 6))

survey = pd.read_csv('data/survey2020.csv')
survey['response_bin'] = pd.qcut(survey['response_probability'], 20)
survey = survey.groupby('response_bin', as_index=False).agg('mean')

ax.scatter(survey['response_probability'], survey['responded'], s=100)
grid = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1]
ax.plot(grid, grid, color='lightgrey', dashes=[2, 2])

ax.set_xlabel('True Response Probability (20 Bins)')
ax.set_ylabel('Predicted Response Probability (20 Bins)')
simpleaxis(ax)
plt.tight_layout()
plt.show()