# Notebook 4: Targeting Simulations
Replication code for:
- Figure 1 and Table 1
- Figure 2
- Figure S2
- Figure S12
- Figure S13
- Table S1
- Table S2
- Table S3
- Table S5
- Table S13
- Table S14
- Table S15

In [None]:
import numpy as np
import sys
import os
import time
import json
import shutil
import random
from joblib import dump, load
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from scipy.stats import percentileofscore
import matplotlib.ticker as mtick

from helpers import *

import warnings
warnings.filterwarnings('ignore')

n_bootstraps = 10 # Change to 1000 for real implementation, keep at 10 for speed

In [None]:
# Load survey data
survey = pd.read_csv('data/survey2018.csv')

# Merge survey data with poverty maps
prefectures = gpd.read_file('data/shapefiles/prefectures.geojson')\
    .rename({'poverty':'prefecture_poverty'}, axis=1)\
    [['prefecture', 'prefecture_poverty']]
survey = survey.merge(prefectures, on='prefecture', how='left')
cantons = gpd.read_file('data/shapefiles/cantons.geojson')\
    .rename({'poverty':'canton_poverty'}, axis=1)\
    [['canton', 'canton_poverty']]
survey = survey.merge(cantons, on='canton', how='left')

# Merge survey data with phone-based poverty predictions
cdr = pd.read_csv('outputs/ml/consumption/LGBM/oos_predictions.csv')\
    [['phone_number', 'predicted']]\
    .rename({'predicted':'phone_poverty'}, axis=1)
survey = survey.merge(cdr, on='phone_number', how='left')

# Merge survey with "old" phone-based poverty predictions with "old" CDR
cdr = pd.read_csv('outputs/ml/consumption/LGBM/oos_predictions.csv')\
    [['phone_number', 'predicted']]\
    .rename({'predicted':'phone_poverty_old_model_old_data'}, axis=1)
survey = survey.merge(cdr, on='phone_number', how='left')

# Merge survey with "old" phone-based poverty predictions with "new" CDR
cdr = pd.read_csv('outputs/ml/consumption/temporality/old_model_new_data.csv')\
    [['phone_number', 'prediction']]\
    .rename({'prediction':'phone_poverty_old_model_new_data'}, axis=1)
survey = survey.merge(cdr, on='phone_number', how='left')

# Merge survey data with phone-based single feature
single_feature = pd.read_csv('data/single_feature2018.csv')
survey = survey.merge(single_feature, on='phone_number', how='left')

# Merge survey data with phone-based location
homes = pd.read_csv('data/inferred_home_locations2018.csv').drop('region', axis=1)
homes = homes.merge(prefectures, on='prefecture', how='left')
homes = homes.merge(cantons, on='canton', how='left')
homes = homes.rename({'prefecture':'phone_prefecture', 'canton':'phone_canton', 
                      'prefecture_poverty':'phone_prefecture_poverty', 
                      'canton_poverty':'phone_canton_poverty'}, axis=1)
survey = survey.merge(homes, on='phone_number', how='left')

# Add random outcome
np.random.seed(0)
survey['random'] = np.random.rand(len(survey))

### Figure S2 panel b

In [None]:
sns.set(font_scale=1.5, style='white')

df = survey[['phone_poverty', 'consumption', 'canton', 'weight']].dropna()
df_repeat = pd.DataFrame(np.repeat(df.values, df['weight'], axis=0), columns=df.columns)
percent_targeted = 29

fig, ax = plt.subplots(1, 2, figsize=(20, 6))
cons_threshold = np.percentile(df_repeat['consumption'], percent_targeted)
phone_threshold = np.percentile(df_repeat['phone_poverty'], percent_targeted)

ci = df[(df['consumption'] <= cons_threshold) & (df['phone_poverty'] <= phone_threshold)]
ce = df[(df['consumption'] > cons_threshold) & (df['phone_poverty'] > phone_threshold)]
ii = df[(df['consumption'] > cons_threshold) & (df['phone_poverty'] <= phone_threshold)]
ie = df[(df['consumption'] <= cons_threshold) & (df['phone_poverty'] > phone_threshold)]

ax[0].scatter(ci['consumption'], ci['phone_poverty'], color='mediumseagreen', s=100)
ax[0].scatter(ie['consumption'], ie['phone_poverty'], color='mediumseagreen', alpha=0.3, s=100)
ax[0].scatter(ii['consumption'], ii['phone_poverty'], color='indianred', s=100)
ax[0].scatter(ce['consumption'], ce['phone_poverty'], color='indianred', alpha=0.3, s=100)
ax[0].axhline(phone_threshold, color='black', dashes=[7, 7])
ax[0].axvline(cons_threshold, color='black', dashes=[2, 2])
ax[0].set_xlabel('Ground Truth Consumption')
ax[0].set_ylabel('Predicted Consumption')
simpleaxis(ax[0])

eligible_cantons = [1, 2, 3, 4, 5, 6, 7, 8, 9, 19]
sns.kdeplot(df[df['canton'].isin(eligible_cantons)]['phone_poverty'], ax=ax[1], shade=True, 
            label='Eligible Cantons')
sns.kdeplot(df['phone_poverty'], ax=ax[1], shade=True, label='All Togo')
ax[1].set_xlabel('Predicted Consumption')
ax[1].set_ylabel('Density')
ax[1].axvline(phone_threshold, dashes=[7, 7], color='black')
simpleaxis(ax[1])

### Table 1, Figure 1, Table S1, Table S2

In [None]:
targeting_methods = ['prefecture_poverty', 'canton_poverty', 'single_feature', 'phone_poverty', 'assetindex',
                    'ppi', 'pmt', 'random', 'formal_occupation', 'occupation_poverty']
outcome = 'consumption'

# Drop observations missing the value any targeting method
df = survey.dropna(subset=targeting_methods + [outcome]).copy()
df['weight'] = df['weight']/df['weight'].min()

bootstraps = get_bootstraps(df, n_bootstraps)
df = pd.DataFrame(np.repeat(df.values, df['weight'], axis=0), columns=df.columns)

In [None]:
# Targeting the poorest 29%
fname = 'outputs/targeting/table1'
percent_targeted = 29
t = table((df, outcome, targeting_methods, targeting_methods, percent_targeted, percent_targeted, True))
t_std = std_table(((bootstraps, outcome, targeting_methods, targeting_methods, percent_targeted, 
                    percent_targeted, True)))
t.to_csv(fname + '.csv', index=False)
t_std.to_csv(fname +'_bootstrap.csv')
t

In [None]:
# Targeting those below the poverty line
fname = 'outputs/targeting/tables2'
percent_targeted = 29
poverty_line = 1800
poverty_threshold = percentileofscore(df[outcome], poverty_line)
t = table((df, outcome, targeting_methods, targeting_methods, poverty_threshold, percent_targeted, True))
t_std = std_table(((bootstraps, outcome, targeting_methods, targeting_methods, percent_targeted, 
                    percent_targeted, True)))
t.to_csv(fname + '.csv', index=False)
t_std.to_csv(fname +'_bootstrap.csv')
t

In [None]:
# Targeting those below the poverty line
fname = 'outputs/targeting/tables3'
percent_targeted = 29
poverty_line = 1000
poverty_threshold = percentileofscore(df[outcome], poverty_line)
t = table((df, outcome, targeting_methods, targeting_methods, poverty_threshold, percent_targeted, True))
t_std = std_table(((bootstraps, outcome, targeting_methods, targeting_methods, percent_targeted, 
                    percent_targeted, True)))
t.to_csv(fname + '.csv', index=False)
t_std.to_csv(fname +'_bootstrap.csv')
t

### Table S13

In [None]:
# Use PMT as ground truth
targeting_methods = ['prefecture_poverty', 'canton_poverty', 'single_feature', 'phone_poverty', 'assetindex',
                    'ppi', 'random', 'formal_occupation', 'occupation_poverty']
outcome = 'pmt'

# Drop observations missing the value any targeting method
df = survey.dropna(subset=targeting_methods + [outcome]).copy()
df['weight'] = df['weight']/df['weight'].min()

bootstraps = get_bootstraps(df, n_bootstraps)
df = pd.DataFrame(np.repeat(df.values, df['weight'], axis=0), columns=df.columns)

In [None]:
# Targeting the poorest 29%
fname = 'outputs/targeting/table13'
percent_targeted = 29
t = table((df, outcome, targeting_methods, targeting_methods, percent_targeted, percent_targeted, True))
t_std = std_table(((bootstraps, outcome, targeting_methods, targeting_methods, percent_targeted, 
                    percent_targeted, True)))
t.to_csv(fname + '.csv', index=False)
t_std.to_csv(fname +'_bootstrap.csv')
t

### Table S3

In [None]:
# In rural areas only, with consumption as ground truth
targeting_methods = ['prefecture_poverty', 'canton_poverty', 'single_feature', 'phone_poverty', 'assetindex',
                    'ppi', 'pmt', 'random', 'formal_occupation', 'occupation_poverty']
outcome = 'consumption'

# Drop observations missing the value any targeting method
df = df[df['milieu'] == 'rural']
df = survey.dropna(subset=targeting_methods + [outcome]).copy()
df['weight'] = df['weight']/df['weight'].min()

bootstraps = get_bootstraps(df, n_bootstraps)
df = pd.DataFrame(np.repeat(df.values, df['weight'], axis=0), columns=df.columns)

In [None]:
# Targeting the poorest 29%
fname = 'outputs/targeting/tables3'
percent_targeted = 29
t = table((df, outcome, targeting_methods, targeting_methods, percent_targeted, percent_targeted, True))
t_std = std_table(((bootstraps, outcome, targeting_methods, targeting_methods, percent_targeted, 
                    percent_targeted, True)))
t.to_csv(fname + '.csv', index=False)
t_std.to_csv(fname +'_bootstrap.csv')
t

### Table S14 

In [None]:
# In rural areas only, with rural-specific PMT as ground truth
targeting_methods = ['prefecture_poverty', 'canton_poverty', 'single_feature', 'phone_poverty', 'assetindex',
                    'ppi', 'random', 'formal_occupation', 'occupation_poverty']
outcome = 'rural_pmt'

# Drop observations missing the value any targeting method
df = df[df['milieu'] == 'rural']
df = survey.dropna(subset=targeting_methods + [outcome]).copy()
df['weight'] = df['weight']/df['weight'].min()

bootstraps = get_bootstraps(df, n_bootstraps)
df = pd.DataFrame(np.repeat(df.values, df['weight'], axis=0), columns=df.columns)

In [None]:
# Targeting the poorest 29%
fname = 'outputs/targeting/tables14'
percent_targeted = 29
t = table((df, outcome, targeting_methods, targeting_methods, percent_targeted, percent_targeted, True))
t_std = std_table(((bootstraps, outcome, targeting_methods, targeting_methods, percent_targeted, 
                    percent_targeted, True)))
t.to_csv(fname + '.csv', index=False)
t_std.to_csv(fname +'_bootstrap.csv')
t

### Table S15

In [None]:
# Geographic targeting with phone-inferred location
targeting_methods = ['prefecture_poverty', 'canton_poverty', 'phone_prefecture_poverty', 'phone_canton_poverty',
                    'phone_poverty']
outcome = 'consumption'

# Drop observations missing the value any targeting method
df = survey.dropna(subset=targeting_methods + [outcome]).copy()
df['weight'] = df['weight']/df['weight'].min()

bootstraps = get_bootstraps(df, n_bootstraps)
df = pd.DataFrame(np.repeat(df.values, df['weight'], axis=0), columns=df.columns)

In [None]:
# Targeting the poorest 29%
fname = 'outputs/targeting/tables15'
percent_targeted = 29
t = table((df, outcome, targeting_methods, targeting_methods, percent_targeted, percent_targeted, True))
t_std = std_table(((bootstraps, outcome, targeting_methods, targeting_methods, percent_targeted, 
                    percent_targeted, True)))
t.to_csv(fname + '.csv', index=False)
t_std.to_csv(fname +'_bootstrap.csv')
t

### Figure S5

In [None]:
targeting_methods = ['phone_poverty', 'phone_poverty_old_model_new_data', 'phone_poverty_old_model_old_data',
                    'prefecture_poverty', 'canton_poverty']
outcome = 'consumption'

# Drop observations missing the value any targeting method
df = survey.dropna(subset=targeting_methods + [outcome]).copy()
df['weight'] = df['weight']/df['weight'].min()

bootstraps = get_bootstraps(df, n_bootstraps)
df = pd.DataFrame(np.repeat(df.values, df['weight'], axis=0), columns=df.columns)

In [None]:
# Targeting the poorest 29%
fname = 'outputs/targeting/tables5panela'
percent_targeted = 29
t = table((df, outcome, targeting_methods, targeting_methods, percent_targeted, percent_targeted, True))
t_std = std_table(((bootstraps, outcome, targeting_methods, targeting_methods, percent_targeted, 
                    percent_targeted, True)))
t.to_csv(fname + '.csv', index=False)
t_std.to_csv(fname +'_bootstrap.csv')
t

In [None]:
# Targeting those below the poverty line
fname = 'outputs/targeting/tables5panelb'
percent_targeted = 29
poverty_line = 1800
poverty_threshold = percentileofscore(df[outcome], poverty_line)
t = table((df, outcome, targeting_methods, targeting_methods, poverty_threshold, percent_targeted, True))
t_std = std_table(((bootstraps, outcome, targeting_methods, targeting_methods, percent_targeted, 
                    percent_targeted, True)))
t.to_csv(fname + '.csv', index=False)
t_std.to_csv(fname +'_bootstrap.csv')
t

In [None]:
# Targeting those below the poverty line
fname = 'outputs/targeting/tables5panelc'
percent_targeted = 29
poverty_line = 1000
poverty_threshold = percentileofscore(df[outcome], poverty_line)
t = table((df, outcome, targeting_methods, targeting_methods, poverty_threshold, percent_targeted, True))
t_std = std_table(((bootstraps, outcome, targeting_methods, targeting_methods, percent_targeted, 
                    percent_targeted, True)))
t.to_csv(fname + '.csv', index=False)
t_std.to_csv(fname +'_bootstrap.csv')
t

### Figure S12

In [None]:
proxies = ['prefecture_poverty', 'canton_poverty', 'phone_poverty', 'assetindex', 'pmt']
proxynames = ['Prefecture', 'Canton', 'Phone+ML', 'Asset Index', 'PMT']
colors = ['indianred', 'darkorange', 'forestgreen', 'dodgerblue', 'darkblue']
dashes = [False, False, False, True, True]

outcome = 'consumption'

# Drop observations missing the value any targeting method
df = survey.dropna(subset=proxies + [outcome]).copy()

# Generate repeated dataframe to account for weighting
df['weight'] = df['weight']/df['weight'].min()
df = pd.DataFrame(np.repeat(df.values, df['weight'], axis=0), columns=df.columns)

In [None]:
sns.set(font_scale=2, style='white')
fig, ax = plt.subplots(1, figsize=(10, 8))

for p, proxy in enumerate(proxies):
    fprs, tprs, auc_score = auc_overall(df[outcome].astype('float'), df[proxy].astype('float'))
    fprs = [100*fpr for fpr in fprs]
    tprs = [100*tpr for tpr in tprs]
    if dashes[p]:
        ax.plot(fprs, tprs, label=proxynames[p] + (' (AUC=%.2f)' % auc_score), dashes=[3, 3], color=colors[p], 
                linewidth=3)
    else:
        ax.plot(fprs, tprs, label=proxynames[p]  + (' (AUC=%.2f)' % auc_score), color=colors[p], linewidth=3)
    
ax.plot([0, 100], [0, 100], label='Random', dashes=[1, 2], color='grey')
ax.set_title('ROC Curves')
ax.legend(loc='best')
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate (Recall)')
simpleaxis(ax)
plt.tight_layout()
plt.show()

In [None]:
sns.set(font_scale=2, style='white')
fig, ax = plt.subplots(1, figsize=(10, 8))

for p, proxy in enumerate(proxies):
    grid = np.linspace(1, 100, 99)[:-1]
    metrics_grid = [metrics(df[outcome].astype('float'), df[proxy].astype('float'), p, p) 
                    for p in grid]
    precisions, recalls = [g[1]*100 for g in metrics_grid], [g[2]*100 for g in 
                                                             metrics_grid]
    if dashes[p]:
        ax.plot(grid, recalls, color=colors[p], dashes=[3, 3], label=proxynames[p],
               linewidth=3)
    else:
        ax.plot(grid, recalls, color=colors[p], label=proxynames[p], linewidth=3)

ax.plot([0, 100], [0, 100], label='Random', dashes=[1, 2], color='grey')
ax.legend(loc='best')
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
ax.set_xlabel('Percentage of Population Targeted')
ax.set_ylabel('Precision and Recall')
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
simpleaxis(ax)
plt.tight_layout()
plt.show()

### Figure 2

In [None]:
proxies = ['prefecture_poverty', 'canton_poverty', 'phone_poverty', 'assetindex', 'pmt']
proxynames = ['Prefecture', 'Canton', 'Phone+ML', 'Asset Index', 'PMT']
colors = ['indianred', 'darkorange', 'forestgreen', 'dodgerblue', 'darkblue']
dashes = [False, False, False, True, True]

outcome = 'consumption'

# Drop observations missing the value any targeting method
df = survey.dropna(subset=proxies + [outcome]).copy()

# Generate repeated dataframe to account for weighting
df['weight'] = df['weight']/df['weight'].min()
df = pd.DataFrame(np.repeat(df.values, df['weight'], axis=0), columns=df.columns)

In [None]:
sns.set(font_scale=2, style='white')
fig, ax = plt.subplots(1, figsize=(20, 10))

budget = 100000
cashout_fee = 20

curves = get_crra(df, proxies, proxynames, 'consumption', budget, cashout_fee)

a = 0
for outcome, results in curves.items():
    percent_targeted, transfersizes, utilities = curves[outcome]
    if dashes[a]:
        ax.plot(percent_targeted, utilities, label=outcome, color=colors[a], linewidth=3, dashes=[3, 3])
    else:
        ax.plot(percent_targeted, utilities, label=outcome, color=colors[a], linewidth=3)
    results = pd.DataFrame([percent_targeted, transfersizes, utilities]).T
    results.columns = ['percent_targeted', 'transfer_size', 'utility']
    results = results.sort_values('utility', ascending=False)
    best_transfer_size = list(results['transfer_size'])[0]
    best_percent_targeted = list(results['percent_targeted'])[0]
    ax.axvline(best_percent_targeted, dashes=[1, 1], color = colors[a])
    a += 1

ax.set_ylim(results['utility'].min(), results['utility'].max())
ax.axhline(list(results.sort_values('percent_targeted', ascending=True)['utility'])[-1], color='grey', 
           dashes=[1, 2], label='UBI')
ax.set_xlabel('Fraction of Population Targeted')
ax.set_ylabel('Utility')
ax.legend(loc='best')
simpleaxis(ax)
plt.suptitle('Utility Curves', fontsize='large')
plt.tight_layout(rect=[0, 0, 1, .95])
plt.show()

### Figure S13

In [None]:
proxies = ['random', 'prefecture_poverty', 'canton_poverty', 'phone_poverty', 'pmt']
proxynames = ['Random', 'Prefecture', 'Canton', 'CDR', 'PMT']
colors = ['indianred', 'darkorange', 'mediumseagreen', 'dodgerblue', 'slateblue']

outcome = 'consumption'

# Drop observations missing the value any targeting method
df = survey.dropna(subset=proxies + [outcome]).copy()

# Generate repeated dataframe to account for weighting
df['weight'] = df['weight']/df['weight'].min()
df = pd.DataFrame(np.repeat(df.values, df['weight'], axis=0), columns=df.columns)

In [None]:
percent_targeted = 29
num_targeted = int((percent_targeted/100)*len(df))

sns.reset_orig()
fig, ax = plt.subplots(1, 5, figsize=(15, 8))

for p, proxy in enumerate(proxies):
    
    df = df.sort_values(proxy, ascending=True)
    targeting_vector = np.concatenate([np.ones(num_targeted), np.zeros(len(df) - num_targeted)])
    df['targeted'] = targeting_vector
    beneficiaries = df[df['targeted'] == 1]
    
    beneficiaries = beneficiaries.groupby(['canton', 'prefecture'], as_index=False).agg('count')\
        [['canton', 'prefecture', 'uid']]\
        .rename({'uid':'count'}, axis=1)
    total = df.groupby(['canton', 'prefecture'], as_index=False).agg('count')\
        [['canton', 'prefecture', 'uid']]\
        .rename({'uid':'total_count'}, axis=1)
    
    beneficiaries = beneficiaries.merge(total, on=['canton', 'prefecture'], how='right')
    beneficiaries['count'] = beneficiaries['count'].fillna(0)
    beneficiaries['percent_targeted'] = beneficiaries['count']/beneficiaries['total_count']
    shapefile = gpd.read_file('/home/em/covid/spatial/cantons.geojson')\
        .rename({'Id':'canton'}, axis=1)
    
    shapefile = shapefile.merge(beneficiaries, on='canton', how='inner')
    shapefile['percent_bucket'] = shapefile['percent_targeted']\
        .apply(lambda x: '0-20%' if x < .2 else '20-40%' if x < .4 else '40-60%' if x < .6 else '60-80%' if 
              x < .8 else '80-100%')
    
    legend = True if proxy == 'pmt' else False
    gpd.read_file('/home/em/covid/spatial/cantons.geojson').plot(color='lightgrey', ax=ax[p])
    shapefile.plot(ax=ax[p], column='percent_bucket', linewidth=1, edgecolor='white', cmap='inferno', 
                   legend=legend, legend_kwds={'title':'Percent Targeted', 'fontsize':'x-large'})

    ax[p].axis('off')
    ax[p].set_title(proxynames[p], fontsize='x-large')
    
plt.suptitle('Location of Beneficiaries Under Counterfactual Targeting Approaches', fontsize='xx-large')
plt.tight_layout(rect=[0, 0, 1, .91])
plt.show()