# Notebook 5: Fairness checks
Replication code for:
- Figure 3
- Figure S4
- Figure S5

In [None]:
import numpy as np
import sys
import os
import time
import json
import shutil
import random
from joblib import dump, load
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from scipy.stats import percentileofscore
import matplotlib.ticker as mtick
from matplotlib.collections import PatchCollection

from helpers import *

import warnings
warnings.filterwarnings('ignore')

EXCHANGE_RATE = 572.269

In [None]:
# Load survey data
survey = pd.read_csv('data/survey.csv')

# Merge survey data with poverty maps
prefectures = gpd.read_file('data/shapefiles/prefectures.geojson')\
    .rename({'poverty':'prefecture_poverty'}, axis=1)\
    [['prefecture', 'prefecture_poverty']]
survey = survey.merge(prefectures, on='prefecture', how='left')
cantons = gpd.read_file('data/shapefiles/cantons.geojson')\
    .rename({'poverty':'canton_poverty'}, axis=1)\
    [['canton', 'canton_poverty']]
survey = survey.merge(cantons, on='canton', how='left')

# Merge survey data with phone-based poverty predictions
cdr = pd.read_csv('outputs/ml/consumption/LGBM/oos_predictions.csv')\
    [['phone_number', 'predicted']]\
    .rename({'predicted':'phone_poverty'}, axis=1)
survey = survey.merge(cdr, on='phone_number', how='left')

# Merge survey data with phone-based single feature
single_feature = pd.read_csv('data/single_feature.csv')
survey = survey.merge(single_feature, on='phone_number', how='left')

# Add random outcome
np.random.seed(0)
survey['random'] = np.random.rand(len(survey))

### Figure S4, Figure 3 Panels a and b

In [None]:
targeting_methods = ['canton_poverty', 'phone_poverty', 'assetindex', 'pmt']
outcome = 'consumption'
sensitive_vars = ['gender', 'ethnicity', 'religion', 'age_group', 'disability', 'children', 'marital_status', 
                  'any_vulnerability']

# Drop observations missing the value any targeting method
df = survey.dropna(subset=targeting_methods + sensitive_vars + [outcome]).copy()

# Generate repeated dataframe to account for weighting
df['weight'] = df['weight']/df['weight'].min()
df = pd.DataFrame(np.repeat(df.values, df['weight'], axis=0), columns=df.columns)

In [None]:
percent_targeted = 29
num_targeted = int(len(df)*(percent_targeted/100))
targeting_vector = np.concatenate([np.ones(num_targeted), np.zeros(len(df) - num_targeted)])

figsizes = [4, 10, 8, 10, 4, 8, 10]
sns.set(font_scale=2, style='white')

for d, demo in enumerate(sensitive_vars):

    fig, ax = plt.subplots(1, 4, figsize=(15, 6), sharey=True)
    
    for a, proxy in enumerate(targeting_methods):
        df = df.sort_values([outcome, 'random'], ascending=True)
        df['true_rank'] = range(len(df))
        df['targeted_true'] = targeting_vector
        df = df.sort_values([proxy, 'random'], ascending=True)
        df['proxy_rank'] = range(len(df))
        df['targeted_proxy'] = targeting_vector
        df['resid'] = (df['proxy_rank'] - df['true_rank'])/(df['true_rank'].max())
        
        sns.boxplot(data=df.sort_values(demo, ascending=True), x='resid', y=demo, orient='h', ax=ax[a], 
                    showfliers=False)
        ax[a].set_ylabel('')
        ax[a].set_xlabel('Error')
        ax[a].set_xlim(-1, 1)
        ax[a].set_title(targeting_methods[a], fontsize='large')
        ax[a].axvline(0, color='grey', dashes=[3, 1])
        simpleaxis(ax[a])
    
    plt.suptitle('Targeting errors, by ' + sensitive_vars[d], fontsize='x-large')
    plt.tight_layout(rect=[0, 0, 1, .93])
    plt.savefig('/data/togo_anon/paper/figures/fairness_boxplots/' + demo + '_cdr.png', dpi=300)
    plt.show()
    

### Figure S5, Figure 3 Panels c and d

In [None]:
targeting_methods = ['prefecture_poverty', 'canton_poverty', 'phone_poverty', 'assetindex', 'pmt']
outcome = 'consumption'
sensitive_vars = ['gender', 'ethnicity', 'religion', 'age_group', 'disability', 'children', 'marital_status', 
                  'any_vulnerability']

# Drop observations missing the value any targeting method
df = survey.dropna(subset=targeting_methods + sensitive_vars + [outcome]).copy()

# Generate repeated dataframe to account for weighting
df['weight'] = df['weight']/df['weight'].min()
df = pd.DataFrame(np.repeat(df.values, df['weight'], axis=0), columns=df.columns)

In [None]:
percent_targeted = 29
num_targeted = int(len(df)*(percent_targeted/100))
targeting_vector = np.concatenate([np.ones(num_targeted), np.zeros(len(df) - num_targeted)])

figsizes = [4, 10, 8, 10, 4, 8, 10, 4]
sns.set(font_scale=2, style='white')

for d, demo in enumerate(sensitive_vars):
    full_table = []
    for a, proxy in enumerate(targeting_methods):
        df = df.sort_values([outcome, 'random'], ascending=True)
        df['targeted_true'] = targeting_vector
        df = df.sort_values([proxy, 'random'], ascending=True)
        df['targeted_proxy'] = targeting_vector
        
        table1 = df.groupby(demo).agg('mean')[['targeted_true', 'targeted_proxy']]\
            .rename({'targeted_true':'percent_deserving', 'targeted_proxy':'percent_receiving'},axis=1)
        table1['difference'] = table1['percent_receiving'] - table1['percent_deserving']
        table2 = (df.groupby(demo).agg('count')/len(df))[['uid']].rename({'uid':'percent_population'}, axis=1)
        table3 = df.groupby(demo).agg('mean')[['targeted_true']].rename({'targeted_true':'percent_poor'}, axis=1)
        
        table = table2.merge(table3, left_index=True, right_index=True)\
            .merge(table1, left_index=True, right_index=True)
        table['group'] = table.index
        table = table[['group', 'percent_population', 'percent_poor', 'difference']]\
            .rename({'difference':targeting_methods[a]}, axis=1)
        full_table.append(table)

    full_table = full_table[0]\
        .merge(full_table[1], on=['group', 'percent_population', 'percent_poor'])\
        .merge(full_table[2], on=['group', 'percent_population', 'percent_poor'])\
        .merge(full_table[3], on=['group', 'percent_population', 'percent_poor'])\
        .merge(full_table[4], on=['group', 'percent_population', 'percent_poor'])
    
    table = full_table
    table['percent_population'] = (100*table['percent_population']).astype('int')
    table['percent_poor'] = (100*table['percent_poor']).astype('int')
    for proxy in targeting_methods:
        table[proxy] = table[proxy]*100
        
    sns.set(style='white', font_scale=2)
    data = table[targeting_methods]

    keys = list(data.keys())

    N = len(table)
    M = len(keys)

    ylabels = []
    for i in range(len(table)):
        ylabels.append('{}\n{}% of Population\n{}% Among Poorest'\
                       .format(table.iloc[i]['group'], table.iloc[i]['percent_population'], 
                               table.iloc[i]['percent_poor']))

    ylabels = ylabels[::-1]


    xlabels = keys
    x, y = np.meshgrid(np.arange(M), np.arange(N))

    radius = 15
    s = [ [] for i in range(len(keys))]

    for i in range(len(keys)):
        for j in range(len(data[keys[i]])):
            s[i].append(data[keys[i]][j])

    arr = np.array(s).transpose()
    new_list = []
    for i in range(arr.shape[0]-1,-1,-1):
        new_list.append(list(arr[i]))

    s = new_list
    fig = plt.figure(figsize=(10.8,figsizes[d])) 
    ax = fig.add_axes([0, 0, 1, 1])

    ax.set_title('Demographic Parity: ' + sensitive_vars[d],pad=85, fontsize='large')

    s=np.array([np.array(row) for row in s])
    R = s
    c = R

    R = np.log(np.abs(R))/10


    circles = [plt.Circle((j,i), radius=r) for r, j, i in zip(R.flatten(), x.flatten(), y.flatten())]
    col = PatchCollection(circles, array=c.flatten(), cmap="RdBu_r", edgecolor='grey', linewidth=2)
    col.set_clim(vmin=-20, vmax=20)
    # math.log(abs(r)) / 10

    ax.add_collection(col)
    ax.set(xticks=np.arange(M), yticks=np.arange(N),
           xticklabels=xlabels, yticklabels=ylabels)
    ax.set_xticks(np.arange(M+1)-0.5, minor=True)
    ax.set_yticks(np.arange(N+1)-0.5, minor=True)
    # ax.grid(which='minor')
    ax.xaxis.tick_top()

    cbar = fig.colorbar(col, fraction=0.03, pad=0.05,)

    cbar.outline.set_edgecolor('white')

    cbar.ax.set_ylabel('Percentage Point Difference', labelpad=20)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)

    ax.tick_params(axis='both', which='both', length=0)
    
    plt.tight_layout()
    plt.show()
    