In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
if not os.path.exists('extracted'):
    os.makedirs('extracted', exist_ok=True)
RAW_DHS_DATA_DIR = 'raw/dhs_wealth_index.csv'
TRAIN_DHS_DATA_DIR = 'extracted/dhs_wealth_index_train.csv'
VAL_DHS_DATA_DIR = 'extracted/dhs_wealth_index_val.csv'
TEST_DHS_DATA_DIR = 'extracted/dhs_wealth_index_test.csv'

df_dhs = pd.read_csv(RAW_DHS_DATA_DIR)
print('total data:',len(df_dhs))
print(df_dhs.columns)

total data: 35235
Index(['cluster', 'svyid', 'wealthpooled', 'wealthpooled5country', 'wealth',
       'iso3', 'hv000', 'year', 'cname', 'country', 'region', 'iso3n',
       'households', 'LATNUM', 'LONGNUM', 'URBAN_RURA'],
      dtype='object')


In [3]:
"""
"wealthpooled": the "asset wealth index" (AWI) of each household, standardized across all surveys, 
  then averaged to the cluster level.
"wealth": "wealth": AWI standardized within each country at the household level, aggregated to the cluster level.  
"""
COLUMN_OF_INTEREST = ['country','year','wealth','wealthpooled', 'households', 'LATNUM','LONGNUM']
df_dhs_relevant = df_dhs.loc[:,COLUMN_OF_INTEREST]
df_dhs_relevant = df_dhs_relevant.dropna()
print(df_dhs_relevant.head())
print('available data:',len(df_dhs_relevant))

  country  year    wealth  wealthpooled  households     LATNUM    LONGNUM
0  Angola  2011  1.713497      2.595618          36 -12.350257  13.534922
1  Angola  2011  1.545335      2.209620          32 -12.360865  13.551494
2  Angola  2011  0.631730      0.906469          36 -12.613421  13.413085
3  Angola  2011  0.826273      1.105359          35 -12.581454  13.397711
4  Angola  2011  1.293282      1.879344          37 -12.578135  13.418748
available data: 27077


In [4]:
def insert_id_column(df):
    idcol = []
    indices = df.index[:]
    for i in indices:
        this_row = df.loc[i]
        country_year_id = '%s_%s_%s'%(str(i),str(this_row['country']),str(this_row['year']))
        idcol.append(country_year_id)
    df.insert(0,'id', idcol)
    return df
df_dhs_relevant = insert_id_column(df_dhs_relevant)
print(df_dhs_relevant.head())

              id country  year    wealth  wealthpooled  households     LATNUM  \
0  0_Angola_2011  Angola  2011  1.713497      2.595618          36 -12.350257   
1  1_Angola_2011  Angola  2011  1.545335      2.209620          32 -12.360865   
2  2_Angola_2011  Angola  2011  0.631730      0.906469          36 -12.613421   
3  3_Angola_2011  Angola  2011  0.826273      1.105359          35 -12.581454   
4  4_Angola_2011  Angola  2011  1.293282      1.879344          37 -12.578135   

     LONGNUM  
0  13.534922  
1  13.551494  
2  13.413085  
3  13.397711  
4  13.418748  


In [5]:
df_dhs_train_val = df_dhs_relevant[(df_dhs_relevant['year']>=2013) & (df_dhs_relevant['year']<=2014)]
df_dhs_test = df_dhs_relevant[(df_dhs_relevant['year']>=2015) & (df_dhs_relevant['year']<=2016)]
print('leng df_dhs_train_val:',len(df_dhs_train_val))
print('leng df_dhs_test:',len(df_dhs_test))

leng df_dhs_train_val: 6252
leng df_dhs_test: 4424


In [6]:
def split_train_val(df,train_probability=0.7):
    assignment_col = np.random.choice(['train','val'],len(df),p=[train_probability,1-train_probability])
    df.insert(0,'assignment',assignment_col)
    df_train = df[df['assignment']=='train'].loc[:,['id']+ COLUMN_OF_INTEREST]
    df_val = df[df['assignment']=='val'].loc[:, ['id']+COLUMN_OF_INTEREST]
    
    return df_train, df_val
df_dhs_train, df_dhs_val = split_train_val(df_dhs_train_val,train_probability=0.7)
print('train:\n',df_dhs_train.head(),'\n leng:', len(df_dhs_train))
print('val:\n',df_dhs_val.head(),'\n leng:',len(df_dhs_val))

print('train fraction:', len(df_dhs_train)/(len(df_dhs_train)+len(df_dhs_val)))

train:
                           id       country  year    wealth  wealthpooled  \
1646  1646_Burkina Faso_2014  Burkina Faso  2014 -0.639860     -0.589851   
1647  1647_Burkina Faso_2014  Burkina Faso  2014 -0.650348     -0.636919   
1649  1649_Burkina Faso_2014  Burkina Faso  2014 -0.400257     -0.332651   
1650  1650_Burkina Faso_2014  Burkina Faso  2014  1.005956      0.874612   
1651  1651_Burkina Faso_2014  Burkina Faso  2014 -0.328516     -0.324034   

      households     LATNUM   LONGNUM  
1646          25  13.068499 -3.655191  
1647          25  13.239115 -4.100196  
1649          26  12.608467 -3.921584  
1650          25  12.725513 -3.872126  
1651          23  12.494392 -3.437510   
 leng: 4396
val:
                           id       country  year    wealth  wealthpooled  \
1648  1648_Burkina Faso_2014  Burkina Faso  2014 -0.441337     -0.515898   
1652  1652_Burkina Faso_2014  Burkina Faso  2014 -0.490227     -0.419723   
1653  1653_Burkina Faso_2014  Burkina Faso  2014

In [7]:
df_dhs_train.to_csv(TRAIN_DHS_DATA_DIR)
df_dhs_val.to_csv(VAL_DHS_DATA_DIR)
df_dhs_test.to_csv(TEST_DHS_DATA_DIR)

## Part 2. LSMS Wealth Index

In [8]:
RAW_LSMS_DATA_DIR = 'raw/lsms_wealth_index.csv'
TRAIN_LSMS_DATA_DIR = 'extracted/lsms_wealth_index_train.csv'
VAL_LSMS_DATA_DIR = 'extracted/lsms_wealth_index_val.csv'
TEST_LSMS_DATA_DIR = 'extracted/lsms_wealth_index_test.csv'

"""
index_all comes from lsms_labels_agg.csv
  "index" column computed as PCA over all LSMS data over the 5 countries (Ethiopia, Malawi, Nigeria, Tanzania, Uganda);
  index computed over households in all 3020 LSMS villages
index_rep comes from lsms_labels_index_agg_geolocated.csv
  updated index, with more migrant households removed
"""

df_lsms = pd.read_csv(RAW_LSMS_DATA_DIR)
print('total data:',len(df_lsms))
print(df_lsms.columns)

total data: 3020
Index(['lat', 'lon', 'year', 'country', 'index_all', 'ea_id', 'index_rep',
       'et_index', 'n', 'geolev1', 'geolev2', 'rooms', 'electric', 'phone',
       'radio', 'tv', 'auto', 'floor_qual', 'toilet_qual', 'watsup_qual'],
      dtype='object')


In [9]:
LSMS_COLUMNS = ['lat', 'lon', 'year', 'country', 'index_all', 'index_rep']
df_lsms_rel = df_lsms.loc[:,LSMS_COLUMNS]
df_lsms_rel = df_lsms_rel.dropna()
print(df_lsms_rel.head())
print('available data:',len(df_lsms_rel))

        lat        lon  year country  index_all  index_rep
0 -0.292248  31.478722  2005  uganda   0.045405   0.042136
1 -0.292248  31.478722  2009  uganda  -0.333650  -0.326794
2 -0.292248  31.478722  2013  uganda  -0.386531  -0.373920
3 -0.439120  31.711950  2005  uganda   0.310049   0.301829
4 -0.439120  31.711950  2009  uganda   0.085589   0.090705
available data: 2915


In [10]:
print(sorted(set(list(df_lsms_rel['country']))))
print('list of year:',sorted(set(df_lsms_rel['year'].to_numpy())))
print()
print('n data train and val:',len(df_lsms_rel[(df_lsms_rel['year']>=2013) & (df_lsms_rel['year']<=2015)]))
print('n data test:',len(df_lsms_rel[(df_lsms_rel['year']>=2016) ]))

['ethiopia', 'malawi', 'nigeria', 'tanzania', 'uganda']
list of year: [2005, 2008, 2009, 2010, 2011, 2012, 2013, 2015, 2016]

n data train and val: 973
n data test: 102


In [11]:
df_lsms_rel = insert_id_column(df_lsms_rel)
print(df_lsms_rel.head())

              id       lat        lon  year country  index_all  index_rep
0  0_uganda_2005 -0.292248  31.478722  2005  uganda   0.045405   0.042136
1  1_uganda_2009 -0.292248  31.478722  2009  uganda  -0.333650  -0.326794
2  2_uganda_2013 -0.292248  31.478722  2013  uganda  -0.386531  -0.373920
3  3_uganda_2005 -0.439120  31.711950  2005  uganda   0.310049   0.301829
4  4_uganda_2009 -0.439120  31.711950  2009  uganda   0.085589   0.090705


In [12]:
def split_train_val_test(df, train_probability=0.7):
    df_train_val = df[(df['year']>=2013) & (df['year']<=2015)]
    assignment_col = np.random.choice(['train','val'],len(df_train_val),p=[train_probability,1-train_probability])
    df_train_val.insert(0,'assignment',assignment_col)
    
    df_train = df_train_val[df_train_val['assignment']=='train']
    df_train = df_train.loc[:,['id']+LSMS_COLUMNS]
    df_val = df_train_val[df_train_val['assignment']=='val']
    df_val = df_val.loc[:,['id']+LSMS_COLUMNS]
    
    df_test = df[(df['year']>=2016) ]
    return df_train, df_val, df_test

df_train, df_val, df_test =split_train_val_test(df_lsms_rel)

print('n train/val/test',len(df_train), len(df_val), len(df_test))
print('train fraction:',len(df_train)/(len(df_train)+len(df_val)))
print(df_train.head())
print(df_val.head())
print(df_test.head())

n train/val/test 670 303 102
train fraction: 0.6885919835560124
                id       lat        lon  year country  index_all  index_rep
2    2_uganda_2013 -0.292248  31.478722  2013  uganda  -0.386531  -0.373920
5    5_uganda_2013 -0.439120  31.711950  2013  uganda   0.317944   0.340910
11  11_uganda_2013 -0.508831  31.333286  2013  uganda  -0.447883  -0.437503
14  14_uganda_2013 -0.518612  31.194437  2013  uganda  -0.078402  -0.065410
17  17_uganda_2013 -0.530628  32.327492  2013  uganda  -0.253745  -0.258809
                id       lat        lon  year country  index_all  index_rep
8    8_uganda_2013 -0.503361  31.616070  2013  uganda   0.291547   0.305004
20  20_uganda_2013 -0.549797  30.067520  2013  uganda   0.055590   0.065336
23  23_uganda_2013 -0.553773  30.303692  2013  uganda   0.637925   0.634928
26  26_uganda_2013 -0.560572  30.678801  2013  uganda   0.838874   0.840463
35  35_uganda_2013 -0.639231  31.529987  2013  uganda   0.645061   0.651606
                  id    

In [13]:
df_train.to_csv(TRAIN_LSMS_DATA_DIR)
df_val.to_csv(VAL_LSMS_DATA_DIR) 
df_test.to_csv(TEST_LSMS_DATA_DIR)