# 1c. Cleaning DHS surveys
DHS survey data is obtained from https://dhsprogram.com/data/dataset_admin/

In [4]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

In [11]:
surveys = {
    'colombia':['COHR61FL.DTA', 'COGE61FL.shp'],
    'honduras':['HNHR62FL.DTA', 'HNGE61FL.shp'],
    'indonesia':['IDHR42FL.DTA', 'IDGE43FL.shp'],
    'kenya':['KEHR72FL.DTA', 'KEGE71FL.shp'],
    'nigeria':['NGHR7BFL.DTA', 'NGGE7BFL.shp'],
    'peru':['PEHR5IFL.DTA', 'PEGE5IFL.shp'],
    'philippines':['PHHR71FL.DTA', 'PHGE71FL.shp']
}

In [8]:
for country in surveys.keys():
    
    print(country)
    print('--------')
    survey_fname, shapefile_fname = surveys[country][0], surveys[country][1]
    
    # Clean survey data
    df = pd.read_stata('/data/mosaiks/dhs/' + country + '/raw/survey/' + survey_fname, convert_categoricals=False)
    df = df[['hhid', 'hv001', 'hv005', 'hv271']]
    df.columns = ['hhid', 'cluster', 'weight', 'wealth']
    print('Number of observations: %i' % len(df))
    print('Number of clusters: %i' % len(df['cluster'].unique()))
    df.to_csv('/data/mosaiks/replication/dhs/' + country + 'survey.csv', index=False)

    # Clean spatial data
    spatial = gpd.read_file('/data/mosaiks/dhs/' + country + '/raw/shapefile/' + shapefile_fname)
    original_length = len(spatial)
    print('Number of clusters in shapefile: %i' % original_length)
    spatial = spatial[['DHSCLUST', 'URBAN_RURA', 'LATNUM', 'LONGNUM']]
    spatial.columns = ['cluster', 'urban', 'lat', 'lon']
    spatial = spatial[~((spatial['lat'] == 0) & (spatial['lon'] == 0))]
    end_length = len(spatial)
    print('Number of clusters without spatial information: %i' % (original_length - end_length))
    spatial.to_csv('/data/mosaiks/replication/dhs/' + country + 'clusters.csv', index=False)
    

kenya
--------
Number of observations: 36430
Number of clusters: 1594
Number of clusters in shapefile: 1594
Number of clusters without spatial information: 9
nigeria
--------
Number of observations: 40427
Number of clusters: 1389
Number of clusters in shapefile: 1389
Number of clusters without spatial information: 7
peru
--------
Number of observations: 26834
Number of clusters: 1132
Number of clusters in shapefile: 1132
Number of clusters without spatial information: 1
philippines
--------
Number of observations: 27496
Number of clusters: 1249
Number of clusters in shapefile: 1250
Number of clusters without spatial information: 36


In [9]:
def grouped_weighted_mean(df, agg_cols, feature_cols, weight_col):
    data = df.copy()
    for feature_col in feature_cols:
        data[feature_col] = data[feature_col]*data[weight_col]
    data = data.groupby(agg_cols, as_index=False).agg('sum')
    for feature_col in feature_cols:
        data[feature_col] = data[feature_col]/data[weight_col] 
    return data[agg_cols + feature_cols + [weight_col]]

In [15]:
for country in list(surveys.keys()):

    # Cluster level
    survey = pd.read_csv('/data/mosaiks/replication/dhs/' + country + 'survey.csv')
    grouped = grouped_weighted_mean(survey, ['cluster'], ['wealth'], 'weight')
    gps = pd.read_csv('/data/mosaiks/replication/dhs/' + country + 'clusters.csv')\
        .rename({'urban':'rural'}, axis=1)
    gps['rural'] = (gps['rural'] == 'R').astype('int')
    grouped = grouped.merge(gps, on='cluster', how='inner')
    grouped[['cluster', 'wealth', 'weight', 'rural']]\
        .to_csv('/data/mosaiks/replication/surveys/dhs/' + country + '_grouped.csv', index=False)
    print(country, round(grouped['rural'].mean(), 2))
    
    # Merge cluster-level info to survey
    survey = survey.merge(gps[['cluster', 'rural']], on='cluster', how='inner')
    survey.to_csv('/data/mosaiks/replication/surveys/dhs/' + country + '_hh.csv', index=False)
    
    # Get geometries
    geo = gpd.GeoDataFrame(gps, geometry=gpd.points_from_xy(gps['lon'], gps['lat']), crs='epsg:4236')
    geo = geo.to_crs('EPSG:32662')
    geo['geometry_urban'] = geo['geometry'].buffer(2000)
    geo['geometry_rural'] = geo['geometry'].buffer(5000)
    geo['geometry'] = geo.apply(lambda row: row['geometry_rural'] if row['rural'] == 1 else 
        row['geometry_urban'], axis=1)
    geo = geo.to_crs('epsg:4236')
    geo = geo[['cluster', 'geometry']]
    geo.to_file('/data/mosaiks/replication/surveys/dhs/' + country + '_polygons.geojson', driver='GeoJSON')

colombia 0.3
honduras 0.56
indonesia 0.58
kenya 0.61
nigeria 0.59
peru 0.39
philippines 0.64
