# 1c. Cleaning DHS surveys
DHS survey data is obtained from https://dhsprogram.com/data/dataset_admin/

In [3]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

In [4]:
surveys = {
    'colombia':['COHR61FL.DTA', 'COGE61FL.shp'],
    'honduras':['HNHR62FL.DTA', 'HNGE61FL.shp'],
    'indonesia':['IDHR42FL.DTA', 'IDGE43FL.shp'],
    'india':['IAHR7CFL.DTA', 'IAGE7AFL.shp'],
    'kenya':['KEHR72FL.DTA', 'KEGE71FL.shp'],
    'nigeria':['NGHR7BFL.DTA', 'NGGE7BFL.shp'],
    'peru':['PEHR5IFL.DTA', 'PEGE5IFL.shp'],
    'philippines':['PHHR71FL.DTA', 'PHGE71FL.shp']
}

In [5]:
for country in surveys.keys():
    
    print(country)
    print('--------')
    survey_fname, shapefile_fname = surveys[country][0], surveys[country][1]
    
    # Clean survey data
    df = pd.read_stata('/data/mosaiks/dhs/' + country + '/raw/survey/' + survey_fname, convert_categoricals=False)
    df = df[['hhid', 'hv001', 'hv005', 'hv271']]
    df.columns = ['hhid', 'cluster', 'weight', 'wealth']
    print('Number of observations: %i' % len(df))
    print('Number of clusters: %i' % len(df['cluster'].unique()))
    #df.to_csv('/data/mosaiks/replication/dhs/' + country + '/clean/survey.csv', index=False)

    # Clean spatial data
    spatial = gpd.read_file('/data/mosaiks/dhs/' + country + '/raw/shapefile/' + shapefile_fname)
    original_length = len(spatial)
    print('Number of clusters in shapefile: %i' % original_length)
    spatial = spatial[['DHSCLUST', 'URBAN_RURA', 'LATNUM', 'LONGNUM']]
    spatial.columns = ['cluster', 'urban', 'lat', 'lon']
    spatial = spatial[~((spatial['lat'] == 0) & (spatial['lon'] == 0))]
    end_length = len(spatial)
    print('Number of clusters without spatial information: %i' % (original_length - end_length))
    #spatial.to_csv('/data/mosaiks/replication/dhs/' + country + '/clean/clusters.csv', index=False)
    

colombia
--------
Number of observations: 51447
Number of clusters: 4987
Number of clusters in shapefile: 4987
Number of clusters without spatial information: 119
honduras
--------
Number of observations: 21362
Number of clusters: 1148
Number of clusters in shapefile: 1148
Number of clusters without spatial information: 20
indonesia
--------
Number of observations: 33088
Number of clusters: 1392
Number of clusters in shapefile: 1392
Number of clusters without spatial information: 73
india
--------
Number of observations: 636699
Number of clusters: 30170
Number of clusters in shapefile: 30197
Number of clusters without spatial information: 118
kenya
--------
Number of observations: 36430
Number of clusters: 1594
Number of clusters in shapefile: 1594
Number of clusters without spatial information: 9
nigeria
--------
Number of observations: 40427
Number of clusters: 1389
Number of clusters in shapefile: 1389
Number of clusters without spatial information: 7
peru
--------
Number of observa