# Replication Project: Nigerian Demographic and Health Surveys (DHS)

**Replication Game – Berlin, 30 October 2025 Berlin**  

---

**Institut for Replication & Freie Universität Berlin**  

**Author:** [Dominik Bursy](mailto:dominik.bursy@icloud.com)  

**Last Updated:** October 2025

---

**Reference:**  
Berazneva, Julia, and Tanya S. Byker. 2017. *Does Forest Loss Increase Human Disease? Evidence from Nigeria.* American Economic Review, 107(5), 516–521. https://doi.org/10.1257/aer.p20171132

---

**Resources:**  
- [DHS Recode Manual (English)](https://dhsprogram.com/publications/publication-dhsg4-dhs-questionnaires-and-manuals.cfm)
- [Guide on Merging DHS](https://dhsprogram.com/data/Merging-Datasets.cfm)
- [Guidelines on the Use of DHS GPS Data (English) ](https://dhsprogram.com/publications/publication-SAR8-Spatial-Analysis-Reports.cfm)  


## Import Packages <a class="anchor" id="packages"></a>

In [2]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import LineString
from datetime import timedelta

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
## Set Root Directory
ROOT_FOLDER = str(Path().absolute().parent)
print(ROOT_FOLDER)

/Users/dominikbursy/Documents/8_PhD_New/replication_game


---

## Data Import

In [21]:
dhs_2008_children_path = f"{ROOT_FOLDER}/datasets/dhs_survey/2008/NGKR53DT/NGKR53FL.DTA"
dhs_2008_geographic_path = f"{ROOT_FOLDER}/datasets/dhs_survey/2008/NGGE52FL/NGGE52FL.shp"

dhs_2013_children_path = f"{ROOT_FOLDER}/datasets/dhs_survey/2013/NGKR6ADT/NGKR6AFL.DTA"
dhs_2013_geographic_path = f"{ROOT_FOLDER}/datasets/dhs_survey/2013/NGGE6AFL/NGGE6AFL.shp"

dhs_2018_children_path = f"{ROOT_FOLDER}/datasets/dhs_survey/2018/NGKR7BDT/NGKR7BFL.DTA"
dhs_2018_geographic_path = f"{ROOT_FOLDER}/datasets/dhs_survey/2018/NGGE7BFL/NGGE7BFL.shp"

df_dhs_2008_children = pd.read_stata(dhs_2008_children_path, convert_categoricals=False)
gdf_dhs_2008_geographic = gpd.read_file(dhs_2008_geographic_path)

df_dhs_2013_children = pd.read_stata(dhs_2013_children_path, convert_categoricals=False)
gdf_dhs_2013_geographic = gpd.read_file(dhs_2013_geographic_path)

df_dhs_2018_children = pd.read_stata(dhs_2018_children_path, convert_categoricals=False)
gdf_dhs_2018_geographic = gpd.read_file(dhs_2018_geographic_path)

## Join Dataframes in Geodataframe

In [22]:
df_merged_2008 = df_dhs_2008_children.set_index(["v001"]).join(gdf_dhs_2008_geographic.set_index(["DHSCLUST"])).reset_index(names='v001')
df_merged_2013 = df_dhs_2013_children.set_index(["v001"]).join(gdf_dhs_2013_geographic.set_index(["DHSCLUST"])).reset_index(names='v001')
df_merged_2018 = df_dhs_2018_children.set_index(["v001"]).join(gdf_dhs_2018_geographic.set_index(["DHSCLUST"])).reset_index(names='v001')

gdf_dhs = pd.concat([df_merged_2008, df_merged_2013, df_merged_2018], ignore_index=True)
gdf_dhs = gpd.GeoDataFrame(gdf_dhs, geometry=gdf_dhs["geometry"], crs="EPSG:4326")

gdf_dhs["DHSCLUST"] = gdf_dhs["v001"].copy()

## Select and Prepare Features

In [23]:
dict_dhs_features = {
    "caseid": "caseid", # df_dhs_2008_children
    "v000": "v000",    # df_dhs_2008_children
    "v001": "v001",    # df_dhs_2008_children
    "v002": "v002",    # df_dhs_2008_children
    "v003": "v003",    # df_dhs_2008_children
    "v005": "v005",    # df_dhs_2008_children
    "v021": "v021",    # df_dhs_2008_children
    "v022": "v022",    # df_dhs_2008_children
    "v136": "no_HH_members",    # df_dhs_2008_children
    "v137": "no_kids_under_5",    # df_dhs_2008_children
    "v115": "time_to_water",    # df_dhs_2008_children
    "v025": "rural",    # df_dhs_2008_children
    "v152": "head_HH_age",    # df_dhs_2008_children -> Not perfect, but likely picked by the authors 
    "v715": "HH_head_edu_years",    # df_dhs_2008_children
    "v459": "own_bednet",   # df_dhs_2008_children
    "v151": "femhh",    # df_dhs_2008_children -> Likely female household / Sex of household head
    "v116": "toilet",   # df_dhs_2008_children: 31 corresponds to no toilet
    "v460": "kidnet",   # df_dhs_2008_children
    "v190": "poorest",    # df_dhs_2008_children: Wealth index quintile / Alternatively, dhs_2008_household_path: hv270
    "v161": "firewood",    # df_dhs_2008_children: Type of cooking fuel
    "v127": "floor",    # df_dhs_2008_children: "Main floor material of higher quality df_dhs_2008_children.v127.isin([20, 21, 22, 30, 31, 32, 33, 34, 35])
    "v012": "age_resp",     # df_dhs_2008_children
    "v133": "edu_years",     # df_dhs_2008_children
    "v130": "christian",    # df_dhs_2008_children: Religion
    "v130": "muslim",    # df_dhs_2008_children: Religion
    "v131": "yoruba",    # df_dhs_2008_children: Ethnicity
    "v131": "igbo",    # df_dhs_2008_children: Ethnicity
    "v131": "hausa",    # df_dhs_2008_children: Ethnicity
    "v201": "no_child_total",  # df_dhs_2008_children
    "v218": "no_child_living",  # df_dhs_2008_children
    "v461": "resp_slept_net",   # df_dhs_2008_children ???
    "v714": "resp_works",     # df_dhs_2008_children
    "v213": "pregnant",     # df_dhs_2008_children
    "v501": "married",     # df_dhs_2008_children: Marital status
    "v501": "livewith",     # df_dhs_2008_children: Marital status
    "v006": "month",    # df_dhs_2008_children
    "v024": "region",    # df_dhs_2008_children
    "b8": "age",   # df_dhs_2008_children
    "h22": "fever",   # df_dhs_2008_children
    "h11": "diarrhea",   # df_dhs_2008_children
    "h31": "cough",   # df_dhs_2008_children

    # DHS Geographic Data 
    "ALT_DEM": "altitude",   
    "DHSCLUST": "DHSCLUST", ## Duplicate v001
    "DHSYEAR": "DHSYEAR", 
    "geometry": "geometry"
}

In [24]:
gdf_dhs = gdf_dhs[dict_dhs_features.keys()]

In [25]:
## Clean and Label variables

gdf_dhs["v025"] -= 1

gdf_dhs.loc[gdf_dhs["v127"] == 9, "floor"] = 1
gdf_dhs.loc[gdf_dhs["v127"] != 9, "floor"] = 0

gdf_dhs["v151"] -= 1

gdf_dhs.loc[gdf_dhs["v116"].isin([31]), "toilet"] = 1
gdf_dhs.loc[~gdf_dhs["v116"].isin([31]), "toilet"] = 0

gdf_dhs.loc[gdf_dhs["v460"].isin([1, 2]), "kidnet"] = 1
gdf_dhs.loc[~gdf_dhs["v460"].isin([1, 2]), "kidnet"] = 0

gdf_dhs.loc[gdf_dhs["v127"] == 1, "poorest"] = 1
gdf_dhs.loc[gdf_dhs["v127"] != 1, "poorest"] = 0

gdf_dhs.loc[gdf_dhs["v127"].isin([8]), "firewood"] = 1
gdf_dhs.loc[~gdf_dhs["v127"].isin([8]), "firewood"] = 0

gdf_dhs.loc[gdf_dhs["v127"].isin([20, 21, 22, 30, 31, 32, 33, 34, 35]), "floor"] = 1
gdf_dhs.loc[~gdf_dhs["v127"].isin([20, 21, 22, 30, 31, 32, 33, 34, 35]), "floor"] = 0

gdf_dhs.loc[gdf_dhs["v130"].isin([1, 2]), "christian"] = 1
gdf_dhs.loc[~gdf_dhs["v130"].isin([1, 2]), "christian"] = 0

gdf_dhs.loc[gdf_dhs["v130"].isin([3]), "muslim"] = 1
gdf_dhs.loc[~gdf_dhs["v130"].isin([3]), "muslim"] = 0

gdf_dhs.loc[gdf_dhs["v131"] == "Yoruba", "yoruba"] = 1
gdf_dhs.loc[gdf_dhs["v131"] != "Yoruba", "yoruba"] = 0

gdf_dhs.loc[gdf_dhs["v131"] == "Igbo", "igbo"] = 1
gdf_dhs.loc[gdf_dhs["v131"] != "Igbo", "igbo"] = 0

gdf_dhs.loc[gdf_dhs["v131"] == "Hausa", "hausa"] = 1
gdf_dhs.loc[gdf_dhs["v131"] != "Hausa", "hausa"] = 0

gdf_dhs.loc[gdf_dhs["v501"] == 1, "married"] = 1
gdf_dhs.loc[gdf_dhs["v501"] != 1, "married"] = 0

gdf_dhs.loc[gdf_dhs["v501"] == 2, "livewith"] = 1
gdf_dhs.loc[gdf_dhs["v501"] != 2, "livewith"] = 0

gdf_dhs.loc[gdf_dhs["h22"] == 0, "fever"] = 0
gdf_dhs.loc[gdf_dhs["h22"] == 1, "fever"] = 1
gdf_dhs.loc[gdf_dhs["h22"] == 8, "fever"] = np.nan
gdf_dhs.loc[gdf_dhs["h22"] == 9, "fever"] = np.nan

gdf_dhs.loc[gdf_dhs["h11"] == 0, "diarrhea"] = 0
gdf_dhs.loc[gdf_dhs["h11"] == 2, "diarrhea"] = 1
gdf_dhs.loc[gdf_dhs["h11"] == 8, "diarrhea"] = np.nan
gdf_dhs.loc[gdf_dhs["h11"] == 9, "diarrhea"] = np.nan

gdf_dhs.loc[gdf_dhs["h31"] == 0, "cough"] = 0
gdf_dhs.loc[gdf_dhs["h31"] == 2, "cough"] = 1
gdf_dhs.loc[gdf_dhs["h31"] == 8, "cough"] = np.nan
gdf_dhs.loc[gdf_dhs["h31"] == 9, "cough"] = np.nan

In [26]:
## Select relevant columns

dict_dhs_features_rename = {
    "caseid": "caseid", # df_dhs_2008_children
    "v000": "v000",    # df_dhs_2008_children
    "v001": "v001",    # df_dhs_2008_children
    "v002": "v002",    # df_dhs_2008_children
    "v003": "v003",    # df_dhs_2008_children
    "v005": "v005",    # df_dhs_2008_children
    "v021": "v021",    # df_dhs_2008_children
    "v022": "v022",    # df_dhs_2008_children
    "v136": "no_HH_members",    # df_dhs_2008_children
    "v137": "no_kids_under_5",    # df_dhs_2008_children
    "v115": "time_to_water",    # df_dhs_2008_children
    "v025": "rural",    # df_dhs_2008_children
    "v152": "head_HH_age",    # df_dhs_2008_children -> Not perfect, but likely picked by the authors 
    "v715": "HH_head_edu_years",    # df_dhs_2008_children
    "v459": "own_bednet",   # df_dhs_2008_children
    "v151": "femhh",    # df_dhs_2008_children -> Likely female household / Sex of household head
    "v012": "age_resp",     # df_dhs_2008_children
    "v133": "edu_years",     # df_dhs_2008_children
    "v201": "no_child_total",  # df_dhs_2008_children
    "v218": "no_child_living",  # df_dhs_2008_children
    "v461": "resp_slept_net",   # df_dhs_2008_children ???
    "v714": "resp_works",     # df_dhs_2008_children
    "v213": "pregnant",     # df_dhs_2008_children
    "v006": "month",    # df_dhs_2008_children
    "v024": "region",    # df_dhs_2008_children
    "b8": "age",   # df_dhs_2008_children

    # DHS Geographic Data 
    "ALT_DEM": "altitude",   
    "DHSCLUST": "DHSCLUST", ## Duplicate v001
    "DHSYEAR": "DHSYEAR", 
    "geometry": "geometry"
}

In [27]:
gdf_dhs = gdf_dhs.rename(columns=dict(zip(dict_dhs_features_rename.keys(), dict_dhs_features_rename.values())))

## Export Dataframe

In [28]:
gdf_dhs.to_file(f"{ROOT_FOLDER}/output/gdf_dhs.geojson", driver='GeoJSON')

---