# Housing, Health, and Happiness – Milestone P4

This milestone aims to provide an extension to the paper _Housing, Health, and Happiness_. Spcifically, we would like to make an additional matching on the household level in order to further confirm the conclusions of the authors, or to nuance them.

## Imports

In [1]:
import networkx as nx
import numbers as nb
import numpy as np
import operator
import pandas as pd
import statsmodels.api as sm

from copy import deepcopy
from math import inf

DATA_PATH = "data/"

## Modularised figure reproduction

Here, we define modular functions for replicating the original tables from the paper, and to compare results to the expected values.
These functions will be reused for generating new tables from the matched datasets.

### 0. Defining common values

Since the treatment variable and the clustering variable are the same across all datasets, we define them here.

In [2]:
vars_ = {
    "treatment_var": ['dpisofirme'],
    "clustering_var": ['idcluster'],
    "demographic_control_vars": [],
    "health_control_vars": ['S_hasanimals', 'S_animalsinside', 'S_waterland', 'S_waterhouse',
                            'S_electricity', 'S_washhands', 'S_garbage'],
    "model3_control_vars": ['S_cashtransfers', 'S_milkprogram', 'S_foodprogram', 'S_seguropopular'],
    "dependant_vars": []
}

In [3]:
def generate_models(vars_):
    models = {
        'model_1': [],
        'model_2': vars_["demographic_control_vars"] + vars_["health_control_vars"],
    }
    models['model_3'] = models['model_2'] + vars_["model3_control_vars"]
    return models

### 2. Generating missing values

Missing values in columns containing the independent variables are replaced by 0 and a dummy variable indicating whether the value was missing is added (missing=1, present=0) for each variable containing missing values (others would only contain 0). An updated model taking the dummy variables into account is returned.

In [4]:
#Parameter inplace is used to show explicitly that this function has side effects on df
def generate_missing_values(df, models_, inplace=True):
    models = deepcopy(models_)
    columns = models["model_3"]
    if not inplace:
        raise ValueError("Parameter inplace has to be true")
    for col_name in columns:
        if df[col_name].isnull().values.any():
            new_col = 'dmiss_' + col_name
            df[new_col] = df[col_name].apply(pd.isna).apply(int)
            if col_name in models['model_2']:
                models['model_2'].append(new_col)
            models['model_3'].append(new_col)
    zeros = dict(zip(columns, [0] * len(columns)))
    df.fillna(zeros, inplace=True)
    return models

### 3. Regression

There are 2 steps to this part.

The first is to compute the mean and standard deviation for the control group for each dependant variable. This is done using `mean()` and `std()` on the control DataFrame (i.e. `vars_['treatment_var']` = False).

The second part is to do a linear regression for each dependent variable once for each model. This is his done using 2 nested for loops (over models, then over dependent variables) and using `statsmodels`'s `OLS` with a cluster covariance estimator (`vars_['clustering_var']`).

In [5]:
### Helper function to convert p-value to stars like in the paper
def to_stars(p):
    if p < 0.01:
        return "***"
    elif p < 0.05:
        return "** "
    elif p < 0.1:
        return "*  "
    return "   "

In [6]:
def compute_results(df, models, vars_):
    treatment_var = vars_['treatment_var'][0]
    # Part 1: control
    dependant_vars = vars_['dependant_vars']
    control = df[df[treatment_var].apply(lambda x: not bool(x))][dependant_vars]
    res = pd.DataFrame({
        'control means': control.mean(),
        'control std': control.std()
    }, index=dependant_vars)
    
    # Part 2: linear regression
    Y = df[dependant_vars]
    
    for k, v in models.items():
        X = df[vars_['treatment_var'] + v]
        X = sm.add_constant(X)
        column = []
        for label, y in Y.items():
            if (k == "model_1"):
                nb_control = y[X[treatment_var] == 0].count()
                nb_treated = y[X[treatment_var] == 1].count()
                print(f"Sample sizes for {label}:")
                print(f"Observations treatment: {nb_treated}")
                print(f"Observations control:   {nb_control}\n")
            regression = sm.OLS(y, X, missing='drop').fit(cov_type='cluster',
                                          cov_kwds={'groups': df.dropna(subset=[label])[vars_['clustering_var']]})
            coeff = regression.params[treatment_var]
            significance = to_stars(regression.pvalues[treatment_var])
            column.append((coeff, regression.bse[treatment_var], significance, 
                           100 * coeff / res.loc[label]['control means']))
        res[k] = column
    
    return pd.DataFrame(res, index=dependant_vars)

### 4. Showing and discussing the results

We display the DataFrame containing the results rounded to 3 decimals (as in the originial paper), then compute the difference (after rounding) with the results from the paper.

In [7]:
def round3(val):
    if isinstance(val, nb.Number):
        return round(val, 3)
    elif isinstance(val, str):
        return val
    else:
        tpe = type(val)
        return tpe(map(round3, val))

def round_res(df, index):
    res = df.apply(round3)
    res.index = index
    return res

In [8]:
# x, y chars
def char_compare(expected_star, real_star):
    star_values = {'none':' ', 'star': '*'}
    if(expected_star not in star_values.values() or real_star not in star_values.values()):
        return '/'
    is_expected_star = expected_star == star_values['star']
    is_real_star = real_star == star_values['star']
    res = [[' ', '+'], ['-', '*']]
    return res[is_expected_star][is_real_star]

def star_compare(expected_stars, real_stars):
    return "".join(map(char_compare, list(expected_stars), list(real_stars)))

def compare_results(expected_val, real_val):
    if isinstance(expected_val, tuple):
        return tuple(map(compare_results, expected_val, real_val))
    elif not (isinstance(expected_val, str)):
        return "{:.2e}".format(operator.sub(real_val, expected_val))
    else:
        return star_compare(expected_val, real_val)
    
def diff(expected_df, results_df):
    comp = expected_df.copy()
    for col in comp.columns:
        comp[col] = list(map(compare_results, expected_df[col], results_df[col]))
    return comp

In [9]:
def compute_and_output_results(df, models, vars_, expected_res, print_expected=True):
    res = compute_results(df, models, vars_)

    rounded_res = round_res(res, vars_['dependant_vars'])
    display(rounded_res)
    
    if print_expected:
        print("\nPaper's results")
        display(expected_res)
    print("\nDifference with expected results")
    comp = diff(expected_res, rounded_res)
    display(comp)

## Matchings

In [10]:
def distance_Lp(p, control_row, treated_row):
    if p == inf:
        return max(abs(control_row-treated_row))
    else:
        return pow(sum(pow(abs(control_row-treated_row), p)), 1/p)

def compute_max_dist(p, dim):
    return distance_Lp(p, np.zeros(dim), np.ones(dim))
    
def normalize_df(df):
    return (df - df.min()) / (df.max() - df.min())

from tqdm import tqdm
# Takes a dataframe, the name of the column co
def match(base_df, match_features, control_col=vars_["treatment_var"][0], p_distances=[1, 2, inf], epsilon=None):
    
    graphs = []
    max_dists = []
    n = len(match_features)
    for p in p_distances:
        graphs.append(nx.Graph())
        max_dists.append(compute_max_dist(p, n))
        
    filter_ = base_df[control_col].astype(bool).values
    normalized_df = normalize_df(base_df)
    treated_set = normalized_df[filter_][match_features]
    control_set = normalized_df.drop(treated_set.index)[match_features]
    
    if epsilon:
        adjusted_eps = [epsilon if (p == inf) else epsilon * max_dists[i] for i, p in enumerate(p_distances)]
    for control_id, control_row in tqdm(control_set.iterrows()):
        for treat_id, treat_row in treated_set.iterrows():
            for i, p in enumerate(p_distances):
                d = distance_Lp(p, control_row, treat_row)
                if not epsilon or d < adjusted_eps[i]:
                    graphs[i].add_weighted_edges_from([(control_id, treat_id, 1-d/max_dists[i])])
    
    # max_weight_matching returns 1-to-1 matching
    matches = []
    for G in graphs:
        matches.append(nx.max_weight_matching(G))
    
    matched_dfs = dict()
    for i, match in enumerate(matches):
        matched_df = pd.DataFrame(columns = base_df.columns)
        for (l, r) in match:
            matched_df = matched_df.append(base_df.loc[l])
            matched_df = matched_df.append(base_df.loc[r])
        matched_dfs[p_distances[i]] = matched_df
    return matched_dfs

def match_and_output_result(df, match_features, models, vars_, expected_res, control_col=vars_["treatment_var"][0],
                            p_distances=[1, 2, inf], epsilon=None):
    matched_dfs = match(df, match_features, control_col, p_distances, epsilon)
    print("Paper's results")
    display(expected_res)
    features_str = match_features[0]
    for i, feature in enumerate(match_features[1:]):
        if (i == len(match_features) - 2):
            features_str += f", and {feature}"
        else:
            features_str += f", {feature}"
    for p, matched_df in matched_dfs.items():
        eps_str = f" < {epsilon * compute_max_dist(p, len(match_features)):.3f}" if epsilon else ""
        print(f"\nResults for matching on {features_str} with L_{p} distance" + eps_str)
        print(f"Found {len(matched_df)//2} matches")
        compute_and_output_results(matched_df, models, vars_, expected_res, print_expected=False)
        print("\n")

## Figure reproduction: Table 4

The goal of this part is to reproduce Table 4 of the paper _Housing, Health, and Happiness_.

### 1. Loading the data and understanding what we'll need

We start by identifying columns which contain our dependent variables, treatment variable, independent variables for each of the three models and clustering variable.

The data related to the dependent variables can be fount in the following columns:
  + Share of rooms with cement floors (`S_shcementfloor`)
  + Cement floor in kitchen (`S_cementfloorkit`)
  + Cement floor in dining room (`S_cementfloordin`)
  + Cement floor in bathroom (`S_cementfloorbat`)
  + Cement floor in bedroom (`S_cementfloorbed`)

Control and treatment groups are identified by `dpisofirme` (control = 0, treatment = 1).

Model 1 has no control variables.

Model 2 has (25 - 1) control variables:
  + demographic:
    + Number of household members (`S_HHpeople`)
    + (Number of rooms (`S_rooms`) -> This one is mentioned in the paper, but after looking at the STATA file, I noticed it was not used for the regression and decided to drop it)
    + Head of household's years of schooling (`S_headeduc`)
    + Spouse's years of schooling (`S_spouseeduc`)
    + Head of household's age (`S_headage`)
    + Spouse's age (`S_spouseage`)
    + Proportion of Males 0-5yrs in household (`S_dem1`)
    + Proportion of Males 6-17yrs in household (`S_dem2`)
    + Proportion of Males 18-49yrs in household (`S_dem3`)
    + Proportion of Males 50+yrs in household (`S_dem4`)
    + Proportion of Females 0-5yrs in household (`S_dem5`)
    + Proportion of Females 6-17yrs in household (`S_dem6`)
    + Proportion of Females 18-49yrs in household (`S_dem7`)
    + Proportion of Females 50+yrs in household (`S_dem8`)
  + health:
    + Household has animals on land (`S_hasanimals`)
    + Animals allowed to enter the house (`S_animalsinside`)
    + Water connection outside (`S_waterland`)
    + Water connection inside the house (`S_waterhouse`)
    + Electricity (`S_electricity`)
    + Number of times respondent washed hands the day before (`S_washhands`)
    + Uses garbage collection service (`S_garbage`)
    
Model 3 adds 4 control variables:
  + Transfers per capita from government programs (`S_cashtransfers`)
  + Household beneficiary of government milk supplement program (`S_milkprogram`)
  + Household beneficiary of government food program (`S_foodprogram`)
  + Household beneficiary of seguro popular (`S_seguropopular`)
  
All models use `idcluster` for clustering.

In [11]:
df_t4_full = pd.read_stata(DATA_PATH + "PisoFirme_AEJPol-20070024_household.dta")

vars_t4 = deepcopy(vars_)
vars_t4["demographic_control_vars"] = ['S_HHpeople', 'S_headeduc', 'S_spouseeduc', 'S_headage',
                                       'S_spouseage', 'S_dem1', 'S_dem2', 'S_dem3', 'S_dem4',
                                       'S_dem5', 'S_dem6', 'S_dem7', 'S_dem8']
vars_t4["dependant_vars"] = ['S_shcementfloor', 'S_cementfloorkit', 'S_cementfloordin',
                             'S_cementfloorbat', 'S_cementfloorbed']
models_t4 = generate_models(vars_t4)

### 2. Expected output

In [12]:
expected_res_t4 = pd.DataFrame({'control means': [0.728, 0.671, 0.709, 0.803, 0.668],
                             'control std': [0.363, 0.470, 0.455, 0.398, 0.471],
                             'model_1': [(0.202, 0.021, '***', 27.746),
                                         (0.255, 0.025, '***', 37.936),
                                         (0.210, 0.026, '***', 29.633),
                                         (0.105, 0.022, '***', 13.071),
                                         (0.238, 0.020, '***', 35.598)],
                             'model_2': [(0.208, 0.019, '***', 28.512),
                                         (0.260, 0.023, '***', 38.708),
                                         (0.217, 0.025, '***', 30.588),
                                         (0.113, 0.018, '***', 14.043),
                                         (0.245, 0.021, '***', 36.735)],
                             'model_3':[(0.210, 0.019, '***', 28.876),
                                        (0.265, 0.023, '***', 39.440),
                                        (0.221, 0.025, '***', 31.189),
                                        (0.117, 0.018, '***', 14.536),
                                        (0.245, 0.020, '***', 36.695)]},
                            index = vars_t4["dependant_vars"])

### 3. Data cleaning and generating missing values

The next step is to clean the data, only keeping the necessary rows and generating missing values. 

Rows were identified as _unecessary_ if they were dropped in the original paper, which mentioned dropping samples for which geographical data was unavailable. Initially, we used that as a criteria for filtering the dataset. However, doing this gave worse results than not filtering at all. Thanks to a helpful comment on Zulip, we noticed a discreptancy between the number of samples we was supposed to have according to Table 1 (1362) and the number we had (1187). After looking around, we noticed geographical data was missing for the following 203 lines: (1788:1916), (2505:2527), (2576:2592), (2656:2661), (2755:2782) (see [Annex B1](#annexB1)). The last 28 lines seem to correspond to the 28 that were dropped in the paper, but the others seem to be due to some corruption of the dataset for some reason. Therefore, we decided to only drop the final 28 lines in order to use the same dataset as the paper, and we get much better results that way

In [13]:
df_t4 = df_t4_full[:2755].copy()
#df_t4 = df_t4[vars_t4["treatment_var"] + vars_t4["clustering_var"] + models_t4['model_3'] + vars_t4["dependant_vars"]]
new_models_t4 = generate_missing_values(df_t4, models_t4)

### 4. Computing and displaying results

In [14]:
compute_and_output_results(df_t4, new_models_t4, vars_t4, expected_res_t4)

Sample sizes for S_shcementfloor:
Observations treatment: 1362
Observations control:   1393

Sample sizes for S_cementfloorkit:
Observations treatment: 1362
Observations control:   1393

Sample sizes for S_cementfloordin:
Observations treatment: 1362
Observations control:   1393

Sample sizes for S_cementfloorbat:
Observations treatment: 1362
Observations control:   1393

Sample sizes for S_cementfloorbed:
Observations treatment: 1362
Observations control:   1393



Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,0.728,0.363,"(0.202, 0.021, ***, 27.746)","(0.207, 0.02, ***, 28.389)","(0.209, 0.02, ***, 28.755)"
S_cementfloorkit,0.671,0.47,"(0.255, 0.025, ***, 37.936)","(0.259, 0.023, ***, 38.516)","(0.263, 0.023, ***, 39.255)"
S_cementfloordin,0.709,0.455,"(0.21, 0.026, ***, 29.633)","(0.216, 0.025, ***, 30.478)","(0.22, 0.025, ***, 31.081)"
S_cementfloorbat,0.803,0.398,"(0.105, 0.022, ***, 13.071)","(0.112, 0.018, ***, 13.975)","(0.116, 0.018, ***, 14.463)"
S_cementfloorbed,0.668,0.471,"(0.238, 0.02, ***, 35.598)","(0.244, 0.02, ***, 36.488)","(0.243, 0.02, ***, 36.458)"



Paper's results


Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,0.728,0.363,"(0.202, 0.021, ***, 27.746)","(0.208, 0.019, ***, 28.512)","(0.21, 0.019, ***, 28.876)"
S_cementfloorkit,0.671,0.47,"(0.255, 0.025, ***, 37.936)","(0.26, 0.023, ***, 38.708)","(0.265, 0.023, ***, 39.44)"
S_cementfloordin,0.709,0.455,"(0.21, 0.026, ***, 29.633)","(0.217, 0.025, ***, 30.588)","(0.221, 0.025, ***, 31.189)"
S_cementfloorbat,0.803,0.398,"(0.105, 0.022, ***, 13.071)","(0.113, 0.018, ***, 14.043)","(0.117, 0.018, ***, 14.536)"
S_cementfloorbed,0.668,0.471,"(0.238, 0.02, ***, 35.598)","(0.245, 0.021, ***, 36.735)","(0.245, 0.02, ***, 36.695)"



Difference with expected results


Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,0.0,0.0,"(0.00e+00, 0.00e+00, ***, 0.00e+00)","(-1.00e-03, 1.00e-03, ***, -1.23e-01)","(-1.00e-03, 1.00e-03, ***, -1.21e-01)"
S_cementfloorkit,0.0,0.0,"(0.00e+00, 0.00e+00, ***, 0.00e+00)","(-1.00e-03, 0.00e+00, ***, -1.92e-01)","(-2.00e-03, 0.00e+00, ***, -1.85e-01)"
S_cementfloordin,0.0,0.0,"(0.00e+00, 0.00e+00, ***, 0.00e+00)","(-1.00e-03, 0.00e+00, ***, -1.10e-01)","(-1.00e-03, 0.00e+00, ***, -1.08e-01)"
S_cementfloorbat,0.0,0.0,"(0.00e+00, 0.00e+00, ***, 0.00e+00)","(-1.00e-03, 0.00e+00, ***, -6.80e-02)","(-1.00e-03, 0.00e+00, ***, -7.30e-02)"
S_cementfloorbed,0.0,0.0,"(0.00e+00, 0.00e+00, ***, 0.00e+00)","(-1.00e-03, -1.00e-03, ***, -2.47e-01)","(-2.00e-03, 0.00e+00, ***, -2.37e-01)"


## Figure reproduction: Table 5

The goal of this milestone is to reproduce Table 5 of the paper _Housing, Health, and Happiness_.

### 1. Loading the data and understanding what we'll need

We started this replication exercise by adapting our replication of Table 4. The main differences are:
1. we use the `individual.dta` rather than the `household.dta` file;
1. we use different dependant variables.

Note that Table 5 only focuses on children under the age of 6, so we drop every row pertaining to a person older than 6 years old (see [Annex A](#annexA) for further justification).

Using the explanation in section V of the paper as well as the STATA code, we identified the columns which contain our dependent variables, treatment variable, independent variables for each of the three models and clustering variable.

The data related to the dependant variables can be found in the following columns:
  + Parasite count (`S_parcount`)
  + Diarrhea (`S_diarrhea`)
  + Anemia (`S_anemia`)
  + McArthur Communication Development Test score (`S_mccdts`)
  + Picture Peabody Vocabulary Test percentile score (`S_pbdypct`)
  + Height-for-age z-score (`S_haz`)
  + Weight-for-height z-score (`S_whz`)

Control and treatment groups are identified by `dpisofirme` (control = 0, treatment = 1).

Model 1 has no control variables.

Model 2 has 58 control variables:
  + demographic:
    + Number of household members (`S_HHpeople`)
    + Number of rooms (`S_rooms`)
    + Age (`S_age`)
    + Male (`S_gender`) -> the README specifies that Male = 1, but the loaded dataframe contained the values `0.0` and `hombre`, so this was corrected to be `0` and `1`
    + Mother of at least one child in household present (`S_childma`)
    + Mother's age (if present) (`S_childmaage`)
    + Mother's years of schooling (if present) (`S_childmaeduc`)
    + Father of at least one child in household present (`S_childpa`)
    + Father's age (if present) (`S_childpaage`)
    + Father's years of schooling (if present) (`S_childpaeduc`)
    + (Trimester * Gender) Dummy for children 0-5yrs (`dtriage*`) [48]
  + health:
    + Household has animals on land (`S_hasanimals`)
    + Animals allowed to enter the house (`S_animalsinside`)
    + Water connection outside (`S_waterland`)
    + Water connection inside the house (`S_waterhouse`)
    + Electricity (`S_electricity`)
    + Number of times respondent washed hands the day before (`S_washhands`)
    + Uses garbage collection service (`S_garbage`)
    
Model 3 adds 4 control variables:
  + Transfers per capita from government programs (`S_cashtransfers`)
  + Household beneficiary of government milk supplement program (`S_milkprogram`)
  + Household beneficiary of government food program (`S_foodprogram`)
  + Household beneficiary of seguro popular (`S_seguropopular`)
  
All models use `idcluster` for clustering.

In [15]:
original_df_t5 = pd.read_stata(DATA_PATH + "PisoFirme_AEJPol-20070024_individual.dta")

vars_t5 = deepcopy(vars_)
demographic_control_vars_1 = ['S_HHpeople', 'S_rooms', 'S_age', 'S_gender', 'S_childma', 'S_childmaage',
                              'S_childmaeduc', 'S_childpa', 'S_childpaage', 'S_childpaeduc']
demographic_control_vars_2 = [x for x in original_df_t5.columns if 'dtriage' in x]
vars_t5["demographic_control_vars"] = demographic_control_vars_1 + demographic_control_vars_2
vars_t5["dependant_vars"] = ['S_parcount', 'S_diarrhea', 'S_anemia', 'S_mccdts', 'S_pbdypct', 'S_haz', 'S_whz']

models_t5 = generate_models(vars_t5)

### 2. Expected output

In [16]:
expected_res_t5 = pd.DataFrame({'control means': [0.333, 0.142, 0.426, 13.354, 30.656, -0.605, 0.125],
                             'control std': [0.673, 0.349, 0.495, 18.952, 24.864, 1.104, 1.133],
                             'model_1': [(-0.065, 0.032, '** ', -19.545),
                                         (-0.018, 0.009, '*  ', -12.819),
                                         (-0.085, 0.028, '***', -20.059), 
                                         (4.031, 1.650, '** ', 30.182), 
                                         (2.668, 1.689,'*  ' , 8.702), 
                                         (0.007, 0.043, '   ', -1.161), 
                                         (0.002, 0.034, '   ', 1.790)],
                             'model_2': [(-0.064, 0.031, '** ', -19.345),
                                         (-0.020, 0.009, '** ', -13.834),
                                         (-0.081, 0.027, '***', -18.908),
                                         (5.652, 1.642, '***', 42.325),
                                         (3.206, 1.430, '** ', 10.460),
                                         (0.002, 0.038, '   ',0.279),
                                         (-0.005, 0.036, '   ', -4.119)],
                             'model_3':[(-0.064, 0.032, '** ', -19.198),
                                         (-0.018, 0.009, '*  ', -12.803),
                                         (-0.083, 0.027, '***', -19.388),
                                         (5.557, 1.641, '***', 41.609),
                                         (3.083, 1.410, '** ', 10.058),
                                         (-0.002, 0.039, '   ', -0.323),
                                         (-0.011, 0.037, '   ', -8.727)]},
                            index = vars_t5['dependant_vars'])

### 3. Data cleaning and generating missing values

The next step is to clean the data, only keeping the necessary rows and generating missing values.

Once again, we identify clusters of missing geographical data in [Annex B2](#annexB2). However, the paper never mentions how many individuals' data they used, so we used the data provided in Table 1 for each dependant variable as a point of comparison. The check of the number of datapoints after dropping the final cluster is done in [Annex C](#annexC).

Note: in the STATA file the paper's authors do not check for missing values in `dtriage` colums. Here, we confirm that there is no missing values in these columns, so we do not need to manually exclude them when generating missing values.

In [17]:
children_df_t5 = original_df_t5[original_df_t5.S_age < 6].reset_index(drop=True) # Magic number! See Annex B!
children_df_t5['S_gender'] = children_df_t5['S_gender'].apply(lambda x: x == 'hombre').astype(int)
df_t5 = children_df_t5[:4052].copy()

#df_t5 = df_t5[vars_t5["treatment_var"] + vars_t5["clustering_var"] + vars_t5["dependant_vars"] + models_t5['model_3'] + ['coord_x']]
new_models_t5 = generate_missing_values(df_t5, models_t5)

### 4. Computing and displaying results

In [18]:
compute_and_output_results(df_t5, new_models_t5, vars_t5, expected_res_t5)

Sample sizes for S_parcount:
Observations treatment: 1528
Observations control:   1566

Sample sizes for S_diarrhea:
Observations treatment: 1930
Observations control:   2105

Sample sizes for S_anemia:
Observations treatment: 1768
Observations control:   1951

Sample sizes for S_mccdts:
Observations treatment: 291
Observations control:   302

Sample sizes for S_pbdypct:
Observations treatment: 757
Observations control:   817

Sample sizes for S_haz:
Observations treatment: 1865
Observations control:   2053

Sample sizes for S_whz:
Observations treatment: 1881
Observations control:   2058



Unnamed: 0,control means,control std,model_1,model_2,model_3
S_parcount,0.333,0.673,"(-0.065, 0.032, ** , -19.545)","(-0.064, 0.031, ** , -19.345)","(-0.064, 0.032, ** , -19.198)"
S_diarrhea,0.142,0.349,"(-0.018, 0.009, * , -12.819)","(-0.02, 0.009, ** , -13.834)","(-0.018, 0.009, * , -12.803)"
S_anemia,0.426,0.495,"(-0.085, 0.028, ***, -20.059)","(-0.081, 0.027, ***, -18.908)","(-0.083, 0.027, ***, -19.388)"
S_mccdts,13.354,18.952,"(4.031, 1.65, ** , 30.182)","(5.652, 1.709, ***, 42.325)","(5.557, 1.71, ***, 41.609)"
S_pbdypct,30.656,24.864,"(2.668, 1.689, , 8.702)","(3.206, 1.443, ** , 10.46)","(3.083, 1.423, ** , 10.058)"
S_haz,-0.605,1.104,"(0.007, 0.043, , -1.161)","(-0.002, 0.038, , 0.279)","(0.002, 0.039, , -0.323)"
S_whz,0.125,1.133,"(0.002, 0.034, , 1.79)","(-0.005, 0.036, , -4.119)","(-0.011, 0.037, , -8.727)"



Paper's results


Unnamed: 0,control means,control std,model_1,model_2,model_3
S_parcount,0.333,0.673,"(-0.065, 0.032, ** , -19.545)","(-0.064, 0.031, ** , -19.345)","(-0.064, 0.032, ** , -19.198)"
S_diarrhea,0.142,0.349,"(-0.018, 0.009, * , -12.819)","(-0.02, 0.009, ** , -13.834)","(-0.018, 0.009, * , -12.803)"
S_anemia,0.426,0.495,"(-0.085, 0.028, ***, -20.059)","(-0.081, 0.027, ***, -18.908)","(-0.083, 0.027, ***, -19.388)"
S_mccdts,13.354,18.952,"(4.031, 1.65, ** , 30.182)","(5.652, 1.642, ***, 42.325)","(5.557, 1.641, ***, 41.609)"
S_pbdypct,30.656,24.864,"(2.668, 1.689, * , 8.702)","(3.206, 1.43, ** , 10.46)","(3.083, 1.41, ** , 10.058)"
S_haz,-0.605,1.104,"(0.007, 0.043, , -1.161)","(0.002, 0.038, , 0.279)","(-0.002, 0.039, , -0.323)"
S_whz,0.125,1.133,"(0.002, 0.034, , 1.79)","(-0.005, 0.036, , -4.119)","(-0.011, 0.037, , -8.727)"



Difference with expected results


Unnamed: 0,control means,control std,model_1,model_2,model_3
S_parcount,0.0,0.0,"(0.00e+00, 0.00e+00, ** , 0.00e+00)","(0.00e+00, 0.00e+00, ** , 0.00e+00)","(0.00e+00, 0.00e+00, ** , 0.00e+00)"
S_diarrhea,0.0,0.0,"(0.00e+00, 0.00e+00, * , 0.00e+00)","(0.00e+00, 0.00e+00, ** , 0.00e+00)","(0.00e+00, 0.00e+00, * , 0.00e+00)"
S_anemia,0.0,0.0,"(0.00e+00, 0.00e+00, ***, 0.00e+00)","(0.00e+00, 0.00e+00, ***, 0.00e+00)","(0.00e+00, 0.00e+00, ***, 0.00e+00)"
S_mccdts,0.0,0.0,"(0.00e+00, 0.00e+00, ** , 0.00e+00)","(0.00e+00, 6.70e-02, ***, 0.00e+00)","(0.00e+00, 6.90e-02, ***, 0.00e+00)"
S_pbdypct,0.0,0.0,"(0.00e+00, 0.00e+00, - , 0.00e+00)","(0.00e+00, 1.30e-02, ** , 0.00e+00)","(0.00e+00, 1.30e-02, ** , 0.00e+00)"
S_haz,0.0,0.0,"(0.00e+00, 0.00e+00, , 0.00e+00)","(-4.00e-03, 0.00e+00, , 0.00e+00)","(4.00e-03, 0.00e+00, , 0.00e+00)"
S_whz,0.0,0.0,"(0.00e+00, 0.00e+00, , 0.00e+00)","(0.00e+00, 0.00e+00, , 0.00e+00)","(0.00e+00, 0.00e+00, , 0.00e+00)"


<span id="annexA"></span>
### Annex A

In [19]:
list_vars = []
olderthan6_df = original_df_t5.drop(children_df_t5.index)[vars_t5["dependant_vars"]]
print('Checking if all dependant variables are `NaN` for individuals older than 6...')
if (olderthan6_df.apply(pd.isna).values.all()):
    s = ""
else:
    s = "not"
print(f'We can{s} drop all aforementioned rows')

Checking if all dependant variables are `NaN` for individuals older than 6...
We cannot drop all aforementioned rows


<span id="annexB1"></span>
### Annex B1

In [20]:
nans = df_t4_full.loc[pd.isna(df_t4_full['coord_x'])].index
ranges = []
low, up = nans[0], nans[1]
for i in range(len(nans) - 1):
    up = nans[i]
    if (up + 1 != nans[i+1] or i+1 == len(nans) - 1):
        if (i+1 == len(nans) - 1):
            up = nans[i+1]
        ranges.append((low, up))
        low = nans[i+1]
ranges_str = ""
for (l, r) in ranges:
    ranges_str += '[' + str(l) + ', ' +  str(r) + ']\n'
print(f'Intervals of rows with missing geographical data:\n{ranges_str}')

Intervals of rows with missing geographical data:
[1788, 1916]
[2505, 2527]
[2576, 2592]
[2656, 2661]
[2755, 2782]



<span id="annexB2"></span>
### Annex B2

In [21]:
nans = children_df_t5.loc[pd.isna(children_df_t5['coord_x'])].index
ranges = []
low, up = nans[0], nans[1]
for i in range(len(nans) - 1):
    up = nans[i]
    if (up + 1 != nans[i+1] or i+1 == len(nans) - 1):
        if (i+1 == len(nans) - 1):
            up = nans[i+1]
        ranges.append((low, up))
        low = nans[i+1]
ranges_str = ""
for (l, r) in ranges:
    ranges_str += '[' + str(l) + ', ' +  str(r) + ']\n'
print(f'Intervals of rows with missing geographical data:\n{ranges_str}')

Intervals of rows with missing geographical data:
[2670, 2856]
[3707, 3734]
[3795, 3816]
[3914, 3921]
[4052, 4091]



<span id="annexC"></span>
### Annex C

In [22]:
expected_amount = pd.DataFrame({'treat_expected': [1528, 1930, 1768, 291, 757, 1865, 1881],
                                'cont_expected' : [1566, 2105, 1951, 302, 817, 2053, 2058]},
                              index=vars_t5["dependant_vars"])
treat, cont, drop_t, drop_c = [], [], [], []
treat_df = df_t5[df_t5.dpisofirme == 1]
cont_df = df_t5[df_t5.dpisofirme == 0]

for col in vars_t5["dependant_vars"]:
    dt = len(treat_df.loc[pd.isna(treat_df[col])])
    treat.append(len(treat_df) - dt)
    drop_t.append(dt)
    dc = len(cont_df.loc[pd.isna(cont_df[col])])
    cont.append(len(cont_df) - dc)
    drop_c.append(dc)
expected_amount['treat'] = treat
expected_amount['cont'] = cont
expected_amount['dropped_treat'] = drop_t
expected_amount['dropped_cont'] = drop_c
expected_amount['dropped_tot'] = expected_amount['dropped_treat'] + expected_amount['dropped_cont']
expected_amount['delta_t'] = expected_amount.treat_expected - expected_amount.treat
expected_amount['delta_c'] = expected_amount.cont_expected - expected_amount.cont
expected_amount['tot'] = expected_amount.treat + expected_amount.cont
expected_amount

Unnamed: 0,treat_expected,cont_expected,treat,cont,dropped_treat,dropped_cont,dropped_tot,delta_t,delta_c,tot
S_parcount,1528,1566,1528,1566,412,546,958,0,0,3094
S_diarrhea,1930,2105,1930,2105,10,7,17,0,0,4035
S_anemia,1768,1951,1768,1951,172,161,333,0,0,3719
S_mccdts,291,302,291,302,1649,1810,3459,0,0,593
S_pbdypct,757,817,757,817,1183,1295,2478,0,0,1574
S_haz,1865,2053,1865,2053,75,59,134,0,0,3918
S_whz,1881,2058,1881,2058,59,54,113,0,0,3939


In [23]:
match_features_t4_eco = ['S_incomepc', 'S_assetspc', 'S_shpeoplework', 'S_microenter', 'S_hrsworkedpc',
                   'S_consumptionpc', 'S_logrent', 'S_logsell']
match_and_output_result(df_t4, match_features_t4_eco, new_models_t4, vars_t4, expected_res_t4, epsilon=0.01)

1393it [30:31,  1.31s/it]


Paper's results


Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,0.728,0.363,"(0.202, 0.021, ***, 27.746)","(0.208, 0.019, ***, 28.512)","(0.21, 0.019, ***, 28.876)"
S_cementfloorkit,0.671,0.47,"(0.255, 0.025, ***, 37.936)","(0.26, 0.023, ***, 38.708)","(0.265, 0.023, ***, 39.44)"
S_cementfloordin,0.709,0.455,"(0.21, 0.026, ***, 29.633)","(0.217, 0.025, ***, 30.588)","(0.221, 0.025, ***, 31.189)"
S_cementfloorbat,0.803,0.398,"(0.105, 0.022, ***, 13.071)","(0.113, 0.018, ***, 14.043)","(0.117, 0.018, ***, 14.536)"
S_cementfloorbed,0.668,0.471,"(0.238, 0.02, ***, 35.598)","(0.245, 0.021, ***, 36.735)","(0.245, 0.02, ***, 36.695)"



Results for matching on S_incomepc, S_assetspc, S_shpeoplework, S_microenter, S_hrsworkedpc, S_consumptionpc, S_logrent, and S_logsell with L_1 distance < 0.080
Found 654 matches
Sample sizes for S_shcementfloor:
Observations treatment: 654
Observations control:   654

Sample sizes for S_cementfloorkit:
Observations treatment: 654
Observations control:   654

Sample sizes for S_cementfloordin:
Observations treatment: 654
Observations control:   654

Sample sizes for S_cementfloorbat:
Observations treatment: 654
Observations control:   654

Sample sizes for S_cementfloorbed:
Observations treatment: 654
Observations control:   654



Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,0.748,0.345,"(0.173, 0.026, ***, 23.119)","(0.178, 0.024, ***, 23.837)","(0.181, 0.024, ***, 24.221)"
S_cementfloorkit,0.69,0.463,"(0.228, 0.032, ***, 33.038)","(0.234, 0.03, ***, 33.869)","(0.237, 0.029, ***, 34.346)"
S_cementfloordin,0.72,0.449,"(0.191, 0.034, ***, 26.539)","(0.194, 0.034, ***, 26.936)","(0.197, 0.033, ***, 27.367)"
S_cementfloorbat,0.821,0.384,"(0.067, 0.029, ** , 8.194)","(0.075, 0.025, ***, 9.143)","(0.081, 0.024, ***, 9.888)"
S_cementfloorbed,0.717,0.451,"(0.196, 0.025, ***, 27.292)","(0.204, 0.023, ***, 28.4)","(0.204, 0.022, ***, 28.489)"



Difference with expected results


Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,0.02,-0.018,"(-2.90e-02, 5.00e-03, ***, -4.63e+00)","(-3.00e-02, 5.00e-03, ***, -4.68e+00)","(-2.90e-02, 5.00e-03, ***, -4.66e+00)"
S_cementfloorkit,0.019,-0.007,"(-2.70e-02, 7.00e-03, ***, -4.90e+00)","(-2.60e-02, 7.00e-03, ***, -4.84e+00)","(-2.80e-02, 6.00e-03, ***, -5.09e+00)"
S_cementfloordin,0.011,-0.006,"(-1.90e-02, 8.00e-03, ***, -3.09e+00)","(-2.30e-02, 9.00e-03, ***, -3.65e+00)","(-2.40e-02, 8.00e-03, ***, -3.82e+00)"
S_cementfloorbat,0.018,-0.014,"(-3.80e-02, 7.00e-03, **-, -4.88e+00)","(-3.80e-02, 7.00e-03, ***, -4.90e+00)","(-3.60e-02, 6.00e-03, ***, -4.65e+00)"
S_cementfloorbed,0.049,-0.02,"(-4.20e-02, 5.00e-03, ***, -8.31e+00)","(-4.10e-02, 2.00e-03, ***, -8.34e+00)","(-4.10e-02, 2.00e-03, ***, -8.21e+00)"





Results for matching on S_incomepc, S_assetspc, S_shpeoplework, S_microenter, S_hrsworkedpc, S_consumptionpc, S_logrent, and S_logsell with L_2 distance < 0.028
Found 354 matches
Sample sizes for S_shcementfloor:
Observations treatment: 354
Observations control:   354

Sample sizes for S_cementfloorkit:
Observations treatment: 354
Observations control:   354

Sample sizes for S_cementfloordin:
Observations treatment: 354
Observations control:   354

Sample sizes for S_cementfloorbat:
Observations treatment: 354
Observations control:   354

Sample sizes for S_cementfloorbed:
Observations treatment: 354
Observations control:   354



Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,0.763,0.333,"(0.16, 0.024, ***, 20.951)","(0.163, 0.023, ***, 21.398)","(0.164, 0.023, ***, 21.547)"
S_cementfloorkit,0.715,0.452,"(0.209, 0.026, ***, 29.249)","(0.211, 0.026, ***, 29.521)","(0.213, 0.026, ***, 29.757)"
S_cementfloordin,0.74,0.439,"(0.169, 0.035, ***, 22.901)","(0.168, 0.037, ***, 22.679)","(0.168, 0.037, ***, 22.636)"
S_cementfloorbat,0.816,0.388,"(0.065, 0.029, ** , 7.958)","(0.072, 0.027, ***, 8.851)","(0.082, 0.027, ***, 10.063)"
S_cementfloorbed,0.732,0.444,"(0.195, 0.03, ***, 26.641)","(0.204, 0.029, ***, 27.899)","(0.201, 0.029, ***, 27.481)"



Difference with expected results


Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,0.035,-0.03,"(-4.20e-02, 3.00e-03, ***, -6.79e+00)","(-4.50e-02, 4.00e-03, ***, -7.11e+00)","(-4.60e-02, 4.00e-03, ***, -7.33e+00)"
S_cementfloorkit,0.044,-0.018,"(-4.60e-02, 1.00e-03, ***, -8.69e+00)","(-4.90e-02, 3.00e-03, ***, -9.19e+00)","(-5.20e-02, 3.00e-03, ***, -9.68e+00)"
S_cementfloordin,0.031,-0.016,"(-4.10e-02, 9.00e-03, ***, -6.73e+00)","(-4.90e-02, 1.20e-02, ***, -7.91e+00)","(-5.30e-02, 1.20e-02, ***, -8.55e+00)"
S_cementfloorbat,0.013,-0.01,"(-4.00e-02, 7.00e-03, **-, -5.11e+00)","(-4.10e-02, 9.00e-03, ***, -5.19e+00)","(-3.50e-02, 9.00e-03, ***, -4.47e+00)"
S_cementfloorbed,0.064,-0.027,"(-4.30e-02, 1.00e-02, ***, -8.96e+00)","(-4.10e-02, 8.00e-03, ***, -8.84e+00)","(-4.40e-02, 9.00e-03, ***, -9.21e+00)"





Results for matching on S_incomepc, S_assetspc, S_shpeoplework, S_microenter, S_hrsworkedpc, S_consumptionpc, S_logrent, and S_logsell with L_inf distance < 0.010
Found 263 matches
Sample sizes for S_shcementfloor:
Observations treatment: 263
Observations control:   263

Sample sizes for S_cementfloorkit:
Observations treatment: 263
Observations control:   263

Sample sizes for S_cementfloordin:
Observations treatment: 263
Observations control:   263

Sample sizes for S_cementfloorbat:
Observations treatment: 263
Observations control:   263

Sample sizes for S_cementfloorbed:
Observations treatment: 263
Observations control:   263



Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,0.715,0.383,"(0.207, 0.03, ***, 28.968)","(0.222, 0.032, ***, 31.082)","(0.227, 0.033, ***, 31.753)"
S_cementfloorkit,0.673,0.47,"(0.24, 0.033, ***, 35.593)","(0.249, 0.035, ***, 37.072)","(0.258, 0.037, ***, 38.321)"
S_cementfloordin,0.7,0.459,"(0.224, 0.035, ***, 32.065)","(0.236, 0.038, ***, 33.765)","(0.243, 0.039, ***, 34.763)"
S_cementfloorbat,0.76,0.428,"(0.129, 0.04, ***, 17.0)","(0.156, 0.04, ***, 20.502)","(0.16, 0.041, ***, 21.004)"
S_cementfloorbed,0.692,0.463,"(0.228, 0.034, ***, 32.967)","(0.234, 0.039, ***, 33.87)","(0.233, 0.04, ***, 33.706)"



Difference with expected results


Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,-0.013,0.02,"(5.00e-03, 9.00e-03, ***, 1.22e+00)","(1.40e-02, 1.30e-02, ***, 2.57e+00)","(1.70e-02, 1.40e-02, ***, 2.88e+00)"
S_cementfloorkit,0.002,0.0,"(-1.50e-02, 8.00e-03, ***, -2.34e+00)","(-1.10e-02, 1.20e-02, ***, -1.64e+00)","(-7.00e-03, 1.40e-02, ***, -1.12e+00)"
S_cementfloordin,-0.009,0.004,"(1.40e-02, 9.00e-03, ***, 2.43e+00)","(1.90e-02, 1.30e-02, ***, 3.18e+00)","(2.20e-02, 1.40e-02, ***, 3.57e+00)"
S_cementfloorbat,-0.043,0.03,"(2.40e-02, 1.80e-02, ***, 3.93e+00)","(4.30e-02, 2.20e-02, ***, 6.46e+00)","(4.30e-02, 2.30e-02, ***, 6.47e+00)"
S_cementfloorbed,0.024,-0.008,"(-1.00e-02, 1.40e-02, ***, -2.63e+00)","(-1.10e-02, 1.80e-02, ***, -2.87e+00)","(-1.20e-02, 2.00e-02, ***, -2.99e+00)"






In [24]:
match_features_t4_houseimprov = ['S_instsanita', 'S_restsanita', 'S_constceili', 'S_restowalls', 'S_improveany']
match_and_output_result(df_t4, match_features_t4_houseimprov, new_models_t4, vars_t4, expected_res_t4, epsilon=0.01)

1393it [31:33,  1.36s/it]


Paper's results


Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,0.728,0.363,"(0.202, 0.021, ***, 27.746)","(0.208, 0.019, ***, 28.512)","(0.21, 0.019, ***, 28.876)"
S_cementfloorkit,0.671,0.47,"(0.255, 0.025, ***, 37.936)","(0.26, 0.023, ***, 38.708)","(0.265, 0.023, ***, 39.44)"
S_cementfloordin,0.709,0.455,"(0.21, 0.026, ***, 29.633)","(0.217, 0.025, ***, 30.588)","(0.221, 0.025, ***, 31.189)"
S_cementfloorbat,0.803,0.398,"(0.105, 0.022, ***, 13.071)","(0.113, 0.018, ***, 14.043)","(0.117, 0.018, ***, 14.536)"
S_cementfloorbed,0.668,0.471,"(0.238, 0.02, ***, 35.598)","(0.245, 0.021, ***, 36.735)","(0.245, 0.02, ***, 36.695)"



Results for matching on S_instsanita, S_restsanita, S_constceili, S_restowalls, and S_improveany with L_1 distance < 0.050
Found 1278 matches
Sample sizes for S_shcementfloor:
Observations treatment: 1278
Observations control:   1278

Sample sizes for S_cementfloorkit:
Observations treatment: 1278
Observations control:   1278

Sample sizes for S_cementfloordin:
Observations treatment: 1278
Observations control:   1278

Sample sizes for S_cementfloorbat:
Observations treatment: 1278
Observations control:   1278

Sample sizes for S_cementfloorbed:
Observations treatment: 1278
Observations control:   1278



Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,0.719,0.368,"(0.209, 0.021, ***, 29.132)","(0.215, 0.02, ***, 29.902)","(0.218, 0.02, ***, 30.302)"
S_cementfloorkit,0.662,0.473,"(0.263, 0.024, ***, 39.716)","(0.268, 0.023, ***, 40.445)","(0.272, 0.023, ***, 41.132)"
S_cementfloordin,0.697,0.46,"(0.218, 0.025, ***, 31.201)","(0.224, 0.025, ***, 32.191)","(0.229, 0.025, ***, 32.84)"
S_cementfloorbat,0.796,0.403,"(0.11, 0.022, ***, 13.864)","(0.118, 0.019, ***, 14.804)","(0.122, 0.019, ***, 15.381)"
S_cementfloorbed,0.656,0.475,"(0.25, 0.02, ***, 38.141)","(0.258, 0.021, ***, 39.303)","(0.258, 0.02, ***, 39.307)"



Difference with expected results


Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,-0.009,0.005,"(7.00e-03, 0.00e+00, ***, 1.39e+00)","(7.00e-03, 1.00e-03, ***, 1.39e+00)","(8.00e-03, 1.00e-03, ***, 1.43e+00)"
S_cementfloorkit,-0.009,0.003,"(8.00e-03, -1.00e-03, ***, 1.78e+00)","(8.00e-03, 0.00e+00, ***, 1.74e+00)","(7.00e-03, 0.00e+00, ***, 1.69e+00)"
S_cementfloordin,-0.012,0.005,"(8.00e-03, -1.00e-03, ***, 1.57e+00)","(7.00e-03, 0.00e+00, ***, 1.60e+00)","(8.00e-03, 0.00e+00, ***, 1.65e+00)"
S_cementfloorbat,-0.007,0.005,"(5.00e-03, 0.00e+00, ***, 7.93e-01)","(5.00e-03, 1.00e-03, ***, 7.61e-01)","(5.00e-03, 1.00e-03, ***, 8.45e-01)"
S_cementfloorbed,-0.012,0.004,"(1.20e-02, 0.00e+00, ***, 2.54e+00)","(1.30e-02, 0.00e+00, ***, 2.57e+00)","(1.30e-02, 0.00e+00, ***, 2.61e+00)"





Results for matching on S_instsanita, S_restsanita, S_constceili, S_restowalls, and S_improveany with L_2 distance < 0.022
Found 1278 matches
Sample sizes for S_shcementfloor:
Observations treatment: 1278
Observations control:   1278

Sample sizes for S_cementfloorkit:
Observations treatment: 1278
Observations control:   1278

Sample sizes for S_cementfloordin:
Observations treatment: 1278
Observations control:   1278

Sample sizes for S_cementfloorbat:
Observations treatment: 1278
Observations control:   1278

Sample sizes for S_cementfloorbed:
Observations treatment: 1278
Observations control:   1278



Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,0.719,0.368,"(0.209, 0.021, ***, 29.132)","(0.215, 0.02, ***, 29.902)","(0.218, 0.02, ***, 30.302)"
S_cementfloorkit,0.662,0.473,"(0.263, 0.024, ***, 39.716)","(0.268, 0.023, ***, 40.445)","(0.272, 0.023, ***, 41.132)"
S_cementfloordin,0.697,0.46,"(0.218, 0.025, ***, 31.201)","(0.224, 0.025, ***, 32.191)","(0.229, 0.025, ***, 32.84)"
S_cementfloorbat,0.796,0.403,"(0.11, 0.022, ***, 13.864)","(0.118, 0.019, ***, 14.804)","(0.122, 0.019, ***, 15.381)"
S_cementfloorbed,0.656,0.475,"(0.25, 0.02, ***, 38.141)","(0.258, 0.021, ***, 39.303)","(0.258, 0.02, ***, 39.307)"



Difference with expected results


Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,-0.009,0.005,"(7.00e-03, 0.00e+00, ***, 1.39e+00)","(7.00e-03, 1.00e-03, ***, 1.39e+00)","(8.00e-03, 1.00e-03, ***, 1.43e+00)"
S_cementfloorkit,-0.009,0.003,"(8.00e-03, -1.00e-03, ***, 1.78e+00)","(8.00e-03, 0.00e+00, ***, 1.74e+00)","(7.00e-03, 0.00e+00, ***, 1.69e+00)"
S_cementfloordin,-0.012,0.005,"(8.00e-03, -1.00e-03, ***, 1.57e+00)","(7.00e-03, 0.00e+00, ***, 1.60e+00)","(8.00e-03, 0.00e+00, ***, 1.65e+00)"
S_cementfloorbat,-0.007,0.005,"(5.00e-03, 0.00e+00, ***, 7.93e-01)","(5.00e-03, 1.00e-03, ***, 7.61e-01)","(5.00e-03, 1.00e-03, ***, 8.45e-01)"
S_cementfloorbed,-0.012,0.004,"(1.20e-02, 0.00e+00, ***, 2.54e+00)","(1.30e-02, 0.00e+00, ***, 2.57e+00)","(1.30e-02, 0.00e+00, ***, 2.61e+00)"





Results for matching on S_instsanita, S_restsanita, S_constceili, S_restowalls, and S_improveany with L_inf distance < 0.010
Found 1279 matches
Sample sizes for S_shcementfloor:
Observations treatment: 1279
Observations control:   1279

Sample sizes for S_cementfloorkit:
Observations treatment: 1279
Observations control:   1279

Sample sizes for S_cementfloordin:
Observations treatment: 1279
Observations control:   1279

Sample sizes for S_cementfloorbat:
Observations treatment: 1279
Observations control:   1279

Sample sizes for S_cementfloorbed:
Observations treatment: 1279
Observations control:   1279



Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,0.719,0.368,"(0.209, 0.021, ***, 29.1)","(0.215, 0.02, ***, 29.872)","(0.218, 0.02, ***, 30.273)"
S_cementfloorkit,0.662,0.473,"(0.263, 0.024, ***, 39.669)","(0.268, 0.023, ***, 40.402)","(0.272, 0.023, ***, 41.089)"
S_cementfloordin,0.697,0.46,"(0.217, 0.025, ***, 31.166)","(0.224, 0.025, ***, 32.162)","(0.229, 0.025, ***, 32.812)"
S_cementfloorbat,0.796,0.403,"(0.11, 0.022, ***, 13.851)","(0.118, 0.018, ***, 14.788)","(0.122, 0.018, ***, 15.366)"
S_cementfloorbed,0.657,0.475,"(0.25, 0.02, ***, 38.095)","(0.258, 0.021, ***, 39.256)","(0.258, 0.021, ***, 39.261)"



Difference with expected results


Unnamed: 0,control means,control std,model_1,model_2,model_3
S_shcementfloor,-0.009,0.005,"(7.00e-03, 0.00e+00, ***, 1.35e+00)","(7.00e-03, 1.00e-03, ***, 1.36e+00)","(8.00e-03, 1.00e-03, ***, 1.40e+00)"
S_cementfloorkit,-0.009,0.003,"(8.00e-03, -1.00e-03, ***, 1.73e+00)","(8.00e-03, 0.00e+00, ***, 1.69e+00)","(7.00e-03, 0.00e+00, ***, 1.65e+00)"
S_cementfloordin,-0.012,0.005,"(7.00e-03, -1.00e-03, ***, 1.53e+00)","(7.00e-03, 0.00e+00, ***, 1.57e+00)","(8.00e-03, 0.00e+00, ***, 1.62e+00)"
S_cementfloorbat,-0.007,0.005,"(5.00e-03, 0.00e+00, ***, 7.80e-01)","(5.00e-03, 0.00e+00, ***, 7.45e-01)","(5.00e-03, 0.00e+00, ***, 8.30e-01)"
S_cementfloorbed,-0.011,0.004,"(1.20e-02, 0.00e+00, ***, 2.50e+00)","(1.30e-02, 0.00e+00, ***, 2.52e+00)","(1.30e-02, 1.00e-03, ***, 2.57e+00)"






In [25]:
match_features_t5 = ['S_assetspc']
match_and_output_result(df_t5, match_features_t5, new_models_t5, vars_t5, expected_res_t5,
                        p_distances=[inf], epsilon=0.01)

2112it [18:11,  1.93it/s]


Paper's results


Unnamed: 0,control means,control std,model_1,model_2,model_3
S_parcount,0.333,0.673,"(-0.065, 0.032, ** , -19.545)","(-0.064, 0.031, ** , -19.345)","(-0.064, 0.032, ** , -19.198)"
S_diarrhea,0.142,0.349,"(-0.018, 0.009, * , -12.819)","(-0.02, 0.009, ** , -13.834)","(-0.018, 0.009, * , -12.803)"
S_anemia,0.426,0.495,"(-0.085, 0.028, ***, -20.059)","(-0.081, 0.027, ***, -18.908)","(-0.083, 0.027, ***, -19.388)"
S_mccdts,13.354,18.952,"(4.031, 1.65, ** , 30.182)","(5.652, 1.642, ***, 42.325)","(5.557, 1.641, ***, 41.609)"
S_pbdypct,30.656,24.864,"(2.668, 1.689, * , 8.702)","(3.206, 1.43, ** , 10.46)","(3.083, 1.41, ** , 10.058)"
S_haz,-0.605,1.104,"(0.007, 0.043, , -1.161)","(0.002, 0.038, , 0.279)","(-0.002, 0.039, , -0.323)"
S_whz,0.125,1.133,"(0.002, 0.034, , 1.79)","(-0.005, 0.036, , -4.119)","(-0.011, 0.037, , -8.727)"



Results for matching on S_assetspc with L_inf distance < 0.010
Found 1894 matches
Sample sizes for S_parcount:
Observations treatment: 1489
Observations control:   1419

Sample sizes for S_diarrhea:
Observations treatment: 1884
Observations control:   1887

Sample sizes for S_anemia:
Observations treatment: 1724
Observations control:   1750

Sample sizes for S_mccdts:
Observations treatment: 283
Observations control:   267

Sample sizes for S_pbdypct:
Observations treatment: 742
Observations control:   739

Sample sizes for S_haz:
Observations treatment: 1825
Observations control:   1839

Sample sizes for S_whz:
Observations treatment: 1837
Observations control:   1843



Unnamed: 0,control means,control std,model_1,model_2,model_3
S_parcount,0.328,0.661,"(-0.061, 0.029, ** , -18.637)","(-0.065, 0.029, ** , -19.983)","(-0.064, 0.03, ** , -19.578)"
S_diarrhea,0.15,0.357,"(-0.025, 0.01, ** , -16.829)","(-0.026, 0.01, ***, -17.572)","(-0.024, 0.01, ** , -16.073)"
S_anemia,0.431,0.495,"(-0.088, 0.03, ***, -20.436)","(-0.085, 0.029, ***, -19.625)","(-0.086, 0.028, ***, -19.962)"
S_mccdts,13.352,18.956,"(4.153, 1.643, ** , 31.106)","(5.454, 1.645, ***, 40.851)","(5.279, 1.646, ***, 39.534)"
S_pbdypct,30.729,24.729,"(2.525, 1.65, , 8.218)","(3.146, 1.472, ** , 10.238)","(3.076, 1.467, ** , 10.009)"
S_haz,-0.595,1.11,"(-0.008, 0.045, , 1.312)","(-0.011, 0.041, , 1.773)","(-0.007, 0.041, , 1.228)"
S_whz,0.138,1.134,"(-0.004, 0.036, , -2.64)","(-0.009, 0.039, , -6.774)","(-0.015, 0.039, , -10.892)"



Difference with expected results


Unnamed: 0,control means,control std,model_1,model_2,model_3
S_parcount,-0.005,-0.012,"(4.00e-03, -3.00e-03, ** , 9.08e-01)","(-1.00e-03, -2.00e-03, ** , -6.38e-01)","(0.00e+00, -2.00e-03, ** , -3.80e-01)"
S_diarrhea,0.008,0.008,"(-7.00e-03, 1.00e-03, *+ , -4.01e+00)","(-6.00e-03, 1.00e-03, **+, -3.74e+00)","(-6.00e-03, 1.00e-03, *+ , -3.27e+00)"
S_anemia,0.005,0.0,"(-3.00e-03, 2.00e-03, ***, -3.77e-01)","(-4.00e-03, 2.00e-03, ***, -7.17e-01)","(-3.00e-03, 1.00e-03, ***, -5.74e-01)"
S_mccdts,-0.002,0.004,"(1.22e-01, -7.00e-03, ** , 9.24e-01)","(-1.98e-01, 3.00e-03, ***, -1.47e+00)","(-2.78e-01, 5.00e-03, ***, -2.08e+00)"
S_pbdypct,0.073,-0.135,"(-1.43e-01, -3.90e-02, - , -4.84e-01)","(-6.00e-02, 4.20e-02, ** , -2.22e-01)","(-7.00e-03, 5.70e-02, ** , -4.90e-02)"
S_haz,0.01,0.006,"(-1.50e-02, 2.00e-03, , 2.47e+00)","(-1.30e-02, 3.00e-03, , 1.49e+00)","(-5.00e-03, 2.00e-03, , 1.55e+00)"
S_whz,0.013,0.001,"(-6.00e-03, 2.00e-03, , -4.43e+00)","(-4.00e-03, 3.00e-03, , -2.66e+00)","(-4.00e-03, 2.00e-03, , -2.16e+00)"




