# Housing, Health, and Happiness – Milestone P4

This milestone aims to provide an extension to the paper _Housing, Health, and Happiness_. Spcifically, we would like to make an additional matching on the household level in order to further confirm the conclusions of the authors, or to nuance them.

## Imports

In [None]:
import networkx as nx
import numbers as nb
import numpy as np
import operator
import pandas as pd
import statsmodels.api as sm

from copy import deepcopy
from math import inf

DATA_PATH = "data/"

## Modularised figure reproduction

Here, we define modular functions for replicating the original tables from the paper, and to compare results to the expected values.
These functions will be reused for generating new tables from the matched datasets.

### 0. Defining common values

Since the treatment variable and the clustering variable are the same across all datasets, we define them here.

In [None]:
vars_ = {
    "treatment_var": ['dpisofirme'],
    "clustering_var": ['idcluster'],
    "demographic_control_vars": [],
    "health_control_vars": ['S_hasanimals', 'S_animalsinside', 'S_waterland', 'S_waterhouse',
                            'S_electricity', 'S_washhands', 'S_garbage'],
    "model3_control_vars": ['S_cashtransfers', 'S_milkprogram', 'S_foodprogram', 'S_seguropopular'],
    "dependant_vars": []
}

In [None]:
def generate_models(vars_):
    models = {
        'model_1': [],
        'model_2': vars_["demographic_control_vars"] + vars_["health_control_vars"],
    }
    models['model_3'] = models['model_2'] + vars_["model3_control_vars"]
    return models

### 2. Generating missing values

Missing values in columns containing the independent variables are replaced by 0 and a dummy variable indicating whether the value was missing is added (missing=1, present=0) for each variable containing missing values (others would only contain 0). An updated model taking the dummy variables into account is returned.

In [None]:
#Parameter inplace is used to show explicitly that this function has side effects on df
def generate_missing_values(df, models_, inplace=True):
    models = deepcopy(models_)
    columns = models["model_3"]
    if not inplace:
        raise ValueError("Parameter inplace has to be true")
    for col_name in columns:
        if df[col_name].isnull().values.any():
            new_col = 'dmiss_' + col_name
            df[new_col] = df[col_name].apply(pd.isna).apply(int)
            if col_name in models['model_2']:
                models['model_2'].append(new_col)
            models['model_3'].append(new_col)
    zeros = dict(zip(columns, [0] * len(columns)))
    df.fillna(zeros, inplace=True)
    return models

### 3. Regression

There are 2 steps to this part.

The first is to compute the mean and standard deviation for the control group for each dependant variable. This is done using `mean()` and `std()` on the control DataFrame (i.e. `vars_['treatment_var']` = False).

The second part is to do a linear regression for each dependent variable once for each model. This is his done using 2 nested for loops (over models, then over dependent variables) and using `statsmodels`'s `OLS` with a cluster covariance estimator (`vars_['clustering_var']`).

In [None]:
### Helper function to convert p-value to stars like in the paper
def to_stars(p):
    if p < 0.01:
        return "***"
    elif p < 0.05:
        return "** "
    elif p < 0.1:
        return "*  "
    return "   "

In [None]:
def compute_results(df, models, vars_):
    treatment_var = vars_['treatment_var'][0]
    # Part 1: control
    dependant_vars = vars_['dependant_vars']
    control = df[df[treatment_var].apply(lambda x: not bool(x))][dependant_vars]
    res = pd.DataFrame({
        'control means': control.mean(),
        'control std': control.std()
    }, index=dependant_vars)
    
    # Part 2: linear regression
    Y = df[dependant_vars]
    
    for k, v in models.items():
        X = df[vars_['treatment_var'] + v]
        X = sm.add_constant(X)
        column = []
        for label, y in Y.items():
            regression = sm.OLS(y, X, missing='drop').fit(cov_type='cluster',
                                          cov_kwds={'groups': df.dropna(subset=[label])[vars_['clustering_var']]})
            coeff = regression.params[treatment_var]
            significance = to_stars(regression.pvalues[treatment_var])
            column.append((coeff, regression.bse[treatment_var], significance, 
                           100 * coeff / res.loc[label]['control means']))
        res[k] = column
    
    return pd.DataFrame(res, index=dependant_vars)

### 4. Showing and discussing the results

We display the DataFrame containing the results rounded to 3 decimals (as in the originial paper), then compute the difference (after rounding) with the results from the paper.

In [None]:
def round3(val):
    if isinstance(val, nb.Number):
        return round(val, 3)
    elif isinstance(val, str):
        return val
    else:
        tpe = type(val)
        return tpe(map(round3, val))

def round_res(df, index):
    res = df.apply(round3)
    res.index = index
    return res

In [None]:
# x, y chars
def char_compare(expected_star, real_star):
    star_values = {'none':' ', 'star': '*'}
    if(expected_star not in star_values.values() or real_star not in star_values.values()):
        return '/'
    is_expected_star = expected_star == star_values['star']
    is_real_star = real_star == star_values['star']
    res = [[' ', '+'], ['-', '*']]
    return res[is_expected_star][is_real_star]

def star_compare(expected_stars, real_stars):
    return "".join(map(char_compare, list(expected_stars), list(real_stars)))

def compare_results(expected_val, real_val):
    if isinstance(expected_val, tuple):
        return tuple(map(compare_results, expected_val, real_val))
    elif not (isinstance(expected_val, str)):
        return "{:.2e}".format(operator.sub(expected_val, real_val))
    else:
        return star_compare(expected_val, real_val)
    
def diff(expected_df, results_df):
    comp = expected_df.copy()
    for col in comp.columns:
        comp[col] = list(map(compare_results, expected_df[col], results_df[col]))
    return comp

In [None]:
def compute_and_output_results(df, models, vars_, expected_res):
    res = compute_results(df, models, vars_)

    rounded_res = round_res(res, vars_['dependant_vars'])
    display(rounded_res)

    display(expected_res)
    comp = diff(expected_res, rounded_res)
    display(comp)

## Matchings

In [None]:
def distance_Lp(p, control_row, treated_row):
    if p == inf:
        return max(abs(control_row-treated_row))
    else:
        return pow(sum(pow(abs(control_row-treated_row), p)), 1/p)

from tqdm import tqdm
# Helper function to compute a 1-to-1 matching depending on propensity scores
# control_set: one of the dataframes on which we do the matching
# treated_set: the second dataframe
# p_distance: the p used for computing Lp distance
# return: a 1-to-1 matching of the two dataframes minimizing the total difference of propensity scores
def match(base_df, control_col, match_features, p_distance, epsilon=None):
    G = nx.Graph()
    filter_ = base_df[control_col].astype(bool).values
    treated_set = base_df[filter_][match_features]
    control_set = base_df.drop(treated_set.index)[match_features]
    nodes_control, nodes_treat, distances = [], [], []
    for control_id, control_row in tqdm(control_set.iterrows()):
        for treat_id, treat_row in treated_set.iterrows():
            nodes_control.append(control_id)
            nodes_treat.append(treat_id)
            distances.append(distance_Lp(p_distance, control_row, treat_row))
    max_dist = max(distances)
    weights = []
    if epsilon:
        popped = 0
        for i in tqdm(range(len(distances))):
            d = distances[i] / max_dist
            if (d < epsilon):
                nodes_control.pop(i-popped)
                nodes_treat.pop(i-popped)
                weights.append(1-d)
                popped += 1
    G.add_weighted_edges_from(zip(nodes_control, nodes_treat, weights))
    print(len(G.edges))
    
    # max_weight_matching returns 1-to-1 matching
    matches = nx.max_weight_matching(G)
    
    matched_df = pd.DataFrame(columns = base_df.columns)
    for (l, r) in matches:
        matched_df = matched_df.append(base_df.loc[l])
        matched_df = matched_df.append(base_df.loc[r])
    return matched_df

## Figure reproduction: Table 4

The goal of this part is to reproduce Table 4 of the paper _Housing, Health, and Happiness_.

### 1. Loading the data and understanding what we'll need

We start by identifying columns which contain our dependent variables, treatment variable, independent variables for each of the three models and clustering variable.

The data related to the dependent variables can be fount in the following columns:
  + Share of rooms with cement floors (`S_shcementfloor`)
  + Cement floor in kitchen (`S_cementfloorkit`)
  + Cement floor in dining room (`S_cementfloordin`)
  + Cement floor in bathroom (`S_cementfloorbat`)
  + Cement floor in bedroom (`S_cementfloorbed`)

Control and treatment groups are identified by `dpisofirme` (control = 0, treatment = 1).

Model 1 has no control variables.

Model 2 has (25 - 1) control variables:
  + demographic:
    + Number of household members (`S_HHpeople`)
    + (Number of rooms (`S_rooms`) -> This one is mentioned in the paper, but after looking at the STATA file, I noticed it was not used for the regression and decided to drop it)
    + Head of household's years of schooling (`S_headeduc`)
    + Spouse's years of schooling (`S_spouseeduc`)
    + Head of household's age (`S_headage`)
    + Spouse's age (`S_spouseage`)
    + Proportion of Males 0-5yrs in household (`S_dem1`)
    + Proportion of Males 6-17yrs in household (`S_dem2`)
    + Proportion of Males 18-49yrs in household (`S_dem3`)
    + Proportion of Males 50+yrs in household (`S_dem4`)
    + Proportion of Females 0-5yrs in household (`S_dem5`)
    + Proportion of Females 6-17yrs in household (`S_dem6`)
    + Proportion of Females 18-49yrs in household (`S_dem7`)
    + Proportion of Females 50+yrs in household (`S_dem8`)
  + health:
    + Household has animals on land (`S_hasanimals`)
    + Animals allowed to enter the house (`S_animalsinside`)
    + Water connection outside (`S_waterland`)
    + Water connection inside the house (`S_waterhouse`)
    + Electricity (`S_electricity`)
    + Number of times respondent washed hands the day before (`S_washhands`)
    + Uses garbage collection service (`S_garbage`)
    
Model 3 adds 4 control variables:
  + Transfers per capita from government programs (`S_cashtransfers`)
  + Household beneficiary of government milk supplement program (`S_milkprogram`)
  + Household beneficiary of government food program (`S_foodprogram`)
  + Household beneficiary of seguro popular (`S_seguropopular`)
  
All models use `idcluster` for clustering.

In [None]:
df_t4_full = pd.read_stata(DATA_PATH + "PisoFirme_AEJPol-20070024_household.dta")

"""vars_t4 = {
    "treatment_var": ['dpisofirme'],
    "clustering_var": ['idcluster'],
    "demographic_control_vars": ['S_HHpeople', 'S_headeduc', 'S_spouseeduc', 'S_headage',
                                 'S_spouseage', 'S_dem1', 'S_dem2', 'S_dem3', 'S_dem4',
                                 'S_dem5', 'S_dem6', 'S_dem7', 'S_dem8'],
    "health_control_vars": ['S_hasanimals', 'S_animalsinside', 'S_waterland', 'S_waterhouse',
                            'S_electricity', 'S_washhands', 'S_garbage'],
    "model3_control_vars": ['S_cashtransfers', 'S_milkprogram', 'S_foodprogram', 'S_seguropopular'],
    "dependant_vars": ['S_shcementfloor', 'S_cementfloorkit', 'S_cementfloordin',
                       'S_cementfloorbat', 'S_cementfloorbed']
}

models_t4 = {
    'model_1': [],
    'model_2': vars_t4["demographic_control_vars"] + vars_t4["health_control_vars"],
}
models_t4['model_3'] = models_t4['model_2'] + vars_t4["model3_control_vars"]"""
vars_t4 = deepcopy(vars_)
vars_t4["demographic_control_vars"] = ['S_HHpeople', 'S_headeduc', 'S_spouseeduc', 'S_headage',
                                       'S_spouseage', 'S_dem1', 'S_dem2', 'S_dem3', 'S_dem4',
                                       'S_dem5', 'S_dem6', 'S_dem7', 'S_dem8']
vars_t4["dependant_vars"] = ['S_shcementfloor', 'S_cementfloorkit', 'S_cementfloordin',
                             'S_cementfloorbat', 'S_cementfloorbed']
models_t4 = generate_models(vars_t4)

### 2. Expected output

In [None]:
expected_res_t4 = pd.DataFrame({'control means': [0.728, 0.671, 0.709, 0.803, 0.668],
                             'control std': [0.363, 0.470, 0.455, 0.398, 0.471],
                             'model_1': [(0.202, 0.021, '***', 27.746),
                                         (0.255, 0.025, '***', 37.936),
                                         (0.210, 0.026, '***', 29.633),
                                         (0.105, 0.022, '***', 13.071),
                                         (0.238, 0.020, '***', 35.598)],
                             'model_2': [(0.208, 0.019, '***', 28.512),
                                         (0.260, 0.023, '***', 38.708),
                                         (0.217, 0.025, '***', 30.588),
                                         (0.113, 0.018, '***', 14.043),
                                         (0.245, 0.021, '***', 36.735)],
                             'model_3':[(0.210, 0.019, '***', 28.876),
                                        (0.265, 0.023, '***', 39.440),
                                        (0.221, 0.025, '***', 31.189),
                                        (0.117, 0.018, '***', 14.536),
                                        (0.245, 0.020, '***', 36.695)]},
                            index = vars_t4["dependant_vars"])

### 3. Data cleaning and generating missing values

The next step is to clean the data, only keeping the necessary rows and generating missing values. 

Rows were identified as _unecessary_ if they were dropped in the original paper, which mentioned dropping samples for which geographical data was unavailable. Initially, we used that as a criteria for filtering the dataset. However, doing this gave worse results than not filtering at all. Thanks to a helpful comment on Zulip, we noticed a discreptancy between the number of samples we was supposed to have according to Table 1 (1362) and the number we had (1187). After looking around, we noticed geographical data was missing for the following 203 lines: (1788:1916), (2505:2527), (2576:2592), (2656:2661), (2755:2782) (see [Annex B1](#annexB1)). The last 28 lines seem to correspond to the 28 that were dropped in the paper, but the others seem to be due to some corruption of the dataset for some reason. Therefore, we decided to only drop the final 28 lines in order to use the same dataset as the paper, and we get much better results that way

In [None]:
df_t4 = df_t4_full[:2755].copy()
#df_t4 = df_t4[vars_t4["treatment_var"] + vars_t4["clustering_var"] + models_t4['model_3'] + vars_t4["dependant_vars"]]
new_models_t4 = generate_missing_values(df_t4, models_t4)

### 4. Computing and displaying results

In [None]:
"""res_t4 = compute_results(df_t4, new_models_t4, vars_t4)

rounded_res_t4 = round_res(res_t4, vars_t4["dependant_vars"])
display(rounded_res_t4)

display(expected_res_t4)
comp_t4 = diff(expected_res_t4, rounded_res_t4)
display(comp_t4)"""
compute_and_output_results(df_t4, new_models_t4, vars_t4, expected_res_t4)

## Figure reproduction: Table 5

The goal of this milestone is to reproduce Table 5 of the paper _Housing, Health, and Happiness_.

### 1. Loading the data and understanding what we'll need

We started this replication exercise by adapting our replication of Table 4. The main differences are:
1. we use the `individual.dta` rather than the `household.dta` file;
1. we use different dependant variables.

Note that Table 5 only focuses on children under the age of 6, so we drop every row pertaining to a person older than 6 years old (see [Annex A](#annexA) for further justification).

Using the explanation in section V of the paper as well as the STATA code, we identified the columns which contain our dependent variables, treatment variable, independent variables for each of the three models and clustering variable.

The data related to the dependant variables can be found in the following columns:
  + Parasite count (`S_parcount`)
  + Diarrhea (`S_diarrhea`)
  + Anemia (`S_anemia`)
  + McArthur Communication Development Test score (`S_mccdts`)
  + Picture Peabody Vocabulary Test percentile score (`S_pbdypct`)
  + Height-for-age z-score (`S_haz`)
  + Weight-for-height z-score (`S_whz`)

Control and treatment groups are identified by `dpisofirme` (control = 0, treatment = 1).

Model 1 has no control variables.

Model 2 has 58 control variables:
  + demographic:
    + Number of household members (`S_HHpeople`)
    + Number of rooms (`S_rooms`)
    + Age (`S_age`)
    + Male (`S_gender`) -> the README specifies that Male = 1, but the loaded dataframe contained the values `0.0` and `hombre`, so this was corrected to be `0` and `1`
    + Mother of at least one child in household present (`S_childma`)
    + Mother's age (if present) (`S_childmaage`)
    + Mother's years of schooling (if present) (`S_childmaeduc`)
    + Father of at least one child in household present (`S_childpa`)
    + Father's age (if present) (`S_childpaage`)
    + Father's years of schooling (if present) (`S_childpaeduc`)
    + (Trimester * Gender) Dummy for children 0-5yrs (`dtriage*`) [48]
  + health:
    + Household has animals on land (`S_hasanimals`)
    + Animals allowed to enter the house (`S_animalsinside`)
    + Water connection outside (`S_waterland`)
    + Water connection inside the house (`S_waterhouse`)
    + Electricity (`S_electricity`)
    + Number of times respondent washed hands the day before (`S_washhands`)
    + Uses garbage collection service (`S_garbage`)
    
Model 3 adds 4 control variables:
  + Transfers per capita from government programs (`S_cashtransfers`)
  + Household beneficiary of government milk supplement program (`S_milkprogram`)
  + Household beneficiary of government food program (`S_foodprogram`)
  + Household beneficiary of seguro popular (`S_seguropopular`)
  
All models use `idcluster` for clustering.

In [None]:
original_df_t5 = pd.read_stata(DATA_PATH + "PisoFirme_AEJPol-20070024_individual.dta")

"""vars_t5 = {
    "treatment_var": ['dpisofirme'],
    "clustering_var": ['idcluster'],
    "demographic_control_vars_1": ['S_HHpeople', 'S_rooms', 'S_age', 'S_gender', 'S_childma', 'S_childmaage',
                            'S_childmaeduc', 'S_childpa', 'S_childpaage', 'S_childpaeduc'],
    "demographic_control_vars_2": [x for x in original_df_t5.columns if 'dtriage' in x],
    "health_control_vars": ['S_hasanimals', 'S_animalsinside', 'S_waterland', 'S_waterhouse', 
                  'S_electricity', 'S_washhands', 'S_garbage'],
    "model3_control_vars": ['S_cashtransfers', 'S_milkprogram', 'S_foodprogram', 'S_seguropopular'],
    "dependant_vars": ['S_parcount', 'S_diarrhea', 'S_anemia', 'S_mccdts', 'S_pbdypct', 'S_haz', 'S_whz']
}

models_t5 = {
    'model_1': [],
    'model_2': vars_t5["demographic_control_vars_1"] + vars_t5["demographic_control_vars_2"] +
                vars_t5["health_control_vars"],
}
models_t5['model_3'] = models_t5['model_2'] + vars_t5["model3_control_vars"]"""
vars_t5 = deepcopy(vars_)
demographic_control_vars_1 = ['S_HHpeople', 'S_rooms', 'S_age', 'S_gender', 'S_childma', 'S_childmaage',
                              'S_childmaeduc', 'S_childpa', 'S_childpaage', 'S_childpaeduc']
demographic_control_vars_2 = [x for x in original_df_t5.columns if 'dtriage' in x]
vars_t5["demographic_control_vars"] = demographic_control_vars_1 + demographic_control_vars_2
vars_t5["dependant_vars"] = ['S_parcount', 'S_diarrhea', 'S_anemia', 'S_mccdts', 'S_pbdypct', 'S_haz', 'S_whz']

models_t5 = generate_models(vars_t5)

### 2. Expected output

In [None]:
expected_res_t5 = pd.DataFrame({'control means': [0.333, 0.142, 0.426, 13.354, 30.656, -0.605, 0.125],
                             'control std': [0.673, 0.349, 0.495, 18.952, 24.864, 1.104, 1.133],
                             'model_1': [(-0.065, 0.032, '** ', -19.545),
                                         (-0.018, 0.009, '*  ', -12.819),
                                         (-0.085, 0.028, '***', -20.059), 
                                         (4.031, 1.650, '** ', 30.182), 
                                         (2.668, 1.689,'*  ' , 8.702), 
                                         (0.007, 0.043, '   ', -1.161), 
                                         (0.002, 0.034, '   ', 1.790)],
                             'model_2': [(-0.064, 0.031, '** ', -19.345),
                                         (-0.020, 0.009, '** ', -13.834),
                                         (-0.081, 0.027, '***', -18.908),
                                         (5.652, 1.642, '***', 42.325),
                                         (3.206, 1.430, '** ', 10.460),
                                         (0.002, 0.038, '   ',0.279),
                                         (-0.005, 0.036, '   ', -4.119)],
                             'model_3':[(-0.064, 0.032, '** ', -19.198),
                                         (-0.018, 0.009, '*  ', -12.803),
                                         (-0.083, 0.027, '***', -19.388),
                                         (5.557, 1.641, '***', 41.609),
                                         (3.083, 1.410, '** ', 10.058),
                                         (-0.002, 0.039, '   ', -0.323),
                                         (-0.011, 0.037, '   ', -8.727)]},
                            index = vars_t5['dependant_vars'])

### 3. Data cleaning and generating missing values

The next step is to clean the data, only keeping the necessary rows and generating missing values.

Once again, we identify clusters of missing geographical data in [Annex B2](#annexB2). However, the paper never mentions how many individuals' data they used, so we used the data provided in Table 1 for each dependant variable as a point of comparison. The check of the number of datapoints after dropping the final cluster is done in [Annex C](#annexC).

Note: in the STATA file the paper's authors do not check for missing values in `dtriage` colums. Here, we confirm that there is no missing values in these columns, so we do not need to manually exclude them when generating missing values.

In [None]:
children_df_t5 = original_df_t5[original_df_t5.S_age < 6].reset_index(drop=True) # Magic number! See Annex B!
children_df_t5['S_gender'] = children_df_t5['S_gender'].apply(lambda x: x == 'hombre').astype(int)
df_t5 = children_df_t5[:4052].copy()

#df_t5 = df_t5[vars_t5["treatment_var"] + vars_t5["clustering_var"] + vars_t5["dependant_vars"] + models_t5['model_3'] + ['coord_x']]
new_models_t5 = generate_missing_values(df_t5, models_t5)

### 4. Computing and displaying results

In [None]:
"""res_t5 = compute_results(df_t5, new_models_t5, vars_t5)

rounded_res_t5 = round_res(res_t5, vars_t5['dependant_vars'])
display(rounded_res_t5)

display(expected_res_t5)
comp_t5 = diff(expected_res_t5, rounded_res_t5)
display(comp_t5)"""
compute_and_output_results(df_t5, new_models_t5, vars_t5, expected_res_t5)

<span id="annexA"></span>
### Annex A

In [None]:
list_vars = []
olderthan6_df = original_df_t5.drop(children_df_t5.index)[vars_t5["dependant_vars"]]
print('Checking if all dependant variables are `NaN` for individuals older than 6...')
if (olderthan6_df.apply(pd.isna).values.all()):
    s = ""
else:
    s = "not"
print(f'We can{s} drop all aforementioned rows')

<span id="annexB1"></span>
### Annex B1

In [None]:
nans = df_t4_full.loc[pd.isna(df_t4_full['coord_x'])].index
ranges = []
low, up = nans[0], nans[1]
for i in range(len(nans) - 1):
    up = nans[i]
    if (up + 1 != nans[i+1] or i+1 == len(nans) - 1):
        if (i+1 == len(nans) - 1):
            up = nans[i+1]
        ranges.append((low, up))
        low = nans[i+1]
ranges_str = ""
for (l, r) in ranges:
    ranges_str += '[' + str(l) + ', ' +  str(r) + ']\n'
print(f'Intervals of rows with missing geographical data:\n{ranges_str}')

<span id="annexB2"></span>
### Annex B2

In [None]:
nans = children_df_t5.loc[pd.isna(children_df_t5['coord_x'])].index
ranges = []
low, up = nans[0], nans[1]
for i in range(len(nans) - 1):
    up = nans[i]
    if (up + 1 != nans[i+1] or i+1 == len(nans) - 1):
        if (i+1 == len(nans) - 1):
            up = nans[i+1]
        ranges.append((low, up))
        low = nans[i+1]
ranges_str = ""
for (l, r) in ranges:
    ranges_str += '[' + str(l) + ', ' +  str(r) + ']\n'
print(f'Intervals of rows with missing geographical data:\n{ranges_str}')

<span id="annexC"></span>
### Annex C

In [None]:
expected_amount = pd.DataFrame({'treat_expected': [1528, 1930, 1768, 291, 757, 1865, 1881],
                                'cont_expected' : [1566, 2105, 1951, 302, 817, 2053, 2058]},
                              index=vars_t5["dependant_vars"])
treat, cont, drop_t, drop_c = [], [], [], []
treat_df = df_t5[df_t5.dpisofirme == 1]
cont_df = df_t5[df_t5.dpisofirme == 0]

for col in vars_t5["dependant_vars"]:
    dt = len(treat_df.loc[pd.isna(treat_df[col])])
    treat.append(len(treat_df) - dt)
    drop_t.append(dt)
    dc = len(cont_df.loc[pd.isna(cont_df[col])])
    cont.append(len(cont_df) - dc)
    drop_c.append(dc)
expected_amount['treat'] = treat
expected_amount['cont'] = cont
expected_amount['dropped_treat'] = drop_t
expected_amount['dropped_cont'] = drop_c
expected_amount['dropped_tot'] = expected_amount['dropped_treat'] + expected_amount['dropped_cont']
expected_amount['delta_t'] = expected_amount.treat_expected - expected_amount.treat
expected_amount['delta_c'] = expected_amount.cont_expected - expected_amount.cont
expected_amount['tot'] = expected_amount.treat + expected_amount.cont
expected_amount

In [None]:
df_t4_matched = match(df_t4, vars_t4['treatment_var'], ["S_incomepc", "S_assetspc", "S_shpeoplework"], inf, epsilon=0.9)

In [None]:
res_t4_matched = compute_results(df_t4_matched.reset_index(drop=True), new_models_t4, vars_t4)

rounded_res_t4_matched = round_res(res_t4_matched, vars_t4["dependant_vars"])
display(rounded_res_t4_matched)

display(expected_res_t4)
comp_t4 = diff(expected_res_t4, rounded_res_t4_matched)
display(comp_t4)