In [5]:
import pandas as pd
import seaborn as sns
from scipy import stats
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
from matplotlib.pyplot import subplots
import matplotlib.pyplot as plt  
import sklearn.model_selection as skm
from sklearn.metrics import (accuracy_score,
                             log_loss)

from sklearn.tree import (DecisionTreeClassifier as DTC,
                          DecisionTreeRegressor as DTR,
                          plot_tree,
                          export_text)
from sklearn.metrics import (accuracy_score,
                             log_loss)
from sklearn.ensemble import \
     (RandomForestRegressor as RF,
      GradientBoostingRegressor as GBR)

In [6]:
#load the datasets into a pandas DataFrame
df = pd.read_stata('school_intervention_panel_final.dta', convert_categoricals=False)

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  df = pd.read_stata('school_intervention_panel_final.dta', convert_categoricals=False)


In [7]:
#Rename Baseline column names

df = df.rename(columns={
    'female' : 'Student is female (Baseline)',
    'miss_f_coded' : 'Student gender could not be coded based on name (Baseline)',
    'dumm_rp_08_bl' : 'Education of mother: At least some secondary (Baseline)',
    'dumm_rp_09_bl' : 'Education of father: At least some secondary (Baseline)',
    'dumm_rp_14_bl' : 'Student\'s Family receives Bolsa Familia cash transfer (Baseline)',
    'dumm_rp_23_bl' : 'Student has computer with internet at home (Baseline)',
    'dumm_rp_24_bl' : 'Student has failed at least one school year (Baseline)',
    'dumm_rp_49_bl' : 'Student is not working at the moment (Baseline)',
    'business_bl' : 'Student works in own or family business (Baseline)',
    'employee_bl' : 'Student works as employee or other (Baseline)',
    'dumm_rp_50_bl' : 'Student receives income (Baseline)',
    'dumm_rp_53B_bl' : 'Student part of income saved is non-zero (Baseline)',
    'dumm_rp_55_bl' : 'Student has borrowed money (any source) (Baseline)',
    'dumm_rp_56_bl' : 'Student is behind on payments (unconditional) (Baseline)',
    'dumm_rp_57s_bl' : 'Student is behind on payments to store (Baseline)',
    'dumm_rp_57i_bl' : 'Student is behind on payments to family friends or other people (Baseline)',
    'dumm_rp_57f_bl' : 'Student is behind on payments to bank or FI (Baseline)',
    'dumm_rp_59_bl' : 'Student says they are a saver (Baseline)',
    'dumm_rp_61_bl' : 'Student has formal savings (Baseline)',
    'dumm_rp_64A_bl' : 'Student makes a list of all monthly expenses (Baseline)',
    'dumm_rp_65A_bl': 'Student saves money for future purchases (Baseline)',
    'dumm_rp88__92C_bl': 'Student has bought electronics shoes or clothing with credit card (Baseline)',
    'dumm_rp88__92D_bl': 'Student has bought electronics shoes or clothing on installments (Baseline)',
    'dumm_rp88__92AB_bl': 'Student has bought electronics shoes or clothing with cash/debit card (Baseline)',
    'dumm_negotiates_bl': 'Student negotiates prices or payment methods (Baseline)',
    'dumm_search_bl': 'Student comparison shops before making purchase (Baseline)',
    'dumm_rp_08p_bl': 'Parent has at least some secondary education (Baseline)',
    'p_employee_bl': 'Parent is an employee (Baseline)',
    'p_selfempl_bl': 'Parent is self-employeed (Baseline)',
    'p_other_bl': 'Parent occupation is other (homemaker, retired, unemployed, other) (Baseline)',
    'dumm_rp_14p_bl': 'Parent: makes a list of all monthly expenses (Baseline)',
    'dumm_rp_18p_bl': 'Parent: has checking account (Baseline)',
    'dumm_rp_18p_bl': 'Parent: has savings account (Baseline)',
    'dumm_rp_19p_bl': 'Parent: has debit card (Baseline)',
    'dumm_rp_23p_bl': 'Parent: has checks (Baseline)',
    'dumm_formal_saving_bl':' Parent: has formal savings (Baseline)',
    'dumm_rp_33p_bl': 'Parent: Student talks to you about finances (Baseline)',
    'dumm_rp_34p_bl': 'Parent: Student helps organize HH budget (Baseline)',
    'dumm_rp_36p_bl': 'Parent: Prefers R50K plus 15 percent interest (Baseline)',
    'dumm_rp_37p_bl': 'Parent: Inflation question dummy for correct (Baseline)',
    'vl_proficiencia_bl': 'Student: Finanical Proficieny Score (Baseline)', 
    'dumm_rp_49_fup' : "Student is not working at the moment" ,  
    'business_fup' : 'Student works in own or family busines' ,
    'employee_fup' : 'Student works as employee or other', 
    'dumm_rp_50_fup': 'Student: Receives income' ,
     'dumm_rp_53B_fup': 'Student: Pct of income saved is non-zero', 
    'dumm_rp_55_fup': 'Student: Has borrowed money (any source)',
    'dumm_rp_56_fup' : 'Student: Is behind on payments (unconditional)',
    'dumm_rp_57f_fup' : 'Student: Is behind on payments to bank or FI' ,
    'dumm_rp_57s_fup' : 'Student: Is behind on payments to store',
    'dumm_rp_57i_fup': 'Student: Is behind on payments to family friends or other people', 
    'dumm_rp_59_fup' : 'Student: Says they are a saver' , 
    'dumm_rp_61_fup' : 'Student: Has formal savings', 
    'dumm_rp_64A_fup' : 'I make a list of all monthly expenses', 
    'dumm_rp_65A_fup' : 'Student: Saves money for future purchases',
    'dumm_rp_88C_fup': 'Student: I have bought cell phone with credit card', 
    'dumm_rp_88D_fup': 'Student: I have bought cell phone on installments',
    'dumm_rp_88AB_fup': 'Student: I have bought a cell phone with cash/debit card',
    'dumm_rp_89C_fup': 'Student: I have bought computer with credit card' ,
    'dumm_rp_89D_fup': 'Student: I have bought computer on installments' ,
    'dumm_rp_89AB_fup': 'Student: I have bought a computer with cash/debit card' ,
    'dumm_rp_90C_fup': 'Student: I have bought an electronic device with credit card' ,
    'dumm_rp_90D_fup': 'Student: I have bought an electronic device on installments' ,
    'dumm_rp_90AB_fup': 'Student: I have bought an electronic device with cash/debit card' ,
    'dumm_rp_91C_fup': 'Student: I have bought shoes with credit card' ,
    'dumm_rp_91D_fup': 'Student: I have bought shoes on installments' ,
    'dumm_rp_91AB_fup': 'Student: I have bought shoes with cash/debit card',
    'dumm_rp_92C_fup': 'Student: I have bought clothing with credit card', 
    'dumm_rp_92D_fup': 'Student: I have bought clothing on installments' ,
    'dumm_rp_92AB_fup' : 'Student: I have bought clothing with cash/debit card',
    'dumm_rp88__92C_fup': 'I have bought electronics shoes or clothing with credit card',
    'dumm_rp88__92D_fup': 'I have bought electronics shoes or clothing on installments' ,
    'dumm_rp88__92AB_fup': 'I have bought electronics shoes or clothing with cash/debit card' ,
    'dumm_rp_93_fup': 'I negotiate the price' ,
    'dumm_rp_94_fup': 'Student I search price in different stores',
    'dumm_rp_95_fup': 'Student I negotiate the payment method' ,
    'dumm_rp_96_fup': 'Student: I search similar models/brands' ,
    'dumm_negotiates_fup': 'Student Negotiates prices or payment methods' ,
    'dumm_search_fup': 'Student Comparison shops before making purchase' ,
    'dumm_rp_08p_fup': 'Parent has at least some secondary education' ,
    'p_employee_fup': 'Parent is an employee' ,
    'p_selfempl_fup': 'Student Parent is self-employeed',
    'p_other_fup': 'Student Parent\'s occupation is other (homemaker, retired, unemployed, other)',
    'dumm_rp_14p_fup': 'Parent: makes a list of all monthly expenses',
    'dumm_rp_18p_fup' : 'Parent: has checking account', 
    'dumm_rp_19p_fup': 'Parent: savings account', 
    'dumm_rp_21p_fup': 'Parent has debit card' ,
    'dumm_rp_23p_fup': 'Parent has checks' ,
    'dumm_formal_saving_fup': 'Parent has formal savings',
    'dumm_rp_33p_fup': 'Parent: Student talks to you about finances' ,
    'dumm_rp_34p_fup': 'Parent: student helps organize HH budget',
    'dumm_rp_36p_fup' : 'Parent: prefers R50K plus 15 percent interest (follow up)',
    'dumm_rp_37p_fup' : 'Parent: Inflation question dummy for correct  (follow up)', 
    'dumm_rp_41p_fup' : 'Parent: budget must have income and expenses (follow up)',

})
column_names = df.columns.tolist()
print(column_names)



['id_geral', 'cd_escola', 'nm_uf_bl', 'matriculas', 'docentes', 'abandonona1sriemdio', 'aprovaona1sriemdio', 'treatment', 'pair_all', 'treatment_workshop', 'strata', 'round', 'female_coded', 'rp_01_bl', 'rp_08_bl', 'rp_09_bl', 'rp_14_bl', 'rp_23_bl', 'rp_24_bl', 'Student: Finanical Proficieny Score (Baseline)', 'bl_test', 'rp_49_bl', 'rp_50_bl', 'rp_53_bl', 'rp_55_bl', 'rp_56_bl', 'rp_57_bl', 'rp_59_bl', 'rp_61_bl', 'rp_64_bl', 'rp_65_bl', 'rp_88_bl', 'rp_89_bl', 'rp_90_bl', 'rp_91_bl', 'rp_92_bl', 'rp_93_bl', 'rp_94_bl', 'rp_95_bl', 'rp_96_bl', 'bl_aluno', 'rp_08p_bl', 'rp_09p_bl', 'rp_14p_bl', 'rp_18p_bl', 'rp_19p_bl', 'rp_21p_bl', 'rp_23p_bl', 'rp_33p_bl', 'rp_34p_bl', 'rp_36p_bl', 'rp_37p_bl', 'vl_proficiencia_fup', 'fu1_test', 'rp_55_fup', 'rp_57_fup', 'rp_49_fup', 'rp_50_fup', 'rp_53_fup', 'rp_56_fup', 'rp_59_fup', 'rp_61_fup', 'rp_64_fup', 'rp_65_fup', 'rp_88_fup', 'rp_89_fup', 'rp_90_fup', 'rp_91_fup', 'rp_92_fup', 'rp_93_fup', 'rp_94_fup', 'rp_95_fup', 'rp_96_fup', 'fu1_aluno'

In [8]:
#useful subdata sets 
df_unique_baseline = df.drop_duplicates(subset='id_geral', keep='first')
follow_up_1_df = df[df['round'] == 0]
follow_up_2_df = df[df['round'] == 1]

In [9]:
#When doing baseline summary statistics the paper 
#looks at treatment round 2 of students 
#and excludes students where treatment is na  
#sanity check to ensure it produces the same value   
print(follow_up_2_df['treatment'].isna().sum())
testdf = follow_up_2_df.dropna(subset=['treatment'])
testdf['Student is female (Baseline)'].value_counts()

28


Student is female (Baseline)
1.0    13056
0.0    10427
Name: count, dtype: int64

In [7]:
#difference in means test for those in the treatment vs the control group at baseline
#without clustering the standard errors ...
bl_filter_df = follow_up_2_df.dropna(subset=['bl_aluno'])
bl_filter_df = follow_up_2_df.dropna(subset=['treatment'])
columns = bl_filter_df.columns
baseline_columns = [col for col in columns if 'Baseline' in col]

results_list = []
significant_vars = []

# Define the significance threshold (e.g., 0.05)
significance_level = 0.05
for var in baseline_columns:
    
    control_group = bl_filter_df[bl_filter_df['treatment'] == 0][var].dropna()
    treatment_group = bl_filter_df[bl_filter_df['treatment'] == 1][var].dropna()

    if control_group.var() == 0 or treatment_group.var() == 0:
        print(f"Skipping {var} due to zero variance")
        continue
        
    # Calculate means
    control_mean = control_group.mean()
    treatment_mean = treatment_group.mean()
    
    # Perform t-test
    t_stat, p_value = stats.ttest_ind(control_group, treatment_group, equal_var=False)
    
  # Append the result to the list
    results_list.append({
        'Variable': var,
        'Control Mean': control_mean,
        'Treatment Mean': treatment_mean,
        'P-value': p_value
    })
    

#print results
results = pd.DataFrame(results_list)
print(results)
#significant difference in means results
significant_var = results[results['P-value'] < 0.05]
print(significant_var['Variable'])

                                             Variable  Control Mean  \
0      Student: Finanical Proficieny Score (Baseline)     49.867783   
1   Student gender could not be coded based on nam...      0.002703   
2                        Student is female (Baseline)      0.549272   
3   Education of mother: At least some secondary (...      0.445643   
4   Education of father: At least some secondary (...      0.414164   
5   Student's Family receives Bolsa Familia cash t...      0.315794   
6   Student has computer with internet at home (Ba...      0.515319   
7   Student has failed at least one school year (B...      0.296738   
8     Student is not working at the moment (Baseline)      0.352024   
9   Student works in own or family business (Basel...      0.116219   
10      Student works as employee or other (Baseline)      0.235805   
11                 Student receives income (Baseline)      0.661449   
12  Student part of income saved is non-zero (Base...      0.445340   
13  St

In [10]:
# Difference in means test using clustered standard errors 
# replicate the way it is done in the paper

bl_filter_df = follow_up_2_df.dropna(subset=['bl_aluno'])
bl_filter_df = follow_up_2_df.dropna(subset=['treatment'])
columns = bl_filter_df.columns
#baseline_columns = [col for col in columns if 'Baseline' in col]

baseline_columns = ['Student is female (Baseline)', 
'Student has failed at least one school year (Baseline)',
'Student has computer with internet at home (Baseline)', 
'Student is not working at the moment (Baseline)', 
'Student works in own or family business (Baseline)', 
'Student works as employee or other (Baseline)', 
'Student receives income (Baseline)' ]

results_list = []
significant_vars = []


bl_filter_df = bl_filter_df.dropna(subset=['cd_escola'])
bl_filter_df['cd_escola'] = bl_filter_df['cd_escola'].astype('category')


results = []
significant_vars = []

for var in baseline_columns:
    # OLS regression with clustered standard errors by `cd_escola`
    bl_filter_df_temp = bl_filter_df.dropna(subset=[var])
    formula = f'Q("{var}") ~ treatment'

    model = smf.ols(formula, data=bl_filter_df_temp).fit(cov_type='cluster', cov_kwds={'groups': bl_filter_df_temp['cd_escola']})

    # Get the p-value for the treatment effect (two-tailed)
    p_value = model.pvalues['treatment']
    
    # Get the means for the treatment and control groups
    treatment_mean = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 1, var].mean()
    control_mean = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 0, var].mean()
    
    # Get standard deviations for the treatment and control groups
    treatment_sd = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 1, var].std()
    control_sd = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 0, var].std()
    
    # Number of observations used in the sample
    sample_size = df[var].notna().sum()
    
    results.append({
        'Variable': var,
        'Treatment Mean': treatment_mean,
        'Control Mean': control_mean,
        'Treatment SD': treatment_sd,
        'Control SD': control_sd,
        'P-value': p_value,
        'N': sample_size
    })

#print results
results_df = pd.DataFrame(results)
print(results_df)

                                            Variable  Treatment Mean  \
0                       Student is female (Baseline)        0.562928   
1  Student has failed at least one school year (B...        0.315483   
2  Student has computer with internet at home (Ba...        0.528910   
3    Student is not working at the moment (Baseline)        0.351051   
4  Student works in own or family business (Basel...        0.115054   
5      Student works as employee or other (Baseline)        0.235996   
6                 Student receives income (Baseline)        0.672061   

   Control Mean  Treatment SD  Control SD   P-value      N  
0      0.549272      0.496034    0.497591  0.078332  46966  
1      0.296738      0.464717    0.456849  0.106721  46152  
2      0.515319      0.499195    0.499774  0.414508  46292  
3      0.352024      0.477343    0.477644  0.930927  49996  
4      0.116219      0.319094    0.320509  0.806588  49996  
5      0.235805      0.424621    0.424545  0.985443  4999

In [13]:
# Difference in means test using clustered standard errors 
# replicate the way it is done in the paper

bl_filter_df = follow_up_2_df.dropna(subset=['bl_aluno'])
bl_filter_df = follow_up_2_df.dropna(subset=['treatment'])
columns = bl_filter_df.columns
#baseline_columns = [col for col in columns if 'Baseline' in col]

baseline_columns = ['Student part of income saved is non-zero (Baseline)', 
'Student has borrowed money (any source) (Baseline)', 
'Student is behind on payments (unconditional) (Baseline)', 
'Student says they are a saver (Baseline)', 
'Student has formal savings (Baseline)', 
'Student makes a list of all monthly expenses (Baseline)', 
'Student saves money for future purchases (Baseline)',  ]

results_list = []
significant_vars = []


bl_filter_df = bl_filter_df.dropna(subset=['cd_escola'])
bl_filter_df['cd_escola'] = bl_filter_df['cd_escola'].astype('category')


results = []
significant_vars = []

for var in baseline_columns:
    # OLS regression with clustered standard errors by `cd_escola`
    bl_filter_df_temp = bl_filter_df.dropna(subset=[var])
    formula = f'Q("{var}") ~ treatment'

    model = smf.ols(formula, data=bl_filter_df_temp).fit(cov_type='cluster', cov_kwds={'groups': bl_filter_df_temp['cd_escola']})

    # Get the p-value for the treatment effect (two-tailed)
    p_value = model.pvalues['treatment']
    
    # Get the means for the treatment and control groups
    treatment_mean = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 1, var].mean()
    control_mean = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 0, var].mean()
    
    # Get standard deviations for the treatment and control groups
    treatment_sd = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 1, var].std()
    control_sd = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 0, var].std()
    
    # Number of observations used in the sample
    sample_size = df[var].notna().sum()
    
    results.append({
        'Variable': var,
        'Treatment Mean': treatment_mean,
        'Control Mean': control_mean,
        'Treatment SD': treatment_sd,
        'Control SD': control_sd,
        'P-value': p_value,
        'N': sample_size
    })

#print results
results_df = pd.DataFrame(results)
print(results_df)
print(results_df['Variable'])

                                            Variable  Treatment Mean  \
0  Student part of income saved is non-zero (Base...        0.457499   
1  Student has borrowed money (any source) (Basel...        0.323513   
2  Student is behind on payments (unconditional) ...        0.093575   
3           Student says they are a saver (Baseline)        0.312958   
4              Student has formal savings (Baseline)        0.191533   
5  Student makes a list of all monthly expenses (...        0.105459   
6  Student saves money for future purchases (Base...        0.160543   

   Control Mean  Treatment SD  Control SD   P-value      N  
0      0.445340      0.498189    0.497000  0.101249  49560  
1      0.325746      0.467830    0.468676  0.754285  49764  
2      0.079892      0.291227    0.271154  0.005639  49514  
3      0.310318      0.463695    0.462626  0.688415  49684  
4      0.192629      0.393543    0.394385  0.886178  50132  
5      0.103263      0.307143    0.304313  0.664310  4953

In [9]:
# Difference in means test using clustered standard errors 
# replicate the way it is done in the paper

bl_filter_df = follow_up_2_df.dropna(subset=['bl_aluno'])
bl_filter_df = follow_up_2_df.dropna(subset=['treatment'])
columns = bl_filter_df.columns
baseline_columns = [col for col in columns if 'Baseline' in col]

results_list = []
significant_vars = []


bl_filter_df = bl_filter_df.dropna(subset=['cd_escola'])
bl_filter_df['cd_escola'] = bl_filter_df['cd_escola'].astype('category')


results = []
significant_vars = []

for var in baseline_columns:
    # OLS regression with clustered standard errors by `cd_escola`
    bl_filter_df_temp = bl_filter_df.dropna(subset=[var])
    formula = f'Q("{var}") ~ treatment'

    model = smf.ols(formula, data=bl_filter_df_temp).fit(cov_type='cluster', cov_kwds={'groups': bl_filter_df_temp['cd_escola']})

    # Get the p-value for the treatment effect (two-tailed)
    p_value = model.pvalues['treatment']
    
    # Get the means for the treatment and control groups
    treatment_mean = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 1, var].mean()
    control_mean = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 0, var].mean()
    
    # Get standard deviations for the treatment and control groups
    treatment_sd = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 1, var].std()
    control_sd = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 0, var].std()
    
    # Number of observations used in the sample
    sample_size = df[var].notna().sum()
    
    results.append({
        'Variable': var,
        'Treatment Mean': treatment_mean,
        'Control Mean': control_mean,
        'Treatment SD': treatment_sd,
        'Control SD': control_sd,
        'P-value': p_value,
        'N': sample_size
    })

#print results
results_df = pd.DataFrame(results)
print(results_df)

                                             Variable  Treatment Mean  \
0      Student: Finanical Proficieny Score (Baseline)       50.153118   
1   Student gender could not be coded based on nam...        0.001139   
2                        Student is female (Baseline)        0.562928   
3   Education of mother: At least some secondary (...        0.435330   
4   Education of father: At least some secondary (...        0.404502   
5   Student's Family receives Bolsa Familia cash t...        0.333362   
6   Student has computer with internet at home (Ba...        0.528910   
7   Student has failed at least one school year (B...        0.315483   
8     Student is not working at the moment (Baseline)        0.351051   
9   Student works in own or family business (Basel...        0.115054   
10      Student works as employee or other (Baseline)        0.235996   
11                 Student receives income (Baseline)        0.672061   
12  Student part of income saved is non-zero (Base.

In [None]:
parent_char = follow_up_1_df[['Student has bought electronics shoes or clothing with credit card (Baseline)', 
'Student has bought electronics shoes or clothing on installments (Baseline)', 
'Student has bought electronics shoes or clothing with cash/debit card (Baseline)',
'Student negotiates prices or payment methods (Baseline)',
'Student comparison shops before making purchase (Baseline)']]

In [12]:
# Difference in means test using clustered standard errors 
# replicate the way it is done in the paper

bl_filter_df = follow_up_2_df.dropna(subset=['bl_aluno'])
bl_filter_df = follow_up_2_df.dropna(subset=['treatment'])
columns = bl_filter_df.columns
#baseline_columns = [col for col in columns if 'Baseline' in col]

baseline_columns = ['Student has bought electronics shoes or clothing with credit card (Baseline)', 
'Student has bought electronics shoes or clothing on installments (Baseline)', 
'Student has bought electronics shoes or clothing with cash/debit card (Baseline)',
'Student negotiates prices or payment methods (Baseline)',
'Student comparison shops before making purchase (Baseline)']

results_list = []
significant_vars = []


bl_filter_df = bl_filter_df.dropna(subset=['cd_escola'])
bl_filter_df['cd_escola'] = bl_filter_df['cd_escola'].astype('category')


results = []
significant_vars = []

for var in baseline_columns:
    # OLS regression with clustered standard errors by `cd_escola`
    bl_filter_df_temp = bl_filter_df.dropna(subset=[var])
    formula = f'Q("{var}") ~ treatment'

    model = smf.ols(formula, data=bl_filter_df_temp).fit(cov_type='cluster', cov_kwds={'groups': bl_filter_df_temp['cd_escola']})

    # Get the p-value for the treatment effect (two-tailed)
    p_value = model.pvalues['treatment']
    
    # Get the means for the treatment and control groups
    treatment_mean = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 1, var].mean()
    control_mean = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 0, var].mean()
    
    # Get standard deviations for the treatment and control groups
    treatment_sd = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 1, var].std()
    control_sd = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 0, var].std()
    
    # Number of observations used in the sample
    sample_size = df[var].notna().sum()
    
    results.append({
        'Variable': var,
        'Treatment Mean': treatment_mean,
        'Control Mean': control_mean,
        'Treatment SD': treatment_sd,
        'Control SD': control_sd,
        'P-value': p_value,
        'N': sample_size
    })

#print results
results_df = pd.DataFrame(results)
print(results_df)

                                            Variable  Treatment Mean  \
0  Student has bought electronics shoes or clothi...        0.245896   
1  Student has bought electronics shoes or clothi...        0.276187   
2  Student has bought electronics shoes or clothi...        0.840207   
3  Student negotiates prices or payment methods (...        0.757523   
4  Student comparison shops before making purchas...        0.929081   

   Control Mean  Treatment SD  Control SD   P-value      N  
0      0.259067      0.430634    0.438157  0.093350  49092  
1      0.263999      0.447130    0.440812  0.210152  49000  
2      0.838239      0.366411    0.368227  0.732337  49876  
3      0.753134      0.428616    0.431201  0.584115  49110  
4      0.925970      0.256691    0.261846  0.408141  49816  


In [14]:
# Difference in means test using clustered standard errors 
# replicate the way it is done in the paper

bl_filter_df = follow_up_2_df.dropna(subset=['bl_aluno'])
bl_filter_df = follow_up_2_df.dropna(subset=['treatment'])
columns = bl_filter_df.columns
#baseline_columns = [col for col in columns if 'Baseline' in col]

baseline_columns = ['Education of mother: At least some secondary (Baseline)', 
'Education of father: At least some secondary (Baseline)', 
'Parent is an employee (Baseline)', 
'Parent is self-employeed (Baseline)', 
'Parent occupation is other (homemaker, retired, unemployed, other) (Baseline)', 
"Student's Family receives Bolsa Familia cash transfer (Baseline)", ]

results_list = []
significant_vars = []


bl_filter_df = bl_filter_df.dropna(subset=['cd_escola'])
bl_filter_df['cd_escola'] = bl_filter_df['cd_escola'].astype('category')


results = []
significant_vars = []

for var in baseline_columns:
    # OLS regression with clustered standard errors by `cd_escola`
    bl_filter_df_temp = bl_filter_df.dropna(subset=[var])
    formula = f'Q("{var}") ~ treatment'

    model = smf.ols(formula, data=bl_filter_df_temp).fit(cov_type='cluster', cov_kwds={'groups': bl_filter_df_temp['cd_escola']})

    # Get the p-value for the treatment effect (two-tailed)
    p_value = model.pvalues['treatment']
    
    # Get the means for the treatment and control groups
    treatment_mean = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 1, var].mean()
    control_mean = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 0, var].mean()
    
    # Get standard deviations for the treatment and control groups
    treatment_sd = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 1, var].std()
    control_sd = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 0, var].std()
    
    # Number of observations used in the sample
    sample_size = df[var].notna().sum()
    
    results.append({
        'Variable': var,
        'Treatment Mean': treatment_mean,
        'Control Mean': control_mean,
        'Treatment SD': treatment_sd,
        'Control SD': control_sd,
        'P-value': p_value,
        'N': sample_size
    })

#print results
results_df = pd.DataFrame(results)
print(results_df)

                                            Variable  Treatment Mean  \
0  Education of mother: At least some secondary (...        0.435330   
1  Education of father: At least some secondary (...        0.404502   
2                   Parent is an employee (Baseline)        0.461286   
3                Parent is self-employeed (Baseline)        0.167037   
4  Parent occupation is other (homemaker, retired...        0.371677   
5  Student's Family receives Bolsa Familia cash t...        0.333362   

   Control Mean  Treatment SD  Control SD   P-value      N  
0      0.445643      0.495799    0.497042  0.406504  46248  
1      0.414164      0.490814    0.492622  0.434469  46006  
2      0.471532      0.498529    0.499218  0.247303  41872  
3      0.171089      0.373043    0.376623  0.472477  41872  
4      0.357379      0.483284    0.479234  0.118989  41872  
5      0.315794      0.471420    0.464829  0.213013  46626  


In [17]:
# Difference in means test using clustered standard errors 
# replicate the way it is done in the paper

bl_filter_df = follow_up_2_df.dropna(subset=['bl_aluno'])
bl_filter_df = follow_up_2_df.dropna(subset=['treatment'])
columns = bl_filter_df.columns
#baseline_columns = [col for col in columns if 'Baseline' in col]

baseline_columns = ['Parent: makes a list of all monthly expenses (Baseline)', 
'Parent: has savings account (Baseline)',
'Parent: has debit card (Baseline)', 
'Parent: has checks (Baseline)', 
' Parent: has formal savings (Baseline)', 
'Parent: Prefers R50K plus 15 percent interest (Baseline)', 
'Parent: Inflation question dummy for correct (Baseline)',  
    'Parent: Student talks to you about finances (Baseline)',
    'Parent: Student helps organize HH budget (Baseline)']

results_list = []
significant_vars = []


bl_filter_df = bl_filter_df.dropna(subset=['cd_escola'])
bl_filter_df['cd_escola'] = bl_filter_df['cd_escola'].astype('category')


results = []
significant_vars = []

for var in baseline_columns:
    # OLS regression with clustered standard errors by `cd_escola`
    bl_filter_df_temp = bl_filter_df.dropna(subset=[var])
    formula = f'Q("{var}") ~ treatment'

    model = smf.ols(formula, data=bl_filter_df_temp).fit(cov_type='cluster', cov_kwds={'groups': bl_filter_df_temp['cd_escola']})

    # Get the p-value for the treatment effect (two-tailed)
    p_value = model.pvalues['treatment']
    
    # Get the means for the treatment and control groups
    treatment_mean = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 1, var].mean()
    control_mean = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 0, var].mean()
    
    # Get standard deviations for the treatment and control groups
    treatment_sd = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 1, var].std()
    control_sd = bl_filter_df_temp.loc[bl_filter_df_temp['treatment'] == 0, var].std()
    
    # Number of observations used in the sample
    sample_size = df[var].notna().sum()
    
    results.append({
        'Variable': var,
        'Treatment Mean': treatment_mean,
        'Control Mean': control_mean,
        'Treatment SD': treatment_sd,
        'Control SD': control_sd,
        'P-value': p_value,
        'N': sample_size
    })

#print results
results_df = pd.DataFrame(results)
print(results_df)

                                            Variable  Treatment Mean  \
0  Parent: makes a list of all monthly expenses (...        0.359547   
1             Parent: has savings account (Baseline)        0.555534   
2                  Parent: has debit card (Baseline)        0.436693   
3                      Parent: has checks (Baseline)        0.211955   
4              Parent: has formal savings (Baseline)        0.748052   
5  Parent: Prefers R50K plus 15 percent interest ...        0.444724   
6  Parent: Inflation question dummy for correct (...        0.364768   
7  Parent: Student talks to you about finances (B...        0.628587   
8  Parent: Student helps organize HH budget (Base...        0.441201   

   Control Mean  Treatment SD  Control SD   P-value      N  
0      0.370038      0.479895    0.482855  0.163837  42446  
1      0.567211      0.496911    0.495482  0.375886  41946  
2      0.439962      0.495986    0.496417  0.748841  41260  
3      0.217146      0.408708    0.

In [26]:
#difference in baseline survey responses for those present vs not at follow-up 1 
#paper checks the parent data that is asked across both the baseline and the follow-up
#these differences are bad
filter_df = follow_up_1_df.copy() 
filter_df['fu1_aluno'] = filter_df['fu1_aluno'].fillna(0)
print(filter_df['fu1_aluno'].value_counts())
filter_df = filter_df.dropna(subset=['bl_aluno'])
print(filter_df['fu1_aluno'].value_counts())


columns = filter_df.columns
baseline_columns = [col for col in columns if 'Baseline' in col]


results = []
significant_vars = []

for var in baseline_columns:
    # OLS regression with clustered standard errors by `cd_escola`
    temp = filter_df.dropna(subset=[var])

    formula = f'Q("{var}") ~ fu1_aluno'
    model = smf.ols(formula, data=temp).fit(cov_type='cluster', 
                                            cov_kwds={'groups': temp['cd_escola']})

    # Get the p-value for the treatment effect (two-tailed)
    p_value = model.pvalues['fu1_aluno']
    # Get the means for the treatment and control groups
    treatment_mean = temp.loc[temp['fu1_aluno'] == 1, var].mean()
    control_mean = temp.loc[temp['fu1_aluno'] == 0, var].mean()
    
    # Get standard deviations for the treatment and control groups
    treatment_sd = temp.loc[temp['fu1_aluno'] == 1, var].std()
    control_sd = temp.loc[temp['fu1_aluno'] == 0, var].std()
    
    # Number of observations used in the sample
    sample_size = df[var].notna().sum()
    
    results.append({
        'Variable': var,
        'Treatment Mean': treatment_mean,
        'Control Mean': control_mean,
        'Treatment SD': treatment_sd,
        'Control SD': control_sd,
        'P-value': p_value,
        'N': sample_size
    })

results = pd.DataFrame(results)
print(results)

#print variables that are significantly different 
significant_var = results[results['P-value'] < 0.05]
print(significant_var['Variable'])

notsignificant_var = results[results['P-value'] > 0.05]
print(notsignificant_var['Variable'])

fu1_aluno
0.0    18454
1.0    16892
Name: count, dtype: int64
fu1_aluno
1.0    15427
0.0     9744
Name: count, dtype: int64
                                             Variable  Treatment Mean  \
0      Student: Finanical Proficieny Score (Baseline)       50.410545   
1   Student gender could not be coded based on nam...        0.000259   
2                        Student is female (Baseline)        0.561485   
3   Education of mother: At least some secondary (...        0.422612   
4   Education of father: At least some secondary (...        0.387201   
5   Student's Family receives Bolsa Familia cash t...        0.346090   
6   Student has computer with internet at home (Ba...        0.554997   
7   Student has failed at least one school year (B...        0.282008   
8     Student is not working at the moment (Baseline)        0.336462   
9   Student works in own or family business (Basel...        0.118629   
10      Student works as employee or other (Baseline)        0.217833   


In [28]:
#difference in baseline survey responses for those present vs not at follow-up 1 
#paper checks the parent data that is asked across both the baseline and the follow-up
#these differences are bad
filter_df = follow_up_1_df.copy() 
filter_df['fu1_aluno'] = filter_df['fu1_aluno'].fillna(0)
print(filter_df['fu1_aluno'].value_counts())
filter_df = filter_df.dropna(subset=['bl_aluno'])
print(filter_df['fu1_aluno'].value_counts())

columns = filter_df.columns
baseline_columns = significant_var['Variable']

results = []
significant_vars = []

for var in baseline_columns:
    # OLS regression with clustered standard errors by `cd_escola`
    temp = filter_df.dropna(subset=[var])

    formula = f'Q("{var}") ~ fu1_aluno'
    model = smf.ols(formula, data=temp).fit(cov_type='cluster', 
                                            cov_kwds={'groups': temp['cd_escola']})

    # Get the p-value for the treatment effect (two-tailed)
    p_value = model.pvalues['fu1_aluno']
    # Get the means for the treatment and control groups
    treatment_mean = temp.loc[temp['fu1_aluno'] == 1, var].mean()
    control_mean = temp.loc[temp['fu1_aluno'] == 0, var].mean()
    
    # Get standard deviations for the treatment and control groups
    treatment_sd = temp.loc[temp['fu1_aluno'] == 1, var].std()
    control_sd = temp.loc[temp['fu1_aluno'] == 0, var].std()
    
    # Number of observations used in the sample
    sample_size = df[var].notna().sum()
    
    results.append({
        'Variable': var,
        'Follow Up 1 Mean': treatment_mean,
        'Baseline Mean': control_mean,
        'Treatment SD': treatment_sd,
        'Control SD': control_sd,
        'P-value': p_value,
        'N': sample_size
    })

results = pd.DataFrame(results)
print(results)

#print variables that are significantly different 
significant_var = results[results['P-value'] < 0.05]
print(significant_var['Variable'])

fu1_aluno
0.0    18454
1.0    16892
Name: count, dtype: int64
fu1_aluno
1.0    15427
0.0     9744
Name: count, dtype: int64
                                             Variable  Follow Up 1 Mean  \
0   Student gender could not be coded based on nam...          0.000259   
1   Education of mother: At least some secondary (...          0.422612   
2   Education of father: At least some secondary (...          0.387201   
3   Student's Family receives Bolsa Familia cash t...          0.346090   
4   Student has computer with internet at home (Ba...          0.554997   
5   Student has failed at least one school year (B...          0.282008   
6     Student is not working at the moment (Baseline)          0.336462   
7       Student works as employee or other (Baseline)          0.217833   
8                  Student receives income (Baseline)          0.651463   
9   Student has borrowed money (any source) (Basel...          0.307193   
10  Student is behind on payments (unconditional) .

In [3]:
1.900548e-02 

0.01900548

In [30]:
#difference in baseline survey responses for those present vs not at follow-up 1 
#paper checks the parent data that is asked across both the baseline and the follow-up
#these differences are bad
filter_df = follow_up_1_df.copy() 
filter_df['fu1_aluno'] = filter_df['fu1_aluno'].fillna(0)
print(filter_df['fu1_aluno'].value_counts())
filter_df = filter_df.dropna(subset=['bl_aluno'])
print(filter_df['fu1_aluno'].value_counts())


columns = filter_df.columns
baseline_columns = notsignificant_var['Variable']


results = []
significant_vars = []

for var in baseline_columns:
    # OLS regression with clustered standard errors by `cd_escola`
    temp = filter_df.dropna(subset=[var])

    formula = f'Q("{var}") ~ fu1_aluno'
    model = smf.ols(formula, data=temp).fit(cov_type='cluster', 
                                            cov_kwds={'groups': temp['cd_escola']})

    # Get the p-value for the treatment effect (two-tailed)
    p_value = model.pvalues['fu1_aluno']
    # Get the means for the treatment and control groups
    treatment_mean = temp.loc[temp['fu1_aluno'] == 1, var].mean()
    control_mean = temp.loc[temp['fu1_aluno'] == 0, var].mean()
    
    # Get standard deviations for the treatment and control groups
    treatment_sd = temp.loc[temp['fu1_aluno'] == 1, var].std()
    control_sd = temp.loc[temp['fu1_aluno'] == 0, var].std()
    
    # Number of observations used in the sample
    sample_size = df[var].notna().sum()
    
    results.append({
        'Variable': var,
        'Follow Up 1 Mean': treatment_mean,
        'Baseline Mean': control_mean,
        'Treatment SD': treatment_sd,
        'Control SD': control_sd,
        'P-value': p_value,
        'N': sample_size
    })

results = pd.DataFrame(results)
print(results)

#print variables that are significantly different 
significant_var = results[results['P-value'] < 0.05]
print(significant_var['Variable'])

fu1_aluno
0.0    18454
1.0    16892
Name: count, dtype: int64
fu1_aluno
1.0    15427
0.0     9744
Name: count, dtype: int64
                                            Variable  Follow Up 1 Mean  \
0     Student: Finanical Proficieny Score (Baseline)         50.410545   
1                       Student is female (Baseline)          0.561485   
2  Student works in own or family business (Basel...          0.118629   
3  Student part of income saved is non-zero (Base...          0.453193   
4  Student makes a list of all monthly expenses (...          0.102782   
5  Student negotiates prices or payment methods (...          0.758163   
6                Parent is self-employeed (Baseline)          0.168717   
7  Parent: makes a list of all monthly expenses (...          0.365833   
8  Parent: Student helps organize HH budget (Base...          0.438904   

   Baseline Mean  Treatment SD  Control SD   P-value      N  
0      50.016479     14.900280   15.040449  0.226018  47848  
1       0.5

In [1]:
2.232767e-02 

0.02232767

In [11]:
#difference in baseline survey responses for those present vs not at follow-up 2
filter_df = follow_up_2_df.copy() 
filter_df['fu2_aluno'] = filter_df['fu2_aluno'].fillna(0)
print(filter_df['fu2_aluno'].value_counts())
filter_df = filter_df.dropna(subset=['bl_aluno'])
print(filter_df['fu2_aluno'].value_counts())


columns = filter_df.columns
baseline_columns = [col for col in columns if 'Baseline' in col]


results = []
significant_vars = []

for var in baseline_columns:
    # OLS regression with clustered standard errors by `cd_escola`
    temp = filter_df.dropna(subset=[var])

    formula = f'Q("{var}") ~ fu2_aluno'
    model = smf.ols(formula, data=temp).fit(cov_type='cluster', 
                                            cov_kwds={'groups': temp['cd_escola']})

    # Get the p-value for the treatment effect (two-tailed)
    p_value = model.pvalues['fu2_aluno']
    # Get the means for the treatment and control groups
    treatment_mean = temp.loc[temp['fu2_aluno'] == 1, var].mean()
    control_mean = temp.loc[temp['fu2_aluno'] == 0, var].mean()
    
    # Get standard deviations for the treatment and control groups
    treatment_sd = temp.loc[temp['fu2_aluno'] == 1, var].std()
    control_sd = temp.loc[temp['fu2_aluno'] == 0, var].std()
    
    # Number of observations used in the sample
    sample_size = df[var].notna().sum()
    
    results.append({
        'Variable': var,
        'Treatment Mean': treatment_mean,
        'Control Mean': control_mean,
        'Treatment SD': treatment_sd,
        'Control SD': control_sd,
        'P-value': p_value,
        'N': sample_size
    })

results = pd.DataFrame(results)
print(results)

#print variables that are significantly different 
significant_var = results[results['P-value'] < 0.05]
print(significant_var['Variable'])

notsignificant_var = results[results['P-value'] > 0.05]
print(notsignificant_var['Variable'])

fu2_aluno
1.0    18230
0.0    17116
Name: count, dtype: int64
fu2_aluno
0.0    13825
1.0    11346
Name: count, dtype: int64
                                             Variable  Treatment Mean  \
0      Student: Finanical Proficieny Score (Baseline)       51.772961   
1   Student gender could not be coded based on nam...        0.000264   
2                        Student is female (Baseline)        0.580116   
3   Education of mother: At least some secondary (...        0.429611   
4   Education of father: At least some secondary (...        0.395753   
5   Student's Family receives Bolsa Familia cash t...        0.336591   
6   Student has computer with internet at home (Ba...        0.539908   
7   Student has failed at least one school year (B...        0.228687   
8     Student is not working at the moment (Baseline)        0.323076   
9   Student works in own or family business (Basel...        0.109613   
10      Student works as employee or other (Baseline)        0.213462   


In [12]:
#difference in baseline survey responses for those present vs not at follow-up 2
filter_df = follow_up_2_df.copy() 
filter_df['fu2_aluno'] = filter_df['fu2_aluno'].fillna(0)
print(filter_df['fu2_aluno'].value_counts())
filter_df = filter_df.dropna(subset=['bl_aluno'])
print(filter_df['fu2_aluno'].value_counts())


columns = filter_df.columns
baseline_columns = significant_var['Variable']


results = []
significant_vars = []

for var in baseline_columns:
    # OLS regression with clustered standard errors by `cd_escola`
    temp = filter_df.dropna(subset=[var])

    formula = f'Q("{var}") ~ fu2_aluno'
    model = smf.ols(formula, data=temp).fit(cov_type='cluster', 
                                            cov_kwds={'groups': temp['cd_escola']})

    # Get the p-value for the treatment effect (two-tailed)
    p_value = model.pvalues['fu2_aluno']
    # Get the means for the treatment and control groups
    treatment_mean = temp.loc[temp['fu2_aluno'] == 1, var].mean()
    control_mean = temp.loc[temp['fu2_aluno'] == 0, var].mean()
    
    # Get standard deviations for the treatment and control groups
    treatment_sd = temp.loc[temp['fu2_aluno'] == 1, var].std()
    control_sd = temp.loc[temp['fu2_aluno'] == 0, var].std()
    
    # Number of observations used in the sample
    sample_size = df[var].notna().sum()
    
    results.append({
        'Variable': var,
        'Treatment Mean': treatment_mean,
        'Control Mean': control_mean,
        'Treatment SD': treatment_sd,
        'Control SD': control_sd,
        'P-value': p_value,
        'N': sample_size
    })

results = pd.DataFrame(results)
print(results)


fu2_aluno
1.0    18230
0.0    17116
Name: count, dtype: int64
fu2_aluno
0.0    13825
1.0    11346
Name: count, dtype: int64
                                             Variable  Treatment Mean  \
0      Student: Finanical Proficieny Score (Baseline)       51.772961   
1                        Student is female (Baseline)        0.580116   
2   Education of father: At least some secondary (...        0.395753   
3   Student's Family receives Bolsa Familia cash t...        0.336591   
4   Student has computer with internet at home (Ba...        0.539908   
5   Student has failed at least one school year (B...        0.228687   
6     Student is not working at the moment (Baseline)        0.323076   
7   Student works in own or family business (Basel...        0.109613   
8       Student works as employee or other (Baseline)        0.213462   
9                  Student receives income (Baseline)        0.647851   
10  Student has borrowed money (any source) (Basel...        0.295738   


In [13]:
#difference in baseline survey responses for those present vs not at follow-up 2
filter_df = follow_up_2_df.copy() 
filter_df['fu2_aluno'] = filter_df['fu2_aluno'].fillna(0)
print(filter_df['fu2_aluno'].value_counts())
filter_df = filter_df.dropna(subset=['bl_aluno'])
print(filter_df['fu2_aluno'].value_counts())


columns = filter_df.columns
baseline_columns = notsignificant_var['Variable']


results = []
significant_vars = []

for var in baseline_columns:
    # OLS regression with clustered standard errors by `cd_escola`
    temp = filter_df.dropna(subset=[var])

    formula = f'Q("{var}") ~ fu2_aluno'
    model = smf.ols(formula, data=temp).fit(cov_type='cluster', 
                                            cov_kwds={'groups': temp['cd_escola']})

    # Get the p-value for the treatment effect (two-tailed)
    p_value = model.pvalues['fu2_aluno']
    # Get the means for the treatment and control groups
    treatment_mean = temp.loc[temp['fu2_aluno'] == 1, var].mean()
    control_mean = temp.loc[temp['fu2_aluno'] == 0, var].mean()
    
    # Get standard deviations for the treatment and control groups
    treatment_sd = temp.loc[temp['fu2_aluno'] == 1, var].std()
    control_sd = temp.loc[temp['fu2_aluno'] == 0, var].std()
    
    # Number of observations used in the sample
    sample_size = df[var].notna().sum()
    
    results.append({
        'Variable': var,
        'Treatment Mean': treatment_mean,
        'Control Mean': control_mean,
        'Treatment SD': treatment_sd,
        'Control SD': control_sd,
        'P-value': p_value,
        'N': sample_size
    })

results = pd.DataFrame(results)
print(results)

#print variables that are significantly different 
significant_var = results[results['P-value'] < 0.05]
print(significant_var['Variable'])

notsignificant_var = results[results['P-value'] > 0.05]
print(notsignificant_var['Variable'])

fu2_aluno
1.0    18230
0.0    17116
Name: count, dtype: int64
fu2_aluno
0.0    13825
1.0    11346
Name: count, dtype: int64
                                             Variable  Treatment Mean  \
0   Student gender could not be coded based on nam...        0.000264   
1   Education of mother: At least some secondary (...        0.429611   
2   Student part of income saved is non-zero (Base...        0.458128   
3   Student saves money for future purchases (Base...        0.152387   
4   Student negotiates prices or payment methods (...        0.758419   
5   Parent has at least some secondary education (...        0.441752   
6                 Parent is self-employeed (Baseline)        0.168199   
7   Parent: makes a list of all monthly expenses (...        0.366147   
8                   Parent: has debit card (Baseline)        0.431665   
9                       Parent: has checks (Baseline)        0.211706   
10  Parent: Student talks to you about finances (B...        0.642672   
