In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

ghana = 'GHA_2008_MGFERE_v01_M_Stata8/ReplicationDataGhanaJDE.dta'
data_ghana = pd.read_stata(ghana)
data_ghana

Unnamed: 0,gender,male_male,female_female,male_mixed,female_mixed,highcapture,highcapital,groupnum,cashtreat,equiptreat,...,norms_9,norms_12,household_5h,belongf,drink,noimpulse1,noimpulse2,jointown,numsib,sibsAccra
0,female,0,1,0,0,0,1,1108,0,0,...,Agree,Agree,Never,0.0,1.0,1.0,1.0,0.0,0.191877,3.30703
1,female,0,1,0,0,0,1,1108,0,0,...,Agree,Agree,Never,0.0,1.0,1.0,1.0,0.0,0.191877,3.30703
2,female,0,1,0,0,0,1,1108,0,0,...,Agree,Agree,Never,0.0,1.0,1.0,1.0,0.0,0.191877,3.30703
3,female,0,1,0,0,0,1,1108,0,0,...,Agree,Agree,Never,0.0,1.0,1.0,1.0,0.0,0.191877,3.30703
4,female,0,1,0,0,0,1,1108,0,0,...,Agree,Agree,Never,0.0,1.0,1.0,1.0,0.0,0.191877,3.30703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4753,female,0,1,0,0,1,1,906,0,0,...,Agree,Agree,More than once a week,0.0,0.0,0.0,0.0,0.0,-2.808123,-1.69297
4754,female,0,1,0,0,1,1,906,0,0,...,Agree,Agree,More than once a week,0.0,0.0,0.0,0.0,0.0,-2.808123,-1.69297
4755,female,0,1,0,0,1,1,906,0,0,...,Agree,Agree,More than once a week,0.0,0.0,0.0,0.0,0.0,-2.808123,-1.69297
4756,female,0,1,0,0,1,1,906,0,0,...,Agree,Agree,More than once a week,0.0,0.0,0.0,0.0,0.0,-2.808123,-1.69297


In [2]:
# recovering large strata by baseline covariates and add a variable indicating treatment status {0,1,2}
columns_needed = ['realfinalprofit', 'atreatcash', 'atreatequip', 'wave', 'male', 'groupnum',
                  'sheno', 'male_male', 'female_female', 'male_mixed', 'female_mixed',
                  'highcapture','highcapital', 'highgroup', 'mlowgroup']
df_wave6 = data_ghana[data_ghana.wave==6][columns_needed]
df_wave6['strata'] = df_wave6.male_male*100000 + df_wave6.female_female*10000 + df_wave6.male_mixed*1000 \
    + df_wave6.female_mixed*100 + df_wave6.highcapture*10 + df_wave6.highcapital
treatment = np.zeros(len(df_wave6))
treatment[df_wave6['atreatcash']==1] = 1
treatment[df_wave6['atreatequip']==1] = 2
df_wave6['treatment'] = treatment
df_wave6 = df_wave6.sort_values(by=['strata','groupnum','treatment'], ascending=True)
df_nan = df_wave6[np.isnan(df_wave6.realfinalprofit)]
groups_nan = set(df_nan.groupnum)

# drop 4 non-quadruplets groups and groups with missing values
bad_groups = set([991,992,993,994] + list(groups_nan))
df_wave6_quad = df_wave6[~df_wave6['groupnum'].isin(bad_groups)]

# keep relavant variables and create dummy variables for group fixed effects
dummies = pd.get_dummies(df_wave6_quad.groupnum)
df_wave6_quad = pd.concat([df_wave6_quad, dummies], axis=1, join='inner')

df_wave6_quad

Unnamed: 0,realfinalprofit,atreatcash,atreatequip,wave,male,groupnum,sheno,male_male,female_female,male_mixed,...,130004,140001,140003,140006,150002,150003,150005,160001,160002,160004
3644,0.000000,0,0,6,0,1601,300105905,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3690,0.000000,0,0,6,0,1601,300107803,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3546,36.191643,1,0,6,0,1601,300101101,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3436,48.255527,0,1,6,0,1601,160601105,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1154,28.149057,0,0,6,0,1603,111000102,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2738,104.553642,0,1,6,1,10002,150505107,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1274,482.555267,0,0,6,1,10004,111002217,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3236,241.277634,0,0,6,1,10004,160402617,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4009,140.745285,1,0,6,1,10004,300300304,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
from scipy.stats import norm
from empirical import Inference
from scipy.stats import norm


# inference based on matched-tuples
def reg_MT(df, column):
    Y = df.realfinalprofit.to_numpy()
    if column == 1:
        idx = Y>=0
    elif column == 2:
        idx = (df.male==1)
    elif column == 3:
        idx =  (df.male==0)
    elif column == 4:
        idx = (df.male==0) & (df.highgroup==1)
    elif column == 5:
        idx = (df.male==0) & (df.mlowgroup==1)
    else:
        raise RuntimeError("Wrong column")
    Y = Y[idx]
    Y = Y.reshape(-1,4)
    inf = Inference(Y, Y)
    tstats = np.abs(inf.tau12)/inf.se_tau12
    pval = (1-norm.cdf(tstats))*2
    return [inf.tau1, inf.se_tau1, inf.tau2, inf.se_tau2, pval]

# replicate regression from Table 5 of GhanaJDE paper
def reg(df, column, fixed_effects=True, clustered_se=False):
    Y = df.realfinalprofit
    if fixed_effects:
        X = df[['atreatcash', 'atreatequip']+list(dummies.columns)[1:]]
    else:
        X = df[['atreatcash', 'atreatequip']]
    if column == 1:
        idx = Y>=0
    elif column == 2:
        idx = (df.male==1)
    elif column == 3:
        idx =  (df.male==0)
    elif column == 4:
        idx = (df.male==0) & (df.highgroup==1)
    elif column == 5:
        idx = (df.male==0) & (df.mlowgroup==1)
    else:
        raise RuntimeError("Wrong column")
    Y = Y[idx]
    X = X[idx]
    X = sm.add_constant(X)
    model = sm.OLS(Y,X)
    if clustered_se:
        results = model.fit(cov_type='cluster', cov_kwds={'groups':df.groupnum[idx]})
    else:
        results = model.fit(cov_type='HC0')
    #print(results.params[1:3])
    #print(results.HC0_se[1:3])
    r = np.zeros_like(results.params)
    r[1:3] = [1,-1]
    T_test = results.t_test(r)
    #print(T_test.pvalue)
    #print(results.params)
    #print(results.summary())
    if clustered_se:
        return results.params[1:3].values, results.bse[1:3].values, T_test.pvalue
    else:
        return results.params[1:3].values, results.HC0_se[1:3].values, T_test.pvalue

reg(df_wave6_quad, 1, fixed_effects=True)

def print_results(results):
    stars = [norm.ppf(0.95), norm.ppf(0.975), norm.ppf(0.995)]
    pvals = [0.1, 0.05, 0.01]

    print("  (1)   (2)   (3)   (4)   (5)")
    for r in range(5):
        for i in range(5):
            if r==1 or r==3:
                print(" & & ({:.2f})".format(results[i][r]), end=' ')
            elif r==4:
                star = ''
                for p in pvals:
                    if results[i][r] < p:
                        star += '*'
                star = "^{" +star+"}" if len(star) > 0 else ''
                print(" & & {:.3f}".format(results[i][r]), end=' ')
            else:
                tstats = np.abs(results[i][r])/results[i][r+1]
                star = ''
                for s in stars:
                    if tstats > s:
                        star += '*'
                star = "^{" +star+"}" if len(star) > 0 else ''
                print(" & & {:.2f}".format(results[i][r]) + star, end=' ')
        print("\\\\")

In [4]:
# regression with strata fixed effects
print("*************** With fixed effects ***************")
results = []
for i in range(5):
    taus, ses, pval = reg(df_wave6_quad, i+1, fixed_effects=True)
    results.append([taus[0], ses[0], taus[1], ses[1], pval])
    
print_results(results)

# regression without strata fixed effects
print("*************** Without fixed effects ***************")
results = []
for i in range(5):
    taus, ses, pval = reg(df_wave6_quad, i+1, fixed_effects=False)
    results.append([taus[0], ses[0], taus[1], ses[1], pval])
    
print_results(results)

# regression without strata fixed effects under clustered standard error
print("*************** Without fixed effects under Clustered standard error ***************")
results = []
for i in range(5):
    taus, ses, pval = reg(df_wave6_quad, i+1, fixed_effects=False, clustered_se=True)
    results.append([taus[0], ses[0], taus[1], ses[1], pval])
    
print_results(results)

# MT
print("*************** MT ***************")
results = []
for i in range(5):
    results.append(reg_MT(df_wave6_quad, i+1))
    
print_results(results)

*************** With fixed effects ***************
  (1)   (2)   (3)   (4)   (5)
 & & 19.64^{*}  & & 24.84  & & 16.30  & & 33.09  & & 7.01 \\
 & & (10.76)  & & (18.74)  & & (12.90)  & & (31.94)  & & (9.35) \\
 & & 20.26^{*}  & & 4.48  & & 30.42^{*}  & & 65.36  & & 11.10 \\
 & & (11.85)  & & (15.48)  & & (16.66)  & & (41.17)  & & (11.86) \\
 & & 0.965  & & 0.333  & & 0.452  & & 0.487  & & 0.761 \\
*************** Without fixed effects ***************
  (1)   (2)   (3)   (4)   (5)
 & & 19.64  & & 24.84  & & 16.30  & & 33.09  & & 7.01 \\
 & & (15.42)  & & (27.29)  & & (18.13)  & & (42.56)  & & (11.58) \\
 & & 20.26  & & 4.48  & & 30.42  & & 65.36  & & 11.10 \\
 & & (15.67)  & & (18.42)  & & (22.83)  & & (53.28)  & & (15.31) \\
 & & 0.975  & & 0.493  & & 0.600  & & 0.610  & & 0.817 \\
*************** Without fixed effects under Clustered standard error ***************
  (1)   (2)   (3)   (4)   (5)
 & & 19.64  & & 24.84  & & 16.30  & & 33.09  & & 7.01 \\
 & & (12.59)  & & (23.68)  & & (14.2