In [15]:
import pandas as pd
import numpy as np
import pyreadr
import math

In [19]:
data_read = pyreadr.read_r("../data/wage2015_subsample_inference.Rdata")
data = data_read[ 'data' ]
data.shape

(5150, 20)

In [20]:

m = data[ ["lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"] ]

college=m[m['clg']==1]
m_college= college[ ["lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"] ]
scl=m[m['scl']==1]
m_scl=scl[["lwage","sex","shs","hsg","scl","clg","ad","ne","mw","so","we","exp1"]]

In [21]:
m_college.shape

(1636, 12)

In [22]:
m_scl.shape

(1432, 12)

In [9]:
total=pd.concat([m_college,m_scl])
total.shape

In [23]:
total.reset_index(inplace=True)

In [24]:
total2 = total[ ["lwage","sex","scl","clg","ne","mw","so","we","exp1"] ]

data_female = total2[total2[ 'sex' ] == 1 ]
t_female = data_female[ ["lwage","sex","scl","clg","ne","mw","so","we","exp1"] ]

data_male = total2[ total2[ 'sex' ] == 0 ]
t_male = data_male[ [ "lwage","sex","scl","clg","ne","mw","so","we","exp1" ] ]


table = np.zeros( (9, 3) )
table[:, 0] = total2.mean().values
table[:, 1] = t_male.mean().values
table[:, 2] = t_female.mean().values
table_pandas = pd.DataFrame( table, columns = [ 'All', 'Men', 'Women'])
table_pandas.index = ["Log Wage","Sex","Some College","Gollage Graduate","Northeast","Midwest","South","West","Experience"]
table_html = table_pandas.to_html()

table_pandas

Unnamed: 0,All,Men,Women
Log Wage,3.000022,3.038412,2.956904
Sex,0.470991,0.0,1.0
Some College,0.466754,0.481824,0.449827
Gollage Graduate,0.533246,0.518176,0.550173
Northeast,0.226532,0.219347,0.234602
Midwest,0.265971,0.261245,0.27128
South,0.285854,0.290819,0.280277
West,0.221643,0.228589,0.213841
Experience,12.700945,12.433148,13.00173


In [25]:
data_female['lwage'].mean() - data_male['lwage'].mean()

-0.08150855508735866

In [27]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [28]:
nocontrol_model = smf.ols( formula = 'lwage ~ sex', data = total2 )
nocontrol_est = nocontrol_model.fit().summary2().tables[1]['Coef.']['sex']
HCV_coefs = nocontrol_model.fit().cov_HC0
nocontrol_se = np.power( HCV_coefs.diagonal() , 0.5)[1]

In [39]:
print( f'The estimated gender coefficient is {nocontrol_est} and the corresponding robust standard error is {nocontrol_se}' )

The estimated gender coefficient is -0.08150855508736156 and the corresponding robust standard error is 0.019579647767772403


# OLS

In [43]:
["lwage","sex","scl","clg","ne","mw","so","we","exp1"]
flex = 'lwage ~ sex + (exp1)*(scl+clg+ne+mw+so+we+exp1)'
control_model = smf.ols( formula = flex, data = total2 )
control_est = control_model.fit().summary2().tables[1]['Coef.']['sex']

print(control_model.fit().summary2().tables[1])
print( f"Coefficient for OLS with controls {control_est}" )

HCV_coefs = control_model.fit().cov_HC0
control_se = np.power( HCV_coefs.diagonal() , 0.5)[1]

              Coef.  Std.Err.           t          P>|t|    [0.025    0.975]
Intercept  1.667494  0.009998  166.781417   0.000000e+00  1.647890  1.687097
sex       -0.096599  0.018697   -5.166475   2.538621e-07 -0.133260 -0.059939
exp1       0.005150  0.000528    9.758316   3.568426e-22  0.004115  0.006185
scl        0.652284  0.016987   38.398567  1.313132e-263  0.618977  0.685592
clg        1.015209  0.015120   67.144747   0.000000e+00  0.985563  1.044855
ne         0.386534  0.026586   14.539250   2.326198e-46  0.334406  0.438661
mw         0.422902  0.025596   16.522084   8.376085e-59  0.372714  0.473089
so         0.419274  0.024124   17.379712   1.355933e-64  0.371972  0.466575
we         0.438784  0.027512   15.948877   4.547772e-55  0.384841  0.492728
exp1:scl   0.004172  0.000971    4.296438   1.789767e-05  0.002268  0.006076
exp1:clg   0.000978  0.000947    1.033571   3.014187e-01 -0.000878  0.002835
exp1:ne    0.003583  0.001584    2.261804   2.377946e-02  0.000477  0.006689

## Partialling-Out using ols

In [44]:
# models
# model for Y
flex_y = 'lwage ~  (exp1)*(scl+clg+ne+mw+so+we+exp1)'
# model for D
flex_d = 'sex ~ (exp1)*(scl+clg+ne+mw+so+we+exp1)' 

# partialling-out the linear effect of W from Y
t_Y = smf.ols( formula = flex_y , data = total2 ).fit().resid

# partialling-out the linear effect of W from D
t_D = smf.ols( formula = flex_d , data = total2 ).fit().resid

data_res = pd.DataFrame( np.vstack(( t_Y.values , t_D.values )).T , columns = [ 't_Y', 't_D' ] )
# regression of Y on D after partialling-out the effect of W
partial_fit =  smf.ols( formula = 't_Y ~ t_D' , data = data_res ).fit()
partial_est = partial_fit.summary2().tables[1]['Coef.']['t_D']

print("Coefficient for D via partialling-out", partial_est)

# standard error
HCV_coefs = partial_fit.cov_HC0
partial_se = np.power( HCV_coefs.diagonal() , 0.5)[1]

# confidence interval
partial_fit.conf_int( alpha=0.05 ).iloc[1, :]

Coefficient for D via partialling-out -0.09659930524586716


0   -0.133206
1   -0.059993
Name: t_D, dtype: float64