# Difference in Differences

## Study Case 2 - Tax Credit to single women experiment

We'll compare the impact between single women and single women with children. The intervention was implemented in 1994.

In [1]:
#packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import statsmodels.api as smf
%matplotlib inline

In [2]:
#import dataset

df = pd.read_stata("eitc.dta")
df.head()

Unnamed: 0,state,year,urate,children,nonwhite,finc,earn,age,ed,work,unearn
0,11.0,1991.0,7.6,0,1,18714.394273,18714.394273,26,10,1,0.0
1,12.0,1991.0,7.2,1,0,4838.568282,471.365639,22,9,1,4.367203
2,13.0,1991.0,6.4,2,0,8178.193833,0.0,33,11,0,8.178194
3,14.0,1991.0,9.1,0,1,9369.570485,0.0,43,11,0,9.36957
4,15.0,1991.0,8.6,3,1,14706.60793,14706.60793,23,7,1,0.0


In [3]:
df.dtypes

state       float32
year        float32
urate       float32
children       int8
nonwhite       int8
finc        float64
earn        float64
age            int8
ed             int8
work           int8
unearn      float64
dtype: object

### Wangling and Cleaning

In [4]:
### new column - before and after 1993

df['post93'] = np.where(df['year'] > 1993, 1,0)

#check
df.head()

Unnamed: 0,state,year,urate,children,nonwhite,finc,earn,age,ed,work,unearn,post93
0,11.0,1991.0,7.6,0,1,18714.394273,18714.394273,26,10,1,0.0,0
1,12.0,1991.0,7.2,1,0,4838.568282,471.365639,22,9,1,4.367203,0
2,13.0,1991.0,6.4,2,0,8178.193833,0.0,33,11,0,8.178194,0
3,14.0,1991.0,9.1,0,1,9369.570485,0.0,43,11,0,9.36957,0
4,15.0,1991.0,8.6,3,1,14706.60793,14706.60793,23,7,1,0.0,0


In [5]:
## mom variable - if have children

df['mom'] = np.where(df['children'] > 0, 1,0)
df.head()

Unnamed: 0,state,year,urate,children,nonwhite,finc,earn,age,ed,work,unearn,post93,mom
0,11.0,1991.0,7.6,0,1,18714.394273,18714.394273,26,10,1,0.0,0,0
1,12.0,1991.0,7.2,1,0,4838.568282,471.365639,22,9,1,4.367203,0,1
2,13.0,1991.0,6.4,2,0,8178.193833,0.0,33,11,0,8.178194,0,1
3,14.0,1991.0,9.1,0,1,9369.570485,0.0,43,11,0,9.36957,0,0
4,15.0,1991.0,8.6,3,1,14706.60793,14706.60793,23,7,1,0.0,0,1


In [6]:
#mom and post 93

df['mom_post93'] = df['mom'] * df['post93']
df.head()

Unnamed: 0,state,year,urate,children,nonwhite,finc,earn,age,ed,work,unearn,post93,mom,mom_post93
0,11.0,1991.0,7.6,0,1,18714.394273,18714.394273,26,10,1,0.0,0,0,0
1,12.0,1991.0,7.2,1,0,4838.568282,471.365639,22,9,1,4.367203,0,1,0
2,13.0,1991.0,6.4,2,0,8178.193833,0.0,33,11,0,8.178194,0,1,0
3,14.0,1991.0,9.1,0,1,9369.570485,0.0,43,11,0,9.36957,0,0,0
4,15.0,1991.0,8.6,3,1,14706.60793,14706.60793,23,7,1,0.0,0,1,0


In [7]:
#x and y
y = df.loc[:, 'work'].values
X = df.loc[:, ['post93', 'mom', 'mom_post93']] 

### Model

In [10]:
##log reg

X = smf.add_constant(X)
model1 = smf.Logit(y,X).fit()
model1.summary(yname = 'work',
               xname = ('intercept',
                        'After 1993',
                        'Mom',
                        "Mom & After 1993"),
              title = "Impact of tax credit employment - Model 1")

Optimization terminated successfully.
         Current function value: 0.686491
         Iterations 4


0,1,2,3
Dep. Variable:,work,No. Observations:,13746.0
Model:,Logit,Df Residuals:,13742.0
Method:,MLE,Df Model:,3.0
Date:,"Sat, 04 Mar 2023",Pseudo R-squ.:,0.009118
Time:,11:44:44,Log-Likelihood:,-9436.5
converged:,True,LL-Null:,-9523.3
Covariance Type:,nonrobust,LLR p-value:,2.058e-37

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.3042,0.036,8.443,0.000,0.234,0.375
After 1993,-0.0085,0.053,-0.161,0.872,-0.112,0.095
Mom,-0.5212,0.047,-10.985,0.000,-0.614,-0.428
Mom & After 1993,0.1885,0.070,2.708,0.007,0.052,0.325


Conclusion: Moms has 52% less working, but after 1993 the tax credit increase the employment to women with children around 19%  with significant statistic.

In [12]:
#second model with more variables

#x and y
y = df.loc[:, 'work'].values
X = df.loc[:, ['post93', 'mom', 
               'mom_post93',
               'nonwhite',
               'ed']
          ]

#fit
X = smf.add_constant(X)
model2 = smf.Logit(y,X).fit()
model2.summary(yname = 'work',
               xname = ('intercept',
                        'After 1993',
                        'Mom',
                        "Mom & After 1993",
                        "Spanic or Black",
                        "Years of Education"),
              title = "Impact of tax credit employment - Model 2")

Optimization terminated successfully.
         Current function value: 0.680664
         Iterations 4


0,1,2,3
Dep. Variable:,work,No. Observations:,13746.0
Model:,Logit,Df Residuals:,13740.0
Method:,MLE,Df Model:,5.0
Date:,"Sat, 04 Mar 2023",Pseudo R-squ.:,0.01753
Time:,11:56:52,Log-Likelihood:,-9356.4
converged:,True,LL-Null:,-9523.3
Covariance Type:,nonrobust,LLR p-value:,5.205e-70

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-0.1687,0.071,-2.367,0.018,-0.308,-0.029
After 1993,-0.0046,0.053,-0.086,0.932,-0.108,0.099
Mom,-0.5287,0.048,-10.986,0.000,-0.623,-0.434
Mom & After 1993,0.1973,0.070,2.817,0.005,0.060,0.335
Spanic or Black,-0.2199,0.036,-6.129,0.000,-0.290,-0.150
Years of Education,0.0687,0.007,10.270,0.000,0.056,0.082


The `Mom and after 93` still been signficant with around 20%. 

### The placebo exp

In [14]:
#dummy to placebo test

df['post92'] = np.where(df['year'] > 1992, 1,0)
df['mom_post92'] = df['mom'] * df['post92']

#check
df.head()

Unnamed: 0,state,year,urate,children,nonwhite,finc,earn,age,ed,work,unearn,post93,mom,mom_post93,post92,mom_post92
0,11.0,1991.0,7.6,0,1,18714.394273,18714.394273,26,10,1,0.0,0,0,0,0,0
1,12.0,1991.0,7.2,1,0,4838.568282,471.365639,22,9,1,4.367203,0,1,0,0,0
2,13.0,1991.0,6.4,2,0,8178.193833,0.0,33,11,0,8.178194,0,1,0,0,0
3,14.0,1991.0,9.1,0,1,9369.570485,0.0,43,11,0,9.36957,0,0,0,0,0
4,15.0,1991.0,8.6,3,1,14706.60793,14706.60793,23,7,1,0.0,0,1,0,0,0


In [17]:
#preprace placebo dataset
df_placebo = df[df['year'] < 1994]

In [18]:
#x and y for placebo test
y_placebo = df_placebo.loc[:, 'work'].values
X_placebo = df_placebo.loc[:, ['post92',
                               'mom',
                               'mom_post92']
                          ]

#fit
X_placebo = smf.add_constant(X_placebo)
model_placebo = smf.Logit(y_placebo,X_placebo).fit()
model_placebo.summary(yname = 'work',
               xname = ('intercept',
                        'After 1992',
                        'Is Mom',
                        "Mom & After 1992"),
              title = "Impact of tax credit employment - placebo Model")

Optimization terminated successfully.
         Current function value: 0.684872
         Iterations 4


0,1,2,3
Dep. Variable:,work,No. Observations:,7401.0
Model:,Logit,Df Residuals:,7397.0
Method:,MLE,Df Model:,3.0
Date:,"Sat, 04 Mar 2023",Pseudo R-squ.:,0.01193
Time:,12:17:42,Log-Likelihood:,-5068.7
converged:,True,LL-Null:,-5130.0
Covariance Type:,nonrobust,LLR p-value:,2.29e-26

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.3124,0.044,7.154,0.000,0.227,0.398
After 1992,-0.0259,0.077,-0.335,0.737,-0.177,0.126
Is Mom,-0.5138,0.057,-8.950,0.000,-0.626,-0.401
Mom & After 1992,-0.0239,0.102,-0.234,0.815,-0.224,0.176


`Mom and after 92` isn't signficant statistically significant (p value > .05), so this tells us there is no diff in the evolution of employment of single women with children versus single women.
This validiti ouw