# Propensity Score Matcing

In [1]:
#import packages
import pandas as pd
import numpy as np
import seaborn as sns
import scipy
import matplotlib.pyplot as plt
import matplotlib.style as style
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

style.use('ggplot')

In [2]:
#import dataset
df = pd.read_csv('lalonde.csv')
df.head()

Unnamed: 0,treat,age,educ,black,hispan,married,nodegree,re74,re75,re78
0,1,37,11,1,0,1,1,0.0,0.0,9930.046
1,1,22,9,0,1,0,1,0.0,0.0,3595.894
2,1,30,12,1,0,0,0,0.0,0.0,24909.45
3,1,27,11,1,0,0,1,0.0,0.0,7506.146
4,1,33,8,1,0,0,1,0.0,0.0,289.7899


In [3]:
#looking for averages per group

df.groupby('treat').mean()

Unnamed: 0_level_0,age,educ,black,hispan,married,nodegree,re74,re75,re78
treat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,28.030303,10.235431,0.202797,0.142191,0.512821,0.596737,5619.236506,2466.484443,6984.169742
1,25.816216,10.345946,0.843243,0.059459,0.189189,0.708108,2095.573689,1532.055314,6349.14353


#### T-test

In [4]:
#list with continous variables
confounders_list = ['age', 'educ', 're74', 're75']

In [5]:
#empty dic
t_test_results = {}

#loop
for x in confounders_list:
    group1 = df[df['treat'] == 0].dropna()[x]
    group2 = df[df['treat'] == 1].dropna()[x]
    
    #append to dic
    t_test_results[x] = scipy.stats.ttest_ind(group1, group2)
    
results = pd.DataFrame.from_dict(t_test_results, orient = 'Index')
results.columns = ['statistic', 'p_value']
    

In [6]:
results

Unnamed: 0,statistic,p_value
age,2.559013,0.01073633
educ,-0.477747,0.633001
re74,6.381464,3.464585e-10
re75,3.248551,0.001223444


#### isolating treat and cofounders

In [7]:
treat = df[['treat']]
confounders = df.iloc[:, 1:-1]

#### log reg

In [8]:
confounders = sm.add_constant(confounders)
propensity_model = sm.Logit(treat, confounders).fit()

Optimization terminated successfully.
         Current function value: 0.397267
         Iterations 7


In [9]:
propensity_model.summary()

0,1,2,3
Dep. Variable:,treat,No. Observations:,614.0
Model:,Logit,Df Residuals:,605.0
Method:,MLE,Df Model:,8.0
Date:,"Sun, 12 Mar 2023",Pseudo R-squ.:,0.3508
Time:,20:01:58,Log-Likelihood:,-243.92
converged:,True,LL-Null:,-375.75
Covariance Type:,nonrobust,LLR p-value:,2.194e-52

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-4.7286,1.017,-4.649,0.000,-6.722,-2.735
age,0.0158,0.014,1.162,0.245,-0.011,0.042
educ,0.1613,0.065,2.477,0.013,0.034,0.289
black,3.0654,0.287,10.698,0.000,2.504,3.627
hispan,0.9836,0.426,2.311,0.021,0.149,1.818
married,-0.8321,0.290,-2.866,0.004,-1.401,-0.263
nodegree,0.7073,0.338,2.095,0.036,0.045,1.369
re74,-7.178e-05,2.87e-05,-2.497,0.013,-0.000,-1.54e-05
re75,5.345e-05,4.63e-05,1.153,0.249,-3.74e-05,0.000


#### predicting the propensity to be treated 

In [11]:
propensity_socre = propensity_model.predict(confounders)
propensity_socre

0      0.638770
1      0.224634
2      0.678244
3      0.776324
4      0.701639
         ...   
609    0.123144
610    0.034560
611    0.183351
612    0.383032
613    0.089712
Length: 614, dtype: float64

In [13]:
#create df

propensity_df = treat.copy()
propensity_df['propensity_score'] = propensity_socre
propensity_df

Unnamed: 0,treat,propensity_score
0,1,0.638770
1,1,0.224634
2,1,0.678244
3,1,0.776324
4,1,0.701639
...,...,...
609,0,0.123144
610,0,0.034560
611,0,0.183351
612,0,0.383032
