In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import scipy
import matplotlib

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

# Question 01: Propensity Score matching

### 01 naive analysis
Let's start by loading in our data

In [2]:
df_lalonde = pd.read_csv(r'lalonde.csv')
df_lalonde.head()

Unnamed: 0,id,treat,age,educ,black,hispan,married,nodegree,re74,re75,re78
0,NSW1,1,37,11,1,0,1,1,0.0,0.0,9930.046
1,NSW2,1,22,9,0,1,0,1,0.0,0.0,3595.894
2,NSW3,1,30,12,1,0,0,0,0.0,0.0,24909.45
3,NSW4,1,27,11,1,0,0,1,0.0,0.0,7506.146
4,NSW5,1,33,8,1,0,0,1,0.0,0.0,289.7899


Let's have a quick look at our data with describe(). We can already note a few aspects of interest
* We're looking mostly at young people, an average of 27 years with 1st and 3rd quartiles at 20 and 32 respectively
* About 60% of the subjects don't have a degree
* If we look at the mean revenue in 1974 and 75 we can see a big fall, which isn't too surprising as this was right during the oil crisis and the [73-75 recession](https://en.wikipedia.org/wiki/1973%E2%80%9375_recession). By 1976 this recession was over and GDP reached it's pre-1973 level, explaining the much higher salaries in 78

In [3]:
df_lalonde.describe()

Unnamed: 0,treat,age,educ,black,hispan,married,nodegree,re74,re75,re78
count,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0
mean,0.301303,27.363192,10.26873,0.395765,0.117264,0.415309,0.630293,4557.546569,2184.938207,6792.834483
std,0.459198,9.881187,2.628325,0.489413,0.321997,0.493177,0.483119,6477.964479,3295.679043,7470.730792
min,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,20.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,238.283425
50%,0.0,25.0,11.0,0.0,0.0,0.0,1.0,1042.33,601.5484,4759.0185
75%,1.0,32.0,12.0,1.0,0.0,1.0,1.0,7888.49825,3248.9875,10893.5925
max,1.0,55.0,18.0,1.0,1.0,1.0,1.0,35040.07,25142.24,60307.93


In [4]:
# Adding column with employment status and looking at unemployment
df_lalonde['em74'] = df_lalonde['re74']
df_lalonde.em74 = df_lalonde.em74.apply(lambda x: x if x == 0 else 1)
df_lalonde.em74.plot(kind='hist', bins=2)

<matplotlib.axes._subplots.AxesSubplot at 0x7f94a7397828>

In [5]:
df_treat = df_lalonde[df_lalonde['treat']==1]
df_notreat = df_lalonde[df_lalonde['treat']==0]

df_new = pd.DataFrame(index=df_treat.describe().index, columns=['treat re78','notreat re78'])
# df_new = df_treat.re78.describe()
df_new.loc[:, 'treat re78'] = df_treat.describe()['re78'].values
df_new.loc[:, 'notreat re78'] = df_notreat.describe()['re78'].values
df_new

Unnamed: 0,treat re78,notreat re78
count,185.0,429.0
mean,6349.14353,6984.169742
std,7867.402218,7294.161791
min,0.0,0.0
25%,485.2298,220.1813
50%,4232.309,4975.505
75%,9642.999,11688.82
max,60307.93,25564.67


In [6]:
fig, axes = plt.subplots(1, 2)
df_treat.re78.plot(kind='hist', ax=axes[0], title='Histogram', label='treated', alpha=0.5, figsize=(16,8))
df_notreat.re78.plot(kind='hist', ax=axes[0], label='untreated', alpha=0.5)
axes[0].legend()

df_treat.re78.plot(kind='hist', cumulative=True, ax=axes[1], title='Cumulative histogram', label='treated', alpha=0.5)
df_notreat.re78.plot(kind='hist', cumulative=True, ax=axes[1], label='untreated', alpha=0.5)

NameError: name 'plt' is not defined

In [None]:
fig, axes = plt.subplots(1, 2)
df_treat[df_treat.re78 != 0].re78.plot(kind='hist', ax=axes[0], title='Histogram', label='treated', alpha=0.5, figsize=(16,8))
df_notreat[df_notreat.re78 != 0].re78.plot(kind='hist', ax=axes[0], label='untreated', alpha=0.5)
axes[0].legend()

df_treat[df_treat.re78 != 0].re78.plot(kind='hist', cumulative=True, ax=axes[1], title='Cumulative histogram', label='treated', alpha=0.5)
df_notreat[df_notreat.re78 != 0].re78.plot(kind='hist', cumulative=True, ax=axes[1], label='untreated', alpha=0.5)

Just looking at the histograms, means and medians are not very helpful to draw a clear cut answer of whether the treatment has an effect or not. We have a lot more samples for the non-treated population too, which makes it harder to compare simple histograms

Given the histograms, the revenue is clearly not normally distributed. Furthermore, the samples are of different sizes, so we would have to use the Mann-Whitney U test to try and estimate if the treated and untreated populations have a different revenue. As we can see from the returned p value close to 1, we cannot discard the null hypothesis that these two samples are drawn from the same population. If we follow the naive reasoning we may then conclude that these populations are in fact the same and that the treatment had no effect

In [None]:
stat, pval = scipy.stats.mannwhitneyu(df_treat.re78.values, df_treat.re78.values, alternative='two-sided')
print(stat,'\n',pval)

### 02 A closer look at the data
We're now interested in distributions of variables within our two sets. For a quick look over our set let's plot a heatmap of correlation between all our variables. What we're really interested in is whether a subject is treated or not is correlated to any other of the subject's features

In [None]:
sns.heatmap(df_lalonde.corr().values, center=0, 
            xticklabels=df_lalonde.columns[1:], yticklabels=df_lalonde.columns[1:])

We can see some interesting trends if we start correlating the different characteristics within the groups.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# todo: choose better color scheme

sns.heatmap(df_treat.corr(method='spearman').values, ax=axes[0], center=0, 
            xticklabels=df_treat.columns[1:], yticklabels=df_lalonde.columns[1:])
sns.heatmap(df_notreat.corr(method='spearman').values, ax=axes[1], center=0, 
            xticklabels=df_notreat.columns[1:], yticklabels=df_lalonde.columns[1:])

In [None]:
fig, axes = plt.subplots(1, 3)
df_treat.black.plot(kind='hist', bins=2, ax=axes[0], alpha=0.5, label='treated', figsize=(16, 8))
df_notreat.black.plot(kind='hist', bins=2, ax=axes[0], alpha=0.5, label='non-treated')
axes[0].legend()
df_treat.married.plot(kind='hist', bins=2, ax=axes[1], alpha=0.5, label='treated')
df_notreat.married.plot(kind='hist', bins=2, ax=axes[1], alpha=0.5, label='non-treated')

df_treat.hispan.plot(kind='hist', bins=2, ax=axes[2], alpha=0.5, label='treated')
df_notreat.hispan.plot(kind='hist', bins=2, ax=axes[2], alpha=0.5, label='non-treated')

### 03 A propensity score model
To account for any variations in feature between treated and control sets, we'd now like to estimate the probability of treatment of a subject given their features.

In [None]:
df_lalonde.columns
features = ['age', 'educ', 'black', 'hispan', 'married', 'nodegree']
x = pd.get_dummies(df_lalonde[features])
y = df_lalonde['re78']

logistic = LogisticRegression()
# logistic.fit(x,y)