In [73]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import norm 

In [68]:
FNAME = '../data/bwght.csv'
df = pd.read_csv(FNAME)

#print summary info
print df.mean()
print '\n'
print df.var()

faminc       29.026657
cigtax       19.552954
cigprice    130.559006
bwght       118.699568
fatheduc     13.186242
motheduc     12.935833
parity        1.632565
male          0.520893
white         0.784582
cigs          2.087176
lbwght        4.760031
bwghtlbs      7.418723
packs         0.104359
lfaminc       3.071271
dtype: float64


faminc      351.160789
cigtax       60.771347
cigprice    104.949478
bwght       414.283864
fatheduc      7.540432
motheduc      5.648838
parity        0.799285
male          0.249743
white         0.169135
cigs         35.673001
lbwght        0.036352
bwghtlbs      1.618296
packs         0.089183
lfaminc       0.842842
dtype: float64


In [69]:
#part a
df['const'] = 1
df['smokes'] = (df['cigs'] > 0).astype(int)

#print percentage of mothers who smoked during pregnancy
print df['smokes'].mean()

0.152737752161


In [70]:
#simple OLS
linear_prob = smf.OLS(endog= df['smokes'],
                     exog=df[['const','motheduc','white','lfaminc']],
                     missing='drop')
linear_results = linear_prob.fit()
print linear_results.summary()

                            OLS Regression Results                            
Dep. Variable:                 smokes   R-squared:                       0.063
Model:                            OLS   Adj. R-squared:                  0.061
Method:                 Least Squares   F-statistic:                     31.05
Date:                Fri, 28 Sep 2018   Prob (F-statistic):           2.02e-19
Time:                        16:55:43   Log-Likelihood:                -505.23
No. Observations:                1387   AIC:                             1018.
Df Residuals:                    1383   BIC:                             1039.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6255      0.054     11.638      0.0

In [71]:
#effect of extra year of educ
print linear_results.params[0]

0.6255032733509698


In [47]:
#simple probit
probit_mod = smf.Probit(endog= df['smokes'],
                     exog=df[['const','motheduc','white','lfaminc']],
                     missing='drop')

probit_results = probit_mod.fit()
print probit_results.summary()

Optimization terminated successfully.
         Current function value: 0.394210
         Iterations 6
                          Probit Regression Results                           
Dep. Variable:                 smokes   No. Observations:                 1387
Model:                         Probit   Df Residuals:                     1383
Method:                           MLE   Df Model:                            3
Date:                Fri, 28 Sep 2018   Pseudo R-squ.:                 0.07812
Time:                        16:36:21   Log-Likelihood:                -546.77
converged:                       True   LL-Null:                       -593.11
                                        LLR p-value:                 5.846e-20
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.1263      0.250      4.497      0.000       0.635       1.617
motheduc      -0.1451      0.

In [79]:
beta = probit_results.params
mean_x = np.array(df[['const','motheduc','white','lfaminc']].mean())
x_beta = np.matmul(mean_x.transpose(),beta)

#discrete impact of an extra year
mean_x_plus = mean_x
#increase x by a unit
mean_x_plus[1] = mean_x_plus[1] + 1
x_beta_plus = np.matmul(mean_x_plus.transpose(),beta)

#discrete formula
#G(x'*beta) - G(x*beta)
print norm(0,1).cdf(x_beta_plus) - norm(0,1).cdf(x_beta)


#cts formula
#g(x*beta)*betaj
print norm(0,1).pdf(x_beta) * beta[1] #this is the same as the table

-0.028635040736870176
-0.03111541986179206


In [81]:
probit_margeff = probit_results.get_margeff(at='mean')
print probit_margeff.summary()

#https://www.statsmodels.org/dev/generated/statsmodels.discrete.discrete_model.CountResults.get_margeff.html

       Probit Marginal Effects       
Dep. Variable:                 smokes
Method:                          dydx
At:                              mean
                dy/dx    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
motheduc      -0.0311      0.004     -7.240      0.000      -0.040      -0.023
white          0.0407      0.024      1.726      0.084      -0.006       0.087
lfaminc       -0.0358      0.011     -3.348      0.001      -0.057      -0.015


In [52]:
#logit
logit_mod = smf.Logit(endog= df['smokes'],
                     exog=df[['const','motheduc','white','lfaminc']],
                     missing='drop')
logit_results = logit_mod.fit()
print logit_results.summary()

Optimization terminated successfully.
         Current function value: 0.395626
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                 smokes   No. Observations:                 1387
Model:                          Logit   Df Residuals:                     1383
Method:                           MLE   Df Model:                            3
Date:                Fri, 28 Sep 2018   Pseudo R-squ.:                 0.07481
Time:                        16:37:05   Log-Likelihood:                -548.73
converged:                       True   LL-Null:                       -593.11
                                        LLR p-value:                 4.078e-19
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0125      0.447      4.498      0.000       1.136       2.889
motheduc      -0.2519      0.

In [53]:
logit_margeff = logit_results.get_margeff(at='mean')
print logit_margeff.summary()

        Logit Marginal Effects       
Dep. Variable:                 smokes
Method:                          dydx
At:                              mean
                dy/dx    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
motheduc      -0.0288      0.004     -7.178      0.000      -0.037      -0.021
white          0.0393      0.023      1.719      0.086      -0.006       0.084
lfaminc       -0.0339      0.010     -3.441      0.001      -0.053      -0.015
