In [1]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import norm 

In [2]:
FNAME = '../data/bwght.csv'
df = pd.read_csv(FNAME)

#print summary info
print df.mean()

faminc       29.026657
cigtax       19.552954
cigprice    130.559006
bwght       118.699568
fatheduc     13.186242
motheduc     12.935833
parity        1.632565
male          0.520893
white         0.784582
cigs          2.087176
lbwght        4.760031
bwghtlbs      7.418723
packs         0.104359
lfaminc       3.071271
dtype: float64


In [3]:
############# part a) #############
df['const'] = 1
df['smokes'] = (df['cigs'] > 0).astype(int)

#print percentage of mothers who smoked during pregnancy
print 'Percentage smokers: %s'%df['smokes'].mean()

Percentage smokers: 0.152737752161


In [4]:
############# part b) i) #############
linear_prob = sm.OLS(endog= df['smokes'],
                     exog=df[['const','motheduc','white','lfaminc']],
                     missing='drop')
linear_results = linear_prob.fit()
print linear_results.summary()

                            OLS Regression Results                            
Dep. Variable:                 smokes   R-squared:                       0.063
Model:                            OLS   Adj. R-squared:                  0.061
Method:                 Least Squares   F-statistic:                     31.05
Date:                Thu, 04 Oct 2018   Prob (F-statistic):           2.02e-19
Time:                        09:02:16   Log-Likelihood:                -505.23
No. Observations:                1387   AIC:                             1018.
Df Residuals:                    1383   BIC:                             1039.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6255      0.054     11.638      0.0

In [5]:
#effect of extra year of educ on probability
print 'Marginal Year of Education: %s'%linear_results.params[1]

Marginal Year of Education: -0.02932066021464503


In [6]:
############# part b) ii) ############# 

probit_mod = sm.Probit(endog= df['smokes'],
                     exog=df[['const','motheduc','white','lfaminc']],
                     missing='drop')

probit_results = probit_mod.fit()
print probit_results.summary()

Optimization terminated successfully.
         Current function value: 0.394210
         Iterations 6
                          Probit Regression Results                           
Dep. Variable:                 smokes   No. Observations:                 1387
Model:                         Probit   Df Residuals:                     1383
Method:                           MLE   Df Model:                            3
Date:                Thu, 04 Oct 2018   Pseudo R-squ.:                 0.07812
Time:                        09:02:46   Log-Likelihood:                -546.77
converged:                       True   LL-Null:                       -593.11
                                        LLR p-value:                 5.846e-20
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.1263      0.250      4.497      0.000       0.635       1.617
motheduc      -0.1451      0.

In [7]:
############# part b) ii) A)-B) #############

beta = probit_results.params
mean_x = np.array(df[['const','motheduc','white','lfaminc']].mean())
x_beta = np.matmul(mean_x.transpose(),beta)

#discrete impact of an extra year
mean_x_plus = mean_x
mean_x_plus[1] = mean_x_plus[1] + 1 #increase x by a unit
x_beta_plus = np.matmul(mean_x_plus.transpose(),beta)

#discrete formula
#G(x'*beta) - G(x*beta)
print 'Discrete Formula: %s'%(norm(0,1).cdf(x_beta_plus) - norm(0,1).cdf(x_beta))

#cts formula
#g(x*beta)*beta_j
print 'Continuous Formula: %s'%(norm(0,1).pdf(x_beta) * beta[1])

Discrete Formula: -0.028635040736870176
Continuous Formula: -0.03111541986179206


In [8]:
############# part b) ii) C) #############

probit_margeff = probit_results.get_margeff(at='mean',dummy=True)
print probit_margeff.summary()

##the marginal effect of education on the probability of smoking
#is -.0311 according to the output. 
#Since education is discrete, it should -.0286

#source:
#https://www.statsmodels.org/dev/generated/statsmodels.discrete.discrete_model.CountResults.get_margeff.html

       Probit Marginal Effects       
Dep. Variable:                 smokes
Method:                          dydx
At:                              mean
                dy/dx    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
motheduc      -0.0311      0.004     -7.240      0.000      -0.040      -0.023
white          0.0383      0.021      1.839      0.066      -0.003       0.079
lfaminc       -0.0358      0.011     -3.348      0.001      -0.057      -0.015


In [9]:
############# part b) iii) #############

logit_mod = sm.Logit(endog= df['smokes'],
                     exog=df[['const','motheduc','white','lfaminc']],
                     missing='drop')
logit_results = logit_mod.fit()
print logit_results.summary()

Optimization terminated successfully.
         Current function value: 0.395626
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                 smokes   No. Observations:                 1387
Model:                          Logit   Df Residuals:                     1383
Method:                           MLE   Df Model:                            3
Date:                Thu, 04 Oct 2018   Pseudo R-squ.:                 0.07481
Time:                        09:02:52   Log-Likelihood:                -548.73
converged:                       True   LL-Null:                       -593.11
                                        LLR p-value:                 4.078e-19
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0125      0.447      4.498      0.000       1.136       2.889
motheduc      -0.2519      0.

In [10]:
logit_margeff = logit_results.get_margeff(at='mean',dummy=True)
print logit_margeff.summary()

#the marginal effect of an extra year of education 
#on the probability of smoking is -.0288

        Logit Marginal Effects       
Dep. Variable:                 smokes
Method:                          dydx
At:                              mean
                dy/dx    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
motheduc      -0.0288      0.004     -7.178      0.000      -0.037      -0.021
white          0.0366      0.020      1.849      0.065      -0.002       0.075
lfaminc       -0.0339      0.010     -3.441      0.001      -0.053      -0.015


In [11]:
############# part b) iv) #############

#The marginal effects in all 3 models are roughly the same.
#It is -.0293 in the LPM, -.0311 in the Probit and -.0288 in the logit.
#They are not exactly the same, but they are close - as expected.