In [1]:
import sys                         # import operating system functions
import numpy as np                  # pandas uses numpy, and sometimes want to use numpy within pandas
import pandas as pd                    # data package, redundant since already did
import matplotlib.pyplot as plt               # graphics package, just the part we mainly use
import seaborn as sns               # makes matplotlib prettier without issuing a single command!
import datetime as dt                  # date and time module, often need to use if date is a field in your data

# check versions (overkill, but why not?)
print('Python version:', sys.version)
print('Pandas version: ', pd.__version__)
# print ('Matplotlib version:',matplotlib.__version__) #command not in the pyplot piece of matplotlib, would need to import entire package
print('Today: ', dt.date.today())
print(plt.style.available)
plt.style.use('seaborn-whitegrid')
import os
os.getcwd()

Python version: 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]
Pandas version:  1.0.1
Today:  2020-04-15
['bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark-palette', 'seaborn-dark', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'seaborn', 'Solarize_Light2', 'tableau-colorblind10', '_classic_test']


'C:\\Users\\tenis\\OneDrive\\Desktop\\Python_projects\\Econometrics'

### Linear Probability Models
> Kenneth Flamm

> Spring 2020

#### Linear probability models
* Fell out of fashion in the 1980s and 1990s
    
* Back in fashion among economists and econometricians
    
    * Can be viewed as approximation to logit or probit (which generally yield nearly identical results)
   [von Hippel article](https://statisticalhorizons.com/linear-vs-logistic)
    
    * should give similar results for probabilities in .2 to .8 range
    
    * generally does give similar empirical results to logit or probit for non-rare events
    
    * huge gains in ease of use
    
    * not obviously worse as approximation to unknown distribution than logit or probit parametric assumptions

#### Good vehicle for taking about heteroskedasticity

* linear probability model is inherently heteroskedastic
Can be good when you have a complex dataset with endogeneity and is easier to use and interpret 

* LPM: 
* probability of event occurring is approximately linear function of x's
    * $P(y=1 | x) = \beta_0 + \sum_i \beta_i x_i$ , so
    * $ E(y | x) = \beta_0 + \sum_i \beta_i x_i$
    * and if we change x_i while holding all other factors fixed, then
    * $\Delta P (y=1 | x) = \beta_i \Delta x_i$
        * can interpret $\beta_i $ as marginal effect of 1 unit change in $x_i$ on probability of event
        
* a binary random variable y=1 with P(y=1|x) has variance  = $ P(y=1|x) * (1-P(y=1|x))  $
    * This is obviously not constant, and will vary with x.
    * Heteroskedasticity guaranteed!
    
* See appropriate sections in Wooldridge, chaps. 7 & 8.

Let's take another look at that apple data.
  

In [2]:
#use apple.data
apdf=pd.read_stata('http://fmwww.bc.edu/ec-p/data/wooldridge/apple.dta') 
apdf.tail()

Unnamed: 0,id,educ,date,state,regprc,ecoprc,inseason,hhsize,male,faminc,age,reglbs,ecolbs,numlt5,num5_17,num18_64,numgt64
655,13892,14,20298,MD,0.59,0.59,0,5,0,65,37,1.333333,1.333333,1,2,2,0
656,13893,16,20398,OH,0.59,0.59,0,4,0,65,47,0.0,2.0,0,2,2,0
657,13908,16,20398,IN,0.89,1.09,0,2,0,75,51,1.0,0.0,0,0,2,0
658,13916,12,20298,NY,0.59,0.59,0,1,0,15,45,0.0,2.666667,0,0,1,0
659,13921,18,20798,MA,1.19,1.39,0,3,1,25,24,2.0,0.0,0,0,3,0


* Lots of zeros in apple consumption data
* a lot of mothers apparently didn't tell their children "an apple a day..."

* Let's code up a y-variable for buying any eco apples at all.


In [3]:
apdf.loc[:,'buy_eco']=0
apdf.loc[(apdf.ecolbs > 0),'buy_eco']=1
apdf.loc[:,'buy_reg']=0
apdf.loc[(apdf.reglbs > 0),'buy_reg']=1
apdf[['buy_eco','buy_reg']].describe()

Unnamed: 0,buy_eco,buy_reg
count,660.0,660.0
mean,0.624242,0.484848
std,0.484685,0.500149
min,0.0,0.0
25%,0.0,0.0
50%,1.0,0.0
75%,1.0,1.0
max,1.0,1.0


> **38% of households wouldn't buy any eco apples given apple prices they were informed of!**

> **52% of households wouldn't buy any reg apples at these prices!**

> model probability of buying an eco apple:

In [4]:
#linear probability model
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import statsmodels.api as sm
from statsmodels.compat import lzip # useful for printing out complicated test statistics, see below
mod=smf.ols('buy_eco ~ ecoprc + regprc + faminc + hhsize + educ + age',apdf)
res=mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,buy_eco,R-squared:,0.11
Model:,OLS,Adj. R-squared:,0.102
Method:,Least Squares,F-statistic:,13.43
Date:,"Wed, 15 Apr 2020",Prob (F-statistic):,2.18e-14
Time:,14:59:23,Log-Likelihood:,-419.6
No. Observations:,660,AIC:,853.2
Df Residuals:,653,BIC:,884.6
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4237,0.165,2.568,0.010,0.100,0.748
ecoprc,-0.8026,0.109,-7.336,0.000,-1.017,-0.588
regprc,0.7193,0.132,5.464,0.000,0.461,0.978
faminc,0.0006,0.001,1.042,0.298,-0.000,0.002
hhsize,0.0238,0.013,1.902,0.058,-0.001,0.048
educ,0.0248,0.008,2.960,0.003,0.008,0.041
age,-0.0005,0.001,-0.401,0.689,-0.003,0.002

0,1,2,3
Omnibus:,4015.36,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,69.344
Skew:,-0.411,Prob(JB):,8.75e-16
Kurtosis:,1.641,Cond. No.,724.0


* Let's test for heteroskedasticity using the Breusch-Pagan test and White test
    * discussed in Wooldridge, chap. 8.

In [5]:
sms.het_breuschpagan?

[1;31mSignature:[0m [0msms[0m[1;33m.[0m[0mhet_breuschpagan[0m[1;33m([0m[0mresid[0m[1;33m,[0m [0mexog_het[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Breusch-Pagan Lagrange Multiplier test for heteroscedasticity

The tests the hypothesis that the residual variance does not depend on
the variables in x in the form

.. :math: \sigma_i = \sigma * f(\alpha_0 + \alpha z_i)

Homoscedasticity implies that :math:`\alpha=0`.

Parameters
----------
resid : array_like
    For the Breusch-Pagan test, this should be the residual of a
    regression. If an array is given in exog, then the residuals are
    calculated by the an OLS regression or resid on exog. In this case
    resid should contain the dependent variable. Exog can be the same as x.
exog_het : array_like
    This contains variables suspected of being related to
    heteroscedasticity in resid.

Returns
-------
lm : float
    lagrange multiplier statistic
lm_pvalue :float
    p-value of lagrange multipl

In [6]:
# Breusch-Pagan test
# null hypothesis is homoskedasticity
name = ['Lagrange multiplier statistic', 'p-value',
        'f-value', 'f p-value']
test = sms.het_breuschpagan(res.resid,res.model.exog)
lzip(name, test)

[('Lagrange multiplier statistic', 41.98900145126039),
 ('p-value', 1.8479883523450348e-07),
 ('f-value', 7.394371624469461),
 ('f p-value', 1.137751543075782e-07)]

We reject the null BECAUSE the p-value is super low .

In [7]:
sms.het_white?

[1;31mSignature:[0m [0msms[0m[1;33m.[0m[0mhet_white[0m[1;33m([0m[0mresid[0m[1;33m,[0m [0mexog[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
White's Lagrange Multiplier Test for Heteroscedasticity.

Parameters
----------
resid : array_like
    The residuals. The squared residuals are used as the endogenous
    variable.
exog : array_like
    The explanatory variables for the variance. Squares and interaction
    terms are automatically included in the auxiliary regression.

Returns
-------
lm : float
    The lagrange multiplier statistic.
lm_pvalue :float
    The p-value of lagrange multiplier test.
fvalue : float
    The f-statistic of the hypothesis that the error variance does not
    depend on x. This is an alternative test variant not the original
    LM test.
f_pvalue : float
    The p-value for the f-statistic.

Notes
-----
Assumes x contains constant (for counting dof).

question: does f-statistic make sense? constant ?

References
----------
Gre

In [8]:
# White test
# null hypothesis is homoskedasticity
name = ['Lagrange multiplier statistic', 'p-value',
        'f-value', 'f p-value']
test = sms.het_white(res.resid,res.model.exog)
lzip(name, test)

[('Lagrange multiplier statistic', 79.88554888540143),
 ('p-value', 3.930216500851468e-07),
 ('f-value', 3.2233528834391363),
 ('f p-value', 1.257704163461744e-07)]

In [9]:
name = ['Lagrange multiplier statistic', 'p-value',
        'f-value', 'f p-value']
test = sms.het_white(res.resid,res.model.exog)
lzip(name, test)

[('Lagrange multiplier statistic', 79.88554888540143),
 ('p-value', 3.930216500851468e-07),
 ('f-value', 3.2233528834391363),
 ('f p-value', 1.257704163461744e-07)]

* we reject null (homoskedasticity) decisively!

##### Solution #1: robust standard errors

* What are we now assuming?

    * Different observations have different variances
    
    * Disturbance terms uncorrelated across observations still assumed though
    
* What do we lose?

    * Efficiency: no longer minimum variance linear estimator
   

In [10]:
#linear probability model
# with robust standard errors
mod_r=smf.ols('buy_eco ~ ecoprc + regprc + faminc + hhsize + educ + age',apdf)
res_r=mod_r.fit(cov_type='HC3')
res_r.summary()

0,1,2,3
Dep. Variable:,buy_eco,R-squared:,0.11
Model:,OLS,Adj. R-squared:,0.102
Method:,Least Squares,F-statistic:,14.75
Date:,"Wed, 15 Apr 2020",Prob (F-statistic):,7.63e-16
Time:,15:14:44,Log-Likelihood:,-419.6
No. Observations:,660,AIC:,853.2
Df Residuals:,653,BIC:,884.6
Df Model:,6,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.4237,0.169,2.506,0.012,0.092,0.755
ecoprc,-0.8026,0.106,-7.551,0.000,-1.011,-0.594
regprc,0.7193,0.131,5.492,0.000,0.463,0.976
faminc,0.0006,0.001,1.033,0.302,-0.000,0.002
hhsize,0.0238,0.013,1.891,0.059,-0.001,0.049
educ,0.0248,0.009,2.907,0.004,0.008,0.041
age,-0.0005,0.001,-0.393,0.694,-0.003,0.002

0,1,2,3
Omnibus:,4015.36,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,69.344
Skew:,-0.411,Prob(JB):,8.75e-16
Kurtosis:,1.641,Cond. No.,724.0


#### Note that coefficient estimates identical to OLS, only SEs have changed

* Not that different in this case, but can be bigger or smaller, in general

* Biggest problem with LPM is when probability p is close to zero or one

* In this case, linear model allows p to be negative or > 1, which is an issue for predictions

* Do we have this problem?

In [11]:
res_r.fittedvalues.describe()

count    660.000000
mean       0.624242
std        0.160623
min        0.240317
25%        0.487370
50%        0.627557
75%        0.754041
max        1.070860
dtype: float64

> No negative values!

> But we do have at least one observation with predicted prob > 0

In [12]:
res_r.fittedvalues[(res_r.fittedvalues>1)].head(20)

167    1.070860
493    1.054372
dtype: float64

> So we have two such predictions that are out of bounds....

#### Solution #2: weighted least squares, AKA Feasible GLS

* Idea: we estimate size of variance, observation by observation, using a statistical model

    * See Wooldridge, chap. 8.

* If predicted variance conditional on x is $h \sigma^2$

* transform all variables (including constant) by multiplying by $\frac{1}{\sqrt{h}}$
    * resulting transformed model has homoskedastic disturbances
    * Can then estimate model using OLS
    * satisfies conditions guaranteeing minimum variance unbiased linear estimator
    * statsmodels will do all this automatically using **WLS** estimation command
        * you pass it a vector of weights  $\frac{1}{{h}}$ i.e., the inverse of the variance, not the sd
        * WLS requires that the weights are proportional to the inverse of the error variance

* ***But,*** 
    * if function used to estimate h is wrong
        * no guarantee estimate will be more efficient than OLS, could be less
        * disturbance term will still be heteroskedastic, inference incorrect
        * but you could then use robust standard errors to hedge your bets!
    * even if the function used to estimate h is right
        * all estimated standard errors, statistical inference is only asymptotically correct
        * because you estimated heteroskedasticity (not known with certainty)
            * you are relying on consistency of variance estimate
            * to deliver estimated variance close to true value, requires large sample
        
        

##### Applying this approach to LPM:

* variance of y (probability) is just p * (1-p) = $ \hat{y} * (1 - \hat{y})$
* so $ h = \hat{y} * (1 - \hat{y})$

* Steps:
    1. get $\hat{y}$
    2. replace $\hat{y}$ with .99 if > 1, .01 if < 0
    3. estimate model using WLS after passing weights $\frac{1}{h}$
* Example:

In [13]:
apdf.loc[:,'p_hat']=res_r.fittedvalues

In [14]:
apdf.p_hat.describe()

count    660.000000
mean       0.624242
std        0.160623
min        0.240317
25%        0.487370
50%        0.627557
75%        0.754041
max        1.070860
Name: p_hat, dtype: float64

In [15]:
apdf.loc[(apdf.p_hat>1),'p_hat']=.99
apdf.loc[:,'w']=1/(apdf.p_hat*(1-apdf.p_hat))
apdf[['p_hat','w']].describe()

Unnamed: 0,p_hat,w
count,660.0,660.0
mean,0.624022,5.45989
std,0.160069,5.687243
min,0.240317,4.0
25%,0.48737,4.088832
50%,0.627557,4.364407
75%,0.754041,5.407975
max,0.99,101.010101


In [16]:
mod_w=sm.WLS.from_formula('buy_eco ~ ecoprc + regprc + faminc + hhsize + educ + age',
              apdf,weights=apdf.w)
res_w=mod_w.fit()
res_w.summary()

0,1,2,3
Dep. Variable:,buy_eco,R-squared:,0.099
Model:,WLS,Adj. R-squared:,0.091
Method:,Least Squares,F-statistic:,12.02
Date:,"Wed, 15 Apr 2020",Prob (F-statistic):,7.81e-13
Time:,15:26:35,Log-Likelihood:,-439.02
No. Observations:,660,AIC:,892.0
Df Residuals:,653,BIC:,923.5
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5559,0.160,3.481,0.001,0.242,0.869
ecoprc,-0.8115,0.112,-7.274,0.000,-1.031,-0.592
regprc,0.8037,0.136,5.909,0.000,0.537,1.071
faminc,-0.0009,0.000,-1.918,0.056,-0.002,2.08e-05
hhsize,0.0164,0.012,1.419,0.156,-0.006,0.039
educ,0.0151,0.008,1.913,0.056,-0.000,0.031
age,0.0003,0.001,0.254,0.799,-0.002,0.003

0,1,2,3
Omnibus:,153.597,Durbin-Watson:,2.104
Prob(Omnibus):,0.0,Jarque-Bera (JB):,468.94
Skew:,-1.113,Prob(JB):,1.48e-102
Kurtosis:,6.477,Cond. No.,837.0


In [17]:
mod_wr=sm.WLS.from_formula('buy_eco ~ ecoprc + regprc + faminc + hhsize + educ + age',
              apdf,weights=apdf.w)
res_wr=mod_wr.fit(cov_type='HC3')
res_wr.summary()

0,1,2,3
Dep. Variable:,buy_eco,R-squared:,0.099
Model:,WLS,Adj. R-squared:,0.091
Method:,Least Squares,F-statistic:,11.69
Date:,"Wed, 15 Apr 2020",Prob (F-statistic):,1.85e-12
Time:,15:28:52,Log-Likelihood:,-439.02
No. Observations:,660,AIC:,892.0
Df Residuals:,653,BIC:,923.5
Df Model:,6,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.5559,0.201,2.761,0.006,0.161,0.951
ecoprc,-0.8115,0.114,-7.140,0.000,-1.034,-0.589
regprc,0.8037,0.136,5.910,0.000,0.537,1.070
faminc,-0.0009,0.002,-0.578,0.563,-0.004,0.002
hhsize,0.0164,0.013,1.245,0.213,-0.009,0.042
educ,0.0151,0.012,1.208,0.227,-0.009,0.039
age,0.0003,0.001,0.239,0.811,-0.002,0.003

0,1,2,3
Omnibus:,153.597,Durbin-Watson:,2.104
Prob(Omnibus):,0.0,Jarque-Bera (JB):,468.94
Skew:,-1.113,Prob(JB):,1.48e-102
Kurtosis:,6.477,Cond. No.,837.0


#### Compare estimated coefficients

In [18]:
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
beta = np.vstack([[res.params], [res_r.params], [res_w.params],[res_wr.params]])
beta = np.round(beta,5)
colnames = ['const', 'ecoprc','regprc','faminc','hhsize','educ','age']
rownames = ['OLS', 'OLS_rob', 'WLS','WLS_HC3']
tabl = SimpleTable(beta, colnames, rownames, txt_fmt=default_txt_fmt)
tabl

0,1,2,3,4,5,6,7
,const,ecoprc,regprc,faminc,hhsize,educ,age
OLS,0.42369,-0.80262,0.71927,0.00055,0.02382,0.02478,-0.0005
OLS_rob,0.42369,-0.80262,0.71927,0.00055,0.02382,0.02478,-0.0005
WLS,0.55591,-0.81151,0.80373,-0.00088,0.01635,0.01505,0.00033
WLS_HC3,0.55591,-0.81151,0.80373,-0.00088,0.01635,0.01505,0.00033


##### Compare various standard errors

In [19]:
se = np.vstack([[res.bse], [res_r.bse], [res_w.bse],[res_wr.bse],[res.HC0_se], 
                [res.HC1_se], [res.HC2_se], [res.HC3_se]])
se = np.round(se,5)
colnames = ['const', 'ecoprc','regprc','faminc','hhsize','educ','age']
rownames = ['OLS', 'OLS_rob', 'WLS','WLS_HC3','OLS_HC0', 'OLS_HC1', 'OLS_HC2', 'OLS_HC3']
tabl = SimpleTable(se, colnames, rownames, txt_fmt=default_txt_fmt)
print(tabl)

         const   ecoprc  regprc  faminc  hhsize   educ    age  
---------------------------------------------------------------
OLS     0.16497  0.1094 0.13164 0.00053 0.01253 0.00837 0.00125
OLS_rob 0.16904 0.10629 0.13096 0.00053  0.0126 0.00852 0.00128
WLS     0.15968 0.11156 0.13601 0.00046 0.01153 0.00787  0.0013
WLS_HC3 0.20134 0.11365   0.136 0.00152 0.01313 0.01246 0.00138
OLS_HC0 0.16686 0.10511 0.12954 0.00052  0.0124 0.00841 0.00126
OLS_HC1 0.16775 0.10567 0.13023 0.00052 0.01247 0.00846 0.00127
OLS_HC2 0.16795 0.10569 0.13025 0.00053  0.0125 0.00847 0.00127
OLS_HC3 0.16904 0.10629 0.13096 0.00053  0.0126 0.00852 0.00128
---------------------------------------------------------------


#### In class Exercise: Define variable for buy reg apple, estimate LPM using OLS with robust se, and WLS with robust se


OLS with robust standard errors: 

In [21]:
mod=smf.ols('buy_reg ~ ecoprc + regprc + faminc + hhsize + educ + age',apdf)
res=mod.fit(cov_type='HC3')
res.summary()

0,1,2,3
Dep. Variable:,buy_reg,R-squared:,0.051
Model:,OLS,Adj. R-squared:,0.042
Method:,Least Squares,F-statistic:,5.89
Date:,"Wed, 15 Apr 2020",Prob (F-statistic):,5.33e-06
Time:,15:42:59,Log-Likelihood:,-461.55
No. Observations:,660,AIC:,937.1
Df Residuals:,653,BIC:,968.5
Df Model:,6,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.5775,0.178,3.252,0.001,0.229,0.925
ecoprc,0.6386,0.116,5.508,0.000,0.411,0.866
regprc,-0.7118,0.139,-5.115,0.000,-0.985,-0.439
faminc,-0.0007,0.001,-1.127,0.260,-0.002,0.000
hhsize,-0.0064,0.013,-0.472,0.637,-0.033,0.020
educ,-0.0056,0.009,-0.611,0.541,-0.023,0.012
age,-0.0005,0.001,-0.349,0.727,-0.003,0.002

0,1,2,3
Omnibus:,3071.326,Durbin-Watson:,2.136
Prob(Omnibus):,0.0,Jarque-Bera (JB):,88.23
Skew:,0.047,Prob(JB):,6.94e-20
Kurtosis:,1.211,Cond. No.,724.0


WLS with robust standard errors: 

In [22]:
mod_wr=sm.WLS.from_formula('buy_reg ~ ecoprc + regprc + faminc + hhsize + educ + age',
              apdf,weights=apdf.w)
res_wr=mod_wr.fit(cov_type='HC3')
res_wr.summary()

0,1,2,3
Dep. Variable:,buy_reg,R-squared:,0.06
Model:,WLS,Adj. R-squared:,0.052
Method:,Least Squares,F-statistic:,6.036
Date:,"Wed, 15 Apr 2020",Prob (F-statistic):,3.68e-06
Time:,15:44:07,Log-Likelihood:,-486.55
No. Observations:,660,AIC:,987.1
Df Residuals:,653,BIC:,1019.0
Df Model:,6,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.6724,0.252,2.667,0.008,0.178,1.167
ecoprc,0.6645,0.123,5.421,0.000,0.424,0.905
regprc,-0.7716,0.150,-5.137,0.000,-1.066,-0.477
faminc,0.0010,0.001,0.809,0.419,-0.001,0.003
hhsize,-0.0131,0.019,-0.682,0.495,-0.051,0.025
educ,-0.0127,0.013,-0.943,0.345,-0.039,0.014
age,-0.0014,0.001,-0.960,0.337,-0.004,0.001

0,1,2,3
Omnibus:,33.663,Durbin-Watson:,2.143
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21.873
Skew:,0.317,Prob(JB):,1.78e-05
Kurtosis:,2.374,Cond. No.,837.0
