In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import GMM
from statsmodels.base.model import GenericLikelihoodModel

from scipy.stats import norm
from scipy.stats import multivariate_normal

In [2]:
#load data into memory
data = pd.DataFrame(data = np.genfromtxt('ps2.dat', delimiter='  '), columns=['y','x1','x2','z'])

print data.mean()

y      0.568393
x1    42.537849
x2    12.286853
z      9.029880
dtype: float64


# Part a

An economic story where $x_{2i}$ is correlated with $\epsilon_i$ involves simultaneity between the decision of an education level and years they want to work. Women who intend to stay in the labor force longer may have select more education.

# Part b

# Part c

$\rho$ represents the correlation between the two error terms. The intrument is relevant it should be non-zero. Since, one would expect parents education to be positively related to your education, you would expect $\rho$ to be positive.

# Part d

In order to estimate the model we must derive the likelihood function.

$p(y_i,x_{2i} | x_{1i}, z_i, \theta) = p(x_{2i} | x_{1i}, z_i, \theta) p(y_i | x_{1i},x_{2i}, z_i, \theta)  = p(x_{2i} | x_{1i}, z_i, \theta) p(y_i | x_{1i},x_{2i}, z_i, \eta_i, \theta)$

1. Preforming a change of variable of $\eta_i$ for $x_{2i}$, we can write

$p(x_{2i} | x_{1i}, z_i, \theta) = p(\eta_i|x_i,z_i,\theta)\dfrac{dx_{2i}}{d\eta_i} = \phi(\dfrac{\eta_i}{\sigma})  \dfrac{1}{\sigma}$

2. We can derive an analytic experession for  $p(y_i | x_{1i},x_{2i}, z_i, \theta)$ below

$p(y_i | x_{1i},x_{2i}, z_i, \theta)$

When $y_i =1$ we have, $p(y_i | x_{1i},x_{2i}, z_i, \theta) = E(\textbf{1}(\epsilon_i + \theta_2 \eta_i + \theta_0 + \theta_2\theta_3 + (\theta_1 + \theta_2\theta_4) + \theta_2\theta_5 z_i > 0 ) | \eta_i ) =  1 - P(\epsilon_i + \gamma_i > 0) $

For notational convenience we have let, $\gamma_i = \theta_2 \eta_i + \theta_0 + \theta_2\theta_3 + (\theta_1 + \theta_2\theta_4) + \theta_2\theta_5 z_i$

When $y_i = 0$ we have, $p(y_i | x_{1i},x_{2i}, z_i, \theta) = E(\textbf{1}(\epsilon_i + \theta_2 \eta_i + \theta_0 + \theta_2\theta_3 + (\theta_1 + \theta_2\theta_4) + \theta_2\theta_5 z_i < 0 ) | \eta_i ) =  P(\epsilon_i + \gamma_i > 0) $

So, we have $p(y_i | x_{1i},x_{2i}, z_i, \theta) = (1-y_i) P(\epsilon_i + \gamma_i > 0) + y_i (1 - P(\epsilon_i + \gamma_i > 0) ) $

Using results about the distribution of conditional normals we know

$\epsilon_i|\eta_i \sim N(\eta_i \dfrac{\rho}{\sigma_\eta^2}, 1 - \dfrac{\rho^2}{\sigma_\eta^2})$


So, $p(y_i | x_{1i},x_{2i}, z_i, \theta) = y_i (1 -\Phi(\dfrac{- \gamma_i - \frac{\rho}{\sigma_\eta^2}}{1 - \frac{\rho^2}{\sigma_\eta^2}})) + (1-y_i)\Phi(\dfrac{- \gamma_i - \frac{\rho}{\sigma_\eta^2}}{1 - \frac{\rho^2}{\sigma_\eta^2}}) $

In [4]:
model_d_stage1 = sm.OLS(data['x2'],sm.add_constant(data[['x1','z']]) )
model_d_stage1_fit = model_d_stage1.fit()
T3, T4, T5 = model_d_stage1_fit.params #initialize these

print model_d_stage1_fit.summary()

                            OLS Regression Results                            
Dep. Variable:                     x2   R-squared:                       0.245
Model:                            OLS   Adj. R-squared:                  0.243
Method:                 Least Squares   F-statistic:                     121.7
Date:                Sat, 27 Oct 2018   Prob (F-statistic):           1.63e-46
Time:                        16:22:05   Log-Likelihood:                -1582.8
No. Observations:                 753   AIC:                             3172.
Df Residuals:                     750   BIC:                             3185.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          9.1255      0.493     18.507      0.0

In [10]:
class part_d(GenericLikelihoodModel):
    """class for evaluating question 1 part b"""
    
    def nloglikeobs(self, params, v=False):
        
        t0,t1,t2,t3,t4,t5,rho,sigma = params

        y,x2 = self.endog.transpose()
        x1,z = self.exog.transpose()
        
        eta = x2 - t3 - t4*x1 - t5*z
        
        mu_epsilon = (rho/sigma**2)*eta
        var_epsilon = np.sqrt(abs(1 - (rho/sigma)**2))
        
        #pr(eta | ... )
        recalc_eta = x2 - t3 - t4*x1 - t5*z
        pr_eta = norm(0,sigma).pdf(recalc_eta)/sigma
        
        #pr(y|x2 ... )
        gamma = t0 + t2*t3 + (t1 + t2*t4)*x1 + t2*t5*z + t2*eta
        
        pr_epsilon = (y*(1 - norm(mu_epsilon,var_epsilon).cdf(-gamma))
                      + (1-y)*norm(mu_epsilon,var_epsilon).cdf(-gamma))
        #if v: print pr_epsilon.max(), pr_epsilon.min(), pr_epsilon.mean(), '\n'
        
        #likelihood = np.log( pr_eta * np.maximum(pr_epsilon,.0001) )
        likelihood = np.log( pr_epsilon*pr_eta )

        
        if v: raise Exception('Stop drop and roll')
        
        return -( likelihood.sum() ) 
    
    
    def fit(self, start_params=None, maxiter=2000, maxfun=5000, **kwds):
        # we have one additional parameter and we need to add it for summary
        if start_params == None:
            start_params = [-.391,.3,.2,T3, T4, T5 ,.12,1.33]
            
        return super(part_d, self).fit(start_params=start_params,
                                       maxiter=maxiter, maxfun=maxfun, **kwds)

    
model_d = part_d(data[['y','x2']],data[['x1','z']])

result_d = model_d.fit()
print(result_d.summary(xname=['theta_0', 'theta_1', 'theta_2',
                              'theta_3','theta_4','theta_5',
                              'rho', 'sigma']))

Optimization terminated successfully.
         Current function value: 3.256370
         Iterations: 761
         Function evaluations: 1159
                                part_d Results                                
Dep. Variable:            ['y', 'x2']   Log-Likelihood:                -2452.0
Model:                         part_d   AIC:                             4908.
Method:            Maximum Likelihood   BIC:                             4917.
Date:                Sat, 27 Oct 2018                                         
Time:                        16:38:00                                         
No. Observations:                 753                                         
Df Residuals:                     751                                         
Df Model:                           1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------