In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import GMM
from statsmodels.base.model import GenericLikelihoodModel

from scipy.stats import norm
from scipy.stats import multivariate_normal

In [6]:
#load data into memory
data = pd.DataFrame(data = np.genfromtxt('ps2.dat', delimiter='  '), columns=['y','x1','x2','z'])

print data.mean()

y      0.568393
x1    42.537849
x2    12.286853
z      9.029880
dtype: float64


# Part a

An economic story where $x_{2i}$ is correlated with $\epsilon_i$ involves simultaneity between the decision of an education level and years they want to work. Women who intend to stay in the labor force longer may have select more education.

# Part b

# Part c

$\rho$ represents the correlation between the two error terms. The intrument is relevant it should be non-zero. Since, one would expect parents education to be positively related to your education, you would expect $\rho$ to be positive.

# Part d

In [17]:
class part_d(GenericLikelihoodModel):
    """class for evaluating question 1 part b"""
    
    def nloglikeobs(self, params):
        
        t0,t1,t2,t3,t4,t5,rho,sigma = params

        y,x2 = self.endog.transpose()
        x1,z = self.exog.transpose()
        
        #pr(eta | ....)
        eta = x2 - t3 - t4*x1 - t5*z
        pr_eta = norm(0,sigma).logpdf(eta).sum()
        
        #pr(y|x2 ... )
        c = t0 + t2*t3 + (t1 + t2*t4)*x1 + t2*t5*z
        pr_epsilon = norm(eta*rho/sigma,1-rho**2/sigma).logcdf(t2*eta + c).sum()
        
        return -(pr_eta+pr_epsilon)
    
    
    def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
        # we have one additional parameter and we need to add it for summary
        if start_params == None:
            start_params = [.5,.5,.5,.5,.5,.5,.5,.5]
        return super(part_d, self).fit(start_params=start_params,
                                       maxiter=maxiter, maxfun=maxfun, **kwds)

    
model_d = part_d(data[['y','x2']],data[['x1','z']])

result_d = model_d.fit()
print(result_d.summary(xname=['theta_0', 'theta_1', 'theta_2',
                              'theta_3','theta_4','theta_5',
                              'rho', 'sigma']))

Optimization terminated successfully.
         Current function value: 2.101981
         Iterations: 1586
         Function evaluations: 2391
                                part_d Results                                
Dep. Variable:            ['y', 'x2']   Log-Likelihood:                -1582.8
Model:                         part_d   AIC:                             3170.
Method:            Maximum Likelihood   BIC:                             3179.
Date:                Sat, 20 Oct 2018                                         
Time:                        12:32:38                                         
No. Observations:                 753                                         
Df Residuals:                     751                                         
Df Model:                           1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------



In [19]:
var = multivariate_normal(mean=[0,0], cov=[[1,0],[0,1]])
joint_distr = multivariate_normal(mean=[0,0], cov=[[1,rho],[rho,sigma]]) 
a = np.array([[0,1,3],[1,1,3]]).transpose()
np.log(var.pdf(a)).sum()

-16.013631199228037

In [13]:
print norm(0,[1,2,3]).cdf([1,1,1])

[0.84134475 0.69146246 0.63055866]
