In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import GMM
from statsmodels.base.model import GenericLikelihoodModel

from scipy.stats import norm
from scipy.stats import multivariate_normal

In [39]:
#load data into memory
data = pd.DataFrame(data = np.genfromtxt('ps2.dat', delimiter='  '), columns=['y','x1','x2','z'])

print data.mean()

y      0.568393
x1    42.537849
x2    12.286853
z      9.029880
dtype: float64


# Part a

An economic story where $x_{2i}$ is correlated with $\epsilon_i$ involves simultaneity between the decision of an education level and years they want to work. Women who intend to stay in the labor force longer may have select more education.

# Part b

# Part c

$\rho$ represents the correlation between the two error terms. The intrument is relevant it should be non-zero. Since, one would expect parents education to be positively related to your education, you would expect $\rho$ to be positive.

# Part d

In [48]:
class part_d(GenericLikelihoodModel):
    """class for evaluating question 1 part b"""
    
    def nloglikeobs(self, params, v=False):
        
        t0,t1,t2,t3,t4,t5,rho,sigma = params

        y,x2 = self.endog.transpose()
        x1,z = self.exog.transpose()
        
        #pr(eta | ....)
        eta = x2 - t3 - t4*x1 - t5*z
        pr_eta = norm(0,sigma).pdf(eta)/sigma
        if v: print pr_eta.max(), pr_eta.min(), pr_eta.mean(), '\n'
        
        
        #pr(y|x2 ... )
        c = t0 + t2*t3 + (t1 + t2*t4)*x1 + t2*t5*z
        if v: print c.max(), c.min(), c.mean(), '\n'
        
        mu_epsilon = rho/sigma**2*eta
        
        var_epsilon = np.sqrt(abs(1 - (rho/sigma)**2))
        
        pr_epsilon = 1 - norm(mu_epsilon,var_epsilon).cdf(-t2*eta - c)
        if v: print pr_epsilon.max(), pr_epsilon.min(), pr_epsilon.mean(), '\n'
        
        likelihood = np.log( pr_eta * np.maximum(pr_epsilon,.0001) )
        if v: print likelihood.shape
        if v: print likelihood.max(), likelihood.min(), likelihood.mean(), '\n'
        
        if v: raise Exception('Stop drop and roll')
        
        print -( likelihood.sum() ) 
        return -( likelihood.sum() ) 
    
    
    def fit(self, start_params=None, maxiter=100, maxfun=5000, **kwds):
        # we have one additional parameter and we need to add it for summary
        if start_params == None:
            start_params = [-.4,.3,.2,10.0,.4,.7,.8,.9]
        return super(part_d, self).fit(start_params=start_params,
                                       maxiter=maxiter, maxfun=maxfun, **kwds)

    
model_d = part_d(data[['y','x2']],data[['x1','z']])

result_d = model_d.fit()
print(result_d.summary(xname=['theta_0', 'theta_1', 'theta_2',
                              'theta_3','theta_4','theta_5',
                              'rho', 'sigma']))

219742.4002820071
219746.24887507013
219621.9838658158
219713.04845280948
229719.1549372624
237293.8935433175
226056.8617192151
219858.01637873688
199543.19094565162
201749.35973695474
202034.96538812126
199697.675687254
200260.15946565327
196019.58442836796
184857.6800717637
187663.4054029584
180131.84369700338
161781.04116548764
167063.68515781558
173287.65426625218
166732.48427510582
161398.75033967005
144682.32865705376
148410.27540515902
136695.82762858467
112764.2860593588
129775.7626524539
121082.21389370612
117572.1027396378
112998.98968241202
101311.22990665043
78774.81153882685
88654.92532669308
83054.50154078614
75152.75110352674
53269.67442133356
66481.98166525146
60851.68251056162
53008.6675509364
33934.72794935721
39651.85180682632
31380.58827239682
15063.372790844383
24556.860892811826
19425.340507227163
13537.267005236678
4072.3112335509113
7920.066270482124
4843.135115178287
2883.5788072419273
3338.6047786621725
2402.660185742795
3482.2966111174233
2054.870164251537
60



In [19]:


#theta_start = np.array([1,1,1])
#res = minimize(neg_loglike, theta_start, method = 'Nelder-Mead', options={'disp': True})

-16.013631199228037

In [13]:
print norm(0,[1,2,3]).cdf([1,1,1])

[0.84134475 0.69146246 0.63055866]


array([2, 0, 2])