In [1]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy

from statsmodels.sandbox.regression.gmm import GMM
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

In [5]:
data = pd.read_stata('chile.dta')

In [6]:
ldata = data.copy()

ldata[['routput', 'totlab', 'renerg','realmats',
       'rcapstock']] = np.log(data[['routput','totlab', 'renerg','realmats','rcapstock']])

ldata = ldata.dropna()

In [7]:
d = ldata['routput'],ldata[['renerg','totlab']],ldata[['rcapstock','realmats']],ldata[['id','year']]

def np_resids(y,x):
    """residuals from lasso"""
    
    poly = PolynomialFeatures(degree=7)
    x_poly = poly.fit_transform(x)
    clf = Ridge(alpha=1.0)
    clf.fit(x_poly, y) 
    resid = y-clf.predict(x_poly)
    return resid

#print np_resids(ldata['routput'],ldata[['rcapstock','realmats']])

In [9]:
def robinson(dta):
    """y is dependent variable
    x has parametric coefficients
    z is nonparameteric variables"""
    
    y,x,z,label = dta
    
    y_resid = np_resids(y,z)
    x_resid = []
    for col in x.columns:
        resid = np_resids(x[col],z)
        x_resid.append(resid)
    x_resid = np.array(x_resid).transpose()
    x_resid = pd.DataFrame(data=x_resid,columns=x.columns)
    return sm.OLS(np.array(y_resid),x_resid)

    
stage1 = robinson(d)
stage1_res = stage1.fit()
print stage1_res.summary()

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.191
Model:                            OLS   Adj. R-squared:                  0.191
Method:                 Least Squares   F-statistic:                     2445.
Date:                Mon, 13 May 2019   Prob (F-statistic):               0.00
Time:                        18:28:03   Log-Likelihood:                -3003.4
No. Observations:               20717   AIC:                             6011.
Df Residuals:                   20715   BIC:                             6027.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
renerg         0.0837      0.002     38.705      0.0

In [10]:
def stage2obj(zparams, xparams, dta):
    
    y,x,z,label = dta
    t = label['year']
    #period1 = np.tile((1979<=t) & (t<=1981),z.shape[1]+1).reshape(z.shape[0],z.shape[1]+1)
    #period2 = np.tile((1982<=t) & (t<=1983),z.shape[1]+1).reshape(z.shape[0],z.shape[1]+1)
    #period3 = np.tile((1984<=t) & (t<=1986),z.shape[1]+1).reshape(z.shape[0],z.shape[1]+1)
    
    #compute value added to production by z
    xparams_tile = np.tile(xparams,x.shape[0]).reshape(x.shape[0],x.shape[1])
    zvalue = y - (xparams_tile*x).sum(axis=1) 
    
    #compute predict value added ahead of time
    zcons = sm.add_constant(z)
    #z_period = np.concatenate((zcons*period1,zcons*period2,zcons*period3),axis=1)
    #zvalue_predict = sm.OLS(zvalue,z_period).fit().fittedvalues
    zvalue_predict = sm.OLS(zvalue,z).fit().fittedvalues
    
    #use the difference to learn the shock to production
    zparams_tile = np.tile(zparams,z.shape[0]).reshape(z.shape[0],z.shape[1])
    
    #calc shock (mostly formating correctly)
    shock = zvalue - (zparams_tile*z).sum(axis=1)
    shock = np.array(shock).reshape((shock.shape[0],1))
    
    shocklag = zvalue_predict - (zparams_tile*z).sum(axis=1)
    shocklag = np.array(shocklag).reshape((shocklag.shape[0],1))
    
    shocklag = np.concatenate((shocklag,label[['id','year']]),axis=1)
    shocklag = pd.DataFrame(shocklag,columns=['shock2','id','year'])
    shocklag = shocklag.set_index(['id','year'])
    shocklag = shocklag.groupby('id').shift(1)
    
    
    #clean 'exogenous' portion of production
    both_shocks = np.concatenate((shocklag,shock),axis=1)
    both_shocks = both_shocks[~np.isnan(both_shocks).any(axis=1)]
    shocklag = both_shocks[:,0].reshape(both_shocks.shape[0],1)
    npshocklag = np.concatenate((shocklag,shocklag**2,shocklag**3),axis=1)
    npshocklag = sm.add_constant(npshocklag)
    
    return sm.OLS(both_shocks[:,1],npshocklag).fit().resid
    
    
d = ldata['routput'],ldata[['renerg','totlab']],ldata[['rcapstock','realmats']],ldata[['id','year']]

stage2obj(np.array([.5,.5]),np.array([.0837,.1680]),d)

array([ 0.07214819, -0.20580687,  0.28069225, ...,  0.32109226,
        0.22290829, -0.59066547])

In [11]:
class LP(GMM):
    
    def __init__(self, dta, stage1_params, *args, **kwds):
        # set appropriate counts for moment conditions and parameters
        y,x,z,label = dta
        super(LP, self).__init__(y,x,z, z.shape[1] +x.shape[1],*args, **kwds)
        self.endog = y
        self.exog = x
        self.instr = z
        self.ids = label
        self.stage1_params = stage1_params
        
        
        #self up a lag variable
        exogs = np.concatenate((x, z, label[['id','year']]),axis=1)
        colnames = ['x'+str(i) for i in range(self.nmoms)]
        colnames = colnames + ['id','year']
        exogs = pd.DataFrame(exogs,columns= colnames )
        exogs = exogs .set_index(['id','year'])
        exogs = exogs .groupby('id').shift(1)
        
        self.exoglag = exogs.dropna()
        self.data.xnames = [col for col in z.columns]
        
    def momcond(self, params):
        d = self.endog, self.exog, self.instr, self.ids
        resids = stage2obj(params, self.stage1_params , d)
        resids_tile = np.repeat(resids,self.nmoms)
        resids_tile = resids_tile.reshape((resids.shape[0],self.nmoms))
        return resids_tile*self.exoglag
        
    

lpmodel = LP(d,np.array([.0837,.1680]))
lpresult = lpmodel.fit(np.array([.5,.5]))
print lpresult.summary()

Optimization terminated successfully.
         Current function value: 0.000081
         Iterations: 19
         Function evaluations: 24
         Gradient evaluations: 24
Optimization terminated successfully.
         Current function value: 0.000792
         Iterations: 13
         Function evaluations: 17
         Gradient evaluations: 17
Optimization terminated successfully.
         Current function value: 0.001166
         Iterations: 6
         Function evaluations: 10
         Gradient evaluations: 10
Optimization terminated successfully.
         Current function value: 0.001188
         Iterations: 1
         Function evaluations: 3
         Gradient evaluations: 3
Optimization terminated successfully.
         Current function value: 0.001188
         Iterations: 0
         Function evaluations: 1
         Gradient evaluations: 1
                                  LP Results                                  
Dep. Variable:                routput   Hansen J:                   