In [1]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy

from statsmodels.sandbox.regression.gmm import GMM
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

In [2]:
data = pd.read_stata('chile.dta')
data = data.set_index(['id','year'])

In [3]:
ldata = data.copy()

ldata[['routput', 'totlab', 'renerg','realmats',
       'rcapstock']] = np.log(data[['routput','totlab', 'renerg','realmats','rcapstock']])

ldata = ldata.dropna()

In [4]:
d = ldata['routput'],ldata[['renerg','totlab']],ldata[['rcapstock','realmats']]

def np_resids(y,x):
    """residuals from lasso"""
    y,x = y.dropna(),x.dropna()
    
    poly = PolynomialFeatures(degree=3)
    x_poly = poly.fit_transform(x)
    clf = Ridge(alpha=0.0)
    clf.fit(x_poly, y) 
    resid = y-clf.predict(x_poly)
    return resid

#print np_resids(ldata['routput'].shift(1),ldata[['rcapstock','realmats']].shift(1))

In [5]:
def robinson(dta):
    """y is dependent variable
    x has parametric coefficients
    z is nonparameteric variables"""
    
    y,x,z = dta
    
    y_resid = np_resids(y,z)
    x_resid = []
    for col in x.columns:
        resid = np_resids(x[col],z)
        x_resid.append(resid)
    x_resid = np.array(x_resid).transpose()
    x_resid = pd.DataFrame(data=x_resid,columns=x.columns)
    return sm.OLS(np.array(y_resid),x_resid)

    
stage1 = robinson(d)
stage1_res = stage1.fit()
print stage1_res.summary()

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.195
Model:                            OLS   Adj. R-squared:                  0.195
Method:                 Least Squares   F-statistic:                     2512.
Date:                Tue, 14 May 2019   Prob (F-statistic):               0.00
Time:                        12:44:38   Log-Likelihood:                -3246.9
No. Observations:               20717   AIC:                             6498.
Df Residuals:                   20715   BIC:                             6514.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
renerg         0.0858      0.002     39.327      0.0

In [44]:
def stage2obj(zparams, xparams, dta):
    
    y,x,z = dta
    #########compute production trend and shock ##############
    shock1 = y - x.mul(xparams,axis=1).sum(axis=1) - z.mul(zparams,axis=1).sum(axis=1)

    ######  compute value added trend and shock ############
    #compute the value added by z + shock
    zvalue = y - x.mul(xparams,axis=1).sum(axis=1)
    
    #predict the value added by z
    zvalue_predict = zvalue - np_resids(zvalue, z)
    
    #predict the difference
    shock2 = zvalue_predict - z.mul(zparams,axis=1).sum(axis=1)

    #####compute expected production trend############
    shock2lag = shock2.groupby('id').shift(1)
    both_shocks = pd.concat((shock1,shock2lag),axis=1).dropna()
    trend = shock1 - np_resids(both_shocks[[0]],both_shocks[[1]]).sum(axis=1)
    
    ####subtract this trend from the shock ###########
    shock3 = shock1 - trend
    
    #####multiply by lags to get moment condition #########
    moments  = pd.concat((x,z),axis=1).groupby('id').shift(1)
    moments = moments.mul(shock3,axis=0) 
    return moments.dropna()
    
d = ldata['routput'],ldata[['renerg','totlab']],ldata[['rcapstock','realmats']]
stage2obj(np.array([.5,.5]),np.array([.0837,.1680]),d)
-0.755991

-0.755991

In [47]:
class LP(GMM):
    
    def __init__(self, dta, stage1_params, *args, **kwds):
        # set appropriate counts for moment conditions and parameters
        y,x,z = dta
        super(LP, self).__init__(y,x,z, z.shape[1] +x.shape[1],*args, **kwds)
        self.endog = y
        self.exog = x
        self.instr = z
        self.stage1_params = stage1_params
        
        self.data.xnames = [col for col in z.columns]
        
    def momcond(self, params):
        d = self.endog, self.exog, self.instr
        return np.array(stage2obj(params, self.stage1_params , d))
        
        
d = ldata['routput'],ldata[['renerg','totlab']],ldata[['rcapstock','realmats']]
lpmodel = LP(d,stage1_res.params)
lpresult = lpmodel.fit(np.array([.1,.5]),optim_method='nm')
print lpresult.summary()

Optimization terminated successfully.
         Current function value: 0.000147
         Iterations: 27
         Function evaluations: 53
Optimization terminated successfully.
         Current function value: 0.000842
         Iterations: 28
         Function evaluations: 53
Optimization terminated successfully.
         Current function value: 0.000842
         Iterations: 18
         Function evaluations: 35
Optimization terminated successfully.
         Current function value: 0.000842
         Iterations: 19
         Function evaluations: 36
                                  LP Results                                  
Dep. Variable:                routput   Hansen J:                        15.20
Model:                             LP   Prob (Hansen J):              0.000501
Method:                           GMM                                         
Date:                Tue, 14 May 2019                                         
Time:                        13:51:07                

In [46]:
#now run with BGFS

d = ldata['routput'],ldata[['renerg','totlab']],ldata[['rcapstock','realmats']]
lpmodel = LP(d,stage1_res.params)
lpresult = lpmodel.fit(np.array([.1,.5]))
print lpresult.summary()

KeyboardInterrupt: 