In [1]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
rawdata = pd.read_csv('data/milk.csv')
data = rawdata.copy()

#general house keeping
data = data[ (data['YEAR']>=1980)]
data = data[(data['MONTH'] !=0) & (data['DAY'] !=0) ] #need data with time index
data = data[(~np.isnan(data['WW']) ) & (~np.isnan(data['QWW']) )] #need data with QWW and WW

data = data.sort_values(['YEAR','MONTH','DAY'])
data['COOLER'] = data['COOLER'].fillna(0)
data['ESC'] = data['ESC'].fillna(0)

#various keys
#milk =  ['WW','LFW','WC','LFC']
milk = ['WW']
auct_key = ['YEAR','MONTH','DAY','SYSTEM','FMOZONE']
cts = ['FMO','GAS','POPUL','QWW']
dummies = ['COOLER','ESC']


#baseline stuff/logs
lmilk = ['L'+x for x in milk]
lcts = ['L'+x for x in cts]
data[lcts] = np.log(data[cts])
data[lmilk] = np.log(data[milk])


#set up lags
lags = 4
lagkeys = [l+str(i) for l in ['LWW_min','LWW_max'] for i in range(1,1+lags)]
aucts = data.groupby(auct_key, as_index=False)[milk].mean()[auct_key]

#note data is already sorted by date
min_lag = data.groupby(auct_key, as_index=False).min()
for t in range(1,1+lags):
    min_lagt = min_lag.shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_min%s'%(t)) ) 
    
max_lag = data.groupby(auct_key, as_index=False).max()
for t in range(1,1+lags):
    max_lagt = max_lag.shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_max%s'%(t)) )    

    
fe = ['FMOZONE']
#fe = ['COUNTY']
#fe = []
data.loc[(data['FMOZONE'] =='1A') , 'FMOZONE'] = '1'
#cheat and make fmozone 1a = 1


fekeys = []
for effect in fe:
    fes = pd.get_dummies(data[effect], drop_first=True)
    fekeys = fekeys+ list(fes.columns)
    data = pd.concat((data, fes), axis=1)

bid_key = auct_key + ['VENDOR'] + ['COUNTY']
covariates = lcts + dummies + fekeys
hist = ['INC'] + lagkeys 

reg1 = data.copy()[bid_key + lmilk + covariates + hist]
reg1 = reg1.dropna()
reg1.to_csv('data/clean_milk1.csv')

#also include win for summary stats
reg0 = data.copy()[bid_key + lmilk + covariates + hist + ['WIN']]
reg0 = reg0.dropna()
reg0.to_csv('data/clean_milk0.csv')

In [3]:
sm.OLS(reg1['LWW'], sm.add_constant(reg1[covariates+hist]) ).fit().summary()

0,1,2,3
Dep. Variable:,LWW,R-squared:,0.195
Model:,OLS,Adj. R-squared:,0.191
Method:,Least Squares,F-statistic:,48.49
Date:,"Tue, 28 May 2019",Prob (F-statistic):,4.45e-163
Time:,18:24:40,Log-Likelihood:,4296.2
No. Observations:,3823,AIC:,-8552.0
Df Residuals:,3803,BIC:,-8427.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.5375,0.099,-15.506,0.000,-1.732,-1.343
LFMO,0.1245,0.026,4.751,0.000,0.073,0.176
LGAS,0.0167,0.004,4.196,0.000,0.009,0.024
LPOPUL,0.0038,0.002,2.024,0.043,0.000,0.007
LQWW,-0.0013,0.002,-0.675,0.500,-0.005,0.003
COOLER,0.0183,0.003,6.367,0.000,0.013,0.024
ESC,-0.0219,0.003,-8.117,0.000,-0.027,-0.017
3,-0.0692,0.004,-17.077,0.000,-0.077,-0.061
6,-0.0589,0.008,-7.403,0.000,-0.074,-0.043

0,1,2,3
Omnibus:,340.88,Durbin-Watson:,1.564
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1522.518
Skew:,0.326,Prob(JB):,0.0
Kurtosis:,6.022,Cond. No.,1250.0


In [4]:
sm.OLS(reg1['LWW'], sm.add_constant(reg1[covariates]) ).fit().summary()

0,1,2,3
Dep. Variable:,LWW,R-squared:,0.158
Model:,OLS,Adj. R-squared:,0.156
Method:,Least Squares,F-statistic:,71.65
Date:,"Sun, 26 May 2019",Prob (F-statistic):,9.319999999999999e-135
Time:,19:33:33,Log-Likelihood:,4210.8
No. Observations:,3823,AIC:,-8400.0
Df Residuals:,3812,BIC:,-8331.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.3258,0.070,-33.110,0.000,-2.464,-2.188
LFMO,0.1855,0.026,7.086,0.000,0.134,0.237
LGAS,0.0167,0.004,4.164,0.000,0.009,0.025
LPOPUL,0.0071,0.002,3.838,0.000,0.003,0.011
LQWW,-0.0035,0.002,-1.769,0.077,-0.007,0.000
COOLER,0.0185,0.003,6.321,0.000,0.013,0.024
ESC,-0.0229,0.003,-8.334,0.000,-0.028,-0.018
3,-0.0671,0.004,-16.278,0.000,-0.075,-0.059
6,-0.0542,0.008,-6.707,0.000,-0.070,-0.038

0,1,2,3
Omnibus:,291.938,Durbin-Watson:,1.462
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1309.001
Skew:,0.229,Prob(JB):,5.679999999999999e-285
Kurtosis:,5.83,Cond. No.,849.0


In [5]:
def add_fe(df, dfvars, fe):
    #apply the within transformation ahead of time
    df = df.copy()
    dum = pd.get_dummies(df[fe])
    for var in dfvars:
        df[var] = sm.OLS(df[var],dum).fit().resid
        if var =='LWW':
            print sm.OLS(df[var],dum).fit().summary()
    return df, [var for var in dfvars]

reg1 = pd.read_csv('data/clean_milk1.csv')
reg2, fenames = add_fe(reg1.copy(), lmilk + lcts + dummies + hist, 'COUNTY')
reg2 = reg2.copy()[bid_key + fenames]
reg2.to_csv('data/clean_milk2.csv')
print fenames

                            OLS Regression Results                            
Dep. Variable:                    LWW   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.011
Method:                 Least Squares   F-statistic:                 4.190e-14
Date:                Sun, 26 May 2019   Prob (F-statistic):               1.00
Time:                        19:34:18   Log-Likelihood:                 4264.1
No. Observations:                3823   AIC:                            -8446.
Df Residuals:                    3782   BIC:                            -8190.
Df Model:                          40                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
ATASCOSA   -2.238e-16      0.015  -1.48e-14      1.0

In [6]:
sm.OLS(reg2['LWW'],reg2[fenames[1:-9]]).fit().summary()

0,1,2,3
Dep. Variable:,LWW,R-squared:,0.041
Model:,OLS,Adj. R-squared:,0.039
Method:,Least Squares,F-statistic:,27.01
Date:,"Sun, 26 May 2019",Prob (F-statistic):,1.06e-31
Time:,19:34:18,Log-Likelihood:,4343.6
No. Observations:,3823,AIC:,-8675.0
Df Residuals:,3817,BIC:,-8638.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
LFMO,0.1972,0.025,7.744,0.000,0.147,0.247
LGAS,0.0170,0.004,4.352,0.000,0.009,0.025
LPOPUL,0.0065,0.002,3.279,0.001,0.003,0.010
LQWW,-0.0051,0.002,-2.492,0.013,-0.009,-0.001
COOLER,0.0101,0.003,3.099,0.002,0.004,0.016
ESC,-0.0248,0.003,-8.982,0.000,-0.030,-0.019

0,1,2,3
Omnibus:,299.08,Durbin-Watson:,1.508
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1188.885
Skew:,0.299,Prob(JB):,6.87e-259
Kurtosis:,5.666,Cond. No.,29.0


In [7]:
sm.OLS(reg2['LWW'],reg2[fenames[1:]]).fit().summary()

0,1,2,3
Dep. Variable:,LWW,R-squared:,0.076
Model:,OLS,Adj. R-squared:,0.072
Method:,Least Squares,F-statistic:,20.89
Date:,"Sun, 26 May 2019",Prob (F-statistic):,2.64e-55
Time:,19:34:19,Log-Likelihood:,4415.3
No. Observations:,3823,AIC:,-8801.0
Df Residuals:,3808,BIC:,-8707.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
LFMO,0.1360,0.026,5.304,0.000,0.086,0.186
LGAS,0.0172,0.004,4.426,0.000,0.010,0.025
LPOPUL,0.0028,0.002,1.411,0.158,-0.001,0.007
LQWW,-0.0034,0.002,-1.644,0.100,-0.007,0.001
COOLER,0.0101,0.003,3.147,0.002,0.004,0.016
ESC,-0.0240,0.003,-8.810,0.000,-0.029,-0.019
INC,0.0108,0.006,1.798,0.072,-0.001,0.023
LWW_min1,-0.0174,0.019,-0.933,0.351,-0.054,0.019
LWW_min2,0.0562,0.019,3.027,0.002,0.020,0.093

0,1,2,3
Omnibus:,347.903,Durbin-Watson:,1.597
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1404.076
Skew:,0.377,Prob(JB):,1.2800000000000001e-305
Kurtosis:,5.872,Cond. No.,30.0
