In [1]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
rawdata = pd.read_csv('data/milk.csv')
data = rawdata.copy()

#general house keeping
data = data[ (data['YEAR']>=1980)]
data = data[(data['MONTH'] !=0) & (data['DAY'] !=0) ] #need data with time index
data = data[(~np.isnan(data['WW']) ) & (~np.isnan(data['QWW']) )] #need data with QWW and WW

data = data.sort_values(['YEAR','MONTH','DAY'])
data['COOLER'] = data['COOLER'].fillna(0)
data['ESC'] = data['ESC'].fillna(0)

#various keys
#milk =  ['WW','LFW','WC','LFC']
milk = ['WW']
auct_key = ['YEAR','MONTH','DAY','SYSTEM','FMOZONE']
cts = ['FMO','GAS','POPUL','QWW']
dummies = ['COOLER','ESC']


#baseline stuff/logs
lmilk = ['L'+x for x in milk]
lcts = ['L'+x for x in cts]
data[lcts] = np.log(data[cts])
data[lmilk] = np.log(data[milk])


#set up lags
lags = 4
lagkeys = [l+str(i) for l in ['LWW_min','LWW_max'] for i in range(1,1+lags)]
aucts = data.groupby(auct_key, as_index=False)[milk].mean()[auct_key]

#note data is already sorted by date
min_lag = data.groupby(auct_key, as_index=False).min()
for t in range(1,1+lags):
    min_lagt = min_lag.shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_min%s'%(t)) ) 
    
max_lag = data.groupby(auct_key, as_index=False).max()
for t in range(1,1+lags):
    max_lagt = max_lag.shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_max%s'%(t)) )    

    
fe = ['FMOZONE']
#fe = ['COUNTY']
#fe = []
data.loc[(data['FMOZONE'] =='1A') , 'FMOZONE'] = '1'
#cheat and make fmozone 1a = 1


fekeys = []
for effect in fe:
    fes = pd.get_dummies(data[effect], drop_first=True)
    fekeys = fekeys+ list(fes.columns)
    data = pd.concat((data, fes), axis=1)

bid_key = auct_key + ['VENDOR'] + ['COUNTY']
covariates = lcts + dummies + fekeys
hist = ['INC'] + lagkeys 

reg1 = data.copy()[bid_key + lmilk + covariates + hist]
reg1 = reg1.dropna()
reg1.to_csv('data/clean_milk1.csv')

#also include win for summary stats
reg0 = data.copy()[bid_key + lmilk + covariates + hist + ['WIN']]
reg0 = reg0.dropna()
reg0.to_csv('data/clean_milk0.csv')

In [3]:
sm.OLS(reg1['LWW'], sm.add_constant(reg1[covariates+hist]) ).fit().summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,LWW,R-squared:,0.194
Model:,OLS,Adj. R-squared:,0.19
Method:,Least Squares,F-statistic:,48.17
Date:,"Thu, 06 Jun 2019",Prob (F-statistic):,4.89e-162
Time:,10:59:51,Log-Likelihood:,4293.8
No. Observations:,3823,AIC:,-8548.0
Df Residuals:,3803,BIC:,-8423.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.5334,0.099,-15.443,0.000,-1.728,-1.339
LFMO,0.1242,0.026,4.734,0.000,0.073,0.176
LGAS,0.0155,0.004,3.865,0.000,0.008,0.023
LPOPUL,0.0042,0.002,2.299,0.022,0.001,0.008
LQWW,-0.0013,0.002,-0.674,0.500,-0.005,0.003
COOLER,0.0184,0.003,6.376,0.000,0.013,0.024
ESC,-0.0219,0.003,-8.088,0.000,-0.027,-0.017
3,-0.0685,0.004,-16.933,0.000,-0.076,-0.061
6,-0.0469,0.012,-4.003,0.000,-0.070,-0.024

0,1,2,3
Omnibus:,342.446,Durbin-Watson:,1.563
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1519.831
Skew:,0.331,Prob(JB):,0.0
Kurtosis:,6.017,Cond. No.,1250.0


In [4]:
sm.OLS(reg1['LWW'], sm.add_constant(reg1[covariates]) ).fit().summary()

0,1,2,3
Dep. Variable:,LWW,R-squared:,0.157
Model:,OLS,Adj. R-squared:,0.155
Method:,Least Squares,F-statistic:,71.04
Date:,"Thu, 06 Jun 2019",Prob (F-statistic):,1.21e-133
Time:,10:59:51,Log-Likelihood:,4208.2
No. Observations:,3823,AIC:,-8394.0
Df Residuals:,3812,BIC:,-8326.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.3272,0.070,-33.108,0.000,-2.465,-2.189
LFMO,0.1861,0.026,7.104,0.000,0.135,0.237
LGAS,0.0151,0.004,3.732,0.000,0.007,0.023
LPOPUL,0.0076,0.002,4.122,0.000,0.004,0.011
LQWW,-0.0035,0.002,-1.765,0.078,-0.007,0.000
COOLER,0.0185,0.003,6.336,0.000,0.013,0.024
ESC,-0.0229,0.003,-8.301,0.000,-0.028,-0.018
3,-0.0664,0.004,-16.136,0.000,-0.074,-0.058
6,-0.0425,0.012,-3.586,0.000,-0.066,-0.019

0,1,2,3
Omnibus:,292.683,Durbin-Watson:,1.46
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1303.709
Skew:,0.234,Prob(JB):,8e-284
Kurtosis:,5.822,Cond. No.,849.0


In [6]:
def add_fe(df, dfvars, fe):
    #apply the within transformation ahead of time
    df = df.copy()
    dum = pd.get_dummies(df[fe])
    for var in dfvars:
        df[var] = sm.OLS(df[var],dum).fit().resid
        if var =='LWW':
            print(sm.OLS(df[var],dum).fit().summary())
    return df, [var for var in dfvars]

reg1 = pd.read_csv('data/clean_milk1.csv')
reg2, fenames = add_fe(reg1.copy(), lmilk + lcts + dummies + hist, 'COUNTY')
reg2 = reg2.copy()[bid_key + fenames]
reg2.to_csv('data/clean_milk2.csv')
print(fenames)

                            OLS Regression Results                            
Dep. Variable:                    LWW   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.011
Method:                 Least Squares   F-statistic:                 4.190e-14
Date:                Thu, 06 Jun 2019   Prob (F-statistic):               1.00
Time:                        11:00:48   Log-Likelihood:                 4264.1
No. Observations:                3823   AIC:                            -8446.
Df Residuals:                    3782   BIC:                            -8190.
Df Model:                          40                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
ATASCOSA   -2.238e-16      0.015  -1.48e-14      1.0

In [None]:
sm.OLS(reg2['LWW'],reg2[fenames[1:-9]]).fit().summary()

In [None]:
sm.OLS(reg2['LWW'],reg2[fenames[1:]]).fit().summary()