In [1]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
rawdata = pd.read_csv('data/milk.csv')
data = rawdata.copy()

#general house keeping
data = data[ (data['YEAR']>=1980)]
data = data[(data['MONTH'] !=0) & (data['DAY'] !=0) ] #need data with time index
data = data[(~np.isnan(data['WW']) ) & (~np.isnan(data['QWW']) )] #need data with QWW and WW

data = data.sort_values(['YEAR','MONTH','DAY'])
data['COOLER'] = data['COOLER'].fillna(0)
data['ESC'] = data['ESC'].fillna(0)

#various keys
#milk =  ['WW','LFW','WC','LFC']
milk = ['WW']
auct_key = ['YEAR','MONTH','DAY','SYSTEM','FMOZONE']
cts = ['FMO','GAS','POPUL','QWW','MEALS']
dummies = ['COOLER','ESC']


#baseline stuff/logs
lmilk = ['L'+x for x in milk]
lcts = ['L'+x for x in cts]
data[lcts] = np.log(data[cts])
data[lmilk] = np.log(data[milk])


#set up lags
lags = 3
lagkeys = [l+str(i) for l in ['LWW_min','LWW_max'] for i in range(1,1+lags)]
aucts = data.groupby(auct_key, as_index=False)[milk].mean()[auct_key]

#note data is already sorted by date
min_lag = data.groupby(auct_key, as_index=False).min()
for t in range(1,1+lags):
    min_lagt = min_lag.shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_min%s'%(t)) ) 
    
max_lag = data.groupby(auct_key, as_index=False).max()
for t in range(1,1+lags):
    max_lagt = max_lag.shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_max%s'%(t)) )    

    
fe = ['FMOZONE']
#fe = []
fekeys = []
for effect in fe:
    fes = pd.get_dummies(data[effect],drop_first=True)
    fekeys = fekeys+ list(fes.columns)
    data = pd.concat((data, fes), axis=1)

bid_key = auct_key + ['VENDOR'] + ['COUNTY']
covariates = lcts + dummies + fekeys
hist = ['INC'] + lagkeys 

reg1 = data.copy()[bid_key + lmilk + covariates + hist]
reg1 = reg1.dropna()
reg1.to_csv('data/clean_milk1.csv')

In [3]:
sm.OLS(reg1['LWW'],sm.add_constant(reg1[covariates+hist]) ).fit().summary()

0,1,2,3
Dep. Variable:,LWW,R-squared:,0.211
Model:,OLS,Adj. R-squared:,0.207
Method:,Least Squares,F-statistic:,51.39
Date:,"Sun, 12 May 2019",Prob (F-statistic):,1.18e-171
Time:,15:00:10,Log-Likelihood:,4134.1
No. Observations:,3665,AIC:,-8228.0
Df Residuals:,3645,BIC:,-8104.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.6805,0.097,-17.383,0.000,-1.870,-1.491
LFMO,0.1600,0.026,6.066,0.000,0.108,0.212
LGAS,0.0102,0.004,2.496,0.013,0.002,0.018
LPOPUL,0.0210,0.003,6.652,0.000,0.015,0.027
LQWW,0.0016,0.002,0.782,0.434,-0.002,0.006
LMEALS,-0.0207,0.003,-6.783,0.000,-0.027,-0.015
COOLER,0.0186,0.003,6.338,0.000,0.013,0.024
ESC,-0.0190,0.003,-6.899,0.000,-0.024,-0.014
1A,-0.0150,0.019,-0.799,0.424,-0.052,0.022

0,1,2,3
Omnibus:,325.97,Durbin-Watson:,1.585
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1517.025
Skew:,0.307,Prob(JB):,0.0
Kurtosis:,6.092,Cond. No.,1540.0


In [4]:
sm.OLS(reg1['LWW'],sm.add_constant(reg1[covariates]) ).fit().summary()

0,1,2,3
Dep. Variable:,LWW,R-squared:,0.185
Model:,OLS,Adj. R-squared:,0.182
Method:,Least Squares,F-statistic:,68.99
Date:,"Sun, 12 May 2019",Prob (F-statistic):,3.54e-152
Time:,15:00:10,Log-Likelihood:,4073.6
No. Observations:,3665,AIC:,-8121.0
Df Residuals:,3652,BIC:,-8040.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.2691,0.072,-31.461,0.000,-2.410,-2.128
LFMO,0.2099,0.026,8.007,0.000,0.159,0.261
LGAS,0.0092,0.004,2.239,0.025,0.001,0.017
LPOPUL,0.0255,0.003,8.078,0.000,0.019,0.032
LQWW,-0.0001,0.002,-0.059,0.953,-0.004,0.004
LMEALS,-0.0227,0.003,-7.388,0.000,-0.029,-0.017
COOLER,0.0178,0.003,5.999,0.000,0.012,0.024
ESC,-0.0198,0.003,-7.062,0.000,-0.025,-0.014
1A,-0.0246,0.019,-1.296,0.195,-0.062,0.013

0,1,2,3
Omnibus:,285.809,Durbin-Watson:,1.513
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1336.635
Skew:,0.22,Prob(JB):,5.670000000000001e-291
Kurtosis:,5.926,Cond. No.,1140.0


In [5]:
reg2 = pd.read_csv('data/clean_milk1.csv')

def add_fe(df, dfvars,groups):
    #apply the within transformation
    df = df.copy()
    for var in dfvars: #,
        df[var+'_fe'] = df[var]
        for fe in groups:
            dum = pd.get_dummies(data[fe])
            means = np.tile(data[var],len(dum.columns)).reshape(len(dum.columns),data.shape[0])
            means = np.array(means.transpose()*dum)
            means = means.sum(axis=0)/dum.sum(axis=0)
            means = np.tile(means,data.shape[0]).reshape(data.shape[0],len(dum.columns))
            means = (means*dum).sum(axis=1)
            df[var+'_fe'] = df[var+'_fe'] - means
    return df, [var+'_fe' for var in dfvars]


reg2, fenames = add_fe(reg1, lmilk + lcts + dummies + hist, ['COUNTY'])
reg2 = reg2.copy()[bid_key + fenames]
reg2.to_csv('data/clean_milk2.csv')
print fenames

['LWW_fe', 'LFMO_fe', 'LGAS_fe', 'LPOPUL_fe', 'LQWW_fe', 'LMEALS_fe', 'COOLER_fe', 'ESC_fe', 'INC_fe', 'LWW_min1_fe', 'LWW_min2_fe', 'LWW_min3_fe', 'LWW_max1_fe', 'LWW_max2_fe', 'LWW_max3_fe']


In [6]:
sm.OLS(reg2['LWW_fe'],sm.add_constant(reg2[fenames[1:]]) ).fit().summary()

0,1,2,3
Dep. Variable:,LWW_fe,R-squared:,0.072
Model:,OLS,Adj. R-squared:,0.068
Method:,Least Squares,F-statistic:,20.12
Date:,"Sun, 12 May 2019",Prob (F-statistic):,8.83e-50
Time:,15:00:11,Log-Likelihood:,4232.5
No. Observations:,3665,AIC:,-8435.0
Df Residuals:,3650,BIC:,-8342.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5037,0.053,9.458,0.000,0.399,0.608
LFMO_fe,0.1712,0.026,6.616,0.000,0.120,0.222
LGAS_fe,0.0153,0.004,3.882,0.000,0.008,0.023
LPOPUL_fe,0.0075,0.002,3.422,0.001,0.003,0.012
LQWW_fe,-0.0022,0.002,-1.039,0.299,-0.006,0.002
LMEALS_fe,-0.0052,0.001,-4.623,0.000,-0.007,-0.003
COOLER_fe,0.0109,0.003,3.279,0.001,0.004,0.017
ESC_fe,-0.0231,0.003,-8.305,0.000,-0.029,-0.018
INC_fe,0.0176,0.007,2.467,0.014,0.004,0.032

0,1,2,3
Omnibus:,335.773,Durbin-Watson:,1.605
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1384.016
Skew:,0.372,Prob(JB):,2.92e-301
Kurtosis:,5.917,Cond. No.,594.0
