In [1]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [5]:
rawdata = pd.read_csv('data/milk.csv')
data = rawdata.copy()

#general house keeping
data = data[ (data['YEAR']>=1980)]
data = data[(data['MONTH'] !=0) & (data['DAY'] !=0) ] #need data with time index
data = data[(~np.isnan(data['WW']) ) & (~np.isnan(data['QWW']) )] #need data with QWW and WW

data = data.sort_values(['YEAR','MONTH','DAY'])
data['COOLER'] = data['COOLER'].fillna(0)
data['ESC'] = data['ESC'].fillna(0)

#various keys
#milk =  ['WW','LFW','WC','LFC']
milk = ['WW']
auct_key = ['YEAR','MONTH','DAY','SYSTEM','FMOZONE']
cts = ['FMO','GAS','POPUL','QWW','MEALS']
dummies = ['COOLER','ESC']


#baseline stuff/logs
lmilk = ['L'+x for x in milk]
lcts = ['L'+x for x in cts]
data[lcts] = np.log(data[cts])
data[lmilk] = np.log(data[milk])


#set up lags
lags = 4
lagkeys = [l+str(i) for l in ['LWW_min','LWW_max'] for i in range(1,1+lags)]
aucts = data.groupby(auct_key, as_index=False)[milk].mean()[auct_key]

#note data is already sorted by date
min_lag = data.groupby(auct_key, as_index=False).min()
for t in range(1,1+lags):
    min_lagt = min_lag.shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_min%s'%(t)) ) 
    
max_lag = data.groupby(auct_key, as_index=False).max()
for t in range(1,1+lags):
    max_lagt = max_lag.shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_max%s'%(t)) )    

    
fe = ['FMOZONE']
#fe = []
fekeys = []
for effect in fe:
    fes = pd.get_dummies(data[effect],drop_first=True)
    fekeys = fekeys+ list(fes.columns)
    data = pd.concat((data, fes), axis=1)

bid_key = auct_key + ['VENDOR'] + ['COUNTY']
covariates = lcts + dummies + fekeys
hist = ['INC'] + lagkeys 

reg1 = data.copy()[bid_key + lmilk + covariates + hist]
reg1 = reg1.dropna()
reg1.to_csv('data/clean_milk1.csv')

In [6]:
sm.OLS(reg1['LWW'],sm.add_constant(reg1[covariates+hist]) ).fit().summary()

0,1,2,3
Dep. Variable:,LWW,R-squared:,0.212
Model:,OLS,Adj. R-squared:,0.208
Method:,Least Squares,F-statistic:,46.77
Date:,"Mon, 13 May 2019",Prob (F-statistic):,5.36e-171
Time:,09:11:50,Log-Likelihood:,4132.4
No. Observations:,3662,AIC:,-8221.0
Df Residuals:,3640,BIC:,-8084.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.6008,0.103,-15.599,0.000,-1.802,-1.400
LFMO,0.1529,0.027,5.756,0.000,0.101,0.205
LGAS,0.0101,0.004,2.479,0.013,0.002,0.018
LPOPUL,0.0206,0.003,6.541,0.000,0.014,0.027
LQWW,0.0015,0.002,0.745,0.456,-0.002,0.006
LMEALS,-0.0204,0.003,-6.692,0.000,-0.026,-0.014
COOLER,0.0185,0.003,6.287,0.000,0.013,0.024
ESC,-0.0190,0.003,-6.889,0.000,-0.024,-0.014
1A,-0.0147,0.019,-0.787,0.432,-0.051,0.022

0,1,2,3
Omnibus:,327.298,Durbin-Watson:,1.588
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1530.981
Skew:,0.307,Prob(JB):,0.0
Kurtosis:,6.107,Cond. No.,1640.0


In [7]:
sm.OLS(reg1['LWW'],sm.add_constant(reg1[covariates]) ).fit().summary()

0,1,2,3
Dep. Variable:,LWW,R-squared:,0.185
Model:,OLS,Adj. R-squared:,0.182
Method:,Least Squares,F-statistic:,68.95
Date:,"Mon, 13 May 2019",Prob (F-statistic):,4.42e-152
Time:,09:11:50,Log-Likelihood:,4069.2
No. Observations:,3662,AIC:,-8112.0
Df Residuals:,3649,BIC:,-8032.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.2699,0.072,-31.458,0.000,-2.411,-2.128
LFMO,0.2103,0.026,8.016,0.000,0.159,0.262
LGAS,0.0091,0.004,2.230,0.026,0.001,0.017
LPOPUL,0.0255,0.003,8.083,0.000,0.019,0.032
LQWW,-0.0002,0.002,-0.081,0.936,-0.004,0.004
LMEALS,-0.0227,0.003,-7.381,0.000,-0.029,-0.017
COOLER,0.0178,0.003,6.014,0.000,0.012,0.024
ESC,-0.0198,0.003,-7.080,0.000,-0.025,-0.014
1A,-0.0246,0.019,-1.297,0.195,-0.062,0.013

0,1,2,3
Omnibus:,285.558,Durbin-Watson:,1.513
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1334.239
Skew:,0.22,Prob(JB):,1.88e-290
Kurtosis:,5.924,Cond. No.,1140.0


In [8]:
reg2 = pd.read_csv('data/clean_milk1.csv')

def add_fe(df, dfvars,groups):
    #apply the within transformation
    df = df.copy()
    for var in dfvars: #,
        df[var+'_fe'] = df[var]
        for fe in groups:
            dum = pd.get_dummies(data[fe])
            means = np.tile(data[var],len(dum.columns)).reshape(len(dum.columns),data.shape[0])
            means = np.array(means.transpose()*dum)
            means = means.sum(axis=0)/dum.sum(axis=0)
            means = np.tile(means,data.shape[0]).reshape(data.shape[0],len(dum.columns))
            means = (means*dum).sum(axis=1)
            df[var+'_fe'] = df[var+'_fe'] - means
    return df, [var+'_fe' for var in dfvars]


reg2, fenames = add_fe(reg1, lmilk + lcts + dummies + hist, ['COUNTY'])
reg2 = reg2.copy()[bid_key + fenames]
reg2.to_csv('data/clean_milk2.csv')
print fenames

['LWW_fe', 'LFMO_fe', 'LGAS_fe', 'LPOPUL_fe', 'LQWW_fe', 'LMEALS_fe', 'COOLER_fe', 'ESC_fe', 'INC_fe', 'LWW_min1_fe', 'LWW_min2_fe', 'LWW_min3_fe', 'LWW_min4_fe', 'LWW_max1_fe', 'LWW_max2_fe', 'LWW_max3_fe', 'LWW_max4_fe']


In [9]:
sm.OLS(reg2['LWW_fe'],sm.add_constant(reg2[fenames[1:]]) ).fit().summary()

0,1,2,3
Dep. Variable:,LWW_fe,R-squared:,0.074
Model:,OLS,Adj. R-squared:,0.07
Method:,Least Squares,F-statistic:,18.22
Date:,"Mon, 13 May 2019",Prob (F-statistic):,2.19e-50
Time:,09:11:50,Log-Likelihood:,4232.5
No. Observations:,3662,AIC:,-8431.0
Df Residuals:,3645,BIC:,-8325.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5800,0.059,9.808,0.000,0.464,0.696
LFMO_fe,0.1623,0.026,6.232,0.000,0.111,0.213
LGAS_fe,0.0150,0.004,3.813,0.000,0.007,0.023
LPOPUL_fe,0.0077,0.002,3.477,0.001,0.003,0.012
LQWW_fe,-0.0022,0.002,-1.049,0.294,-0.006,0.002
LMEALS_fe,-0.0054,0.001,-4.833,0.000,-0.008,-0.003
COOLER_fe,0.0106,0.003,3.183,0.001,0.004,0.017
ESC_fe,-0.0231,0.003,-8.273,0.000,-0.029,-0.018
INC_fe,0.0179,0.007,2.516,0.012,0.004,0.032

0,1,2,3
Omnibus:,336.73,Durbin-Watson:,1.61
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1410.473
Skew:,0.368,Prob(JB):,5.2399999999999994e-307
Kurtosis:,5.95,Cond. No.,668.0
