In [22]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [23]:
rawdata = pd.read_csv('data/milk.csv')
data = rawdata.copy()

#general house keeping
data = data[ (data['YEAR']>=1980)]
data = data[(data['MONTH'] >= 4) & (data['MONTH'] <= 9) & (data['DAY'] !=0) ] #need data with time index
data = data[(~np.isnan(data['SCORE']) ) & (data['QSCORE']!=0 )] #need data with QWW and WW

data = data.sort_values(['YEAR','MONTH','DAY'])
data['COOLER'] = data['COOLER'].fillna(0)
data['ESC'] = data['ESC'].fillna(0)
data['ONEBID'] = 1.*(data['NUM'] == 1)

#various keys
#milk =  ['WW','LFW','WC','LFC']
milk = ['SCORE']
auct_key = ['YEAR','MONTH','DAY','SYSTEM','FMOZONE']
cts = ['FMO','GAS','POPUL','QSCORE']
dummies = ['COOLER','ESC', 'ONEBID','NUM'] #delete num?


#baseline stuff/logs
lmilk = ['L'+x for x in milk]
lcts = ['L'+x for x in cts]
data[lcts] = np.log(data[cts])
data[lmilk] = np.log(data[milk])


#set up lags
lags = 10
maxlag = 5
lagkeys = [l+str(i) for l in ['LSCORE_min','LSCORE_max'] for i in range(1,1+maxlag)]
alllagkeys = [l+str(i) for l in ['LSCORE_min','LSCORE_max'] for i in range(1,1+lags)]
aucts = data.groupby(auct_key, as_index=False)[milk].mean()[auct_key]

#note data is already sorted by date
min_lag = data.groupby(auct_key, as_index=False).min()
for t in range(1,1+lags):
    min_lagt = min_lag.shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_min%s'%(t)) ) 
    
max_lag = data.groupby(auct_key, as_index=False).max()
for t in range(1,1+lags):
    max_lagt = max_lag.shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_max%s'%(t)) )    

    
fe = ['FMOZONE']
#fe = ['COUNTY']
#fe = []
data.loc[(data['FMOZONE'] =='1A') , 'FMOZONE'] = '1'
#cheat and make fmozone 1a = 1


fekeys = []
for effect in fe:
    fes = pd.get_dummies(data[effect], drop_first=True)
    fekeys = fekeys+ list(fes.columns)
    data = pd.concat((data, fes), axis=1)

bid_key = auct_key + ['VENDOR'] + ['COUNTY']
covariates = lcts + dummies + fekeys
hist = ['INC'] + lagkeys 

reg1 = data.copy()[bid_key + lmilk + covariates + hist + ['WIN']]
reg1 = reg1.dropna()
reg1.to_csv('data/clean_milk1.csv')

histall = ['INC'] + alllagkeys 
reg2 = data.copy()[bid_key + lmilk + covariates + histall]
reg2 = reg2.dropna()
reg2.to_csv('data/clean_milk2.csv')

In [24]:
sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates+hist]) ).fit().summary()

0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.208
Model:,OLS,Adj. R-squared:,0.203
Method:,Least Squares,F-statistic:,45.85
Date:,"Thu, 27 Jun 2019",Prob (F-statistic):,1.39e-183
Time:,16:27:16,Log-Likelihood:,4220.4
No. Observations:,4040,AIC:,-8393.0
Df Residuals:,4016,BIC:,-8242.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.3424,0.110,-12.194,0.000,-1.558,-1.127
LFMO,0.1247,0.028,4.378,0.000,0.069,0.181
LGAS,0.0221,0.004,5.231,0.000,0.014,0.030
LPOPUL,0.0091,0.003,2.808,0.005,0.003,0.015
LQSCORE,-0.0160,0.003,-4.735,0.000,-0.023,-0.009
COOLER,0.0174,0.003,5.723,0.000,0.011,0.023
ESC,-0.0257,0.003,-9.042,0.000,-0.031,-0.020
ONEBID,0.0081,0.006,1.280,0.201,-0.004,0.020
NUM,0.0066,0.002,4.232,0.000,0.004,0.010

0,1,2,3
Omnibus:,310.532,Durbin-Watson:,1.515
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1418.001
Skew:,0.225,Prob(JB):,1.2199999999999997e-308
Kurtosis:,5.867,Cond. No.,1450.0


In [25]:
sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates]) ).fit().summary()

0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.166
Model:,OLS,Adj. R-squared:,0.163
Method:,Least Squares,F-statistic:,66.61
Date:,"Thu, 27 Jun 2019",Prob (F-statistic):,1.6300000000000002e-148
Time:,16:27:16,Log-Likelihood:,4115.2
No. Observations:,4040,AIC:,-8204.0
Df Residuals:,4027,BIC:,-8122.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.2861,0.077,-29.625,0.000,-2.437,-2.135
LFMO,0.2101,0.028,7.430,0.000,0.155,0.266
LGAS,0.0237,0.004,5.561,0.000,0.015,0.032
LPOPUL,0.0151,0.003,4.625,0.000,0.009,0.022
LQSCORE,-0.0200,0.003,-5.822,0.000,-0.027,-0.013
COOLER,0.0185,0.003,5.954,0.000,0.012,0.025
ESC,-0.0273,0.003,-9.398,0.000,-0.033,-0.022
ONEBID,0.0181,0.006,3.107,0.002,0.007,0.030
NUM,0.0078,0.002,4.915,0.000,0.005,0.011

0,1,2,3
Omnibus:,274.708,Durbin-Watson:,1.416
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1276.677
Skew:,0.118,Prob(JB):,5.93e-278
Kurtosis:,5.744,Cond. No.,972.0


In [20]:
def add_fe(df, dfvars, fe):
    #apply the within transformation ahead of time
    df = df.copy()
    dum = pd.get_dummies(df[fe])
    for var in dfvars:
        df[var] = sm.OLS(df[var],dum).fit().resid
        if var =='LWW':
            print(sm.OLS(df[var],dum).fit().summary())
    return df, [var for var in dfvars]

reg1 = pd.read_csv('data/clean_milk1.csv')
reg2, fenames = add_fe(reg1.copy(), lmilk + lcts + dummies + hist, 'COUNTY')
reg2 = reg2.copy()[bid_key + fenames]
reg2.to_csv('data/clean_milk2.csv')
print(fenames)

['LSCORE', 'LFMO', 'LGAS', 'LPOPUL', 'LQSCORE', 'COOLER', 'ESC', 'ONEBID', 'NUM', 'INC', 'LSCORE_min1', 'LSCORE_min2', 'LSCORE_min3', 'LSCORE_min4', 'LSCORE_min5', 'LSCORE_max1', 'LSCORE_max2', 'LSCORE_max3', 'LSCORE_max4', 'LSCORE_max5']


In [None]:
sm.OLS(reg2['LWW'],reg2[fenames[1:-9]]).fit().summary()

In [None]:
sm.OLS(reg2['LWW'],reg2[fenames[1:]]).fit().summary()