In [1]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [8]:
rawdata = pd.read_csv('data/milk.csv')
data = rawdata.copy()

#general house keeping
data = data[ (data['YEAR']>=1980)]
data = data[(data['MONTH'] >= 4) & (data['MONTH'] <= 9) & (data['DAY'] !=0) ] #need data with time index
data = data[(~np.isnan(data['SCORE']) ) & (data['QSCORE']!=0 )] #need data with QWW and WW


data['COOLER'] = data['COOLER'].fillna(0)
data['ESC'] = data['ESC'].fillna(0)
data['ONEBID'] = 1.*(data['NUM'] == 1)

#various keys
milk = ['SCORE']
auct_key = ['YEAR','MONTH','DAY','SYSTEM','FMOZONE']
cts = ['FMO','GAS','POPUL','QSCORE']
dummies = ['COOLER','ESC', 'NUM'] #delete num?


#baseline stuff/logs
lmilk = ['L'+x for x in milk]
lcts = ['L'+x for x in cts]
data[lcts] = np.log(data[cts])
data[lmilk] = np.log(data[milk])

#set up lags
lags = 10
lagkeys = [l+str(i) for l in ['LSCORE_min','LSCORE_max'] for i in range(1,1+lags)]
aucts = data.groupby(auct_key, as_index=False)[milk].mean()[auct_key]

#note data is already sorted by date
data = data.sort_values(['YEAR','MONTH','DAY'])
min_lag = data.groupby(auct_key, as_index=False).min()
for t in range(1,1+lags):
    min_lagt = min_lag.shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_min%s'%(t)) ) 
    
max_lag = data.groupby(auct_key, as_index=False).max()
for t in range(1,1+lags):
    max_lagt = max_lag.shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_max%s'%(t)) )    

    
#set up lags but prev year 
yearlags = 1
year_lagkeys = [l+str(i) for l in ['LSCORE_miny','LSCORE_maxy'] for i in range(1,1+yearlags)]


#note data is sorted by year now
data = data.sort_values(['SYSTEM','YEAR','MONTH','DAY'])

year_min_lag = data.groupby(auct_key, as_index=False).min().sort_values(['SYSTEM','YEAR','MONTH','DAY'])
for t in range(1,1+lags):
    min_lagt = year_min_lag.groupby(['SYSTEM']).shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_miny%s'%(t)) )

year_max_lag = data.groupby(auct_key, as_index=False).max().sort_values(['SYSTEM','YEAR','MONTH','DAY'])
for t in range(1,1+lags):
    max_lagt = year_max_lag.groupby(['SYSTEM']).shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_maxy%s'%(t)) ) 
    

#pre processing to help fmozones
fe = ['FMOZONE']
data.loc[(data['FMOZONE'] =='1A') , 'FMOZONE'] = '1'

fekeys = []
for effect in fe:
    fes = pd.get_dummies(data[effect], drop_first=True)
    fekeys = fekeys+ list(fes.columns)
    data = pd.concat((data, fes), axis=1)

    
bid_key = auct_key + ['VENDOR'] + ['COUNTY']
covariates = lcts + dummies + fekeys
hist = ['INC'] + lagkeys 

#only drop data 5 periods back
maxlag = 5
limitedlag = [l+str(i) for l in ['LSCORE_min','LSCORE_max'] for i in range(1,1+maxlag)]


maxlagy = 3
limitedlagy = [l+str(i) for l in ['LSCORE_miny','LSCORE_maxy'] for i in range(1,1+maxlagy)]


limitedhist = ['INC'] + limitedlag + limitedlagy 

reg1 = data.copy()[bid_key + lmilk + covariates + limitedhist + ['WIN']]
reg1 = reg1.dropna()
reg1.to_csv('data/clean_milk1.csv')


reg2 = data.copy()[bid_key + lmilk + covariates + hist + ['WIN']]
reg2 = reg2.dropna()
reg2.to_csv('data/clean_milk2.csv')

In [9]:
sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates+limitedhist]) ).fit().summary()

0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.424
Model:,OLS,Adj. R-squared:,0.418
Method:,Least Squares,F-statistic:,70.01
Date:,"Mon, 01 Jul 2019",Prob (F-statistic):,8.1e-294
Time:,19:17:33,Log-Likelihood:,3197.2
No. Observations:,2696,AIC:,-6336.0
Df Residuals:,2667,BIC:,-6165.0
Df Model:,28,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.8339,0.121,-6.900,0.000,-1.071,-0.597
LFMO,0.3279,0.029,11.187,0.000,0.270,0.385
LGAS,-0.0266,0.005,-4.841,0.000,-0.037,-0.016
LPOPUL,0.0110,0.004,2.780,0.005,0.003,0.019
LQSCORE,-0.0169,0.004,-4.103,0.000,-0.025,-0.009
COOLER,0.0011,0.003,0.326,0.745,-0.005,0.008
ESC,-0.0210,0.003,-6.909,0.000,-0.027,-0.015
NUM,0.0069,0.002,4.383,0.000,0.004,0.010
3,-0.0306,0.005,-6.051,0.000,-0.040,-0.021

0,1,2,3
Omnibus:,303.828,Durbin-Watson:,1.765
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1808.965
Skew:,0.355,Prob(JB):,0.0
Kurtosis:,6.95,Cond. No.,1560.0


In [10]:
sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates]) ).fit().summary()

0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.178
Model:,OLS,Adj. R-squared:,0.175
Method:,Least Squares,F-statistic:,52.89
Date:,"Mon, 01 Jul 2019",Prob (F-statistic):,4.549999999999999e-106
Time:,19:17:33,Log-Likelihood:,2718.9
No. Observations:,2696,AIC:,-5414.0
Df Residuals:,2684,BIC:,-5343.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.1723,0.091,-23.862,0.000,-2.351,-1.994
LFMO,0.2134,0.032,6.670,0.000,0.151,0.276
LGAS,-0.0097,0.006,-1.535,0.125,-0.022,0.003
LPOPUL,0.0196,0.005,4.230,0.000,0.010,0.029
LQSCORE,-0.0258,0.005,-5.333,0.000,-0.035,-0.016
COOLER,0.0181,0.004,4.711,0.000,0.011,0.026
ESC,-0.0300,0.004,-8.405,0.000,-0.037,-0.023
NUM,0.0092,0.002,5.085,0.000,0.006,0.013
3,-0.0688,0.006,-12.167,0.000,-0.080,-0.058

0,1,2,3
Omnibus:,239.397,Durbin-Watson:,1.273
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1270.385
Skew:,0.235,Prob(JB):,1.3800000000000001e-276
Kurtosis:,6.33,Cond. No.,937.0


In [20]:
def add_fe(df, dfvars, fe):
    #apply the within transformation ahead of time
    df = df.copy()
    dum = pd.get_dummies(df[fe])
    for var in dfvars:
        df[var] = sm.OLS(df[var],dum).fit().resid
        if var =='LWW':
            print(sm.OLS(df[var],dum).fit().summary())
    return df, [var for var in dfvars]

reg1 = pd.read_csv('data/clean_milk1.csv')
reg2, fenames = add_fe(reg1.copy(), lmilk + lcts + dummies + hist, 'COUNTY')
reg2 = reg2.copy()[bid_key + fenames]
reg2.to_csv('data/clean_milk2.csv')
print(fenames)

['LSCORE', 'LFMO', 'LGAS', 'LPOPUL', 'LQSCORE', 'COOLER', 'ESC', 'ONEBID', 'NUM', 'INC', 'LSCORE_min1', 'LSCORE_min2', 'LSCORE_min3', 'LSCORE_min4', 'LSCORE_min5', 'LSCORE_max1', 'LSCORE_max2', 'LSCORE_max3', 'LSCORE_max4', 'LSCORE_max5']


In [None]:
sm.OLS(reg2['LWW'],reg2[fenames[1:-9]]).fit().summary()

In [None]:
sm.OLS(reg2['LWW'],reg2[fenames[1:]]).fit().summary()