In [1]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
rawdata = pd.read_csv('data/milk.csv')
data = rawdata.copy()

data = rawdata.copy()

#lag auctions within 1 vendor
data = data.sort_values(['VENDOR','YEAR','MONTH','DAY','SYSTEM'])
bids = data.groupby(['VENDOR','YEAR','MONTH','DAY','SYSTEM'], as_index=False).mean()
bids = bids[['VENDOR','YEAR','MONTH','DAY','SYSTEM']]
sys_lag = bids.groupby(['VENDOR']).shift(1)
bids['VENDOR_LAG'] = sys_lag['SYSTEM']
data = pd.merge(data, bids, how='left', 
                 on=['VENDOR','YEAR','MONTH','DAY','SYSTEM'], suffixes=('', '_LAG') ) 

#lag auctions
data =data.sort_values(['YEAR','MONTH','DAY','SYSTEM'])
aucts =  data.groupby(['YEAR','MONTH','DAY','SYSTEM'], as_index=False).mean()
aucts = aucts[['YEAR','MONTH','DAY','SYSTEM']]
auct_lag = aucts.shift(1)
aucts['AUCT_LAG'] = auct_lag['SYSTEM']
data = pd.merge(data, aucts, how='left', 
                 on=['YEAR','MONTH','DAY','SYSTEM'], suffixes=('', '_LAG') ) 

prev_auct = 1.*(data['AUCT_LAG'] == data['VENDOR_LAG'])
data['PAST_AUCT'] = prev_auct

In [3]:
#general house keeping
data = data[ (data['YEAR']>=1980)]
data = data[(data['MONTH'] >= 4) & (data['MONTH'] <= 9) & (data['DAY'] !=0) ] #need data with time index
data = data[(~np.isnan(data['SCORE']) ) & (data['QSCORE']!=0 )] #need data with QWW and WW


data['COOLER'] = data['COOLER'].fillna(0)
data['ESC'] = data['ESC'].fillna(0)
data['ONEBID'] = 1.*(data['NUM'] == 1)

#various keys
milk = ['SCORE']
auct_key = ['YEAR','MONTH','DAY','SYSTEM','FMOZONE']
cts = ['FMO','GAS','POPUL','QSCORE']
dummies = ['COOLER','ESC', 'NUM'] #delete num?


#baseline stuff/logs
lmilk = ['L'+x for x in milk]
lcts = ['L'+x for x in cts]
data[lcts] = np.log(data[cts])
data[lmilk] = np.log(data[milk])

#set up lags
lags = 10
lagkeys = [l+str(i) for l in ['LSCORE_min','LSCORE_max'] for i in range(1,1+lags)]
aucts = data.groupby(auct_key, as_index=False)[milk].mean()[auct_key]

#note data is already sorted by date
data = data.sort_values(['YEAR','MONTH','DAY'])
min_lag = data.groupby(auct_key, as_index=False).min()
for t in range(1,1+lags):
    min_lagt = min_lag.shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_min%s'%(t)) ) 
    
max_lag = data.groupby(auct_key, as_index=False).max()
for t in range(1,1+lags):
    max_lagt = max_lag.shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_max%s'%(t)) )    

    
#set up lags but prev year 
yearlags = 3
year_lagkeys = [l+str(i) for l in ['LSCORE_miny','LSCORE_maxy'] for i in range(1,1+yearlags)]


#note data is sorted by year now
data = data.sort_values(['SYSTEM','YEAR','MONTH','DAY'])

year_min_lag = data.groupby(auct_key, as_index=False).min().sort_values(['SYSTEM','YEAR','MONTH','DAY'])
for t in range(1,1+lags):
    min_lagt = year_min_lag.groupby(['SYSTEM']).shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_miny%s'%(t)) )

year_max_lag = data.groupby(auct_key, as_index=False).max().sort_values(['SYSTEM','YEAR','MONTH','DAY'])
for t in range(1,1+lags):
    max_lagt = year_max_lag.groupby(['SYSTEM']).shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_maxy%s'%(t)) ) 
    

#pre processing to help fmozones
fe = ['FMOZONE']
data.loc[(data['FMOZONE'] =='1A') , 'FMOZONE'] = '1'

fekeys = []
for effect in fe:
    fes = pd.get_dummies(data[effect], drop_first=True)
    fekeys = fekeys+ list(fes.columns)
    data = pd.concat((data, fes), axis=1)

    
bid_key = auct_key + ['VENDOR'] + ['COUNTY']
covariates = lcts + dummies + fekeys

#setup interaction term for punishment
data['min*past'] = data['LSCORE_min1']*data['PAST_AUCT']

In [4]:
#write this to csvs
reg0 = data.copy()[bid_key + lmilk + covariates + year_lagkeys + lagkeys + ['INC','PAST_AUCT', 'min*past']]
reg0.to_csv('data/clean_milk0.csv')

In [5]:
#only drop data 5 periods back
maxlag = 4
limitedlag = [l+str(i) for l in ['LSCORE_min','LSCORE_max'] for i in range(1,1+maxlag)]


maxlagy = 2
limitedlagy = [l+str(i) for l in ['LSCORE_miny','LSCORE_maxy'] for i in range(1,1+maxlagy)]


limitedhist = ['PAST_AUCT','min*past'] + limitedlag + limitedlagy 


reg1 = data.copy()[bid_key + lmilk + covariates + limitedhist + ['INC','WIN']]
reg1 = reg1.dropna()
reg1.to_csv('data/clean_milk1.csv')

In [6]:
#baseline
sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates] ) ).fit().summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.166
Model:,OLS,Adj. R-squared:,0.163
Method:,Least Squares,F-statistic:,56.8
Date:,"Mon, 22 Jul 2019",Prob (F-statistic):,2.6e-115
Time:,19:05:58,Log-Likelihood:,3183.8
No. Observations:,3153,AIC:,-6344.0
Df Residuals:,3141,BIC:,-6271.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.2682,0.085,-26.614,0.000,-2.435,-2.101
LFMO,0.2227,0.030,7.305,0.000,0.163,0.283
LGAS,0.0090,0.005,1.648,0.099,-0.002,0.020
LPOPUL,0.0159,0.004,3.853,0.000,0.008,0.024
LQSCORE,-0.0213,0.004,-4.938,0.000,-0.030,-0.013
COOLER,0.0177,0.004,4.984,0.000,0.011,0.025
ESC,-0.0294,0.003,-8.895,0.000,-0.036,-0.023
NUM,0.0078,0.002,4.709,0.000,0.005,0.011
3,-0.0697,0.005,-13.592,0.000,-0.080,-0.060

0,1,2,3
Omnibus:,268.816,Durbin-Watson:,1.198
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1444.596
Skew:,0.201,Prob(JB):,0.0
Kurtosis:,6.291,Cond. No.,946.0


In [7]:
#incumbency
sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates+ ['INC']] ) ).fit().summary()

0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.183
Model:,OLS,Adj. R-squared:,0.18
Method:,Least Squares,F-statistic:,58.78
Date:,"Mon, 22 Jul 2019",Prob (F-statistic):,1.11e-128
Time:,19:05:58,Log-Likelihood:,3217.2
No. Observations:,3153,AIC:,-6408.0
Df Residuals:,3140,BIC:,-6330.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.2384,0.084,-26.515,0.000,-2.404,-2.073
LFMO,0.2168,0.030,7.181,0.000,0.158,0.276
LGAS,0.0094,0.005,1.749,0.080,-0.001,0.020
LPOPUL,0.0154,0.004,3.777,0.000,0.007,0.023
LQSCORE,-0.0208,0.004,-4.872,0.000,-0.029,-0.012
COOLER,0.0174,0.004,4.928,0.000,0.010,0.024
ESC,-0.0304,0.003,-9.284,0.000,-0.037,-0.024
NUM,0.0053,0.002,3.172,0.002,0.002,0.009
3,-0.0708,0.005,-13.947,0.000,-0.081,-0.061

0,1,2,3
Omnibus:,285.446,Durbin-Watson:,1.101
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1685.007
Skew:,0.193,Prob(JB):,0.0
Kurtosis:,6.56,Cond. No.,947.0


In [8]:
#2 lines incumbency

In [9]:
#previous prices 1 day
sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates + limitedlag]) ).fit().summary()

0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.202
Model:,OLS,Adj. R-squared:,0.197
Method:,Least Squares,F-statistic:,41.69
Date:,"Mon, 22 Jul 2019",Prob (F-statistic):,6.81e-138
Time:,19:05:58,Log-Likelihood:,3253.1
No. Observations:,3153,AIC:,-6466.0
Df Residuals:,3133,BIC:,-6345.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.4875,0.118,-12.639,0.000,-1.718,-1.257
LFMO,0.1519,0.031,4.931,0.000,0.092,0.212
LGAS,0.0109,0.005,2.032,0.042,0.000,0.021
LPOPUL,0.0097,0.004,2.375,0.018,0.002,0.018
LQSCORE,-0.0176,0.004,-4.139,0.000,-0.026,-0.009
COOLER,0.0168,0.003,4.795,0.000,0.010,0.024
ESC,-0.0277,0.003,-8.509,0.000,-0.034,-0.021
NUM,0.0063,0.002,3.819,0.000,0.003,0.009
3,-0.0715,0.005,-14.186,0.000,-0.081,-0.062

0,1,2,3
Omnibus:,318.204,Durbin-Watson:,1.26
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1731.051
Skew:,0.319,Prob(JB):,0.0
Kurtosis:,6.573,Cond. No.,1350.0


In [10]:
#prev prices with prev auction
sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates + ['PAST_AUCT','min*past'] + limitedlag ]) ).fit().summary()

0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.204
Model:,OLS,Adj. R-squared:,0.199
Method:,Least Squares,F-statistic:,38.2
Date:,"Mon, 22 Jul 2019",Prob (F-statistic):,4.5699999999999996e-138
Time:,19:05:58,Log-Likelihood:,3257.4
No. Observations:,3153,AIC:,-6471.0
Df Residuals:,3131,BIC:,-6338.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.5475,0.120,-12.894,0.000,-1.783,-1.312
LFMO,0.1481,0.031,4.805,0.000,0.088,0.209
LGAS,0.0110,0.005,2.059,0.040,0.001,0.022
LPOPUL,0.0095,0.004,2.340,0.019,0.002,0.018
LQSCORE,-0.0176,0.004,-4.134,0.000,-0.026,-0.009
COOLER,0.0162,0.003,4.630,0.000,0.009,0.023
ESC,-0.0277,0.003,-8.529,0.000,-0.034,-0.021
NUM,0.0062,0.002,3.777,0.000,0.003,0.009
3,-0.0718,0.005,-14.213,0.000,-0.082,-0.062

0,1,2,3
Omnibus:,313.516,Durbin-Watson:,1.265
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1693.87
Skew:,0.312,Prob(JB):,0.0
Kurtosis:,6.536,Cond. No.,1400.0


In [11]:
#previous prices prev year
sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates+limitedhist]) ).fit().summary()

0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.417
Model:,OLS,Adj. R-squared:,0.412
Method:,Least Squares,F-statistic:,89.35
Date:,"Mon, 22 Jul 2019",Prob (F-statistic):,0.0
Time:,19:05:58,Log-Likelihood:,3747.5
No. Observations:,3153,AIC:,-7443.0
Df Residuals:,3127,BIC:,-7286.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.0017,0.106,-9.428,0.000,-1.210,-0.793
LFMO,0.3218,0.027,11.882,0.000,0.269,0.375
LGAS,-0.0162,0.005,-3.464,0.001,-0.025,-0.007
LPOPUL,0.0086,0.003,2.459,0.014,0.002,0.015
LQSCORE,-0.0144,0.004,-3.929,0.000,-0.022,-0.007
COOLER,0.0011,0.003,0.364,0.716,-0.005,0.007
ESC,-0.0194,0.003,-6.920,0.000,-0.025,-0.014
NUM,0.0072,0.001,5.097,0.000,0.004,0.010
3,-0.0301,0.005,-6.597,0.000,-0.039,-0.021

0,1,2,3
Omnibus:,325.03,Durbin-Watson:,1.669
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1835.21
Skew:,0.318,Prob(JB):,0.0
Kurtosis:,6.683,Cond. No.,1470.0
