In [6]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [7]:
rawdata = pd.read_csv('data/milk.csv')
data = rawdata.copy()

data = rawdata.copy()

#lag auctions within 1 vendor
data = data.sort_values(['VENDOR','YEAR','MONTH','DAY','SYSTEM'])
bids = data.groupby(['VENDOR','YEAR','MONTH','DAY','SYSTEM'], as_index=False).mean()
bids = bids[['VENDOR','YEAR','MONTH','DAY','SYSTEM']]
sys_lag = bids.groupby(['VENDOR']).shift(1)
bids['VENDOR_LAG'] = sys_lag['SYSTEM']
data = pd.merge(data, bids, how='left', 
                 on=['VENDOR','YEAR','MONTH','DAY','SYSTEM'], suffixes=('', '_LAG') ) 

#lag auctions
data =data.sort_values(['YEAR','MONTH','DAY','SYSTEM'])
aucts =  data.groupby(['YEAR','MONTH','DAY','SYSTEM'], as_index=False).mean()
aucts = aucts[['YEAR','MONTH','DAY','SYSTEM']]
auct_lag = aucts.shift(1)
aucts['AUCT_LAG'] = auct_lag['SYSTEM']
data = pd.merge(data, aucts, how='left', 
                 on=['YEAR','MONTH','DAY','SYSTEM'], suffixes=('', '_LAG') ) 

prev_auct = 1.*(data['AUCT_LAG'] == data['VENDOR_LAG'])
data['PAST_AUCT'] = prev_auct

In [8]:
#general house keeping
data = data[ (data['YEAR']>=1980)]
data = data[(data['MONTH'] >= 4) & (data['MONTH'] <= 9) & (data['DAY'] !=0) ] #need data with time index
data = data[(~np.isnan(data['SCORE']) ) & (data['QSCORE']!=0 )] #need data with QWW and WW


data['COOLER'] = data['COOLER'].fillna(0)
data['ESC'] = data['ESC'].fillna(0)
data['ONEBID'] = 1.*(data['NUM'] == 1)

#various keys
milk = ['SCORE']
auct_key = ['YEAR','MONTH','DAY','SYSTEM','FMOZONE']
cts = ['FMO','GAS','POPUL','QSCORE']
dummies = ['COOLER','ESC', 'NUM'] #delete num?


#baseline stuff/logs
lmilk = ['L'+x for x in milk]
lcts = ['L'+x for x in cts]
data[lcts] = np.log(data[cts])
data[lmilk] = np.log(data[milk])

#set up lags
lags = 10
lagkeys = [l+str(i) for l in ['LSCORE_min','LSCORE_max'] for i in range(1,1+lags)]
aucts = data.groupby(auct_key, as_index=False)[milk].mean()[auct_key]

#note data is already sorted by date
data = data.sort_values(['YEAR','MONTH','DAY'])
min_lag = data.groupby(auct_key, as_index=False).min()
for t in range(1,1+lags):
    min_lagt = min_lag.shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_min%s'%(t)) ) 
    
max_lag = data.groupby(auct_key, as_index=False).max()
for t in range(1,1+lags):
    max_lagt = max_lag.shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_max%s'%(t)) )    

    
#set up lags but prev year 
yearlags = 1
year_lagkeys = [l+str(i) for l in ['LSCORE_miny','LSCORE_maxy'] for i in range(1,1+yearlags)]


#note data is sorted by year now
data = data.sort_values(['SYSTEM','YEAR','MONTH','DAY'])

year_min_lag = data.groupby(auct_key, as_index=False).min().sort_values(['SYSTEM','YEAR','MONTH','DAY'])
for t in range(1,1+lags):
    min_lagt = year_min_lag.groupby(['SYSTEM']).shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_miny%s'%(t)) )

year_max_lag = data.groupby(auct_key, as_index=False).max().sort_values(['SYSTEM','YEAR','MONTH','DAY'])
for t in range(1,1+lags):
    max_lagt = year_max_lag.groupby(['SYSTEM']).shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_maxy%s'%(t)) ) 
    

#pre processing to help fmozones
fe = ['FMOZONE']
data.loc[(data['FMOZONE'] =='1A') , 'FMOZONE'] = '1'

fekeys = []
for effect in fe:
    fes = pd.get_dummies(data[effect], drop_first=True)
    fekeys = fekeys+ list(fes.columns)
    data = pd.concat((data, fes), axis=1)

    
bid_key = auct_key + ['VENDOR'] + ['COUNTY']
covariates = lcts + dummies + fekeys
hist = ['INC'] + lagkeys 

#only drop data 5 periods back
maxlag = 4
limitedlag = [l+str(i) for l in ['LSCORE_min','LSCORE_max'] for i in range(1,1+maxlag)]


maxlagy = 2
limitedlagy = [l+str(i) for l in ['LSCORE_miny','LSCORE_maxy'] for i in range(1,1+maxlagy)]


limitedhist = ['INC','PAST_AUCT'] + limitedlag + limitedlagy 

reg1 = data.copy()[bid_key + lmilk + covariates + limitedhist + ['WIN']]
reg1 = reg1.dropna()
reg1.to_csv('data/clean_milk1.csv')

In [9]:
sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates+limitedhist]) ).fit().summary()

0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.416
Model:,OLS,Adj. R-squared:,0.411
Method:,Least Squares,F-statistic:,88.97
Date:,"Thu, 11 Jul 2019",Prob (F-statistic):,0.0
Time:,11:37:25,Log-Likelihood:,3744.8
No. Observations:,3153,AIC:,-7438.0
Df Residuals:,3127,BIC:,-7280.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.9540,0.105,-9.129,0.000,-1.159,-0.749
LFMO,0.3240,0.027,11.957,0.000,0.271,0.377
LGAS,-0.0165,0.005,-3.522,0.000,-0.026,-0.007
LPOPUL,0.0087,0.003,2.484,0.013,0.002,0.016
LQSCORE,-0.0144,0.004,-3.905,0.000,-0.022,-0.007
COOLER,0.0015,0.003,0.498,0.619,-0.004,0.007
ESC,-0.0193,0.003,-6.870,0.000,-0.025,-0.014
NUM,0.0073,0.001,5.072,0.000,0.004,0.010
3,-0.0301,0.005,-6.585,0.000,-0.039,-0.021

0,1,2,3
Omnibus:,323.882,Durbin-Watson:,1.667
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1834.852
Skew:,0.314,Prob(JB):,0.0
Kurtosis:,6.684,Cond. No.,1430.0


In [10]:
sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates+limitedhist]) ).fit().summary()

0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.416
Model:,OLS,Adj. R-squared:,0.411
Method:,Least Squares,F-statistic:,88.97
Date:,"Thu, 11 Jul 2019",Prob (F-statistic):,0.0
Time:,11:37:25,Log-Likelihood:,3744.8
No. Observations:,3153,AIC:,-7438.0
Df Residuals:,3127,BIC:,-7280.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.9540,0.105,-9.129,0.000,-1.159,-0.749
LFMO,0.3240,0.027,11.957,0.000,0.271,0.377
LGAS,-0.0165,0.005,-3.522,0.000,-0.026,-0.007
LPOPUL,0.0087,0.003,2.484,0.013,0.002,0.016
LQSCORE,-0.0144,0.004,-3.905,0.000,-0.022,-0.007
COOLER,0.0015,0.003,0.498,0.619,-0.004,0.007
ESC,-0.0193,0.003,-6.870,0.000,-0.025,-0.014
NUM,0.0073,0.001,5.072,0.000,0.004,0.010
3,-0.0301,0.005,-6.585,0.000,-0.039,-0.021

0,1,2,3
Omnibus:,323.882,Durbin-Watson:,1.667
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1834.852
Skew:,0.314,Prob(JB):,0.0
Kurtosis:,6.684,Cond. No.,1430.0


In [11]:
sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates]) ).fit().summary()

0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.166
Model:,OLS,Adj. R-squared:,0.163
Method:,Least Squares,F-statistic:,56.8
Date:,"Thu, 11 Jul 2019",Prob (F-statistic):,2.6e-115
Time:,11:37:25,Log-Likelihood:,3183.8
No. Observations:,3153,AIC:,-6344.0
Df Residuals:,3141,BIC:,-6271.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.2682,0.085,-26.614,0.000,-2.435,-2.101
LFMO,0.2227,0.030,7.305,0.000,0.163,0.283
LGAS,0.0090,0.005,1.648,0.099,-0.002,0.020
LPOPUL,0.0159,0.004,3.853,0.000,0.008,0.024
LQSCORE,-0.0213,0.004,-4.938,0.000,-0.030,-0.013
COOLER,0.0177,0.004,4.984,0.000,0.011,0.025
ESC,-0.0294,0.003,-8.895,0.000,-0.036,-0.023
NUM,0.0078,0.002,4.709,0.000,0.005,0.011
3,-0.0697,0.005,-13.592,0.000,-0.080,-0.060

0,1,2,3
Omnibus:,268.816,Durbin-Watson:,1.198
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1444.596
Skew:,0.201,Prob(JB):,0.0
Kurtosis:,6.291,Cond. No.,946.0


In [19]:
reg1['interact'] = (reg1['LSCORE_min1']<=np.log(.155))*reg1['PAST_AUCT']
#reg1['interact2'] = 1.*(reg1['LSCORE_min1']<=np.log(.155))
sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates + ['interact']+ limitedlag]) ).fit().summary()

0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.205
Model:,OLS,Adj. R-squared:,0.199
Method:,Least Squares,F-statistic:,38.34
Date:,"Thu, 11 Jul 2019",Prob (F-statistic):,1.5e-138
Time:,11:42:14,Log-Likelihood:,3258.5
No. Observations:,3153,AIC:,-6473.0
Df Residuals:,3131,BIC:,-6340.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.5351,0.125,-12.236,0.000,-1.781,-1.289
LFMO,0.1515,0.031,4.910,0.000,0.091,0.212
LGAS,0.0110,0.005,2.046,0.041,0.000,0.021
LPOPUL,0.0097,0.004,2.390,0.017,0.002,0.018
LQSCORE,-0.0179,0.004,-4.196,0.000,-0.026,-0.010
COOLER,0.0162,0.003,4.644,0.000,0.009,0.023
ESC,-0.0276,0.003,-8.494,0.000,-0.034,-0.021
NUM,0.0061,0.002,3.734,0.000,0.003,0.009
3,-0.0720,0.005,-14.282,0.000,-0.082,-0.062

0,1,2,3
Omnibus:,311.843,Durbin-Watson:,1.264
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1678.585
Skew:,0.31,Prob(JB):,0.0
Kurtosis:,6.52,Cond. No.,1440.0
