In [1]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
rawdata = pd.read_csv('data/milk.csv')
data = rawdata.copy()

data = rawdata.copy()

#lag auctions within 1 vendor
data = data.sort_values(['VENDOR','YEAR','MONTH','DAY','SYSTEM'])
bids = data.groupby(['VENDOR','YEAR','MONTH','DAY','SYSTEM'], as_index=False).mean()
bids = bids[['VENDOR','YEAR','MONTH','DAY','SYSTEM']]
sys_lag = bids.groupby(['VENDOR']).shift(1)
bids['VENDOR_LAG'] = sys_lag['SYSTEM']
data = pd.merge(data, bids, how='left', 
                 on=['VENDOR','YEAR','MONTH','DAY','SYSTEM'], suffixes=('', '_LAG') ) 

#lag auctions
data =data.sort_values(['YEAR','MONTH','DAY','SYSTEM'])
aucts =  data.groupby(['YEAR','MONTH','DAY','SYSTEM'], as_index=False).mean()
aucts = aucts[['YEAR','MONTH','DAY','SYSTEM']]
auct_lag = aucts.shift(1)
aucts['AUCT_LAG'] = auct_lag['SYSTEM']
data = pd.merge(data, aucts, how='left', 
                 on=['YEAR','MONTH','DAY','SYSTEM'], suffixes=('', '_LAG') ) 

prev_auct = 1.*(data['AUCT_LAG'] == data['VENDOR_LAG'])
data['PAST_AUCT'] = prev_auct

In [3]:
#general house keeping
data = data[ (data['YEAR']>=1980)]
data = data[(data['MONTH'] >= 4) & (data['MONTH'] <= 9) & (data['DAY'] !=0) ] #need data with time index
data = data[(~np.isnan(data['SCORE']) ) & (data['QSCORE']!=0 )] #need data with QWW and WW


data['COOLER'] = data['COOLER'].fillna(0)
data['ESC'] = data['ESC'].fillna(0)
data['ONEBID'] = 1.*(data['NUM'] == 1)

#various keys
milk = ['SCORE']
auct_key = ['YEAR','MONTH','DAY','SYSTEM','FMOZONE']
cts = ['FMO','GAS','POPUL','QSCORE']
dummies = ['COOLER','ESC', 'NUM'] #delete num?


#baseline stuff/logs
lmilk = ['L'+x for x in milk]
lcts = ['L'+x for x in cts]
data[lcts] = np.log(data[cts])
data[lmilk] = np.log(data[milk])

#set up lags
lags = 10
lagkeys = [l+str(i) for l in ['LSCORE_min','LSCORE_max'] for i in range(1,1+lags)]
aucts = data.groupby(auct_key, as_index=False)[milk].mean()[auct_key]

#note data is already sorted by date
data = data.sort_values(['YEAR','MONTH','DAY'])
min_lag = data.groupby(auct_key, as_index=False).min()
for t in range(1,1+lags):
    min_lagt = min_lag.shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_min%s'%(t)) ) 
    
max_lag = data.groupby(auct_key, as_index=False).max()
for t in range(1,1+lags):
    max_lagt = max_lag.shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_max%s'%(t)) )    

    
#set up lags but prev year 
yearlags = 3
year_lagkeys = [l+str(i) for l in ['LSCORE_miny','LSCORE_maxy'] for i in range(1,1+yearlags)]


#note data is sorted by year now
data = data.sort_values(['SYSTEM','YEAR','MONTH','DAY'])

year_min_lag = data.groupby(auct_key, as_index=False).min().sort_values(['SYSTEM','YEAR','MONTH','DAY'])
for t in range(1,1+lags):
    min_lagt = year_min_lag.groupby(['SYSTEM']).shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_miny%s'%(t)) )

year_max_lag = data.groupby(auct_key, as_index=False).max().sort_values(['SYSTEM','YEAR','MONTH','DAY'])
for t in range(1,1+lags):
    max_lagt = year_max_lag.groupby(['SYSTEM']).shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_maxy%s'%(t)) ) 
    

#pre processing to help fmozones
fe = ['FMOZONE']
data.loc[(data['FMOZONE'] =='1A') , 'FMOZONE'] = '1'

fekeys = []
for effect in fe:
    fes = pd.get_dummies(data[effect], drop_first=True)
    fekeys = fekeys+ list(fes.columns)
    data = pd.concat((data, fes), axis=1)

    
bid_key = auct_key + ['VENDOR'] + ['COUNTY']
covariates = lcts + dummies + fekeys

In [4]:
#write this to csvs with all the lags
reg0 = data.copy()[bid_key + lmilk + covariates + year_lagkeys + lagkeys + ['INC','PAST_AUCT']]
reg0.to_csv('data/clean_milk0.csv')

#only drop data 5 periods back

In [6]:
reg1 = reg0.copy()[bid_key + lmilk + covariates + ['INC']]
reg1 = reg1.dropna()

In [7]:
#baseline

sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates] ) ).fit().summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.163
Model:,OLS,Adj. R-squared:,0.161
Method:,Least Squares,F-statistic:,71.77
Date:,"Tue, 23 Jul 2019",Prob (F-statistic):,1.08e-147
Time:,16:23:30,Log-Likelihood:,4127.0
No. Observations:,4056,AIC:,-8230.0
Df Residuals:,4044,BIC:,-8154.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.2626,0.077,-29.423,0.000,-2.413,-2.112
LFMO,0.2045,0.028,7.238,0.000,0.149,0.260
LGAS,0.0237,0.004,5.570,0.000,0.015,0.032
LPOPUL,0.0153,0.003,4.713,0.000,0.009,0.022
LQSCORE,-0.0201,0.003,-5.870,0.000,-0.027,-0.013
COOLER,0.0180,0.003,5.787,0.000,0.012,0.024
ESC,-0.0270,0.003,-9.308,0.000,-0.033,-0.021
NUM,0.0055,0.001,3.848,0.000,0.003,0.008
3,-0.0706,0.004,-16.172,0.000,-0.079,-0.062

0,1,2,3
Omnibus:,284.052,Durbin-Watson:,1.204
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1357.2
Skew:,0.122,Prob(JB):,1.94e-295
Kurtosis:,5.823,Cond. No.,970.0


In [8]:
#incumbency
sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates+ ['INC']] ) ).fit().summary()

0,1,2,3
Dep. Variable:,LSCORE,R-squared:,0.18
Model:,OLS,Adj. R-squared:,0.178
Method:,Least Squares,F-statistic:,74.05
Date:,"Tue, 23 Jul 2019",Prob (F-statistic):,2.0100000000000002e-164
Time:,16:23:30,Log-Likelihood:,4168.3
No. Observations:,4056,AIC:,-8311.0
Df Residuals:,4043,BIC:,-8229.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.2448,0.076,-29.477,0.000,-2.394,-2.096
LFMO,0.1998,0.028,7.141,0.000,0.145,0.255
LGAS,0.0251,0.004,5.958,0.000,0.017,0.033
LPOPUL,0.0145,0.003,4.497,0.000,0.008,0.021
LQSCORE,-0.0188,0.003,-5.567,0.000,-0.025,-0.012
COOLER,0.0177,0.003,5.747,0.000,0.012,0.024
ESC,-0.0281,0.003,-9.769,0.000,-0.034,-0.022
NUM,0.0029,0.001,1.973,0.049,1.81e-05,0.006
3,-0.0718,0.004,-16.604,0.000,-0.080,-0.063

0,1,2,3
Omnibus:,300.428,Durbin-Watson:,1.109
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1564.605
Skew:,0.097,Prob(JB):,0.0
Kurtosis:,6.036,Cond. No.,970.0


In [9]:
#2 lines incumbency