In [16]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [17]:
rawdata = pd.read_csv('data/milk.csv')
data = rawdata.copy()

data = rawdata.copy()

#lag auctions within 1 vendor
data = data.sort_values(['VENDOR','YEAR','MONTH','DAY','SYSTEM'])
bids = data.groupby(['VENDOR','YEAR','MONTH','DAY','SYSTEM'], as_index=False).mean()
bids = bids[['VENDOR','YEAR','MONTH','DAY','SYSTEM']]
sys_lag = bids.groupby(['VENDOR']).shift(1)
bids['VENDOR_LAG'] = sys_lag['SYSTEM']
data = pd.merge(data, bids, how='left', 
                 on=['VENDOR','YEAR','MONTH','DAY','SYSTEM'], suffixes=('', '_LAG') ) 

#lag auctions
data =data.sort_values(['YEAR','MONTH','DAY','SYSTEM'])
aucts =  data.groupby(['YEAR','MONTH','DAY','SYSTEM'], as_index=False).mean()
aucts = aucts[['YEAR','MONTH','DAY','SYSTEM']]
auct_lag = aucts.shift(1)
aucts['AUCT_LAG'] = auct_lag['SYSTEM']
data = pd.merge(data, aucts, how='left', 
                 on=['YEAR','MONTH','DAY','SYSTEM'], suffixes=('', '_LAG') ) 

prev_auct = 1.*(data['AUCT_LAG'] == data['VENDOR_LAG'])
data['PAST_AUCT'] = prev_auct

In [18]:
#general house keeping
data = data[ (data['YEAR']>=1980)]
data = data[(data['MONTH'] >= 4) & (data['MONTH'] <= 9) & (data['DAY'] !=0) ] #need data with time index
data = data[(~np.isnan(data['SCORE']) ) & (data['QSCORE']!=0 )] #need data with QWW and WW


data['COOLER'] = data['COOLER'].fillna(0)
data['ESC'] = data['ESC'].fillna(0)
data['ONEBID'] = 1.*(data['NUM'] == 1)

#various keys
milk = ['SCORE']
auct_key = ['YEAR','MONTH','DAY','SYSTEM','FMOZONE']
cts = ['FMO','GAS','POPUL','QSCORE']
dummies = ['COOLER','ESC', 'NUM'] #delete num?


#baseline stuff/logs
lmilk = ['L'+x for x in milk]
lcts = ['L'+x for x in cts]
data[lcts] = np.log(data[cts])
data[lmilk] = np.log(data[milk])

#set up lags
lags = 10
lagkeys = [l+str(i) for l in ['LSCORE_min','LSCORE_max'] for i in range(1,1+lags)]
aucts = data.groupby(auct_key, as_index=False)[milk].mean()[auct_key]

#note data is already sorted by date
data = data.sort_values(['YEAR','MONTH','DAY'])
min_lag = data.groupby(auct_key, as_index=False).min()
for t in range(1,1+lags):
    min_lagt = min_lag.shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_min%s'%(t)) ) 
    
max_lag = data.groupby(auct_key, as_index=False).max()
for t in range(1,1+lags):
    max_lagt = max_lag.shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_max%s'%(t)) )    

    
#set up lags but prev year 
yearlags = 3
year_lagkeys = [l+str(i) for l in ['LSCORE_miny','LSCORE_maxy'] for i in range(1,1+yearlags)]


#note data is sorted by year now
data = data.sort_values(['SYSTEM','YEAR','MONTH','DAY'])

year_min_lag = data.groupby(auct_key, as_index=False).min().sort_values(['SYSTEM','YEAR','MONTH','DAY'])
for t in range(1,1+lags):
    min_lagt = year_min_lag.groupby(['SYSTEM']).shift(t)[lmilk]
    min_lagt = pd.concat((aucts, min_lagt), axis=1)
    data = pd.merge(data, min_lagt, how='left', on=auct_key, suffixes=('', '_miny%s'%(t)) )

year_max_lag = data.groupby(auct_key, as_index=False).max().sort_values(['SYSTEM','YEAR','MONTH','DAY'])
for t in range(1,1+lags):
    max_lagt = year_max_lag.groupby(['SYSTEM']).shift(t)[lmilk]
    max_lagt = pd.concat((aucts, max_lagt), axis=1)
    data = pd.merge(data, max_lagt, how='left', on=auct_key, suffixes=('', '_maxy%s'%(t)) ) 
    

#pre processing to help fmozones
fe = ['FMOZONE']
data.loc[(data['FMOZONE'] =='1A') , 'FMOZONE'] = '1'

fekeys = []
for effect in fe:
    fes = pd.get_dummies(data[effect], drop_first=True)
    fekeys = fekeys+ list(fes.columns)
    data = pd.concat((data, fes), axis=1)

    
bid_key = auct_key + ['VENDOR'] + ['COUNTY']
covariates = lcts + dummies + fekeys

In [19]:
#write this to csvs with all the lags
reg0 = data.copy()[bid_key + lmilk + covariates + year_lagkeys + lagkeys + ['INC','PAST_AUCT', 'WIN']]
reg0.to_csv('data/clean_milk0.csv')

#only drop data 5 periods back

In [20]:
reg1 = reg0.copy()[bid_key + lmilk + covariates + ['INC']]
reg1 = reg1.dropna()

In [23]:
#baseline

fit1 = sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates] ) ).fit()
print(fit1.summary())

                            OLS Regression Results                            
Dep. Variable:                 LSCORE   R-squared:                       0.163
Model:                            OLS   Adj. R-squared:                  0.161
Method:                 Least Squares   F-statistic:                     71.77
Date:                Thu, 25 Jul 2019   Prob (F-statistic):          1.08e-147
Time:                        16:23:50   Log-Likelihood:                 4127.0
No. Observations:                4056   AIC:                            -8230.
Df Residuals:                    4044   BIC:                            -8154.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.2626      0.077    -29.423      0.0

In [24]:
#incumbency
fit2 = sm.OLS(reg1['LSCORE'], sm.add_constant(reg1[covariates+ ['INC']] ) ).fit()
print(fit2.summary())

                            OLS Regression Results                            
Dep. Variable:                 LSCORE   R-squared:                       0.180
Model:                            OLS   Adj. R-squared:                  0.178
Method:                 Least Squares   F-statistic:                     74.05
Date:                Thu, 25 Jul 2019   Prob (F-statistic):          2.01e-164
Time:                        16:23:59   Log-Likelihood:                 4168.3
No. Observations:                4056   AIC:                            -8311.
Df Residuals:                    4043   BIC:                            -8229.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.2448      0.076    -29.477      0.0

In [26]:
#2 create a table.

df = pd.concat([fit1.params, fit1.HC0_se, fit2.params, fit2.HC0_se], axis=1)


df.loc['$R^2$'] =  [fit1.rsquared,np.NaN,fit2.rsquared,np.NaN]
df.loc['Obs.'] =  [int(reg1.shape[0]),np.NaN, int(reg1.shape[0]),np.NaN]


nice_cov = {'const':'(Intercept)', 
            'LFMO':'Raw milk',
            'LGAS':'Gas',
            'LPOPUL':'Population', 
            'LQSCORE':'Quantity',
            'COOLER':'Cooler',
            'ESC':'Escalated',
            'NUM':'No. Bidders', #+ fekeys
            '3':'Waco','6':'St. Angelo', '7':'Austin', '9':'San Antonio',
            'INC':'Incumbency' }


#fix column names
df = df.reindex(index = ['Obs.','$R^2$', 'const'] + covariates + ['INC'])
df = df.rename(columns = {0:'fit1',1:'se1',2:'fit2',3:'se2'})
df = df.rename(index = nice_cov)
print(df)

df.to_csv('baseline.csv',  float_format='%.4f', na_rep = '')

                    fit1       se1         fit2       se2
Obs.         4056.000000       NaN  4056.000000       NaN
$R^2$           0.163326       NaN     0.180194       NaN
(Intercept)    -2.262603  0.076363    -2.244822  0.076179
Raw milk        0.204512  0.028012     0.199804  0.028032
Gas             0.023697  0.004212     0.025111  0.004159
Population      0.015321  0.003762     0.014480  0.003766
Quantity       -0.020052  0.003983    -0.018843  0.003989
Cooler          0.017962  0.003073     0.017659  0.003053
Escalated      -0.026995  0.002837    -0.028074  0.002817
No. Bidders     0.005542  0.001417     0.002872  0.001458
Waco           -0.070571  0.004733    -0.071764  0.004772
St. Angelo     -0.042726  0.010331    -0.040681  0.010791
Austin         -0.089732  0.023023    -0.090389  0.022093
San Antonio    -0.051186  0.005135    -0.053267  0.005111
Incumbency           NaN       NaN    -0.029550  0.003050


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until
