In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing
import xgboost as xgb
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
# load training and test datasets
train_raw = pd.read_csv('input/train_set.csv', parse_dates=[2,])
test_raw = pd.read_csv('input/test_set.csv', parse_dates=[3,])

In [27]:
#regular model
ols_data = train_raw.copy()
ols_data['year'] = train_raw.quote_date.dt.year

baseline_ols = smf.ols('cost ~ annual_usage + min_order_quantity + bracket_pricing + quantity + \
                       year + supplier', data = ols_data)


baseline_fit = baseline_ols.fit()
print('R2: ', baseline_fit.rsquared)
print('MSE: ', baseline_fit.mse_model)

R2:  0.135932177588306
MSE:  55312.24146055766


In [4]:
point = dict(ols_data[['annual_usage','min_order_quantity', 'bracket_pricing',
                       'quantity','year','supplier']].loc[index])

print(baseline_fit.predict(exog=point))
print(ols_data['cost'].loc[index])

NameError: name 'index' is not defined

In [None]:
#regular model
ols_data = train_raw.copy()
ols_data['year'] = train_raw.quote_date.dt.year
print(ols_data.columns)

baseline_ols = smf.ols('np.log(cost) ~  np.log(quantity) + annual_usage +  min_order_quantity + bracket_pricing  + \
+ supplier', data = ols_data)
baseline_fit = baseline_ols.fit()
print('AIC: %s, R2: %s, params: %s'%(baseline_fit.aic, baseline_fit.rsquared,baseline_fit.params.shape))
print('quantity %s'%baseline_fit.params[2])




In [5]:
squaredterm_ols = smf.ols('np.log(cost) ~ np.log(quantity) + annual_usage +  min_order_quantity + bracket_pricing  + \
                        supplier + supplier*np.log(quantity) ', data = ols_data)

squaredterm_fit = squaredterm_ols.fit()
print('AIC: %s, R2: %s, params: %s'%( squaredterm_fit.aic, squaredterm_fit.rsquared,squaredterm_fit.params.shape))

base_params = len(baseline_fit.params)
restrict = len(squaredterm_fit.params)

#run an f-test (might be wrong?)
A = np.identity(restrict)
A = A[:base_params-1,:]
print(squaredterm_fit.f_test(A))
print('quantity %s'%baseline_fit.params[2])

AIC: 60694.38665779629, R2: 0.4920089391614567, params: (117,)
<F test: F=array([[3432.14215417]]), p=0.0, df_denom=30121, df_num=60>
quantity -83.7918929496086




In [6]:
train = train_raw.copy()
test = test_raw.copy()

# create some new features
train['year'] = train.quote_date.dt.year
train['month'] = train.quote_date.dt.month
train['dayofyear'] = train.quote_date.dt.dayofyear
train['dayofweek'] = train.quote_date.dt.dayofweek
train['day'] = train.quote_date.dt.day

test['year'] = test.quote_date.dt.year
test['month'] = test.quote_date.dt.month
test['dayofyear'] = test.quote_date.dt.dayofyear
test['dayofweek'] = test.quote_date.dt.dayofweek
test['day'] = test.quote_date.dt.day

# drop useless columns and create labels
idx = test.id.values.astype(int)
labels = train.cost.values

test = test.drop(['id', 'tube_assembly_id', 'quote_date'], axis = 1)
train = train.drop(['quote_date', 'cost', 'tube_assembly_id'], axis = 1)

# convert data to numpy array
column_names = list(train.columns)

train = np.array(train)
test = np.array(test)

# label encode the categorical variables
for i in range(train.shape[1]):
    if i in [0,3]:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[:,i]) + list(test[:,i]))
        train[:,i] = lbl.transform(train[:,i]) #give each label a number 1-67
        test[:,i] = lbl.transform(test[:,i])
        
        
# object array to float
train = train.astype(float)
test = test.astype(float)

In [7]:
label_log = np.log1p(labels)

# fit a random forest model
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["min_child_weight"] = 5
params["subsample"] = 1.0
params["scale_pos_weight"] = 1.0
params["silent"] = 1
params["max_depth"] = 7

plst = list(params.items())

xgtrain = xgb.DMatrix(train, label=label_log)
xgtest = xgb.DMatrix(test)
print(xgtest)


num_rounds = 120
model = xgb.train(plst, xgtrain, num_rounds)


<xgboost.core.DMatrix object at 0x7f1148324898>


In [29]:
mse = ( ( label_log - model.predict(xgtrain) )**2).sum()
r2 = mse/(( label_log - label_log.mean() )**2).sum()

print(r2)

0.14370526453776553


In [23]:
#predict a point

index = 3
point  = train[index,:].reshape(1,len(train[index,:]))
print(model.predict(xgb.DMatrix(point)))
print(labels[index])

[1.6555067]
4.6877695119712


In [None]:
# get predictions from the model, convert them and dump them!
preds = np.expm1(model.predict(xgtest))
preds = pd.DataFrame({"id": idx, "cost": preds})
preds.to_csv('benchmark.csv', index=False)
