In [2]:
import os
import csv
import pandas as pd
import numpy as np
import datetime
from sklearn import ensemble, preprocessing
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
import timeit
import math
from sklearn import cross_validation
from sklearn import datasets

##Load Data

In [4]:
train = pd.read_csv("competition_data/train_set.csv", parse_dates=[2,])
test = pd.read_csv("competition_data/test_set.csv", parse_dates=[3,])

tubes = pd.read_csv('competition_data/tube.csv')
tube_end = pd.read_csv('competition_data/tube_end_form.csv')

In [3]:
train = pd.merge(train,tubes,on='tube_assembly_id',how='inner')
test = pd.merge(test,tubes,on='tube_assembly_id',how='inner')

train = pd.merge(train,tube_end, left_on='end_a', right_on = 'end_form_id',how='left')
test = pd.merge(test,tube_end,left_on='end_a', right_on = 'end_form_id',how='left')

train['material_id'].fillna('SP-9999',inplace=True)
test['material_id'].fillna('SP-9999',inplace=True)

train['forming'].fillna('unknown', inplace = True)
test['forming'].fillna('unknown', inplace = True)

train['end_form_id'].fillna('unknown', inplace = True)
test['end_form_id'].fillna('unknown', inplace = True)

In [4]:
idx = test.id.values.astype(int)
train['year'] = train.quote_date.dt.year
train['month'] = train.quote_date.dt.month
train['dayofyear'] = train.quote_date.dt.dayofyear
train['dayofweek'] = train.quote_date.dt.dayofweek
train['day'] = train.quote_date.dt.day

test['year'] = test.quote_date.dt.year
test['month'] = test.quote_date.dt.month
test['dayofyear'] = test.quote_date.dt.dayofyear
test['dayofweek'] = test.quote_date.dt.dayofweek
test['day'] = test.quote_date.dt.day

In [5]:
idx = test.id.values.astype(int)
test = test.drop(['id', 'tube_assembly_id', 'quote_date'], axis = 1)
labels = train.cost.values
train = train.drop(['quote_date', 'cost', 'tube_assembly_id'], axis = 1)

In [6]:
# convert data to numpy array
train = np.array(train)
test = np.array(test)

# label encode the categorical variables
for i in range(train.shape[1]):
    if i in [0,3,5,11,12,13,14,15,16,20, 21]:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[:,i]) + list(test[:,i]))
        train[:,i] = lbl.transform(train[:,i])
        test[:,i] = lbl.transform(test[:,i])

In [7]:
# object array to float
train = train.astype(float)
test = test.astype(float)

In [8]:
label_log = np.log1p(labels)

###Set xgboost params

##Evaluation

###RMSLE

In [9]:
def rmsle(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))

In [10]:
def rmsle_loop(y, y_pred):
	assert len(y) == len(y_pred)
	terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
	return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

In [11]:
params = {}
params["booster"] = "gblinear"
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["min_child_weight"] = 6
params["subsample"] = 0.85
params["scale_pos_weight"] = 1.0
params["max_depth"] = 10
params["nthread"] = 4
params["verbose"] = 1
params["gamma"] = 2
params["colsample_bytree"] = 0.75


In [12]:
plst = list(params.items())

In [13]:
num_rounds = 120

###Cross validation

In [39]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(train, label_log, test_size = 0.80, random_state = 31415)

In [40]:
validation_num_rounds = 120

In [41]:
xgtrain = xgb.DMatrix(x_train, label=y_train)
xgtest = xgb.DMatrix(x_test)
model = xgb.train(plst, xgtrain, num_rounds)

# get predictions from the model, convert them and dump them!
preds = np.expm1(model.predict(xgtest))

In [42]:
print rmsle(preds, y_test)
print rmsle_loop(preds, y_test)

1.123899778
1.12389977779


##Submission

In [44]:
xgtrain = xgb.DMatrix(train, label=label_log)
xgtest = xgb.DMatrix(test)
model = xgb.train(plst, xgtrain, num_rounds)

# get predictions from the model, convert them and dump them!
preds = np.expm1(model.predict(xgtest))
preds = pd.DataFrame({"id": idx, "cost": preds})
preds.to_csv('benchmark.csv', index=False)