# XGBoost
adapted from https://www.kaggle.com/gaborfodor/1-xgboost-baseline

In [1]:
%matplotlib inline
import os
import pandas as pd
import datetime as dt
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

In [2]:
plt.rcParams['figure.figsize'] = [16, 10]
plt.rcParams['font.size'] = 14
pd.set_option('display.max_columns', 99)
start = dt.datetime.now()

In [3]:
full = pd.read_csv('./features.csv.gz')
full.shape

(2777744, 56)

In [4]:
TRAIN_SAMPLE_SIZE = 0.5

In [5]:
full['random'] = np.random.rand(len(full))

train = full[full.IsTrain == 1]
test = full[full.IsTrain == 0]

column_stats = pd.concat([
    pd.DataFrame(full.count()).rename(columns={0: 'cnt'}),
    pd.DataFrame(full.nunique()).rename(columns={0: 'unique'}),
], sort=True, axis=1)
column_stats.sort_values(by='unique')

train_columns = list(column_stats[column_stats.cnt < 10 ** 6].index)
print(train_columns)

target_columns = [
    'TotalTimeStopped_p20',
    'TotalTimeStopped_p50',
    'TotalTimeStopped_p80',
    'DistanceToFirstStop_p20',
    'DistanceToFirstStop_p50',
    'DistanceToFirstStop_p80',
]

do_not_use = train_columns + ['IsTrain', 'Path', 'RowId', 'IntersectionId',
                              'random', 'intersection_random', 'ValidationGroup']

feature_columns = [c for c in full.columns if c not in do_not_use]
print(len(feature_columns))
print(feature_columns)

Unnamed: 0,cnt,unique
SameStreet,2777744,2
Weekend,2777744,2
IsTrain,2777744,2
EntryStreetMissing,2777744,2
ExitStreetMissing,2777744,2
ValidationGroup,2777744,3
City,2777744,4
ExitType,2777744,5
EntryType,2777744,5
EntryHeading,2777744,8


['DistanceToFirstStop_p20', 'DistanceToFirstStop_p40', 'DistanceToFirstStop_p50', 'DistanceToFirstStop_p60', 'DistanceToFirstStop_p80', 'TimeFromFirstStop_p20', 'TimeFromFirstStop_p40', 'TimeFromFirstStop_p50', 'TimeFromFirstStop_p60', 'TimeFromFirstStop_p80', 'TotalTimeStopped_p20', 'TotalTimeStopped_p40', 'TotalTimeStopped_p50', 'TotalTimeStopped_p60', 'TotalTimeStopped_p80']
36
['City', 'EntryHeading', 'EntryStreetName', 'ExitHeading', 'ExitStreetName', 'Hour', 'Latitude', 'Longitude', 'Month', 'Weekend', 'Latitude3', 'Longitude3', 'EntryStreetMissing', 'ExitStreetMissing', 'CMWH', 'DiffHeading', 'Rainfall', 'Temperature', 'EntryType', 'ExitType', 'Intersection', 'SameStreet', 'LatitudeDist', 'LongitudeDist', 'CenterDistL1', 'CenterDistL2', 'Longitude3Count', 'Latitude3Count', 'ExitStreetNameCount', 'EntryStreetNameCount', 'IntersectionCount', 'PathCount', 'Longitude3UniqueIntersections', 'Latitude3UniqueIntersections', 'ExitStreetNameUniqueIntersections', 'EntryStreetNameUniqueInte

In [6]:
fix = {
    'lambda': 1., 'nthread': 4, 'booster': 'gbtree',
    'silent': 1, 'eval_metric': 'rmse',
    'objective': 'reg:squarederror'}
config = dict(min_child_weight=20,
              eta=0.05, colsample_bytree=0.6,
              max_depth=50, subsample=0.8)
config.update(fix)
nround = 200

In [7]:
total_mse = 0.0
submission_parts = []
for i, target in enumerate(target_columns):
    print(f'Training and predicting for target {target}')
    train_idx = train.random < TRAIN_SAMPLE_SIZE
    valid_idx = train.random >= TRAIN_SAMPLE_SIZE

    Xtr = train[train_idx][feature_columns]
    Xv = train[valid_idx][feature_columns]
    ytr = train[train_idx][target].values
    yv = train[valid_idx][target].values
    print(Xtr.shape, ytr.shape, Xv.shape, yv.shape)

    dtrain = xgb.DMatrix(Xtr, label=ytr)
    dvalid = xgb.DMatrix(Xv, label=yv)

    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
    model = xgb.train(config, dtrain, nround, evals=watchlist,
                      verbose_eval=100, early_stopping_rounds=100)

    pv = model.predict(dvalid)
    mse = np.mean((yv - pv) ** 2)
    total_mse += mse / 6
    print(target, 'rmse', np.sqrt(mse))

    df = pd.DataFrame({
        'TargetId': test.RowId.astype(str) + '_' + str(i),
        'Target': model.predict(xgb.DMatrix(test[feature_columns]))})
    submission_parts.append(df)

Training and predicting for target TotalTimeStopped_p20
(429229, 36) (429229,) (428180, 36) (428180,)
[0]	train-rmse:7.06011	valid-rmse:7.0878
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:4.03814	valid-rmse:5.61322
[199]	train-rmse:3.15945	valid-rmse:5.65136
TotalTimeStopped_p20 rmse 5.6517304424800425
Training and predicting for target TotalTimeStopped_p50
(429229, 36) (429229,) (428180, 36) (428180,)
[0]	train-rmse:16.6505	valid-rmse:16.6514
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:6.77987	valid-rmse:9.68001
[199]	train-rmse:5.13267	valid-rmse:9.66619
TotalTimeStopped_p50 rmse 9.666541547480934
Training and predicting for target TotalTimeStopped_p80
(429229, 36) (429229,) (428180, 36) (428180,)
[0]	train-rmse:34.7746	valid-rmse:34.6761
Multiple eva

In [8]:
rmse = np.sqrt(total_mse)
print('Total rmse', rmse)
submission = pd.concat(submission_parts, sort=True)
submission.to_csv('XGBoostsubmission.csv', index=False)

Total rmse 41.165263860501845


In [9]:
end = dt.datetime.now()
print('Latest run {}.\nTotal time {}s'.format(end, (end - start).seconds))

Latest run 2019-12-11 13:24:00.717789.
Total time 3627s
