In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import operator
import matplotlib
matplotlib.use("Agg") #Needed to save figures
import matplotlib.pyplot as plt

In [2]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

# Gather some features

In [3]:
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'SchoolHoliday'])

    # Label encode some features
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear

    
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0

    # Indicate that sales on that day are in promo interval
    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data


In [None]:
print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("store.csv")

Load the training, test and store data using pandas


In [10]:
print("Assume store open, if not provided")
train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]
print("Use only Sales bigger then zero. Simplifies calculation of rmspe")
train = train[train["Sales"] > 0]

print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

features = []

print("augment features")
build_features(features, train)
build_features([], test)
print(features)

Assume store open, if not provided
Consider only open stores for training. Closed stores wont count into the score.
Use only Sales bigger then zero. Simplifies calculation of rmspe
Join with store
augment features
['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']


In [11]:
print('training data processed')

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.3,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 300

training data processed


In [12]:
['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

Train a XGBoost model
[0]	train-rmse:5.79368	eval-rmse:5.79362	train-rmspe:0.99684	eval-rmspe:0.996838
Multiple eval metrics have been passed: 'eval-rmspe' will be used for early stopping.

Will train until eval-rmspe hasn't improved in 100 rounds.
[1]	train-rmse:4.06275	eval-rmse:4.06366	train-rmspe:0.981524	eval-rmspe:0.981525
[2]	train-rmse:2.85319	eval-rmse:2.85523	train-rmspe:0.938095	eval-rmspe:0.938159
[3]	train-rmse:2.00969	eval-rmse:2.01187	train-rmspe:0.856661	eval-rmspe:0.856648
[4]	train-rmse:1.42234	eval-rmse:1.42514	train-rmspe:0.74415	eval-rmspe:0.743779
[5]	train-rmse:1.01698	eval-rmse:1.02011	train-rmspe:0.619457	eval-rmspe:0.617958
[6]	train-rmse:0.741238	eval-rmse:0.744666	train-rmspe:0.503996	eval-rmspe:0.500151
[7]	train-rmse:0.55319	eval-rmse:0.556665	train-rmspe:0.411686	eval-rmspe:0.403568
[8]	train-rmse:0.430897	eval-rmse:0.434265	train-rmspe:0.348815	eval-rmspe:0.335589
[9]	train-rmse:0.352225	eval-rmse:0.35527	train-rmspe:0.311021	eval-rmspe:0.292855
[10]	tra

[96]	train-rmse:0.10148	eval-rmse:0.106777	train-rmspe:0.127281	eval-rmspe:0.113493
[97]	train-rmse:0.101183	eval-rmse:0.106541	train-rmspe:0.127007	eval-rmspe:0.113204
[98]	train-rmse:0.101056	eval-rmse:0.106426	train-rmspe:0.12692	eval-rmspe:0.113044
[99]	train-rmse:0.100873	eval-rmse:0.106307	train-rmspe:0.126697	eval-rmspe:0.112928
[100]	train-rmse:0.100556	eval-rmse:0.106075	train-rmspe:0.126348	eval-rmspe:0.112672
[101]	train-rmse:0.100179	eval-rmse:0.105782	train-rmspe:0.125751	eval-rmspe:0.112239
[102]	train-rmse:0.099906	eval-rmse:0.10548	train-rmspe:0.125471	eval-rmspe:0.111937
[103]	train-rmse:0.099658	eval-rmse:0.105214	train-rmspe:0.125252	eval-rmspe:0.111513
[104]	train-rmse:0.099494	eval-rmse:0.105093	train-rmspe:0.125105	eval-rmspe:0.111386
[105]	train-rmse:0.099302	eval-rmse:0.104929	train-rmspe:0.124931	eval-rmspe:0.111201
[106]	train-rmse:0.099129	eval-rmse:0.104842	train-rmspe:0.123911	eval-rmspe:0.111085
[107]	train-rmse:0.098929	eval-rmse:0.104684	train-rmspe:0.12

[192]	train-rmse:0.083801	eval-rmse:0.09486	train-rmspe:0.093695	eval-rmspe:0.099373
[193]	train-rmse:0.083734	eval-rmse:0.094777	train-rmspe:0.093574	eval-rmspe:0.099254
[194]	train-rmse:0.083657	eval-rmse:0.094769	train-rmspe:0.093479	eval-rmspe:0.09925
[195]	train-rmse:0.083553	eval-rmse:0.094757	train-rmspe:0.09339	eval-rmspe:0.099233
[196]	train-rmse:0.083362	eval-rmse:0.094637	train-rmspe:0.093185	eval-rmspe:0.099107
[197]	train-rmse:0.083217	eval-rmse:0.094506	train-rmspe:0.093001	eval-rmspe:0.098907
[198]	train-rmse:0.082988	eval-rmse:0.094368	train-rmspe:0.092773	eval-rmspe:0.098764
[199]	train-rmse:0.082905	eval-rmse:0.094342	train-rmspe:0.092689	eval-rmspe:0.098717
[200]	train-rmse:0.082812	eval-rmse:0.094307	train-rmspe:0.092596	eval-rmspe:0.098675
[201]	train-rmse:0.08268	eval-rmse:0.094218	train-rmspe:0.092447	eval-rmspe:0.09859
[202]	train-rmse:0.082572	eval-rmse:0.094144	train-rmspe:0.092298	eval-rmspe:0.098488
[203]	train-rmse:0.082503	eval-rmse:0.094106	train-rmspe:0.

[288]	train-rmse:0.074867	eval-rmse:0.090771	train-rmspe:0.080737	eval-rmspe:0.094959
[289]	train-rmse:0.07478	eval-rmse:0.090766	train-rmspe:0.080597	eval-rmspe:0.094956
[290]	train-rmse:0.074753	eval-rmse:0.090751	train-rmspe:0.080574	eval-rmspe:0.094943
[291]	train-rmse:0.074664	eval-rmse:0.090701	train-rmspe:0.080367	eval-rmspe:0.094896
[292]	train-rmse:0.074592	eval-rmse:0.090653	train-rmspe:0.08023	eval-rmspe:0.094788
[293]	train-rmse:0.074534	eval-rmse:0.090642	train-rmspe:0.080119	eval-rmspe:0.094768
[294]	train-rmse:0.074459	eval-rmse:0.09058	train-rmspe:0.080021	eval-rmspe:0.094689
[295]	train-rmse:0.0744	eval-rmse:0.090585	train-rmspe:0.079954	eval-rmspe:0.094701
[296]	train-rmse:0.074349	eval-rmse:0.090564	train-rmspe:0.079904	eval-rmspe:0.094681
[297]	train-rmse:0.074293	eval-rmse:0.090551	train-rmspe:0.07983	eval-rmspe:0.094658
[298]	train-rmse:0.074257	eval-rmse:0.090555	train-rmspe:0.079793	eval-rmspe:0.09466
[299]	train-rmse:0.074197	eval-rmse:0.090554	train-rmspe:0.07

In [13]:
print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(yhat))
print('RMSPE: {:.6f}'.format(error))

Validating
RMSPE: 0.094663


In [14]:
print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)
# Make Submission
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("xgboost_10_submission.csv", index=False)


Make predictions on the test set


In [15]:
create_feature_map(features)
importance = gbm.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

featp = df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
fig_featp = featp.get_figure()
fig_featp.savefig('feature_importance_xgb.png', bbox_inches='tight', pad_inches=1)