### Data preprocessing

In [None]:
def pre_process(n):
    print("Load the training, test and store data using pandas")
    types = {'CompetitionOpenSinceYear': np.dtype(int),
             'CompetitionOpenSinceMonth': np.dtype(int),
             'StateHoliday': np.dtype(str),
             'Promo2SinceWeek': np.dtype(int),
             'SchoolHoliday': np.dtype(int),
             'PromoInterval': np.dtype(str)}
    train = pd.read_csv("data/train.csv", parse_dates=[2], dtype=types)
    store = pd.read_csv("data/store.csv")

    print("Consider only open stores for training. Closed stores wont count into the score.")
    train = train[train["Open"] != 0]
    print("Use only Sales bigger then zero")
    train = train[train["Sales"] > 0]

    print("Join with store")
    train = pd.merge(train, store, on='Store')

    features = []

    print("augment features")
    train = build_features(features, train)
    print(features)

    print('training data processed')
    
    train = train[train["Store"] == n]
    test = train[train["Date"] > '2015-06-19']
    train = train[train["Date"] <= '2015-06-19']
    return train, test, features

### Feature Selection

In [None]:
#!/usr/bin/python
from __future__ import print_function
'''
Public Score :  0.11727 (previous 0.11771)
Private Validation Score : [1199]	train-rmspe:0.104377	eval-rmspe:0.093786
'''

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline

# Gather some features
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

    # Label encode some features
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)
    data['Assortment'] = data.Assortment.astype(int)
    data['StateHoliday'] = data.StateHoliday.astype(int)

    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear

    # CompetionOpen en PromoOpen from https://www.kaggle.com/ananya77041/rossmann-store-sales/randomforestpython/code
    # Calculate time competition open time in months
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0

    # Indicate that sales on that day are in promo interval
    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data

### Evaluation Metrics

In [None]:
def rmspe(y, yhat):
    return np.sqrt(np.mean(((y - yhat)/y) ** 2))

def rmse(y, yhat):
    return np.sqrt(np.mean((y - yhat) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y, yhat)

### Random Fo

In [None]:
def random_forest(train, test, features):
    print("Fit a random forest model")
    X_train = train[features]
    y_train = np.log1p(train.Sales)
    regr = RandomForestRegressor(max_depth=2, random_state=0)
    regr.fit(X_train, y_train)
    test_probs = np.exp(regr.predict(test[features]))
    return test_probs


def plot_results(test, test_probs):
    plt.plot(range(test.shape[0]), test, label='true')
    plt.plot(test_probs, label='pred')
    plt.legend()
    plt.show()
    

store_list = [2, 3, 5, 85, 259, 262, 1, 4, 21, 13, 15, 18]
rf_test_sales_list = []
rf_test_probs_list = []
for i in store_list:
    train, test, features = pre_process(i)
    test_probs = random_forest(train, test,features)
    
    rf_test_sales_list.append(test.Sales)
    rf_test_probs_list.append(test_probs)
    plot_results(test.Sales, test_probs)

In [None]:
def xgboost(train, test, features):
    params = {"objective": "reg:linear",
              "booster" : "gbtree",
              "eta": 0.1,
              "max_depth": 10,
              "subsample": 0.85,
              "colsample_bytree": 0.4,
              "min_child_weight": 6,
              "silent": 1,
              "thread": 1,
              "seed": 1301
              }
    num_boost_round = 1200

    X_train, X_valid = train_test_split(train, test_size=0.02, random_state=10)
    y_train = np.log1p(X_train.Sales)
    y_valid = np.log1p(X_valid.Sales)
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=200, \
      feval=rmspe_xg, verbose_eval=True)

    yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
    error = rmspe(X_valid.Sales.values, np.expm1(yhat))
    
    dtest = xgb.DMatrix(test[features])
    test_probs = np.exp(gbm.predict(dtest))
    return test_probs


test_sales_list=[]
test_probs_list=[]
for i in store_list:
    train, test, features = pre_process(i)
    test_probs = xgboost(train, test, features)
    
    test_sales_list.append(test.Sales)
    test_probs_list.append(test_probs)
    plot_results(test.Sales, test_probs)
    