# Import

In [1]:
# Standard Imports
import os
import pandas as pd
import numpy as np
import sys
import gc
from pathlib import Path
import pickle

# Modelling
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
import lightgbm as ltb
from sklearn.model_selection import (train_test_split, GridSearchCV)


# Visualisation
import plotly.express as px
import plotly.graph_objects as go

# Src modules
sys.path.append(os.path.dirname(os.getcwd())) # Add the parent directory to the Python path so we can import src modules
from src.data_setup import *
from src.DT_Data_Processing import *
from src.model_evaluation import model_eval_pipeline,calc_root_mean_squared_error,calc_root_mean_squared_log_error
from src.generic import *

#Path Variables
ROOT_PATH =Path(os.path.dirname(os.getcwd()))
DATA_PATH = ROOT_PATH / 'data'
SUBMISSION_PATH = DATA_PATH / 'submissions'
MODEL_PATH=ROOT_PATH/'models/'
DECISIONTREE_PATH=MODEL_PATH/'DecisionTree/'

# Load Data

In [2]:
# load train,val and test data
train, test, stores, transactions = get_data()
#daily_sales = get_daily_sales(train).reset_index()
print(f'train: {train.shape[0]:,d} rows')
train = clean_train(train)
print(f'cleaned train: {train.shape[0]:,d} rows')

loading pickled dataframes...
train: 3,000,888 rows
cleaned train: 2,780,316 rows


# Data Processing

In [3]:
#Transform train data to Decision Tree Input
df_train =Transform_Data_For_DT(train,60,True)
#Create basic, trend and seasonality features 
df_feats=DT_features(df_train,True)

#Save the data for speed purpose
df_train.to_csv(DATA_PATH/'processed/DT_Train.csv')
df_feats.to_csv(DATA_PATH/'processed/DT_Train_Features.csv')

#Load the data for speed purpose
#df_train = pd.read_csv(DATA_PATH/'processed/DT_Train.csv')
#df_feats= pd.read_csv(DATA_PATH/'processed/DT_Train_Features.csv')

#Create the split train/validation
df_training,df_validation=train_val_split(df_train)
df_feats_use,df_feats_validation=train_val_split(df_feats)

#Remove Date column for features dataframe
df_feats_validation=df_feats_validation.drop(columns={'date'})
df_feats_use=df_feats_use.drop(columns={'date'})

# Model Training/Loading

In [9]:
#Load Model if they exists
#lgbm = load_model(DECISIONTREE_PATH,'lgbm.pkl')
#rf = load_model(DECISIONTREE_PATH,'rf.pkl')
#gbr = load_model(DECISIONTREE_PATH,'gbr.pkl')

#If not, train the model
#lgbm = ltb.LGBMRegressor(random_state=64)
#lgbm.fit(df_feats_use, df_training['target'])
#rf= RandomForestRegressor(random_state=64)
#rf.fit(df_feats_use, df_training['target'])
gbr=GradientBoostingRegressor(random_state=64)
gbr.fit(df_feats_use, df_training['target'])

#If you had to train the models, save them:
#save_model(DECISIONTREE_PATH, lgbm, 'lgbm.pkl')
#save_model(DECISIONTREE_PATH, rf, 'rf.pkl')
save_model(DECISIONTREE_PATH, gbr, 'gbr.pkl')

Model has been saved in the following location C:/Users/lcauche/Documents/kaggle-store-sales/models/DecisionTree\gbr.pkl


()

# Test

Prediction

In [12]:
#Run each model on the validation data
lgbm_pred=lgbm.predict(df_feats_validation)
#rf_pred=rf.predict(df_feats_validation)
gbr_pred=gbr.predict(df_feats_validation)

#Zero all negative values
for i in range(len(lgbm_pred)):
    lgbm_pred[i]=max(0,lgbm_pred[i])
#for i in range(len(rf_pred)):
#    rf_pred[i]=max(0,rf_pred[i])
for i in range(len(gbr_pred)):
    gbr_pred[i]=max(0,gbr_pred[i])

#Measure RMSLE Score
lgbm_score = calc_root_mean_squared_log_error(df_validation['target'], lgbm_pred)
#rf_score = calc_root_mean_squared_log_error(df_validation['target'], rf_pred)
gbr_score = calc_root_mean_squared_log_error(df_validation['target'], gbr_pred)



In [14]:
gbr_score

0.9416879740521417

Visualisation

In [11]:
#Create a line to compare validation and prediction
hist=df_training.rename(columns={'target':'sales'})
val=df_validation.rename(columns={'target':'sales'})
reg_Prediction = val.copy()
#rf_Prediction=val.copy()
gbr_Prediction=val.copy()
reg_Prediction['sales']=lgbm_pred
#rf_Prediction['sales']=rf_pred
gbr_Prediction['sales']=gbr_pred
val = val.groupby("date").sales.sum().reset_index()
hist = hist.groupby("date").sales.sum().reset_index()
hist =hist[hist['date']>='2017-01-01']
reg_Prediction = reg_Prediction.groupby("date").sales.sum().reset_index()
#rf_Prediction = rf_Prediction.groupby("date").sales.sum().reset_index()
gbr_Prediction = gbr_Prediction.groupby("date").sales.sum().reset_index()
#merged_data = pd.concat([hist, val,reg_Prediction,rf_Prediction,gbr_Prediction], axis=0,keys=['Train', 'Validation','lgbm_pred','rf_pred','gbr_pred'])
merged_data = pd.concat([hist, val,reg_Prediction,gbr_Prediction], axis=0,keys=['Train', 'Validation','lgbm_pred','gbr_pred'])
merged_data = merged_data.reset_index().rename(columns={'level_0': 'Dataset','level 1': 'id'})
merged_data.to_pickle(DATA_PATH / 'results/DT_results.pkl')
fig = px.line(merged_data, x='date', y='sales', color='Dataset', title='Sales Data')
fig.show()

# Create Submission

Cascade prediction

In [None]:
# Concat train and test data
New_data=pd.concat([train,test],axis=0,keys=['Train','Test'])
New_data = New_data.reset_index().rename(columns={'level_0': 'Dataset'})
New_data=New_data.reset_index().drop(columns={'id'})
New_data = New_data.rename(columns={'level_1': 'id'})
#Assign a value to the NaN from Test
New_data.fillna("nan",inplace=True)
#Encode the family to 
family_list =New_data['family'].unique()
fam_le = pd.DataFrame(family_list, columns=['family'])
le=LabelEncoder()
fam_le['family_le']=le.fit_transform(fam_le['family'])
n=train.shape[0]
i=0
SIZE = 60 #we take 2 past months here for each time-series point
COLUMNS = ['t{}'.format(x) for x in range(SIZE)]
#Loop for all the test data
while n<New_data.shape[0]:
    obj=New_data.iloc[n]
    df_test= []
    fam_list= []
    sto_list = []
    date_list=[]
    fam =obj['family']
    sto =obj['store_nbr']
    le=fam_le[(fam_le['family']==fam)]['family_le'].values[0]
    tmp= New_data[(New_data['family']==fam) & (New_data['store_nbr']==sto) & (New_data['sales'] != 'nan')]
    tmp=tmp.reset_index()
    df_test.append(tmp.loc[tmp.shape[0]-SIZE:tmp.shape[0], 'sales'].tolist())
    fam_list.append(fam)
    sto_list.append(sto)
    date_list.append(obj['date'])
    df_test = pd.DataFrame(df_test, columns=COLUMNS)
    df_test['family']=le
    df_test['store_nbr']=sto_list
    df_test['date']=date_list
    df_test["year"] = df_test.date.dt.year
    df_test["month"] = df_test.date.dt.month
    df_test["daynumber"] = df_test.date.dt.day
    df_test['day_of_week'] = df_test['date'].dt.dayofweek

    # Create features for the new row
    temp=df_test.drop(columns={'family','store_nbr','date','year','month','daynumber','day_of_week'})
    df_feats=pd.DataFrame()
    df_feats['prev_1'] = temp.iloc[:,-2] #Here -2 as -1 is a target
    for win in [2, 3, 5, 7, 10, 14, 21, 28, 56]:
        tmp = temp.iloc[:,-1-win:-1]
        #General statistics for base level
        df_feats['mean_prev_{}'.format(win)] = tmp.mean(axis=1)
        df_feats['median_prev_{}'.format(win)] = tmp.median(axis=1)
        df_feats['min_prev_{}'.format(win)] = tmp.min(axis=1)
        df_feats['max_prev_{}'.format(win)] = tmp.max(axis=1)
        df_feats['std_prev_{}'.format(win)] = tmp.std(axis=1)
        #Capturing trend
        df_feats['mean_ewm_prev_{}'.format(win)] = tmp.T.ewm(com=9.5).mean().T.mean(axis=1)
        df_feats['last_ewm_prev_{}'.format(win)] = tmp.T.ewm(com=9.5).mean().T.iloc[:,-1]
        
        df_feats['avg_diff_{}'.format(win)] = (tmp - tmp.shift(1, axis=1)).mean(axis=1)
        #df_feats['avg_div_{}'.format(win)] = (tmp / tmp.shift(1, axis=1)).mean(axis=1)
    for win in [2, 3, 4, 8]:
        tmp = df_train.iloc[:,-1-win*7:-1:7] #7 for week
        #Features for weekly seasonality
        df_feats['week_mean_prev_{}'.format(win)] = tmp.mean(axis=1)
        df_feats['week_median_prev_{}'.format(win)] = tmp.median(axis=1)
        df_feats['week_min_prev_{}'.format(win)] = tmp.min(axis=1)
        df_feats['week_max_prev_{}'.format(win)] = tmp.max(axis=1)
        df_feats['week_std_prev_{}'.format(win)] = tmp.std(axis=1)
    df_feats['family']=df_test['family']
    df_feats['store_nbr']=df_test['store_nbr']
    #df_feats['date']=df_test['date']
    df_feats['year']=df_test['year']
    df_feats['month']=df_test['month']
    df_feats['day']=df_test['daynumber']
    df_feats['day_of_week']=df_test['day_of_week']

    #Prediction on the new row
    pred=lgbm.predict(df_feats)

    #Get prediction - could be in a list or just the value
    try:
        pred=pred[0]
    except:
        pred=pred
    pred = max(0,pred)
    New_data.loc[n,'sales']=pred

    #Go to next row
    n=n+1
    #update tracker
    i=i+1
    if i==100:
        i=0
        print("step number ",n-train.shape[0]," has been processed over", test.shape[0], " steps")



In [35]:
def Transform_Data_For_DT_Test(df,SIZE:60, enable_encode:False):
    """
    Take a dataframe, and transform it to train Decision Tree models
    -> Moving from 1 long line with time series data to many training samples with target values
    link: https://towardsdatascience.com/approaching-time-series-with-a-tree-based-model-87c6d1fb6603

    Encode the family column to be used by Decision Tree (if enable)
    Args:
        df (dataframe): dataframe with daily sales, family and store_nbr
        SIZE (integer): number of prior data points you want to use to train your model
        enable_encode (bool): enable the label encoder to encode the family columns

    Returns:
        df_train: new data frame with prior data points and a target sales
    """
    if SIZE <=15:
        print('SIZE parameter is too small, pick a bigger integer')
        return()
    else:
        COLUMNS = ['t{}'.format(x) for x in range(SIZE-15)] + ['target']
        df_train= []
        fam_list= []
        sto_list = []
        date_list=[]
        family_list =df['family'].unique()
        store_list =df['store_nbr'].unique()
        dataset_list = []
        for fam in family_list:
            for sto in store_list:
                tmp= df[(df['family']==fam) & (df['store_nbr']==sto)]
                tmp=tmp.reset_index()
                if tmp.shape[0]>0:
                    for i in range(SIZE, tmp.shape[0]):
                        if tmp.loc[i,'Dataset']=='Test':
                            df_train.append(tmp.loc[i-SIZE:i-15, 'sales'].tolist())
                            fam_list.append(fam)
                            sto_list.append(sto)
                            date_list.append(tmp.loc[i, 'date'])
                            dataset_list.append(tmp.loc[i,'Dataset'])
        df_train = pd.DataFrame(df_train, columns=COLUMNS)
        df_train['family']=fam_list
        df_train['store_nbr']=sto_list
        df_train['date']=date_list
        #df_train['Dataset']=dataset_list
        df_train['date']= pd.to_datetime(df_train['date'])
        df_train["year"] = df_train.date.dt.year
        df_train["month"] = df_train.date.dt.month
        df_train["daynumber"] = df_train.date.dt.day
        df_train['day_of_week'] = df_train['date'].dt.dayofweek
        columns= ['family', 'store_nbr']
        if enable_encode:
            columns= ['family']
            for col in columns:
                le=LabelEncoder()
                df_train[col]=le.fit_transform(df_train[col])
        
        return(df_train)

In [40]:
#Create new data set using train and test
New_data=pd.concat([train,test],axis=0,keys=['Train','Test'])
New_data = New_data.reset_index().rename(columns={'level_0': 'Dataset'})
New_data=New_data.reset_index().drop(columns={'id'})
New_data = New_data.rename(columns={'level_1': 'id'})
#Assign a value to the NaN from Test
New_data.fillna("nan",inplace=True)
#Transform train data to Decision Tree Input
df_test =Transform_Data_For_DT_Test(New_data,60,True)
#Create basic, trend and seasonality features 
df_feats_test=DT_features(df_test,True)
df_feats_test=df_feats_test.drop(columns={'date'})
pred=lgbm.predict(df_feats_test)

#Remove negative values
for i in range(len(pred)):
    pred[i]=max(0,pred[i])


Output

In [None]:
#Taking only the Test Data
Output = New_data[New_data['Dataset']=='Test']
Output['sales']=pred

#Saving the submission file !!!Update the name of the file!!!
Output.to_csv(SUBMISSION_PATH/'submission_lgbm_01.csv')