In [2]:
# BASE
# ------------------------------------------------------
import numpy as np
import pandas as pd
import os
import gc
import warnings

#!pip install xgboost
import sklearn
from sklearn import linear_model
from sklearn.linear_model  import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score,mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
import forecast_tools 
from forecast_tools.baseline import Naive1,SNaive,Average

# PACF - ACF
# ------------------------------------------------------
import statsmodels.api as sm
from statsmodels.tsa.statespace import sarimax

# DATA VISUALIZATION
# ------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


# CONFIGURATIONS
# ------------------------------------------------------
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format
warnings.filterwarnings('ignore')
folder = "C:/Users/lcauche/OneDrive - FTI Consulting/Documents/IBEX/data/store-sales-time-series-forecasting/"

In [3]:
# Import
train = pd.read_csv(folder+"train.csv")
test = pd.read_csv(folder+"test.csv")
stores = pd.read_csv(folder+"stores.csv")
#sub = pd.read_csv("../input/store-sales-time-series-forecasting/sample_submission.csv")   
transactions = pd.read_csv(folder+"transactions.csv").sort_values(["store_nbr", "date"])


# Datetime
train["date"] = pd.to_datetime(train.date)
test["date"] = pd.to_datetime(test.date)
transactions["date"] = pd.to_datetime(transactions.date)

# Data types
train.onpromotion = train.onpromotion.astype("float16")
train.sales = train.sales.astype("float32")
stores.cluster = stores.cluster.astype("int8")

train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0.0
1,1,2013-01-01,1,BABY CARE,0.0,0.0
2,2,2013-01-01,1,BEAUTY,0.0,0.0
3,3,2013-01-01,1,BEVERAGES,0.0,0.0
4,4,2013-01-01,1,BOOKS,0.0,0.0


In [4]:
daily_sales = train.groupby(["date"]).sales.sum().reset_index()
px.line(daily_sales.sort_values([ "date"]), x='date', y='sales',title = "Daily Sales -- Jan 2013 - Aug 2017" )

In [5]:
a = daily_sales.copy()
a["year"] = a.date.dt.year
a["month"] = a.date.dt.month
px.box(a, x="year", y="sales" , color = "month", title = "Sales Seasonality -- Jan 2013 - Aug 2017")

In [6]:
a = daily_sales.copy()
a["year"] = a.date.dt.year
a["dayofweek"] = a.date.dt.dayofweek+1
a = a.groupby(["year", "dayofweek"]).sales.mean().reset_index()
px.line(a, x="dayofweek", y="sales" , color = "year", title = "Average Sales per Day of Week")

In [7]:
#### Define the initial training dataset we'll use for our models
daily_sales = train.groupby(["date"]).sales.sum().reset_index()
daily_agg_sales = train.groupby(["date",'family','store_nbr']).sales.sum().reset_index()
daily_sales_origin = daily_sales.copy()
daily_sales_seasonality = daily_sales.copy()
daily_sales_seasonality["dayofweek"] = daily_sales_seasonality.date.dt.dayofweek+1
daily_sales_seasonality["year"] = daily_sales_seasonality.date.dt.year
daily_sales_seasonality["month"] = daily_sales_seasonality.date.dt.month
daily_agg_sales_origin = daily_agg_sales.copy()
daily_agg_sales_seasonality = daily_agg_sales.copy()
daily_agg_sales_seasonality["dayofweek"] = daily_agg_sales_seasonality.date.dt.dayofweek+1
daily_agg_sales_seasonality["year"] = daily_agg_sales_seasonality.date.dt.year
daily_agg_sales_seasonality["month"] = daily_agg_sales_seasonality.date.dt.month



In [33]:
######  Linear Regression  ######
#Create Train/test Data + Label Encoder
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
data_le=data.copy()
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data_le[col]=le.fit_transform(data[col])
X_train=data_le.iloc[:n][features]
X_test=data_le.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
regressor = LinearRegression()
regressor.fit(X_train,y_train)
feature_importance = dict(zip(X_train.columns,regressor.coef_))
y_pred = regressor.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result = data[n:][features]
result['Actual'] =y_test
result['pred'] = y_pred
result['Index'] = result.index
result['date'] = result['Index'].apply(lambda x:listofdate[x])
result=result.sort_values(by='date')
result_linear = result.groupby(["date"]).sum().reset_index()

print('######  Linear Regression  ######')
print(f'Coefficients: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')


######  Random Forest  ######
#Create Train/test Data + Label Encoder
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
data_le=data.copy()
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data_le[col]=le.fit_transform(data[col])
X_train=data_le.iloc[:n][features]
X_test=data_le.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
regressor = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
regressor.fit(X_train,y_train)
feature_importance = dict(zip(X_train.columns,regressor.feature_importances_))
y_pred = regressor.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result2 = data[n:][features]
result2['Actual'] =y_test
result2['pred'] = y_pred
result2['Index'] = result2.index
result2['date'] = result2['Index'].apply(lambda x:listofdate[x])
result2=result2.sort_values(by='date')
result_rf = result2.groupby(["date"]).sum().reset_index()

print('######  Random Forest  ######')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

######  Naive  ###### 
#Create Train/test Data
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
data_le=data.copy()
features=data.columns.values.tolist()
y_train=target.iloc[:n]
y_test=target.iloc[n:]
regressor = Naive1()
regressor.fit(y_train)
feature_importance = "No importance defined for Naive"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result3 = data.iloc[n:][features]
result3['Actual'] =y_test
result3['pred'] = y_pred
result3['Index'] = result3.index
result3['date'] = result3['Index'].apply(lambda x:listofdate[x])
result3=result3.sort_values(by='date')
result_naive = result3.groupby(["date"]).sum().reset_index()

print('######  Naive  ###### ')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')


######  SNaive  ###### 
#Create Train/test Data
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
data_le=data.copy()
features=data.columns.values.tolist()
y_train=target.iloc[:n]
y_test=target.iloc[n:]
regressor = SNaive(period=365)
regressor.fit(y_train)
feature_importance = "No importance defined for SNaive"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result4 = data.iloc[n:][features]
result4['Actual'] =y_test
result4['pred'] = y_pred
result4['Index'] = result4.index
result4['date'] = result4['Index'].apply(lambda x:listofdate[x])
result4=result4.sort_values(by='date')
result_Snaive = result4.groupby(["date"]).sum().reset_index()

print('######  SNaive period=365 ######  ')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')


######  Average  ###### 
#Create Train/test Data
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
data_le=data.copy()
features=data.columns.values.tolist()
y_train=target.iloc[:n]
y_test=target.iloc[n:]
regressor = Average()
regressor.fit(y_train)
feature_importance = "No importance defined for Average"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result5 = data.iloc[n:][features]
result5['Actual'] =y_test
result5['pred'] = y_pred
result5['Index'] = result5.index
result5['date'] = result5['Index'].apply(lambda x:listofdate[x])
result5=result5.sort_values(by='date')
result_Average = result5.groupby(["date"]).sum().reset_index()

print('######  Average  ###### ')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

fig = px.scatter(result_linear, x='date', y='Actual', labels={'Actual': 'Actual Sales'})
fig.add_trace(px.line(result_linear, x='date', y='pred', labels={'pred':'Linear Regression'}).data[0])
fig.add_trace(px.line(result_rf, x='date', y='pred', labels={'pred':'RF Regressor'}).data[0])
fig.add_trace(px.line(result_naive, x='date', y='pred', labels={'pred':'Naive'}).data[0])
fig.add_trace(px.line(result_Snaive, x='date', y='pred', labels={'pred':'Snaive Period=365'}).data[0])
fig.add_trace(px.line(result_Average, x='date', y='pred', labels={'pred':'Average'}).data[0])

# Assigning custom colors to each trace
colors = ['red', 'green', 'orange', 'purple','grey']
names = ['Actual','Linear Regression', 'RF Regressor', 'Naive', 'Snaive Period=365', 'Average']

for i, trace in enumerate(fig.data[0:], start=0):
    fig.data[i].line.color = colors[i-1]
    fig.data[i].name = names[i]

fig.update_layout(title='Baseline regressors', xaxis_title='Date', yaxis_title='Sales')
fig.update_traces(showlegend=True)
fig.show()

######  Linear Regression  ######
Coefficients: {'family': -12.524954184793248, 'store_nbr': 2.255261904795519, 'dayofweek': 18.51776466399372, 'year': 81.27191027183633, 'month': 12.326779205136118}
MAE: 659.70
MSE: 1756162.86
RMSE: 1325.20
RMSLE: 3.72
R2 score: 0.01
######  Random Forest  ######
Features Importances: {'family': 0.5774976439823298, 'store_nbr': 0.3053240041289138, 'dayofweek': 0.03917300754094475, 'year': 0.051436290211029294, 'month': 0.026569054136782372}
MAE: 164.19
MSE: 342301.39
RMSE: 585.07
RMSLE: 0.91
R2 score: 0.81
######  Naive  ###### 
Features Importances: No importance defined for Naive
MAE: 462.68
MSE: 1994136.58
RMSE: 1412.14
RMSLE: 4.34
R2 score: -0.12
######  SNaive period=365 ######  
Features Importances: No importance defined for SNaive
MAE: 498.78
MSE: 1953758.86
RMSE: 1397.77
RMSLE: 3.64
R2 score: -0.10
######  Average  ###### 
Features Importances: No importance defined for Average
MAE: 558.98
MSE: 1802522.43
RMSE: 1342.58
RMSLE: 3.42
R2 score: -

Family breakdown - How far are we off the prediction

In [39]:
#### Merged all results from models together
data = result.groupby("family")["Actual","pred"].sum().reset_index()
data = data.rename(columns={'pred': 'pred_linear'})
merged_data = data.merge(result2.groupby("family")["pred"].sum().reset_index(),on='family')
merged_data=merged_data.rename(columns={'pred': 'pred_RF'})
merged_data = merged_data.merge(result3.groupby("family")["pred"].sum().reset_index(),on='family')
merged_data=merged_data.rename(columns={'pred': 'pred_Naive'})
merged_data = merged_data.merge(result4.groupby("family")["pred"].sum().reset_index(),on='family')
merged_data=merged_data.rename(columns={'pred': 'pred_SNaive'})
merged_data = merged_data.merge(result5.groupby("family")["pred"].sum().reset_index(),on='family')
merged_data=merged_data.rename(columns={'pred': 'pred_Average'})
#Melt the data to show it as an histogram
melted_data = merged_data.melt(id_vars="family", value_vars=["Actual","pred_linear", "pred_RF",'pred_Naive','pred_SNaive','pred_Average'], var_name="metric", value_name="value")

hist = px.bar(melted_data, x="family", y="value", color="metric", barmode="group")
hist.update_layout(title='Models Performance - Family View', xaxis_title='Family', yaxis_title='Sales')
hist.show()

In [41]:
### Adding the % of Actual ratio for all models
merged_data['Actual_Ratio'] = merged_data['Actual']/merged_data['Actual']
merged_data['Linear_Ratio'] = merged_data['pred_linear']/merged_data['Actual']
merged_data['RF_Ratio'] = merged_data['pred_RF']/merged_data['Actual']
merged_data['Naive_Ratio'] = merged_data['pred_Naive']/merged_data['Actual']
merged_data['SNaive_Ratio'] = merged_data['pred_SNaive']/merged_data['Actual']
merged_data['Average_Ratio'] = merged_data['pred_Average']/merged_data['Actual']
#Melt the data to show it as an histogram
melted_data = merged_data.melt(id_vars="family", value_vars=["Actual_Ratio","Linear_Ratio", "RF_Ratio",'Naive_Ratio','SNaive_Ratio','Average_Ratio'], var_name="metric", value_name="value")

hist = px.bar(melted_data, x="family", y="value", color="metric", barmode="group", range_y= [0,10])
hist.update_layout(title='Models Performance '+ '%' + ' of Actual - Family view', xaxis_title='Family', yaxis_title='Sales')
hist.show()

Family breakdown - How far are we off the prediction

In [36]:
#### Merged all results from models together
data = result.groupby("store_nbr")["Actual","pred"].sum().reset_index()
data = data.rename(columns={'pred': 'pred_linear'})
merged_data = data.merge(result2.groupby("store_nbr")["pred"].sum().reset_index(),on='store_nbr')
merged_data=merged_data.rename(columns={'pred': 'pred_RF'})
merged_data = merged_data.merge(result3.groupby("store_nbr")["pred"].sum().reset_index(),on='store_nbr')
merged_data=merged_data.rename(columns={'pred': 'pred_Naive'})
merged_data = merged_data.merge(result4.groupby("store_nbr")["pred"].sum().reset_index(),on='store_nbr')
merged_data=merged_data.rename(columns={'pred': 'pred_SNaive'})
merged_data = merged_data.merge(result5.groupby("store_nbr")["pred"].sum().reset_index(),on='store_nbr')
merged_data=merged_data.rename(columns={'pred': 'pred_Average'})
#Melt the data to show it as an histogram
melted_data = merged_data.melt(id_vars="store_nbr", value_vars=["Actual","pred_linear", "pred_RF",'pred_Naive','pred_SNaive','pred_Average'], var_name="metric", value_name="value")

hist = px.bar(melted_data, x="store_nbr", y="value", color="metric", barmode="group")
hist.update_layout(title='Models Performance - Store_Nbr view', xaxis_title='Family', yaxis_title='Sales')
hist.show()

In [37]:
### Adding the % of Actual ratio for all models
merged_data['Actual_Ratio'] = merged_data['Actual']/merged_data['Actual']
merged_data['Linear_Ratio'] = merged_data['pred_linear']/merged_data['Actual']
merged_data['RF_Ratio'] = merged_data['pred_RF']/merged_data['Actual']
merged_data['Naive_Ratio'] = merged_data['pred_Naive']/merged_data['Actual']
merged_data['SNaive_Ratio'] = merged_data['pred_SNaive']/merged_data['Actual']
merged_data['Average_Ratio'] = merged_data['pred_Average']/merged_data['Actual']
#Melt the data to show it as an histogram
melted_data = merged_data.melt(id_vars="store_nbr", value_vars=["Actual_Ratio","Linear_Ratio", "RF_Ratio",'Naive_Ratio','SNaive_Ratio','Average_Ratio'], var_name="metric", value_name="value")

hist = px.bar(melted_data, x="store_nbr", y="value", color="metric", barmode="group")
hist.update_layout(title='Models Performance '+ '%' + ' of Actual - Store_Nbr view', xaxis_title='Family', yaxis_title='Sales')
hist.show()

Appendix below - Uncommmented 

LinearRegressor

In [6]:
#No features linear regression
data=daily_sales_origin.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
data['Index'] = data.index
features=data.columns.values.tolist()
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data.index,target, test_size=0.3, random_state=42)
#X_train=X_train.array.reshape(-1,1)
#y_train=y_train.array.reshape(-1,1)
#X_test=X_test.array.reshape(-1,1)
#y_test=y_test.array.reshape(-1,1)
regressor = LinearRegression()
regressor.fit(X_train,y_train)
feature_importance = dict(zip(X_train.columns,regressor.coef_))
y_pred = regressor.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result = X_test
result = pd.DataFrame(result,columns =['Index'])
result['Actual'] =y_test
result['pred'] = y_pred
result['Index'] = result.index
result['date'] = result['Index'].apply(lambda x:listofdate[x])
result=result.sort_values(by='date')

print('Baseline linear regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Seasonality features linear regression
data=daily_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
#data['Index'] = data.index
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = LinearRegression()
regressor.fit(X_train,y_train)
feature_importance = dict(zip(X_train.columns,regressor.coef_))
y_pred = regressor.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result2 = X_test
result2['Actual'] =y_test
result2['pred'] = y_pred
result2['Index'] = result2.index
result2['date'] = result2['Index'].apply(lambda x:listofdate[x])
result2=result2.sort_values(by='date')

print('seasonality features linear regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Family features linear regression
data=daily_agg_sales_origin.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = LinearRegression()
regressor.fit(X_train,y_train)
feature_importance = dict(zip(X_train.columns,regressor.coef_))
y_pred = regressor.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result3 = X_test
result3['Actual'] =y_test
result3['pred'] = y_pred
result3['Index'] = result3.index
result3['date'] = result3['Index'].apply(lambda x:listofdate[x])
result3=result3.sort_values(by='date')
result3 = result3.groupby(["date"]).sum().reset_index()

print('Family features linear regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Family and seasonality features linear regression
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = LinearRegression()
regressor.fit(X_train,y_train)
feature_importance = dict(zip(X_train.columns,regressor.coef_))
y_pred = regressor.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result4 = X_test
result4['Actual'] =y_test
result4['pred'] = y_pred
result4['Index'] = result4.index
result4['date'] = result4['Index'].apply(lambda x:listofdate[x])
result4=result4.sort_values(by='date')
result4 = result4.groupby(["date"]).sum().reset_index()

print('Family and seasonality features linear regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

fig = px.scatter(result, x='date', y='Actual', labels={'Actual': 'Actual Sales'})
fig.add_trace(px.line(result, x='date', y='pred', labels={'pred':'Baseline Prediction'}).data[0])
fig.add_trace(px.line(result2, x='date', y='pred', labels={'pred':'Seasonality Prediction'}).data[0])
fig.add_trace(px.line(result3, x='date', y='pred', labels={'pred':'Family Baseline Prediction'}).data[0])
fig.add_trace(px.line(result4, x='date', y='pred', labels={'pred':'Family Seasonality Prediction'}).data[0])
# Assigning custom colors to each trace
colors = ['red', 'green', 'orange', 'purple']
names = ['Actual','Baseline Prediction', 'Seasonality Prediction', 'Family Baseline Prediction', 'Family Seasonality Prediction']

for i, trace in enumerate(fig.data[0:], start=0):
    fig.data[i].line.color = colors[i-1]
    fig.data[i].name = names[i]

fig.update_layout(title='Baseline regressor', xaxis_title='Date', yaxis_title='Sales')
fig.update_traces(showlegend=True)
fig.show()

Baseline linear regression
Features Importances: {'Index': 409.7008948130734}
MAE: 170808.78
MSE: 39329079598.39
RMSE: 198315.61
RMSLE: 0.30
R2 score: -0.17
seasonality features linear regression
Features Importances: {'dayofweek': 33043.130204846595, 'year': 144944.3651236705, 'month': 21957.12841282118}
MAE: 141311.24
MSE: 30926208716.16
RMSE: 175858.49
RMSLE: 0.28
R2 score: 0.08
Family features linear regression
Features Importances: {'family': -12.529078282231131, 'store_nbr': 2.255148510677915}
MAE: 550.30
MSE: 1779484.20
RMSE: 1333.97
RMSLE: 3.38
R2 score: 0.00
Family and seasonality features linear regression
Features Importances: {'family': -12.524954184793248, 'store_nbr': 2.255261904795519, 'dayofweek': 18.51776466399372, 'year': 81.27191027183633, 'month': 12.326779205136118}
MAE: 659.70
MSE: 1756162.86
RMSE: 1325.20
RMSLE: 3.72
R2 score: 0.01


Same but with RandomForestRegressor

In [7]:
#No features linear regression
data=daily_sales_origin.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
data['Index'] = data.index
features=data.columns.values.tolist()
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data.index,target, test_size=0.3, random_state=42)
#X_train=X_train.array.reshape(-1,1)
#y_train=y_train.array.reshape(-1,1)
#X_test=X_test.array.reshape(-1,1)
#y_test=y_test.array.reshape(-1,1)
regressor = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
regressor.fit(X_train,y_train)
feature_importance = dict(zip(X_train.columns,regressor.feature_importances_))
y_pred = regressor.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result = X_test
result = pd.DataFrame(result,columns =['Index'])
result['Actual'] =y_test
result['pred'] = y_pred
result['Index'] = result.index
result['date'] = result['Index'].apply(lambda x:listofdate[x])
result=result.sort_values(by='date')

print('Baseline Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Seasonality features linear regression
data=daily_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
#data['Index'] = data.index
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
regressor.fit(X_train,y_train)
feature_importance = dict(zip(X_train.columns,regressor.feature_importances_))
y_pred = regressor.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result2 = X_test
result2['Actual'] =y_test
result2['pred'] = y_pred
result2['Index'] = result2.index
result2['date'] = result2['Index'].apply(lambda x:listofdate[x])
result2=result2.sort_values(by='date')

print('seasonality features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Family features linear regression
data=daily_agg_sales_origin.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
regressor.fit(X_train,y_train)
feature_importance = dict(zip(X_train.columns,regressor.feature_importances_))
y_pred = regressor.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result3 = X_test
result3['Actual'] =y_test
result3['pred'] = y_pred
result3['Index'] = result3.index
result3['date'] = result3['Index'].apply(lambda x:listofdate[x])
result3=result3.sort_values(by='date')
result3 = result3.groupby(["date"]).sum().reset_index()

print('Family features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Family  and seasonality features linear regression
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
regressor.fit(X_train,y_train)
feature_importance = dict(zip(X_train.columns,regressor.feature_importances_))
y_pred = regressor.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result4 = X_test
result4['Actual'] =y_test
result4['pred'] = y_pred
result4['Index'] = result4.index
result4['date'] = result4['Index'].apply(lambda x:listofdate[x])
result4=result4.sort_values(by='date')
result4 = result4.groupby(["date"]).sum().reset_index()

print('Family and seasonality features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

fig = px.scatter(result, x='date', y='Actual', labels={'Actual': 'Actual Sales'})
fig.add_trace(px.line(result, x='date', y='pred', labels={'pred':'Baseline Prediction'}).data[0])
fig.add_trace(px.line(result2, x='date', y='pred', labels={'pred':'Seasonality Prediction'}).data[0])
fig.add_trace(px.line(result3, x='date', y='pred', labels={'pred':'Family Baseline Prediction'}).data[0])
fig.add_trace(px.line(result4, x='date', y='pred', labels={'pred':'Family Seasonality Prediction'}).data[0])
# Assigning custom colors to each trace
colors = ['red', 'green', 'orange', 'purple']
names = ['Actual','Baseline Prediction', 'Seasonality Prediction', 'Family Baseline Prediction', 'Family Seasonality Prediction']

for i, trace in enumerate(fig.data[0:], start=0):
    fig.data[i].line.color = colors[i-1]
    fig.data[i].name = names[i]

fig.update_layout(title='Random Forest regressor', xaxis_title='Date', yaxis_title='Sales')
fig.update_traces(showlegend=True)
fig.show()

Baseline Random Forest regression
Features Importances: {'Index': 1.0}
MAE: 148037.48
MSE: 42082888809.14
RMSE: 205141.14
RMSLE: 0.29
R2 score: -0.26
seasonality features Random Forest regression
Features Importances: {'dayofweek': 0.2792120871930243, 'year': 0.47017885758546796, 'month': 0.25060905522150767}
MAE: 94084.28
MSE: 19021960974.13
RMSE: 137920.13
RMSLE: 0.25
R2 score: 0.43
Family features Random Forest regression
Features Importances: {'family': 0.6519631659122068, 'store_nbr': 0.34803683408779323}
MAE: 195.91
MSE: 527529.44
RMSE: 726.31
RMSLE: 0.91
R2 score: 0.70
Family and seasonality features Random Forest regression
Features Importances: {'family': 0.5774976439823298, 'store_nbr': 0.3053240041289138, 'dayofweek': 0.03917300754094475, 'year': 0.051436290211029294, 'month': 0.026569054136782372}
MAE: 164.19
MSE: 342301.39
RMSE: 585.07
RMSLE: 0.91
R2 score: 0.81


Same but with Naive1

In [10]:
#No features linear regression
data=daily_sales_origin.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
data['Index'] = data.index
features=data.columns.values.tolist()
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
regressor = Naive1()
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result = X_test
result = pd.DataFrame(result,columns =['Index'])
result['Actual'] =y_test
result['pred'] = y_pred
result['Index'] = result.index
result['date'] = result['Index'].apply(lambda x:listofdate[x])
result=result.sort_values(by='date')

print('Baseline Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Seasonality features linear regression
data=daily_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
regressor = Naive1()
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result2 = X_test
result2['Actual'] =y_test
result2['pred'] = y_pred
result2['Index'] = result2.index
result2['date'] = result2['Index'].apply(lambda x:listofdate[x])
result2=result2.sort_values(by='date')

print('seasonality features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Family features linear regression
data=daily_agg_sales_origin.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = Naive1()
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result3 = X_test
result3['Actual'] =y_test
result3['pred'] = y_pred
result3['Index'] = result3.index
result3['date'] = result3['Index'].apply(lambda x:listofdate[x])
result3=result3.sort_values(by='date')
result3 = result3.groupby(["date"]).sum().reset_index()

print('Family features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Family  and seasonality features linear regression
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = Naive1()
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result4 = X_test
result4['Actual'] =y_test
result4['pred'] = y_pred
result4['Index'] = result4.index
result4['date'] = result4['Index'].apply(lambda x:listofdate[x])
result4=result4.sort_values(by='date')
result4 = result4.groupby(["date"]).sum().reset_index()

print('Family and seasonality features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

fig = px.scatter(result, x='date', y='Actual', labels={'Actual': 'Actual Sales'})
fig.add_trace(px.line(result, x='date', y='pred', labels={'pred':'Baseline Prediction'}).data[0])
fig.add_trace(px.line(result2, x='date', y='pred', labels={'pred':'Seasonality Prediction'}).data[0])
fig.add_trace(px.line(result3, x='date', y='pred', labels={'pred':'Family Baseline Prediction'}).data[0])
fig.add_trace(px.line(result4, x='date', y='pred', labels={'pred':'Family Seasonality Prediction'}).data[0])
# Assigning custom colors to each trace
colors = ['red', 'green', 'orange', 'purple']
names = ['Actual','Baseline Prediction', 'Seasonality Prediction', 'Family Baseline Prediction', 'Family Seasonality Prediction']

for i, trace in enumerate(fig.data[0:], start=0):
    fig.data[i].line.color = colors[i-1]
    fig.data[i].name = names[i]

fig.update_layout(title='Random Forest regressor', xaxis_title='Date', yaxis_title='Sales')
fig.update_traces(showlegend=True)
fig.show()

Baseline Random Forest regression
Features Importances: No importance defined - To check
MAE: 141924.75
MSE: 34124281280.45
RMSE: 184727.59
RMSLE: 0.28
R2 score: -0.02
seasonality features Random Forest regression
Features Importances: No importance defined - To check
MAE: 141924.75
MSE: 34124281280.45
RMSE: 184727.59
RMSLE: 0.28
R2 score: -0.02
Family features Random Forest regression
Features Importances: No importance defined - To check
MAE: 462.68
MSE: 1994136.58
RMSE: 1412.14
RMSLE: 4.34
R2 score: -0.12
Family and seasonality features Random Forest regression
Features Importances: No importance defined - To check
MAE: 462.68
MSE: 1994136.58
RMSE: 1412.14
RMSLE: 4.34
R2 score: -0.12


In [20]:
#Family  and seasonality features linear regression
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = Naive1()
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
print(y_pred)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result4 = X_test
result4['Actual'] =y_test
result4['pred'] = y_pred
result4['Index'] = result4.index
result4['date'] = result4['Index'].apply(lambda x:listofdate[x])
result4=result4.sort_values(by='date')
result4 = result4.groupby(["date"]).sum().reset_index()

print('Family and seasonality features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

[0. 0. 0. ... 0. 0. 0.]
Family and seasonality features Random Forest regression
Features Importances: No importance defined - To check
MAE: 462.68
MSE: 1994136.58
RMSE: 1412.14
RMSLE: 4.34
R2 score: -0.12


SNaive

In [12]:
#No features linear regression
data=daily_sales_origin.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
data['Index'] = data.index
features=data.columns.values.tolist()
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
regressor = SNaive(period=12)
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result = X_test
result = pd.DataFrame(result,columns =['Index'])
result['Actual'] =y_test
result['pred'] = y_pred
result['Index'] = result.index
result['date'] = result['Index'].apply(lambda x:listofdate[x])
result=result.sort_values(by='date')

print('Baseline Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Seasonality features linear regression
data=daily_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
regressor = SNaive(period=12)
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result2 = X_test
result2['Actual'] =y_test
result2['pred'] = y_pred
result2['Index'] = result2.index
result2['date'] = result2['Index'].apply(lambda x:listofdate[x])
result2=result2.sort_values(by='date')

print('seasonality features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Family features linear regression
data=daily_agg_sales_origin.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = SNaive(period=12)
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result3 = X_test
result3['Actual'] =y_test
result3['pred'] = y_pred
result3['Index'] = result3.index
result3['date'] = result3['Index'].apply(lambda x:listofdate[x])
result3=result3.sort_values(by='date')
result3 = result3.groupby(["date"]).sum().reset_index()

print('Family features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Family  and seasonality features linear regression
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = SNaive(period=12)
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result4 = X_test
result4['Actual'] =y_test
result4['pred'] = y_pred
result4['Index'] = result4.index
result4['date'] = result4['Index'].apply(lambda x:listofdate[x])
result4=result4.sort_values(by='date')
result4 = result4.groupby(["date"]).sum().reset_index()

print('Family and seasonality features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

fig = px.scatter(result, x='date', y='Actual', labels={'Actual': 'Actual Sales'})
fig.add_trace(px.line(result, x='date', y='pred', labels={'pred':'Baseline Prediction'}).data[0])
fig.add_trace(px.line(result2, x='date', y='pred', labels={'pred':'Seasonality Prediction'}).data[0])
fig.add_trace(px.line(result3, x='date', y='pred', labels={'pred':'Family Baseline Prediction'}).data[0])
fig.add_trace(px.line(result4, x='date', y='pred', labels={'pred':'Family Seasonality Prediction'}).data[0])
# Assigning custom colors to each trace
colors = ['red', 'green', 'orange', 'purple']
names = ['Actual','Baseline Prediction', 'Seasonality Prediction', 'Family Baseline Prediction', 'Family Seasonality Prediction']

for i, trace in enumerate(fig.data[0:], start=0):
    fig.data[i].line.color = colors[i-1]
    fig.data[i].name = names[i]

fig.update_layout(title='Random Forest regressor', xaxis_title='Date', yaxis_title='Sales')
fig.update_traces(showlegend=True)
fig.show()

Baseline Random Forest regression
Features Importances: No importance defined - To check
MAE: 178721.64
MSE: 55102074490.39
RMSE: 234738.31
RMSLE: 0.33
R2 score: -0.64
seasonality features Random Forest regression
Features Importances: No importance defined - To check
MAE: 178721.64
MSE: 55102074490.39
RMSE: 234738.31
RMSLE: 0.33
R2 score: -0.64
Family features Random Forest regression
Features Importances: No importance defined - To check
MAE: 462.13
MSE: 1993276.40
RMSE: 1411.83
RMSLE: 4.05
R2 score: -0.12
Family and seasonality features Random Forest regression
Features Importances: No importance defined - To check
MAE: 462.13
MSE: 1993276.40
RMSE: 1411.83
RMSLE: 4.05
R2 score: -0.12


Average

In [13]:
#No features linear regression
data=daily_sales_origin.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
data['Index'] = data.index
features=data.columns.values.tolist()
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
regressor = Average()
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result = X_test
result = pd.DataFrame(result,columns =['Index'])
result['Actual'] =y_test
result['pred'] = y_pred
result['Index'] = result.index
result['date'] = result['Index'].apply(lambda x:listofdate[x])
result=result.sort_values(by='date')

print('Baseline Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Seasonality features linear regression
data=daily_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
regressor = Average()
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result2 = X_test
result2['Actual'] =y_test
result2['pred'] = y_pred
result2['Index'] = result2.index
result2['date'] = result2['Index'].apply(lambda x:listofdate[x])
result2=result2.sort_values(by='date')

print('seasonality features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Family features linear regression
data=daily_agg_sales_origin.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = Average()
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result3 = X_test
result3['Actual'] =y_test
result3['pred'] = y_pred
result3['Index'] = result3.index
result3['date'] = result3['Index'].apply(lambda x:listofdate[x])
result3=result3.sort_values(by='date')
result3 = result3.groupby(["date"]).sum().reset_index()

print('Family features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Family  and seasonality features linear regression
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = Average()
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result4 = X_test
result4['Actual'] =y_test
result4['pred'] = y_pred
result4['Index'] = result4.index
result4['date'] = result4['Index'].apply(lambda x:listofdate[x])
result4=result4.sort_values(by='date')
result4 = result4.groupby(["date"]).sum().reset_index()

print('Family and seasonality features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

fig = px.scatter(result, x='date', y='Actual', labels={'Actual': 'Actual Sales'})
fig.add_trace(px.line(result, x='date', y='pred', labels={'pred':'Baseline Prediction'}).data[0])
fig.add_trace(px.line(result2, x='date', y='pred', labels={'pred':'Seasonality Prediction'}).data[0])
fig.add_trace(px.line(result3, x='date', y='pred', labels={'pred':'Family Baseline Prediction'}).data[0])
fig.add_trace(px.line(result4, x='date', y='pred', labels={'pred':'Family Seasonality Prediction'}).data[0])
# Assigning custom colors to each trace
colors = ['red', 'green', 'orange', 'purple']
names = ['Actual','Baseline Prediction', 'Seasonality Prediction', 'Family Baseline Prediction', 'Family Seasonality Prediction']

for i, trace in enumerate(fig.data[0:], start=0):
    fig.data[i].line.color = colors[i-1]
    fig.data[i].name = names[i]

fig.update_layout(title='Random Forest regressor', xaxis_title='Date', yaxis_title='Sales')
fig.update_traces(showlegend=True)
fig.show()

Baseline Random Forest regression
Features Importances: No importance defined - To check
MAE: 270060.51
MSE: 104964797366.06
RMSE: 323982.71
RMSLE: 0.46
R2 score: -2.13
seasonality features Random Forest regression
Features Importances: No importance defined - To check
MAE: 270060.51
MSE: 104964797366.06
RMSE: 323982.71
RMSLE: 0.46
R2 score: -2.13
Family features Random Forest regression
Features Importances: No importance defined - To check
MAE: 558.98
MSE: 1802522.43
RMSE: 1342.58
RMSLE: 3.42
R2 score: -0.01
Family and seasonality features Random Forest regression
Features Importances: No importance defined - To check
MAE: 558.98
MSE: 1802522.43
RMSE: 1342.58
RMSLE: 3.42
R2 score: -0.01


SNaive with different seasonality

In [22]:
#Family features linear regression
data=daily_agg_sales_origin.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = SNaive(period=365)
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result = X_test
result['Actual'] =y_test
result['pred'] = y_pred
result['Index'] = result.index
result['date'] = result['Index'].apply(lambda x:listofdate[x])
result=result.sort_values(by='date')
result = result.groupby(["date"]).sum().reset_index()

print('Family features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')
#Family features linear regression
data=daily_agg_sales_origin.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = SNaive(period=1000)
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result2 = X_test
result2['Actual'] =y_test
result2['pred'] = y_pred
result2['Index'] = result2.index
result2['date'] = result2['Index'].apply(lambda x:listofdate[x])
result2=result2.sort_values(by='date')
result2 = result2.groupby(["date"]).sum().reset_index()

print('Family features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Family features linear regression
data=daily_agg_sales_origin.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = SNaive(period=2000)
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result3 = X_test
result3['Actual'] =y_test
result3['pred'] = y_pred
result3['Index'] = result3.index
result3['date'] = result3['Index'].apply(lambda x:listofdate[x])
result3=result3.sort_values(by='date')
result3 = result3.groupby(["date"]).sum().reset_index()

print('Family features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Family features linear regression
data=daily_agg_sales_origin.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = SNaive(period=3000)
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result4 = X_test
result4['Actual'] =y_test
result4['pred'] = y_pred
result4['Index'] = result4.index
result4['date'] = result4['Index'].apply(lambda x:listofdate[x])
result4=result4.sort_values(by='date')
result4 = result4.groupby(["date"]).sum().reset_index()

print('Family features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

fig = px.scatter(result, x='date', y='Actual', labels={'Actual': 'Actual Sales'})
fig.add_trace(px.line(result, x='date', y='pred', labels={'pred':'Baseline Prediction'}).data[0])
fig.add_trace(px.line(result2, x='date', y='pred', labels={'pred':'Seasonality Prediction'}).data[0])
fig.add_trace(px.line(result3, x='date', y='pred', labels={'pred':'Family Baseline Prediction'}).data[0])
fig.add_trace(px.line(result4, x='date', y='pred', labels={'pred':'Family Seasonality Prediction'}).data[0])
# Assigning custom colors to each trace
colors = ['red', 'green', 'orange', 'purple']
names = ['Actual','Period=1', 'Period=2', 'Period=6', 'Period=12']

for i, trace in enumerate(fig.data[0:], start=0):
    fig.data[i].line.color = colors[i-1]
    fig.data[i].name = names[i]

fig.update_layout(title='Random Forest regressor', xaxis_title='Date', yaxis_title='Sales')
fig.update_traces(showlegend=True)
fig.show()

Family features Random Forest regression
Features Importances: No importance defined - To check
MAE: 498.78
MSE: 1953758.86
RMSE: 1397.77
RMSLE: 3.64
R2 score: -0.10
Family features Random Forest regression
Features Importances: No importance defined - To check
MAE: 728.82
MSE: 3276862.04
RMSE: 1810.21
RMSLE: 3.63
R2 score: -0.84
Family features Random Forest regression
Features Importances: No importance defined - To check
MAE: 757.12
MSE: 3346179.05
RMSE: 1829.26
RMSLE: 3.67
R2 score: -0.88
Family features Random Forest regression
Features Importances: No importance defined - To check
MAE: 764.12
MSE: 3340620.12
RMSE: 1827.74
RMSLE: 3.65
R2 score: -0.88


Mixed of All

In [17]:
#Family and seasonality features linear regression
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = LinearRegression()
regressor.fit(X_train,y_train)
feature_importance = dict(zip(X_train.columns,regressor.coef_))
y_pred = regressor.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result = X_test
result['Actual'] =y_test
result['pred'] = y_pred
result['Index'] = result.index
result['date'] = result['Index'].apply(lambda x:listofdate[x])
result=result.sort_values(by='date')
result = result.groupby(["date"]).sum().reset_index()

print('Family and seasonality features linear regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')


#Family  and seasonality features linear regression
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
regressor.fit(X_train,y_train)
feature_importance = dict(zip(X_train.columns,regressor.feature_importances_))
y_pred = regressor.predict(X_test)
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result2 = X_test
result2['Actual'] =y_test
result2['pred'] = y_pred
result2['Index'] = result2.index
result2['date'] = result2['Index'].apply(lambda x:listofdate[x])
result2=result2.sort_values(by='date')
result2 = result2.groupby(["date"]).sum().reset_index()

print('Family and seasonality features Random Forest regression')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

#Family  and seasonality features Naive
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = Naive1()
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result3 = X_test
result3['Actual'] =y_test
result3['pred'] = y_pred
result3['Index'] = result3.index
result3['date'] = result3['Index'].apply(lambda x:listofdate[x])
result3=result3.sort_values(by='date')
result3 = result3.groupby(["date"]).sum().reset_index()

print('Family and seasonality features Naive')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')


#Family  and seasonality features SNaive
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = SNaive(period=12)
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result4 = X_test
result4['Actual'] =y_test
result4['pred'] = y_pred
result4['Index'] = result4.index
result4['date'] = result4['Index'].apply(lambda x:listofdate[x])
result4=result4.sort_values(by='date')
result4 = result4.groupby(["date"]).sum().reset_index()

print('Family and seasonality features SNaive period=12')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')


#Family  and seasonality features Average
data=daily_agg_sales_seasonality.copy()
target = data['sales']
listofdate = list(data['date'])
n=int(len(data)*0.7)
data=data.drop(columns=['sales','date'])
features=data.columns.values.tolist()
columns= [col for col in data[features].columns if data[col].dtype=='object']
for col in columns:
    le=LabelEncoder()
    data[col]=le.fit_transform(data[col])
X_train=data.iloc[:n][features]
X_test=data.iloc[n:][features]
y_train=target.iloc[:n]
y_test=target.iloc[n:]
#X_train,X_test,y_train,y_test = train_test_split(data[features],target, test_size=0.3, random_state=42)
regressor = Average()
regressor.fit(y_train)
feature_importance = "No importance defined - To check"
y_pred = regressor.predict(horizon=len(y_test))
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
result5 = X_test
result5['Actual'] =y_test
result5['pred'] = y_pred
result5['Index'] = result5.index
result5['date'] = result5['Index'].apply(lambda x:listofdate[x])
result5=result5.sort_values(by='date')
result5 = result5.groupby(["date"]).sum().reset_index()

print('Family and seasonality Average')
print(f'Features Importances: {feature_importance}')
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'RMSLE: {rmsle:.2f}')
print(f'R2 score: {r2:.2f}')

fig = px.scatter(result, x='date', y='Actual', labels={'Actual': 'Actual Sales'})
fig.add_trace(px.line(result, x='date', y='pred', labels={'pred':'Linear Regression'}).data[0])
fig.add_trace(px.line(result2, x='date', y='pred', labels={'pred':'RF Regressor'}).data[0])
fig.add_trace(px.line(result3, x='date', y='pred', labels={'pred':'Naive'}).data[0])
fig.add_trace(px.line(result4, x='date', y='pred', labels={'pred':'Snaive Period=12'}).data[0])
fig.add_trace(px.line(result5, x='date', y='pred', labels={'pred':'Average'}).data[0])

# Assigning custom colors to each trace
colors = ['red', 'green', 'orange', 'purple','grey']
names = ['Actual','Linear Regression', 'RF Regressor', 'Naive', 'Snaive Period=12', 'Average']

for i, trace in enumerate(fig.data[0:], start=0):
    fig.data[i].line.color = colors[i-1]
    fig.data[i].name = names[i]

fig.update_layout(title='Baseline regressors', xaxis_title='Date', yaxis_title='Sales')
fig.update_traces(showlegend=True)
fig.show()

Family and seasonality features linear regression
Features Importances: {'family': -12.524954184793248, 'store_nbr': 2.255261904795519, 'dayofweek': 18.51776466399372, 'year': 81.27191027183633, 'month': 12.326779205136118}
MAE: 659.70
MSE: 1756162.86
RMSE: 1325.20
RMSLE: 3.72
R2 score: 0.01
Family and seasonality features Random Forest regression
Features Importances: {'family': 0.5774976439823298, 'store_nbr': 0.3053240041289138, 'dayofweek': 0.03917300754094475, 'year': 0.051436290211029294, 'month': 0.026569054136782372}
MAE: 164.19
MSE: 342301.39
RMSE: 585.07
RMSLE: 0.91
R2 score: 0.81
Family and seasonality features Naive
Features Importances: No importance defined - To check
MAE: 462.68
MSE: 1994136.58
RMSE: 1412.14
RMSLE: 4.34
R2 score: -0.12
Family and seasonality features SNaive period=12
Features Importances: No importance defined - To check
MAE: 462.13
MSE: 1993276.40
RMSE: 1411.83
RMSLE: 4.05
R2 score: -0.12
Family and seasonality Average
Features Importances: No importanc

In [22]:
from math import sqrt
from multiprocessing import cpu_count
from joblib import Parallel
from joblib import delayed
from warnings import catch_warnings
from warnings import filterwarnings
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

# one-step sarima forecast
def sarima_forecast(history, config):
	order, sorder, trend = config
	# define model
	model = SARIMAX(history, order=order, seasonal_order=sorder, trend=trend, enforce_stationarity=False, enforce_invertibility=False)
	# fit model
	model_fit = model.fit(disp=False)
	# make one step forecast
	yhat = model_fit.predict(len(history), len(history))
	return yhat[0]

# root mean squared error or rmse
def measure_rmse(actual, predicted):
	return sqrt(mean_squared_error(actual, predicted))

# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
	return data[:-n_test], data[-n_test:]

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, cfg):
	predictions = list()
	# split dataset
	train, test = train_test_split(data, n_test)
	# seed history with training dataset
	history = [x for x in train]
	# step over each time-step in the test set
	for i in range(len(test)):
		# fit model and make forecast for history
		yhat = sarima_forecast(history, cfg)
		# store forecast in list of predictions
		predictions.append(yhat)
		# add actual observation to history for the next loop
		history.append(test[i])
	# estimate prediction error
	error = measure_rmse(test, predictions)
	print("error", error)
	return error

# score a model, return None on failure
def score_model(data, n_test, cfg, debug=False):
	result = None
	# convert config to a key
	key = str(cfg)
	# show all warnings and fail on exception if debugging
	if debug:
		result = walk_forward_validation(data, n_test, cfg)
	else:
		# one failure during model validation suggests an unstable config
		try:
			# never show warnings when grid searching, too noisy
			with catch_warnings():
				filterwarnings("ignore")
				result = walk_forward_validation(data, n_test, cfg)
		except:
			error = None
			print("Exception triggered")
	# check for an interesting result
	if result is not None:
		print(' > Model[%s] %.3f' % (key, result))
	return (key, result)

# grid search configs
def grid_search(data, cfg_list, n_test, parallel=False):
	scores = None
	if parallel:
		print("In Parallel")
		# execute configs in parallel
		executor = Parallel(n_jobs=cpu_count(), backend='multiprocessing')
		tasks = (delayed(score_model)(data, n_test, cfg) for cfg in cfg_list)
		print(tasks)
		scores = executor(tasks)
		print("Done scores parallel")
	else:
		scores = [score_model(data, n_test, cfg) for cfg in cfg_list]
	print(scores)
	# remove empty results
	scores = [r for r in scores if r[1] != None]
	# sort configs by error, asc
	scores.sort(key=lambda tup: tup[1])
	print("End of Grid Search")
	return scores

# create a set of sarima configs to try
def sarima_configs(seasonal=[0,12]):
	models = list()
	# define config lists
	p_params = [0, 1, 2]
	d_params = [0, 1]
	q_params = [0, 1, 2]
	t_params = ['n','c','t','ct']
	P_params = [0, 1, 2]
	D_params = [0, 1]
	Q_params = [0, 1, 2]
	m_params = seasonal
	i=0
	number=100000
	# create config instances
	for p in p_params:
		for d in d_params:
			for q in q_params:
				for t in t_params:
					for P in P_params:
						for D in D_params:
							for Q in Q_params:
								for m in m_params:
									if i<number:
										cfg = [(p,d,q), (P,D,Q,m), t]
										models.append(cfg)
										i=i+1
										print("Iteration number:  ", i )
										print(p,d,q,t,P,D,Q,m)
	return models

In [25]:
# define dataset
data = daily_sales["sales"]
print(data.head())
# data split
n_test = int(daily_sales['date'].count()*0.7)
# model configs
cfg_list = sarima_configs()
# grid search
scores = grid_search(data, cfg_list, n_test)
print('done')
# list top 3 configs
for cfg, error in scores[:3]:
	print(cfg, error)

0     2511.62
1   496092.41
2   361461.22
3   354459.69
4   477350.12
Name: sales, dtype: float32
Iteration number:   1
0 0 0 n 0 0 0 0
Iteration number:   2
0 0 0 n 0 0 0 12
Iteration number:   3
0 0 0 n 0 0 1 0
Iteration number:   4
0 0 0 n 0 0 1 12
Iteration number:   5
0 0 0 n 0 0 2 0
Iteration number:   6
0 0 0 n 0 0 2 12
Iteration number:   7
0 0 0 n 0 1 0 0
Iteration number:   8
0 0 0 n 0 1 0 12
Iteration number:   9
0 0 0 n 0 1 1 0
Iteration number:   10
0 0 0 n 0 1 1 12
Iteration number:   11
0 0 0 n 0 1 2 0
Iteration number:   12
0 0 0 n 0 1 2 12
Iteration number:   13
0 0 0 n 1 0 0 0
Iteration number:   14
0 0 0 n 1 0 0 12
Iteration number:   15
0 0 0 n 1 0 1 0
Iteration number:   16
0 0 0 n 1 0 1 12
Iteration number:   17
0 0 0 n 1 0 2 0
Iteration number:   18
0 0 0 n 1 0 2 12
Iteration number:   19
0 0 0 n 1 1 0 0
Iteration number:   20
0 0 0 n 1 1 0 12
Iteration number:   21
0 0 0 n 1 1 1 0
Iteration number:   22
0 0 0 n 1 1 1 12
Iteration number:   23
0 0 0 n 1 1 2 0
Ite

In [None]:

temp = pd.merge(train.groupby(["date", "store_nbr"]).sales.sum().reset_index(), transactions, how = "left")
print("Spearman Correlation between Total Sales and Transactions: {:,.4f}".format(temp.corr("spearman").sales.loc["transactions"]))
px.line(transactions.sort_values(["store_nbr", "date"]), x='date', y='transactions', color='store_nbr',title = "Transactions" )

In [None]:
a = transactions.copy()
a["year"] = a.date.dt.year
a["month"] = a.date.dt.month
px.box(a, x="year", y="transactions" , color = "month", title = "Transactions")

In [None]:
px.scatter(temp, x = "transactions", y = "sales", trendline = "ols", trendline_color_override = "red")


In [None]:
a = transactions.copy()
a["year"] = a.date.dt.year
a["dayofweek"] = a.date.dt.dayofweek+1
a = a.groupby(["year", "dayofweek"]).transactions.mean().reset_index()
px.line(a, x="dayofweek", y="transactions" , color = "year", title = "Transactions")

In [None]:
# Import 
oil = pd.read_csv(folder+"/oil.csv")
oil["date"] = pd.to_datetime(oil.date)
# Resample
oil = oil.set_index("date").dcoilwtico.resample("D").sum().reset_index()
# Interpolate
oil["dcoilwtico"] = np.where(oil["dcoilwtico"] == 0, np.nan, oil["dcoilwtico"])
oil["dcoilwtico_interpolated"] =oil.dcoilwtico.interpolate()
# Plot
p = oil.melt(id_vars=['date']+list(oil.keys()[5:]), var_name='Legend')
px.line(p.sort_values(["Legend", "date"], ascending = [False, True]), x='date', y='value', color='Legend',title = "Daily Oil Price" )

In [None]:
temp = pd.merge(temp, oil, how = "left")
print("Correlation with Daily Oil Prices")
print(temp.drop(["store_nbr", "dcoilwtico"], axis = 1).corr("spearman").dcoilwtico_interpolated.loc[["sales", "transactions"]], "\n")


fig, axes = plt.subplots(1, 2, figsize = (15,5))
temp.plot.scatter(x = "dcoilwtico_interpolated", y = "transactions", ax=axes[0])
temp.plot.scatter(x = "dcoilwtico_interpolated", y = "sales", ax=axes[1], color = "r")
axes[0].set_title('Daily oil price & Transactions', fontsize = 15)
axes[1].set_title('Daily Oil Price & Sales', fontsize = 15)

In [None]:
a = pd.merge(train.groupby(["date", "family"]).sales.sum().reset_index(), oil.drop("dcoilwtico", axis = 1), how = "left")
c = a.groupby("family").corr("spearman").reset_index()
c = c[c.level_1 == "dcoilwtico_interpolated"][["family", "sales"]].sort_values("sales")

In [None]:

fig, axes = plt.subplots(7, 5, figsize = (20,20))
for i, fam in enumerate(c.family):
    print(i, fam)
    # if i < 6:
    #     a[a.family == fam].plot.scatter(x = "dcoilwtico_interpolated", y = "sales", ax=axes[0, i-1])
    #     axes[0, i-1].set_title(fam+"\n Correlation:"+str(c[c.family == fam].sales.iloc[0])[:6], fontsize = 12)
    #     axes[0, i-1].axvline(x=70, color='r', linestyle='--')
    # if i >= 6 and i<11:
    #     a[a.family == fam].plot.scatter(x = "dcoilwtico_interpolated", y = "sales", ax=axes[1, i-6])
    #     axes[1, i-6].set_title(fam+"\n Correlation:"+str(c[c.family == fam].sales.iloc[0])[:6], fontsize = 12)
    #     axes[1, i-6].axvline(x=70, color='r', linestyle='--')
    # if i >= 11 and i<16:
    #     a[a.family == fam].plot.scatter(x = "dcoilwtico_interpolated", y = "sales", ax=axes[2, i-11])
    #     axes[2, i-11].set_title(fam+"\n Correlation:"+str(c[c.family == fam].sales.iloc[0])[:6], fontsize = 12)
    #     axes[2, i-11].axvline(x=70, color='r', linestyle='--')
    # if i >= 16 and i<21:
    #     a[a.family == fam].plot.scatter(x = "dcoilwtico_interpolated", y = "sales", ax=axes[3, i-16])
    #     axes[3, i-16].set_title(fam+"\n Correlation:"+str(c[c.family == fam].sales.iloc[0])[:6], fontsize = 12)
    #     axes[3, i-16].axvline(x=70, color='r', linestyle='--')
    # if i >= 21 and i<26:
    #     a[a.family == fam].plot.scatter(x = "dcoilwtico_interpolated", y = "sales", ax=axes[4, i-21])
    #     axes[4, i-21].set_title(fam+"\n Correlation:"+str(c[c.family == fam].sales.iloc[0])[:6], fontsize = 12)
    #     axes[4, i-21].axvline(x=70, color='r', linestyle='--')
    # if i >= 26 and i < 31:
    #     a[a.family == fam].plot.scatter(x = "dcoilwtico_interpolated", y = "sales", ax=axes[5, i-26])
    #     axes[5, i-26].set_title(fam+"\n Correlation:"+str(c[c.family == fam].sales.iloc[0])[:6], fontsize = 12)
    #     axes[5, i-26].axvline(x=70, color='r', linestyle='--')
    # if i >= 31 :
    #     a[a.family == fam].plot.scatter(x = "dcoilwtico_interpolated", y = "sales", ax=axes[6, i-31])
    #     axes[6, i-31].set_title(fam+"\n Correlation:"+str(c[c.family == fam].sales.iloc[0])[:6], fontsize = 12)
    #     axes[6, i-31].axvline(x=70, color='r', linestyle='--')
        
        
plt.tight_layout(pad=5)
plt.suptitle("Daily Oil Product & Total Family Sales \n", fontsize = 20);
plt.show()

In [None]:
a = train[["store_nbr", "sales"]]
a["ind"] = 1
a["ind"] = a.groupby("store_nbr").ind.cumsum().values
a = pd.pivot(a, index = "ind", columns = "store_nbr", values = "sales").corr()
mask = np.triu(a.corr())
plt.figure(figsize=(20, 20))
sns.heatmap(a,
        annot=True,
        fmt='.1f',
        cmap='coolwarm',
        square=True,
        mask=mask,
        linewidths=1,
        cbar=False)
plt.title("Correlations among stores",fontsize = 20)
plt.show()

In [None]:
print(train.shape)
train = train[~((train.store_nbr == 52) & (train.date < "2017-04-20"))]
train = train[~((train.store_nbr == 22) & (train.date < "2015-10-09"))]
train = train[~((train.store_nbr == 42) & (train.date < "2015-08-21"))]
train = train[~((train.store_nbr == 21) & (train.date < "2015-07-24"))]
train = train[~((train.store_nbr == 29) & (train.date < "2015-03-20"))]
train = train[~((train.store_nbr == 20) & (train.date < "2015-02-13"))]
train = train[~((train.store_nbr == 53) & (train.date < "2014-05-29"))]
train = train[~((train.store_nbr == 36) & (train.date < "2013-05-09"))]
train.shape

In [None]:
c = train.groupby(["store_nbr", "family"]).sales.sum().reset_index().sort_values(["family","store_nbr"])
c = c[c.sales == 0]
c

In [None]:
print(train.shape)
# Anti Join
outer_join = train.merge(c[c.sales == 0].drop("sales",axis = 1), how = 'outer', indicator = True)
train = outer_join[~(outer_join._merge == 'both')].drop('_merge', axis = 1)
del outer_join
gc.collect()
train.shape

In [None]:
zero_prediction = []
for i in range(0,len(c)):
    zero_prediction.append(
        pd.DataFrame({
            "date":pd.date_range("2017-08-16", "2017-08-31").tolist(),
            "store_nbr":c.store_nbr.iloc[i],
            "family":c.family.iloc[i],
            "sales":0
        })
    )
zero_prediction = pd.concat(zero_prediction)
del c
gc.collect()
zero_prediction

In [None]:
fig, ax = plt.subplots(1,5, figsize = (20,4))
train[(train.store_nbr == 10) & (train.family == "LAWN AND GARDEN")].set_index("date").sales.plot(ax = ax[0], title = "STORE 10 - LAWN AND GARDEN")
train[(train.store_nbr == 36) & (train.family == "LADIESWEAR")].set_index("date").sales.plot(ax = ax[1], title = "STORE 36 - LADIESWEAR")
train[(train.store_nbr == 6) & (train.family == "SCHOOL AND OFFICE SUPPLIES")].set_index("date").sales.plot(ax = ax[2], title = "STORE 6 - SCHOOL AND OFFICE SUPPLIES")
train[(train.store_nbr == 14) & (train.family == "BABY CARE")].set_index("date").sales.plot(ax = ax[3], title = "STORE 14 - BABY CARE")
train[(train.store_nbr == 53) & (train.family == "BOOKS")].set_index("date").sales.plot(ax = ax[4], title = "STORE 43 - BOOKS")
plt.show()


In [None]:
a = train.set_index("date").groupby("family").resample("D").sales.sum().reset_index()
px.line(a, x = "date", y= "sales", color = "family", title = "Daily total sales of the family")

In [None]:
a = train.groupby("family").sales.mean().sort_values(ascending = False).reset_index()
px.bar(a, y = "family", x="sales", color = "family", title = "Which product family preferred more?")

In [None]:
print("Spearman Correlation between Sales and Onpromotion: {:,.4f}".format(train.corr("spearman").sales.loc["onpromotion"]))

In [None]:
holidays = pd.read_csv(folder+"holidays_events.csv")
holidays["date"] = pd.to_datetime(holidays.date)
holidays


# holidays[holidays.type == "Holiday"]
# holidays[(holidays.type == "Holiday") & (holidays.transferred == True)]

# Transferred Holidays
tr1 = holidays[(holidays.type == "Holiday") & (holidays.transferred == True)].drop("transferred", axis = 1).reset_index(drop = True)
tr2 = holidays[(holidays.type == "Transfer")].drop("transferred", axis = 1).reset_index(drop = True)
tr = pd.concat([tr1,tr2], axis = 1)
tr = tr.iloc[:, [5,1,2,3,4]]

tr

holidays = holidays[(holidays.transferred == False) & (holidays.type != "Transfer")].drop("transferred", axis = 1)
holidays = holidays.append(tr).reset_index(drop = True)


# Additional Holidays
holidays["description"] = holidays["description"].str.replace("-", "").str.replace("+", "").str.replace('\d+', '')
holidays["type"] = np.where(holidays["type"] == "Additional", "Holiday", holidays["type"])

# Bridge Holidays
holidays["description"] = holidays["description"].str.replace("Puente ", "")
holidays["type"] = np.where(holidays["type"] == "Bridge", "Holiday", holidays["type"])

 
# Work Day Holidays, that is meant to payback the Bridge.
work_day = holidays[holidays.type == "Work Day"]  
holidays = holidays[holidays.type != "Work Day"]  


# Split

# Events are national
events = holidays[holidays.type == "Event"].drop(["type", "locale", "locale_name"], axis = 1).rename({"description":"events"}, axis = 1)

holidays = holidays[holidays.type != "Event"].drop("type", axis = 1)
regional = holidays[holidays.locale == "Regional"].rename({"locale_name":"state", "description":"holiday_regional"}, axis = 1).drop("locale", axis = 1).drop_duplicates()
national = holidays[holidays.locale == "National"].rename({"description":"holiday_national"}, axis = 1).drop(["locale", "locale_name"], axis = 1).drop_duplicates()
local = holidays[holidays.locale == "Local"].rename({"description":"holiday_local", "locale_name":"city"}, axis = 1).drop("locale", axis = 1).drop_duplicates()

test['test/train'] = 'test'
train['test/train'] = 'train'


d = pd.merge(train.append(test), stores)
d["store_nbr"] = d["store_nbr"].astype("int8")


# National Holidays & Events
#d = pd.merge(d, events, how = "left")
d = pd.merge(d, national, how = "left")
# Regional
d = pd.merge(d, regional, how = "left", on = ["date", "state"])
# Local
d = pd.merge(d, local, how = "left", on = ["date", "city"])


# Work Day: It will be removed when real work day colum created
d = pd.merge(d,  work_day[["date", "type"]].rename({"type":"IsWorkDay"}, axis = 1),how = "left")



# EVENTS
events["events"] =np.where(events.events.str.contains("futbol"), "Futbol", events.events)

def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = df.select_dtypes(["category", "object"]).columns.tolist()
    # categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    df.columns = df.columns.str.replace(" ", "_")
    return df, df.columns.tolist()

events, events_cat = one_hot_encoder(events, nan_as_category=False)
events["events_Dia_de_la_Madre"] = np.where(events.date == "2016-05-08", 1,events["events_Dia_de_la_Madre"])
events = events.drop(239)

d = pd.merge(d, events, how = "left")
d[events_cat] = d[events_cat].fillna(0)



# New features
d["holiday_national_binary"] = np.where(d.holiday_national.notnull(), 1, 0)
d["holiday_local_binary"] = np.where(d.holiday_local.notnull(), 1, 0)
d["holiday_regional_binary"] = np.where(d.holiday_regional.notnull(), 1, 0)

# 
d["national_independence"] = np.where(d.holiday_national.isin(['Batalla de Pichincha',  'Independencia de Cuenca', 'Independencia de Guayaquil', 'Independencia de Guayaquil', 'Primer Grito de Independencia']), 1, 0)
d["local_cantonizacio"] = np.where(d.holiday_local.str.contains("Cantonizacio"), 1, 0)
d["local_fundacion"] = np.where(d.holiday_local.str.contains("Fundacion"), 1, 0)
d["local_independencia"] = np.where(d.holiday_local.str.contains("Independencia"), 1, 0)


holidays, holidays_cat = one_hot_encoder(d[["holiday_national","holiday_regional","holiday_local"]], nan_as_category=False)
d = pd.concat([d.drop(["holiday_national","holiday_regional","holiday_local"], axis = 1),holidays], axis = 1)


print(d[d['test/train'] == 'test'])

he_cols = d.columns[d.columns.str.startswith("events")].tolist() + d.columns[d.columns.str.startswith("holiday")].tolist() + d.columns[d.columns.str.startswith("national")].tolist()+ d.columns[d.columns.str.startswith("local")].tolist()
d[he_cols] = d[he_cols].astype("int8")

d[["test/train","family", "city", "state", "type"]] = d[["test/train","family", "city", "state", "type"]].astype("category")

del holidays, holidays_cat, work_day, local, regional, national, events, events_cat, tr, tr1, tr2, he_cols
gc.collect()
print(d.tail())

#d.head(5)

In [None]:
d.dtypes
d.dtypes
#d["date"]

In [None]:
def get_date_data():
    global test, train
    holidays = pd.read_csv(folder+"holidays_events.csv")
    holidays["date"] = pd.to_datetime(holidays.date)
    holidays


    # holidays[holidays.type == "Holiday"]
    # holidays[(holidays.type == "Holiday") & (holidays.transferred == True)]

    # Transferred Holidays
    tr1 = holidays[(holidays.type == "Holiday") & (holidays.transferred == True)].drop("transferred", axis = 1).reset_index(drop = True)
    tr2 = holidays[(holidays.type == "Transfer")].drop("transferred", axis = 1).reset_index(drop = True)
    tr = pd.concat([tr1,tr2], axis = 1)
    tr = tr.iloc[:, [5,1,2,3,4]]

    tr

    holidays = holidays[(holidays.transferred == False) & (holidays.type != "Transfer")].drop("transferred", axis = 1)
    holidays = holidays.append(tr).reset_index(drop = True)


    # Additional Holidays
    holidays["description"] = holidays["description"].str.replace("-", "").str.replace("+", "").str.replace('\d+', '')
    holidays["type"] = np.where(holidays["type"] == "Additional", "Holiday", holidays["type"])

    # Bridge Holidays
    holidays["description"] = holidays["description"].str.replace("Puente ", "")
    holidays["type"] = np.where(holidays["type"] == "Bridge", "Holiday", holidays["type"])


    # Work Day Holidays, that is meant to payback the Bridge.
    work_day = holidays[holidays.type == "Work Day"]  
    holidays = holidays[holidays.type != "Work Day"]  


    # Split

    # Events are national
    events = holidays[holidays.type == "Event"].drop(["type", "locale", "locale_name"], axis = 1).rename({"description":"events"}, axis = 1)

    holidays = holidays[holidays.type != "Event"].drop("type", axis = 1)
    regional = holidays[holidays.locale == "Regional"].rename({"locale_name":"state", "description":"holiday_regional"}, axis = 1).drop("locale", axis = 1).drop_duplicates()
    national = holidays[holidays.locale == "National"].rename({"description":"holiday_national"}, axis = 1).drop(["locale", "locale_name"], axis = 1).drop_duplicates()
    local = holidays[holidays.locale == "Local"].rename({"description":"holiday_local", "locale_name":"city"}, axis = 1).drop("locale", axis = 1).drop_duplicates()

    test['test/train'] = 'test'
    train['test/train'] = 'train'


    d = pd.merge(train.append(test), stores)
    d["store_nbr"] = d["store_nbr"].astype("int8")


    # National Holidays & Events
    #d = pd.merge(d, events, how = "left")
    d = pd.merge(d, national, how = "left")
    # Regional
    d = pd.merge(d, regional, how = "left", on = ["date", "state"])
    # Local
    d = pd.merge(d, local, how = "left", on = ["date", "city"])


    # Work Day: It will be removed when real work day colum created
    d = pd.merge(d,  work_day[["date", "type"]].rename({"type":"IsWorkDay"}, axis = 1),how = "left")

    # EVENTS
    events["events"] =np.where(events.events.str.contains("futbol"), "Futbol", events.events)

    def one_hot_encoder(df, nan_as_category=True):
        original_columns = list(df.columns)
        categorical_columns = df.select_dtypes(["category", "object"]).columns.tolist()
        # categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
        df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
        new_columns = [c for c in df.columns if c not in original_columns]
        df.columns = df.columns.str.replace(" ", "_")
        return df, df.columns.tolist()

    events, events_cat = one_hot_encoder(events, nan_as_category=False)
    events["events_Dia_de_la_Madre"] = np.where(events.date == "2016-05-08", 1,events["events_Dia_de_la_Madre"])
    events = events.drop(239)

    d = pd.merge(d, events, how = "left")
    d[events_cat] = d[events_cat].fillna(0)



    # New features
    d["holiday_national_binary"] = np.where(d.holiday_national.notnull(), 1, 0)
    d["holiday_local_binary"] = np.where(d.holiday_local.notnull(), 1, 0)
    d["holiday_regional_binary"] = np.where(d.holiday_regional.notnull(), 1, 0)

    # 
    d["national_independence"] = np.where(d.holiday_national.isin(['Batalla de Pichincha',  'Independencia de Cuenca', 'Independencia de Guayaquil', 'Independencia de Guayaquil', 'Primer Grito de Independencia']), 1, 0)
    d["local_cantonizacio"] = np.where(d.holiday_local.str.contains("Cantonizacio"), 1, 0)
    d["local_fundacion"] = np.where(d.holiday_local.str.contains("Fundacion"), 1, 0)
    d["local_independencia"] = np.where(d.holiday_local.str.contains("Independencia"), 1, 0)


    holidays, holidays_cat = one_hot_encoder(d[["holiday_national","holiday_regional","holiday_local"]], nan_as_category=False)
    d = pd.concat([d.drop(["holiday_national","holiday_regional","holiday_local"], axis = 1),holidays], axis = 1)



    he_cols = d.columns[d.columns.str.startswith("events")].tolist() + d.columns[d.columns.str.startswith("holiday")].tolist() + d.columns[d.columns.str.startswith("national")].tolist()+ d.columns[d.columns.str.startswith("local")].tolist()
    d[he_cols] = d[he_cols].astype("int8")

    d[["test/train","family", "city", "state", "type"]] = d[["test/train","family", "city", "state", "type"]].astype("category")

    del holidays, holidays_cat, work_day, local, regional, national, events, events_cat, tr, tr1, tr2, he_cols
    gc.collect()
    return d

#d.head(5)

In [None]:
# def AB_Test(dataframe, group, target): # data, columun, sales
    
#     # Packages
#     from scipy.stats import shapiro
#     import scipy.stats as stats
#     # Split A/B
#     groupA = dataframe[dataframe[group] == 1][target]
#     groupB = dataframe[dataframe[group] == 0][target]
    
#     print(shapiro(groupA)[1])
#     print(shapiro(groupB)[1])

#     # Assumption: Normality
#     ntA = shapiro(groupA)[1] < 0.05
#     ntB = shapiro(groupB)[1] < 0.05
#     # H0: Distribution is Normal! - False
#     # H1: Distribution is not Normal! - True
    
#     if (ntA == False) & (ntB == False): # "H0: Normal Distribution"
#         # Parametric Test
#         # Assumption: Homogeneity of variances
#         print('# Parametric Test')
#         leveneTest = stats.levene(groupA, groupB)[1] < 0.05
#         # H0: Homogeneity: False
#         # H1: Heterogeneous: True
        
#         if leveneTest == False:
#             # Homogeneity
#             print('# Homogeneity')

#             ttest = stats.ttest_ind(groupA, groupB, equal_var=True)[1]
#             # H0: M1 == M2 - False
#             # H1: M1 != M2 - True
#         else:
#             # Heterogeneous
#             print('# Heterogeneous')
#             ttest = stats.ttest_ind(groupA, groupB, equal_var=False)[1]
#             # H0: M1 == M2 - False
#             # H1: M1 != M2 - True
#     else:
#         # Non-Parametric Test
#         print('# Non-Parametric Test')

#         ttest = stats.mannwhitneyu(groupA, groupB)[1] 
#         # H0: M1 == M2 - False
#         # H1: M1 != M2 - True
        
#     # Result
#     temp = pd.DataFrame({
#         "AB Hypothesis":[ttest < 0.05], 
#         "p-value":[ttest]
#     })
#     #print('tempstart \n')
#     #print(temp)
#     #print('tempover \n')

#     temp["Test Type"] = np.where((ntA == False) & (ntB == False), "Parametric", "Non-Parametric")
#     temp["AB Hypothesis"] = np.where(temp["AB Hypothesis"] == False, "Fail to Reject H0", "Reject H0")
#     temp["Comment"] = np.where(temp["AB Hypothesis"] == "Fail to Reject H0", "A/B groups are similar!", "A/B groups are not similar!")
#     temp["Feature"] = group
#     temp["GroupA_mean"] = groupA.mean()
#     temp["GroupB_mean"] = groupB.mean()
#     temp["GroupA_median"] = groupA.median()
#     temp["GroupB_median"] = groupB.median()
    
#     # Columns
#     if (ntA == False) & (ntB == False):
#         temp["Homogeneity"] = np.where(leveneTest == False, "Yes", "No")
#         temp = temp[["Feature","Test Type", "Homogeneity","AB Hypothesis", "p-value", "Comment", "GroupA_mean", "GroupB_mean", "GroupA_median", "GroupB_median"]]
#     else:
#         temp = temp[["Feature","Test Type","AB Hypothesis", "p-value", "Comment", "GroupA_mean", "GroupB_mean", "GroupA_median", "GroupB_median"]]
    
#     # Print Hypothesis
#     # print("# A/B Testing Hypothesis")
#     # print("H0: A == B")
#     # print("H1: A != B", "\n")
    
#     return temp
    
# # Apply A/B Testing
# he_cols = d.columns[d.columns.str.startswith("events")].tolist() + d.columns[d.columns.str.startswith("holiday")].tolist() \
#         + d.columns[d.columns.str.startswith("national")].tolist()+ d.columns[d.columns.str.startswith("local")].tolist()
# ab = []
# for i in he_cols:
#     print(i)
#     ab.append(AB_Test(dataframe=d[d.sales.notnull()], group = i, target = "sales"))
# ab = pd.concat(ab)
# ab

In [None]:
d.groupby(["family","events_Futbol"]).sales.mean()[:60]

In [None]:
# Time Related Features
def create_date_features(df):
    df['month'] = df.date.dt.month.astype("int8")
    df['day_of_month'] = df.date.dt.day.astype("int8")
    df['day_of_year'] = df.date.dt.dayofyear.astype("int16")
    #df['week_of_month'] = (df.date.apply(lambda d: (d.day-1) // 7 + 1)).astype("int8")
    df['week_of_month'] = ((df['day_of_month']-1) // 7 + 1).astype("int8")


    df['week_of_year'] = (df.date.dt.weekofyear).astype("int8")
    df['day_of_week'] = (df.date.dt.dayofweek + 1).astype("int8")
    df['year'] = df.date.dt.year.astype("int32")
    df["is_wknd"] = (df.date.dt.weekday // 4).astype("int8")
    df["quarter"] = df.date.dt.quarter.astype("int8")
    df['is_month_start'] = df.date.dt.is_month_start.astype("int8")
    df['is_month_end'] = df.date.dt.is_month_end.astype("int8")
    df['is_quarter_start'] = df.date.dt.is_quarter_start.astype("int8")
    df['is_quarter_end'] = df.date.dt.is_quarter_end.astype("int8")
    df['is_year_start'] = df.date.dt.is_year_start.astype("int8")
    df['is_year_end'] = df.date.dt.is_year_end.astype("int8")
    # 0: Winter - 1: Spring - 2: Summer - 3: Fall
    df["season"] = np.where(df.month.isin([12,1,2]), 0, 1)
    df["season"] = np.where(df.month.isin([6,7,8]), 2, df["season"])
    df["season"] = pd.Series(np.where(df.month.isin([9, 10, 11]), 3, df["season"])).astype("int8")
    return df
#d["date"] = pd.to_datetime(d.date, errors='coerce')
#d = d.dropna()
d = get_date_data()

d = create_date_features(d)




# Workday column
d["workday"] = np.where((d.holiday_national_binary == 1) | (d.holiday_local_binary==1) | (d.holiday_regional_binary==1) | (d['day_of_week'].isin([6,7])), 0, 1)
d["workday"] = pd.Series(np.where(d.IsWorkDay.notnull(), 1, d["workday"])).astype("int8")
d.drop("IsWorkDay", axis = 1, inplace = True)

# Wages in the public sector are paid every two weeks on the 15 th and on the last day of the month. 
# Supermarket sales could be affected by this.
d["wageday"] = pd.Series(np.where((d['is_month_end'] == 1) | (d["day_of_month"] == 15), 1, 0)).astype("int8")

d.tail(15)
d.shape

In [None]:
d.dtypes
#d["date"]

In [None]:
d[d['test/train'] == 'train']

In [None]:
dFull = d
d = dFull.sample(frac=0.05, replace=True, random_state=1)
d.shape

In [None]:
dataTrainTestHold = d

cols = ['test/train' ,'sales', 'family', 'date', 'month','day_of_month', 'week_of_year','day_of_week']
dataTrainTest = dataTrainTestHold[cols]

dataTrain = dataTrainTest[dataTrainTest['test/train'] ==  'train'].reset_index(drop=True)
dataTest = dataTrainTest[dataTrainTest['test/train'] ==  'test'].reset_index(drop=True)

dataTrain




In [None]:
# Separate data into train and test sets
#train, test = tts(model_df)
train_ = (dataTrain)
test_ = (dataTest)

#family_encoded = pd.get_dummies(dataTrain['family'], prefix='family')
#dataTrain = pd.concat([dataTrain, family_encoded], axis=1)
#dataTrain.drop('family', axis=1, inplace=True)


dataTrain.drop('family', axis=1, inplace=True)

train_, test_ = train_test_split(dataTrain.dropna().reset_index(drop=True), test_size=0.2, random_state=42)

In [None]:
print(train_['sales'][train_['sales'] <= 0] )

In [None]:
#!pip install xgboost
import sklearn
from sklearn import linear_model
from sklearn.linear_model  import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_colwidth', None)

def predict_test(test_data, model):
    return model.predict(test_data)


model_metrics_full = []
metrics_df = pd.DataFrame(columns=['Model', 'cols', 'MAE', 'MSE', 'RMSE','RMSLE', 'R2'])

X_train = 0
y_train = 0
X_test = 0
y_test = 0
y_pred = 0

def regressive_model(train_data, test_data, model, model_name):
    global metrics_df, X_train,y_train, X_test, y_test, y_pred
    model_metrics = {}
    # Call helper functions to create X & y and scale data
    #X_train, y_train, X_test, y_test, scaler_object =  scale_data(train_data, test_data)


    X_train = train_data.drop(['sales','date', 'test/train'], axis=1) 
    y_train = train_data.sales 

    X_test = test_data.drop(['sales','date', 'test/train'], axis=1) 
    y_test = test_data.sales


    scaler = MinMaxScaler()

    #for col in X_train.columns:
    #    X_train[col] = scaler.fit_transform(X_train[[col]])
    #    X_test[col] = scaler.fit_transform(X_test[[col]])
    

    #scaler.fit(X_train)
    #scaler.transform(X_test)

     # Run regression model
    mod = model
    mod.fit(X_train, y_train)
    X_train_ped =  mod.predict(X_train)
    X_train_ped[X_train_ped < 0] = 0
    X_train['y_pred'] = X_train_ped
    X_train['sales'] = y_train
    X_train['pred_diff'] = abs(X_train['sales'] - X_train['y_pred'])
    X_train['pred_diff_%'] = round(X_train['y_pred'] / X_train['sales'], 2)
    y_pred = mod.predict(X_test)
    print(X_train.shape)
    
    min_value = min(min(y_test), min(y_pred))
    y_test_transformed = y_test - min_value + 1
    y_pred_transformed = y_pred - min_value + 1

    y_pred[y_pred < 0] = 0

    X_test['y_pred'] = y_pred

    if (y_pred < 0).any():
        rmsle = 9999999

    else:    
        rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))


    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    #rmsle = np.sqrt(mean_squared_log_error(y_test_transformed, y_pred_transformed))

    r2 = r2_score(y_test, y_pred)

    print('Columns:',list(X_train.columns))
    print(f'MAE: {mae:.2f}')
    print(f'MSE: {mse:.2f}')
    print(f'RMSE: {rmse:.2f}')
    print(f'RMSLE: {rmsle:.2f}')
    print(f'R2 score: {r2:.2f}')

    model_metrics[model_name] = {
        'Columns': (X_train.columns.values),
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2 score': r2
    }
    model_metrics_full.append(model_metrics) 
    metrics_df = metrics_df.append({'Model': model_name, 'cols': list(X_train.columns),  'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'RMSLE': rmsle, 'R2': r2}, ignore_index=True)


    '''
    #plt.scatter(y_pred, y_test, alpha=0.5)
    #plt.scatter(X_train, y_train,color='g') 
    plt.plot(X_test, y_pred,color='k') 

    plt.xlabel('Actual Sales')
    plt.ylabel('Predicted Sales')
    plt.title('Linear Regression Model')
    plt.show() 
    '''
    return model


# Call model frame work for linear regression


In [None]:
model = regressive_model(train_, test_, LinearRegression(),'LinearRegression')

In [None]:
# Call model frame work for random forest regressor 
model = regressive_model(train_, test_, 
                 RandomForestRegressor(n_estimators=100,
                                       max_depth=10),        
                                       'RandomForest')

In [None]:
# Call model frame work for XGBoost
model = regressive_model(train_, test_, xgb.XGBRegressor(n_estimators=100,
                                           learning_rate=0.2), 
                                           'XGBoost')

In [None]:
#d = d[d >= 0].dropna()
#d = d.dropna()

dataTrainTestHold = d

dataTrainTest = dataTrainTestHold[cols]

dataTrain = dataTrainTest[dataTrainTest['test/train'] ==  'train'].reset_index(drop=True)
dataTest = dataTrainTest[dataTrainTest['test/train'] ==  'test'].reset_index(drop=True)

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
dataTrain["family_encoded"] = encoder.fit_transform(dataTrain["family"])
dataTrain.drop('family', axis=1, inplace=True)


train_, test_ = train_test_split(dataTrain.dropna().reset_index(drop=True), test_size=0.2, random_state=42)




In [None]:
model = regressive_model(train_, test_, LinearRegression(),'LinearRegression')

In [None]:
# Call model frame work for random forest regressor 
model = regressive_model(train_, test_, 
                 RandomForestRegressor(n_estimators=100,
                                       max_depth=10),        
                                       'RandomForest')

In [None]:
# Call model frame work for XGBoost
model = regressive_model(train_, test_, xgb.XGBRegressor(n_estimators=100,
                                           learning_rate=0.2), 
                                           'XGBoost')

In [None]:
oil

In [None]:
#d = d.dropna()

dataTrainTestHold = d

dataTrainTest = dataTrainTestHold[cols]



print(dataTrainTest.shape)
dataTrainTest = pd.merge(dataTrainTest, oil, on='date')
print(dataTrainTest.shape)

dataTrainTest.drop('dcoilwtico', axis=1, inplace=True)

num_missing = dataTrainTest['dcoilwtico_interpolated'].isna().sum().sum()
print(f"Number of rows with NaN values: {num_missing}")
dataTrainTest = dataTrainTest.dropna()

dataTrain = dataTrainTest[dataTrainTest['test/train'] ==  'train'].reset_index(drop=True)
dataTest = dataTrainTest[dataTrainTest['test/train'] ==  'test'].reset_index(drop=True)


# Separate data into train and test sets
#train, test = tts(model_df)
train_ = (dataTrain)
test_ = (dataTest)

#family_encoded = pd.get_dummies(dataTrain['family'], prefix='family')
#dataTrain = pd.concat([dataTrain, family_encoded], axis=1)
#dataTrain.drop('family', axis=1, inplace=True)


dataTrain

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
dataTrain["family_encoded"] = encoder.fit_transform(dataTrain["family"])
dataTrain.drop('family', axis=1, inplace=True)

train_, test_ = train_test_split(dataTrain.reset_index(drop=True).dropna(), test_size=0.2, random_state=42)

model = regressive_model(train_, test_, LinearRegression(),'LinearRegression')
# Call model frame work for XGBoost


In [None]:
# Call model frame work for random forest regressor 
model = regressive_model(train_, test_, 
                 RandomForestRegressor(n_estimators=100,
                                       max_depth=10),        
                                       'RandomForest')

In [None]:
model = regressive_model(train_, test_, xgb.XGBRegressor(n_estimators=100,
                                           learning_rate=0.4), 
                                           'XGBoost')

In [None]:
holidays = pd.read_csv(folder+"holidays_events.csv")
holidays["date"] = pd.to_datetime(holidays.date)


dataTrainTestHold = d

dataTrainTest = dataTrainTestHold[cols]

print(dataTrainTest.shape)
dataTrainTest = pd.merge(dataTrainTest, oil, on='date')
print(dataTrainTest.shape)

dataTrainTest.drop('dcoilwtico', axis=1, inplace=True)

num_missing = dataTrainTest['dcoilwtico_interpolated'].isna().sum().sum()
print(f"Number of rows with NaN values: {num_missing}")
dataTrainTest = dataTrainTest.dropna()
holidays =  holidays[['date','locale_name']]
print('h',holidays.shape)
print('d',dataTrainTest.shape)
dataTrainTest = pd.merge(dataTrainTest, holidays, on='date')
print('d2',dataTrainTest.shape)


dataTrain = dataTrainTest[dataTrainTest['test/train'] ==  'train'].reset_index(drop=True)
dataTest = dataTrainTest[dataTrainTest['test/train'] ==  'test'].reset_index(drop=True)


# Separate data into train and test sets
#train, test = tts(model_df)
train_ = (dataTrain)
test_ = (dataTest)

In [None]:
d

In [None]:
dataTrain

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
dataTrain["family_encoded"] = encoder.fit_transform(dataTrain["family"])
dataTrain.drop('family', axis=1, inplace=True)

dataTrain["holiday_encoded"] = encoder.fit_transform(dataTrain["locale_name"])
dataTrain.drop('locale_name', axis=1, inplace=True)

train_, test_ = train_test_split(dataTrain.reset_index(drop=True), test_size=0.2, random_state=42)

model = regressive_model(train_, test_, LinearRegression(),'LinearRegression')

In [None]:
# Call model frame work for random forest regressor 
model = regressive_model(train_, test_, 
                 RandomForestRegressor(n_estimators=100,
                                       max_depth=10),        
                                       'RandomForest')

In [None]:
model = regressive_model(train_, test_, xgb.XGBRegressor(n_estimators=100,
                                           learning_rate=0.4), 
                                           'XGBoost')

In [None]:
metrics_df.sort_values(by='RMSLE')

In [None]:
dataTest

In [None]:
predict_test(model)