In [None]:
# Jovian Commit Essentials
# Please retain and execute this cell without modifying the contents for `jovian.commit` to work
!pip install jovian --upgrade -q
import jovian
jovian.set_project('walmart-sa-es')
jovian.set_colab_id('1_kq9jVSPW5z6VdRiiugqsDcLE2FknZ_I')

# walmart-sa-es

Use the "Run" button to execute the code.

In [None]:
!pip install jovian --upgrade --quiet

In [None]:
import jovian

In [None]:
# Execute this to save new versions of the notebook
jovian.commit(project="walmart-sa-es")

LIBRARIES

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()

from sklearn import model_selection
from sklearn import metrics, ensemble, linear_model
import xgboost as xgb
import lightgbm as lgb
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

%matplotlib inline

DATA **LOADING**

In [None]:
!pip install opendatasets scikit-learn jovian --quiet --upgrade

In [None]:
import os
import opendatasets as od
import pandas as pd
pd.set_option("display.max_columns", 120)
pd.set_option("display.max_rows", 120)

In [None]:
od.download('https://www.kaggle.com/competitions/walmart-recruiting-store-sales-forecasting/data')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: darshanparamane
Your Kaggle Key: ··········
Downloading walmart-recruiting-store-sales-forecasting.zip to ./walmart-recruiting-store-sales-forecasting


100%|██████████| 2.70M/2.70M [00:00<00:00, 126MB/s]


Extracting archive ./walmart-recruiting-store-sales-forecasting/walmart-recruiting-store-sales-forecasting.zip to ./walmart-recruiting-store-sales-forecasting





In [None]:
os.listdir('walmart-recruiting-store-sales-forecasting')

['sampleSubmission.csv.zip',
 'test.csv.zip',
 'features.csv.zip',
 'train.csv.zip',
 'stores.csv']

In [None]:
features=pd.read_csv('walmart-recruiting-store-sales-forecasting/features.csv.zip')
test=pd.read_csv('walmart-recruiting-store-sales-forecasting/test.csv.zip')
train=pd.read_csv('walmart-recruiting-store-sales-forecasting/train.csv.zip')
stores=pd.read_csv('walmart-recruiting-store-sales-forecasting/stores.csv')
sample_submission = pd.read_csv('walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip')

In [None]:
feature_store = features.merge(stores, how='inner', on = "Store")

In [None]:
train_df = train.merge(feature_store, how='inner', on = ['Store','Date','IsHoliday']).sort_values(by=['Store','Dept','Date']).reset_index(drop=True)

In [None]:
test_df = test.merge(feature_store, how='inner', on = ['Store','Date','IsHoliday']).sort_values(by = ['Store','Dept','Date']).reset_index(drop=True)

EXPLORATORY DATA ANALYSIS

The EDA is one of the most important parts of the process, because will gives you an idea about the relationship of the features, your distribution, and so on.


In [None]:
train_df.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Store,421570.0,22.200546,12.785297,1.0,11.0,22.0,33.0,45.0
Dept,421570.0,44.260317,30.492054,1.0,18.0,37.0,74.0,99.0
Weekly_Sales,421570.0,15981.258123,22711.183519,-4988.94,2079.65,7612.03,20205.8525,693099.36
Temperature,421570.0,60.090059,18.447931,-2.06,46.68,62.09,74.28,100.14
Fuel_Price,421570.0,3.361027,0.458515,2.472,2.933,3.452,3.738,4.468
MarkDown1,150681.0,7246.420196,8291.221345,0.27,2240.27,5347.45,9210.9,88646.76
MarkDown2,111248.0,3334.628621,9475.357325,-265.76,41.6,192.0,1926.94,104519.54
MarkDown3,137091.0,1439.421384,9623.07829,-29.1,5.08,24.6,103.99,141630.61
MarkDown4,134967.0,3383.168256,6292.384031,0.22,504.22,1481.31,3595.04,67474.85
MarkDown5,151432.0,4628.975079,5962.887455,135.16,1878.44,3359.45,5563.8,108519.28


In [None]:
feature_store = features.merge(stores, how='inner', on = "Store")

# Converting date column to datetime 
feature_store['Date'] = pd.to_datetime(feature_store['Date'])
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

# Adding some basic datetime features
feature_store['Day'] = feature_store['Date'].dt.day
feature_store['Week'] = feature_store['Date'].dt.week
feature_store['Month'] = feature_store['Date'].dt.month
feature_store['Year'] = feature_store['Date'].dt.year

In [None]:
train_df = train.merge(feature_store, how='inner', on = ['Store','Date','IsHoliday']).sort_values(by = ['Store','Dept','Date']).reset_index(drop=True)

In [None]:
test_df = test.merge(feature_store, how='inner', on = ['Store','Date','IsHoliday']).sort_values(by = ['Store','Dept','Date']).reset_index(drop=True)

In [None]:
df_weeks = train_df.groupby('Week').sum()

In [None]:
df_weeks

Unnamed: 0_level_0,Store,Dept,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Size,Day,Month,Year
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,130899,260501,87731210.0,0,222343.25,18872.529,14993970.0,67007390.0,353486.7,3078155.61,15506769.87,1008962.0,46396.361,806775433,38348,5903,11873906
2,131036,260045,82696760.0,0,204005.74,19098.337,10198580.0,29235310.0,218513.3,1565390.83,11416226.1,1007821.0,46330.934,805424262,79550,5894,11855800
3,130031,259381,82735640.0,0,222547.92,19167.202,7210967.0,9824350.0,214079.0,1942054.54,8410008.53,1006852.0,46151.605,804330161,120458,5877,11821606
4,129971,259111,79434830.0,0,221326.56,19229.283,4418107.0,5170161.0,213215.8,296603.35,7927497.89,1006330.0,46135.25,803454747,161436,5871,11809533
5,195904,392036,141989500.0,0,317172.19,27489.729,114073400.0,8936033.0,448553.2,90821661.61,18580144.27,1507341.0,71739.976,1213767653,35410,17716,17813460
6,196850,394786,145682300.0,8895,308692.32,27771.111,33881020.0,13748910.0,223095.0,28678531.29,18188378.72,1514441.0,71962.692,1219143417,97800,17790,17887890
7,197602,396508,147190200.0,0,362493.21,28048.496,34918200.0,24956590.0,55328.42,19735801.65,18139157.65,1520852.0,72321.353,1221309371,160692,17858,17956249
8,195853,390108,133865900.0,0,369896.71,28063.01,27937950.0,17553670.0,13370.54,8544934.4,17424405.8,1507838.0,71597.14,1210785752,221090,17688,17785294
9,197097,393893,140713100.0,0,388853.51,29131.976,49815560.0,4794496.0,85144.02,34262956.69,10572056.03,1517037.0,71990.147,1217495757,32532,26676,17881858
10,196989,393404,138033200.0,0,416255.07,29839.192,24783110.0,1436038.0,16858.28,9151534.41,14091768.95,1518232.0,72045.724,1216929950,94818,26676,17881832


Sales analysis

In [None]:
import plotly.io as pio
pio.renderers.default = "colab"

In [None]:
palette = px.colors.qualitative.Safe

In [None]:
px.line( data_frame = df_weeks, x = df_weeks.index, y = 'Weekly_Sales', 
        labels = {'Weekly_Sales' : 'Weekly Sales', 'x' : 'Weeks' }, 
        title = 'Sales over weeks', color_discrete_sequence=palette)

Insights:
The sales across the year are quite stable, with a plunge around the week 42, and a recovery for the holidays. 

**Markdowns relationship with sales**

In [None]:
# Execute this to save new versions of the notebook
jovian.commit(project="walmart-sa-es")

[jovian] Detected Colab notebook...[0m
[jovian] Please enter your API key ( from https://jovian.ai/ ):[0m
API KEY: ··········
[jovian] Uploading colab notebook to Jovian...[0m
Committed successfully! https://jovian.ai/20je0301/walmart-sa-es


'https://jovian.ai/20je0301/walmart-sa-es'

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter( x = df_weeks.index, y = df_weeks['MarkDown1'], name = 'MarkDown1', mode = 'lines') )
fig.add_trace(go.Scatter( x = df_weeks.index, y = df_weeks['MarkDown2'], name = 'MarkDown2', mode = 'lines') )
fig.add_trace(go.Scatter( x = df_weeks.index, y = df_weeks['MarkDown3'], name = 'MarkDown3', mode = 'lines') )
fig.add_trace(go.Scatter( x = df_weeks.index, y = df_weeks['MarkDown4'], name = 'MarkDown4', mode = 'lines') )
fig.add_trace(go.Scatter( x = df_weeks.index, y = df_weeks['MarkDown5'], name = 'MarkDown5', mode = 'lines') )
fig.add_trace(go.Scatter( x = df_weeks.index, y = df_weeks['Weekly_Sales'], name = 'Weekly Sales', mode = 'lines') )
fig.update_layout(title = "Sales vs Markdown's", xaxis_title = 'Weeks')

**Mean sales comparassion across the years**

In [None]:
weekly_sales = train_df.groupby(['Year','Week'], as_index = False).agg({'Weekly_Sales': ['mean', 'median']})
weekly_sales2010 = train_df.loc[train_df['Year']==2010].groupby(['Week']).agg({'Weekly_Sales': ['mean', 'median']})
weekly_sales2011 = train_df.loc[train_df['Year']==2011].groupby(['Week']).agg({'Weekly_Sales': ['mean', 'median']})
weekly_sales2012 = train_df.loc[train_df['Year']==2012].groupby(['Week']).agg({'Weekly_Sales': ['mean', 'median']})

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter( x = weekly_sales2010['Weekly_Sales']['mean'].index, y = weekly_sales2010['Weekly_Sales']['mean'], name = 'Mean Sales 2010', mode = 'lines') )
fig.add_trace(go.Scatter( x = weekly_sales2011['Weekly_Sales']['mean'].index, y = weekly_sales2011['Weekly_Sales']['mean'], name = 'Mean Sales 2011', mode = 'lines') )
fig.add_trace(go.Scatter( x = weekly_sales2012['Weekly_Sales']['mean'].index, y = weekly_sales2012['Weekly_Sales']['mean'], name = 'Mean Sales 2012', mode = 'lines') )
fig.add_annotation(text="Thanskgiving", x=47, y=25000, showarrow=False)
fig.add_annotation(text="Christmas", x=51, y=29000, showarrow=False)
fig.update_layout(title = 'Sales 2010, 2011, 2012', xaxis_title = 'Weeks')

**Insights:
There's a clear pattern about the sales across the years, by Thanksgiving and Christmas the sales rise up by a huge margin.**

In [None]:
# Converting the temperature to celsius for a better interpretation
train_df['Temperature'] = train_df['Temperature'].apply(lambda x :  (x - 32) / 1.8)
train_df['Temperature'] = train_df['Temperature'].apply(lambda x :  (x - 32) / 1.8)

In [None]:
train_plt = train_df.sample(frac=0.10)

In [None]:
import plotly.io as pio
pio.renderers.default = "colab"
px.violin(train_plt, x='Temperature', y ='Weekly_Sales', color='IsHoliday', 
         color_discrete_sequence=palette)

Insights:
There is a pattern between the coldness and the sales, obviously related to the fact that the USA is in the northern hemisphere, and it is a country where a large part of it experiences cold temperatures for this time.

In [None]:
px.violin(train_plt, x='Fuel_Price', y ='Weekly_Sales', color='IsHoliday', 
           title='Fuel price and sales by holiday',color_discrete_sequence=palette)

There is not a clear pattern but lower the fuel prices more the sales

In [None]:
px.violin(train_plt, x='CPI', y ='Weekly_Sales', color='IsHoliday', 
           title='CPI and sales by holiday',color_discrete_sequence=palette)

**Here there is not a very clear pattern either, you can see that there are 3 groups, but in all of them you have sales, despite the fact that the CPI is higher.**

In [None]:
px.violin(train_plt, x='Unemployment', y ='Weekly_Sales', color='IsHoliday', 
           title='Unemployment rate and sales by holiday',color_discrete_sequence=palette)

**Insights:
In relation to unemployment, it can be seen that the lower the value, higher the sales, it makes sense.**

In [None]:
sizes= train_plt.groupby('Size').mean()
px.line(sizes, x = sizes.index, y = sizes.Weekly_Sales, 
        title='Store size and sales',color_discrete_sequence=palette)

Insights:
Size is an important factor when it comes to sales, as you can see here.

In [None]:
store_type = pd.concat([stores['Type'], stores['Size']], axis=1)
px.box(store_type, x='Type', y='Size', color='Type', 
       title='Store size and Store type',color_discrete_sequence=palette)

Insights:
Within size we can see that there are 3 types of stores, the A are the most present.

In [None]:
store_sale = pd.concat([stores['Type'], train_df['Weekly_Sales']], axis=1)
px.box(store_sale.dropna(), x='Type', y='Weekly_Sales', color='Type', 
       title='Store type and sales',color_discrete_sequence=palette)

**Insights:
In relation to the type of store we can see that although the C are the smallest ones, they are those that have the highest median sales.**

In [None]:
depts= train_plt.groupby('Dept').mean().sort_values(by='Weekly_Sales', ascending='False')
bar=px.bar(depts, x = depts.index, y =  depts.Weekly_Sales, 
           title='Departament and sales',color=depts.Weekly_Sales)
bar.update_layout(barmode='group', xaxis={'categoryorder':'total descending'})

Insights:
Some departaments contributes to sales more than others.

# ** Heatmap and correlation between features**

In [None]:
corr = train_df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
df_mask = corr.mask(mask).round(2)

fig = ff.create_annotated_heatmap(z=df_mask.to_numpy(), 
                                  x=df_mask.columns.tolist(),
                                  y=df_mask.columns.tolist(),
                                  colorscale=px.colors.diverging.RdBu,
                                  hoverinfo="none", 
                                  showscale=True, ygap=1, xgap=1
                                 )

fig.update_xaxes(side="bottom")

fig.update_layout(
    title_text='Heatmap', 
    title_x=0.5, 
    width=1000, 
    height=1000,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_zeroline=False,
    yaxis_zeroline=False,
    yaxis_autorange='reversed',
    template='plotly_white'
)
for i in range(len(fig.layout.annotations)):
    if fig.layout.annotations[i].text == 'nan':
        fig.layout.annotations[i].text = ""

fig.show()

In [None]:
weekly_sales_corr = train_df.corr().iloc[2,:]
corr_df = pd.DataFrame(data = weekly_sales_corr, index = weekly_sales_corr.index ).sort_values (by = 'Weekly_Sales', ascending = False)
corr_df = corr_df.iloc[1:]
bar = px.bar(corr_df, x = corr_df.index, y = 'Weekly_Sales', color=corr_df.index, labels={'index':'Featues'},
             title='Feature correlation with sales',color_discrete_sequence=palette)
bar.update_traces(showlegend=False)

# Feature Engineering

In [None]:
data_train = train_df.copy()
data_test = test_df.copy()

**Since Thanksgiving and Christmas are the most importarnt holidays, I'm going to try some feature engineering on this features, and also Superbowl and Laborday.**

In [None]:
data_train['Days_to_Thansksgiving'] = (pd.to_datetime(train_df["Year"].astype(str)+"-11-24", format="%Y-%m-%d") - pd.to_datetime(train_df["Date"], format="%Y-%m-%d")).dt.days.astype(int)
data_train['Days_to_Christmas'] = (pd.to_datetime(train_df["Year"].astype(str)+"-12-24", format="%Y-%m-%d") - pd.to_datetime(train_df["Date"], format="%Y-%m-%d")).dt.days.astype(int)

In [None]:
data_test['Days_to_Thansksgiving'] = (pd.to_datetime(test_df["Year"].astype(str)+"-11-24", format="%Y-%m-%d") - pd.to_datetime(test_df["Date"], format="%Y-%m-%d")).dt.days.astype(int)
data_test['Days_to_Christmas'] = (pd.to_datetime(test_df["Year"].astype(str)+"-12-24", format="%Y-%m-%d") - pd.to_datetime(test_df["Date"], format="%Y-%m-%d")).dt.days.astype(int)

In [None]:
data_train['SuperBowlWeek'] = train_df['Week'].apply(lambda x: 1 if x == 6 else 0)
data_train['LaborDay'] = train_df['Week'].apply(lambda x: 1 if x == 36 else 0)
data_train['Tranksgiving'] = train_df['Week'].apply(lambda x: 1 if x == 47 else 0)
data_train['Christmas'] = train_df['Week'].apply(lambda x: 1 if x == 52 else 0)

In [None]:
data_train['MarkdownsSum'] = train_df['MarkDown1'] + train_df['MarkDown2'] + train_df['MarkDown3'] + train_df['MarkDown4'] + train_df['MarkDown5'] 

In [None]:
data_test['MarkdownsSum'] = test_df['MarkDown1'] + test_df['MarkDown2'] + test_df['MarkDown3'] + test_df['MarkDown4'] + test_df['MarkDown5']

# PREPOCESSING
**Filling missing values**

In [None]:
data_train.isna().sum()[data_train.isna().sum() > 0].sort_values(ascending=False)

MarkdownsSum    324514
MarkDown2       310322
MarkDown4       286603
MarkDown3       284479
MarkDown1       270889
MarkDown5       270138
dtype: int64

In [None]:
data_test.isna().sum()[data_test.isna().sum() > 0].sort_values(ascending=False)

CPI             38162
Unemployment    38162
MarkdownsSum    37457
MarkDown2       28627
MarkDown4       12888
MarkDown3        9829
MarkDown1         149
dtype: int64

In [None]:
data_train.fillna(0, inplace = True)

In [None]:
data_test['CPI'].fillna(data_test['CPI'].mean(), inplace = True)
data_test['Unemployment'].fillna(data_test['Unemployment'].mean(), inplace = True)

In [None]:
data_test.fillna(0, inplace = True)

In [None]:
data_train['IsHoliday'] = data_train['IsHoliday'].apply(lambda x: 1 if x == True else 0)
data_test['IsHoliday'] = data_test['IsHoliday'].apply(lambda x: 1 if x == True else 0)

In [None]:
data_train['Type'] = data_train['Type'].apply(lambda x: 1 if x == 'A' else (2 if x == 'B' else 3))
data_test['Type'] = data_test['Type'].apply(lambda x: 1 if x == 'A' else (2 if x == 'B' else 3))

In [None]:
features = [feature for feature in data_train.columns if feature not in ('Date','Weekly_Sales')]

In [None]:
X = data_train[features].copy()
y = data_train.Weekly_Sales.copy()

In [None]:
data_sample = data_train.copy().sample(frac=.25)
X_sample = data_sample[features].copy()
y_sample = data_sample.Weekly_Sales.copy()

In [None]:
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X_sample, y_sample, random_state=0, test_size=0.15)

In [None]:
feat_model = xgb.XGBRegressor(random_state=0).fit(X_train, y_train)



In [None]:
!pip install eli5
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(feat_model, random_state=1).fit(X_valid, y_valid)
features = eli5.show_weights(perm, top=len(X_train.columns), feature_names = X_valid.columns.tolist())

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Execute this to save new versions of the notebook
jovian.commit(project="walmart-sa-es")

[jovian] Detected Colab notebook...[0m
[jovian] Please enter your API key ( from https://jovian.ai/ ):[0m
API KEY: 

In [None]:
features_weights = eli5.show_weights(perm, top=len(X_train.columns), feature_names = X_valid.columns.tolist())
features_weights

Weight,Feature
1.1403  ± 0.0157,Dept
0.3458  ± 0.0059,Size
0.0410  ± 0.0020,Store
0.0134  ± 0.0017,Week
0.0114  ± 0.0017,CPI
0.0088  ± 0.0023,Tranksgiving
0.0036  ± 0.0003,Type
0.0019  ± 0.0002,MarkDown3
0.0014  ± 0.0002,Days_to_Thansksgiving
0.0013  ± 0.0001,Unemployment


In [None]:
f_importances = pd.Series(dict(zip(X_valid.columns.tolist(), perm.feature_importances_))).sort_values(ascending=False)
f_importances

Dept                     1.140262
Size                     0.345760
Store                    0.041045
Week                     0.013397
CPI                      0.011400
Tranksgiving             0.008782
Type                     0.003592
MarkDown3                0.001888
Days_to_Thansksgiving    0.001356
Unemployment             0.001289
IsHoliday                0.001257
Day                      0.000624
Temperature              0.000241
Christmas                0.000000
LaborDay                 0.000000
SuperBowlWeek            0.000000
Days_to_Christmas        0.000000
Month                    0.000000
Year                     0.000000
MarkDown4                0.000000
Fuel_Price               0.000000
MarkDown1                0.000000
MarkDown2                0.000000
MarkDown5                0.000000
MarkdownsSum             0.000000
dtype: float64

In [None]:
weights = eli5.show_weights(perm, top=len(X_train.columns), feature_names=X_valid.columns.tolist())
result = pd.read_html(weights.data)[0]
result

Unnamed: 0,Weight,Feature
0,1.1403 ± 0.0157,Dept
1,0.3458 ± 0.0059,Size
2,0.0410 ± 0.0020,Store
3,0.0134 ± 0.0017,Week
4,0.0114 ± 0.0017,CPI
5,0.0088 ± 0.0023,Tranksgiving
6,0.0036 ± 0.0003,Type
7,0.0019 ± 0.0002,MarkDown3
8,0.0014 ± 0.0002,Days_to_Thansksgiving
9,0.0013 ± 0.0001,Unemployment


# MODELLING

In [None]:
models = {
          '    LGBM': lgb.LGBMRegressor(random_state = 0),
          ' XGBoost': xgb.XGBRegressor(random_state = 0, objective = 'reg:squarederror'),
                    
          '    HGBR': HistGradientBoostingRegressor(random_state = 0),
          ' ExtraTr': ensemble.ExtraTreesRegressor(bootstrap = True, random_state = 0),
          ' RandomF': ensemble.RandomForestRegressor(random_state = 0),
         }

In [None]:
def model_evaluation (name, model, models, X_train, y_train, X_valid, y_valid):
   
    rmses = []
    
    for i in range(len(models)):
    
        # Model fit
        model.fit(X_train, y_train)
        
        # Model predict
        y_preds = model.predict(X_valid)

        # RMSE
        rmse = np.sqrt(np.mean((y_valid - y_preds)**2))
        rmses.append(rmse)
        
    return np.mean(rmses)

In [None]:
for name, model in models.items():
    print(name + ' Valid RMSE {:.4f}'.format(model_evaluation(name, model, models,  X_train, y_train, X_valid, y_valid)) )

    LGBM Valid RMSE 6751.5386
 XGBoost Valid RMSE 11368.6662
    HGBR Valid RMSE 6842.5531
 ExtraTr Valid RMSE 5364.0854
 RandomF Valid RMSE 4393.2449


**Seems to be RandomForest it's the best baseline model by default, followed by ExtraTrees, but you can improve the score of boosting models by doing hyperparameter optimization. Also, for a more generalizable model you can do a blend of the best models at the end.**

Stablish a baseline with the best model.

In [None]:
X_baseline = X[['Store','Dept','IsHoliday','Size','Week','Type','Year','Day']].copy()
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X_baseline, y, random_state=0, test_size=0.1)
RF = ensemble.RandomForestRegressor(n_estimators=60, max_depth=25, min_samples_split=3, min_samples_leaf=1)
RF.fit(X_train, y_train)

RandomForestRegressor(max_depth=25, min_samples_split=3, n_estimators=60)

In [None]:
test = data_test[['Store','Dept','IsHoliday','Size','Week','Type','Year','Day']].copy()
predict_rf = RF.predict(test)

In [None]:
sample_submission['Weekly_Sales'] = predict_rf
sample_submission.to_csv('submission.csv',index=False)