In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score

import plotly.express as px
import matplotlib
import plotly as plt
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings('ignore')


# IMPORT DATASET

In [2]:
df=pd.read_csv('Walmart_Store_sales.csv')
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092


# Explore dataset

In [3]:
#Basic stats
print("Number of rows : {}".format(df.shape[0]))
print("Number of columns : {}".format(df.shape[1]))
print()

print("Basics statistics: ")
data_desc = df.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])

Number of rows : 150
Number of columns : 8

Basics statistics: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,150.0,132,136.0,138.0,132.0,136.0,138.0,135.0
unique,,85,,,,,,
top,,19-10-2012,,,,,,
freq,,4,,,,,,
mean,9.866667,,1249536.0,0.07971,61.398106,3.320853,179.898509,7.59843
std,6.231191,,647463.0,0.271831,18.378901,0.478149,40.274956,1.577173
min,1.0,,268929.0,0.0,18.79,2.514,126.111903,5.143
25%,4.0,,605075.7,0.0,45.5875,2.85225,131.970831,6.5975
50%,9.0,,1261424.0,0.0,62.985,3.451,197.908893,7.47
75%,15.75,,1806386.0,0.0,76.345,3.70625,214.934616,8.15



Percentage of missing values: 


Store            0.000000
Date            12.000000
Weekly_Sales     9.333333
Holiday_Flag     8.000000
Temperature     12.000000
Fuel_Price       9.333333
CPI              8.000000
Unemployment    10.000000
dtype: float64

## Dropping null values in our target "weekly sales"

In [4]:
#We check how many rows with null value as target y (Weekly_Sales)
rows_to_drop=df['Weekly_Sales'].isnull().sum()
print(f"We have {rows_to_drop} null values in our target")

We have 14 null values in our target


In [5]:
# We drop these rows
df=df.dropna(subset = ['Weekly_Sales'])
df.Weekly_Sales.isnull().sum()

0

In [6]:
#Let's visualize the shape of our new dataset
df.shape

(136, 8)

# Study outliers

In [7]:
# We fix as outlier all the values up to the mean + 3 times the standard deviation for each feature
max_Temp=df['Temperature'].mean() + 3*df['Temperature'].std()
max_Fuel=df['Fuel_Price'].mean() + 3*df['Fuel_Price'].std()
max_CPI=df['CPI'].mean() + 3*df['CPI'].std()
max_Unempl=df['Unemployment'].mean() + 3*df['Unemployment'].std()
print('we can consider as outliers, values up to: ')
print(f'Temperature = {max_Temp}, Fuel_Price = {max_Fuel}, CPI = {max_CPI}, Unemployment = {max_Unempl}')

we can consider as outliers, values up to: 
Temperature = 116.39726255188084, Fuel_Price = 4.7556126415753965, CPI = 298.82045814693976, Unemployment = 12.523867092274017


In [8]:
# We fix as outlier all the values less than the mean + 3 times the standard deviation for each feature
min_Temp=df['Temperature'].mean() - 3*df['Temperature'].std()
min_Fuel=df['Fuel_Price'].mean() - 3*df['Fuel_Price'].std()
min_CPI=df['CPI'].mean() - 3*df['CPI'].std()
min_Unempl=df['Unemployment'].mean() - 3*df['Unemployment'].std()
print('we can consider as outliers, values less than: ')
print(f'Temperature = {min_Temp}, Fuel_Price = {min_Fuel}, CPI = {min_CPI}, Unemployment = {min_Unempl}')

we can consider as outliers, values less than: 
Temperature = 5.310671332416675, Fuel_Price = 1.8783712293923458, CPI = 57.36182994106019, Unemployment = 2.807296842152213


In [9]:
# Outliers visualization
plot_rows=1
plot_cols= 4
fig = make_subplots(rows=plot_rows, cols=plot_cols)

# add traces
variables=['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
for var in variables:
    fig.add_trace(go.Box(y=df[var], name=var, boxpoints="all"), row=1, col=variables.index(var)+1)


# Format and show fig
fig.update_layout(height=800, width=900, title_text="Outliers visualization with Boxplots", title_x=0.5,plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
                        font=dict(color='#8a8d93'),
                        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)
fig.show()


In [10]:
# Dropping outliers in Unemployment fearure (using masks) (we keep the null values here)
print('Dropping outliers...')
to_keep_Unemp = (df['Unemployment'] < (df['Unemployment'].mean() + 3*df['Unemployment'].std())) | (df['Unemployment'].isnull())

Dropping outliers...


In [11]:
df = df.loc[to_keep_Unemp,:].reset_index(drop=True)
print('Done. Number of lines remaining : ', df.shape[0])
print()

Done. Number of lines remaining :  131



## EDA

In [12]:
#We will split this column by creating column for each: day, year, month....
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek
df=df.drop(columns=['Date'])
df.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,DayOfWeek
0,6.0,1572117.54,,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0
1,13.0,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0,4.0
2,11.0,1244390.03,0.0,84.57,,214.556497,7.346,,,,
3,6.0,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0
4,4.0,1857533.7,0.0,,2.756,126.160226,7.896,2010.0,5.0,28.0,4.0


In [13]:
# We convert our temperature column to celsus degrees // we create a function
def fahr_to_celsius(temp_fahr):
    """Convert Fahrenheit to Celsius
    Return Celsius conversion of input"""
    temp_celsius = (temp_fahr - 32) * 5 / 9
    return temp_celsius

In [14]:
df.Temperature=fahr_to_celsius(df["Temperature"])
df.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,DayOfWeek
0,6.0,1572117.54,,15.338889,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0
1,13.0,1807545.43,0.0,5.766667,3.435,128.616064,7.47,2011.0,3.0,25.0,4.0
2,11.0,1244390.03,0.0,29.205556,,214.556497,7.346,,,,
3,6.0,1644470.66,0.0,26.05,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0
4,4.0,1857533.7,0.0,,2.756,126.160226,7.896,2010.0,5.0,28.0,4.0


In [15]:
#Let's visualize Weekly sales per store
sales_per_store=df["Weekly_Sales"].groupby(df["Store"]).sum().round(1).rename_axis('Store').reset_index(name='Count')

In [16]:
department = px.funnel(sales_per_store, x='Store', y='Count',
                       height=300, title='Weekly Sales per Store',
                       color_discrete_sequence=['#008B8B']
                       )
department.update_traces(textposition='auto', textfont=dict(color='#fff'))
department.update_layout(autosize=True,
                         margin=dict(t=110, b=50, l=70, r=40),
                         title_x=0.5,
                         xaxis_title=' ', yaxis_title=" ",
                         plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
                         title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
                         font=dict(color='#8a8d93'),
                         legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
                        )

In [17]:
#Let's visualize total sales per Day of week
sales_per_Day=df.groupby(['Store', 'DayOfWeek'])['Weekly_Sales'].sum().reset_index(name='Count').sort_values(by='DayOfWeek', ascending=True)
sales_per_Day['Day of week']=sales_per_Day['DayOfWeek'].apply(lambda x: 'Monday' if x==0
                                                else 'Tuesday' if x==1
                                                else 'Wendsday' if x==2
                                                else 'Thursday' if x==3
                                                else 'Friday' if x==4
                                                else 'Saurday' if x==5
                                                else 'Sunday'

)

In [18]:
total_sales=df["Weekly_Sales"].groupby(df["DayOfWeek"]).sum().round(1).rename_axis('Day').reset_index(name='Total')

In [19]:
fig = px.funnel(total_sales, x=total_sales.Day, y=total_sales.Total,
                       height=300, title='Weekly Sales per Day of week',
                       color_discrete_sequence=['#008B8B']
                       )
fig.update_traces(textposition='auto', textfont=dict(color='#fff'))
fig.update_traces(marker = {'line': {'width': [1, 1,1, 3,1,1, 1], 'color': ['gray', 'gray', 'gray','blue','gray','gray','gray']}})
fig.update_layout(autosize=True,
                         margin=dict(t=110, b=50, l=70, r=40),
                         title_x=0.5,
                         xaxis_title=' ', yaxis_title=" ",
                         plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
                         title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
                         font=dict(color='#8a8d93'),
                         legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
                        )

In [20]:
#Let's visualize total sales per store per Day of week
fig = px.histogram(sales_per_Day, x='Store',y='Count', color='Day of week', nbins=20,text_auto=True,
                         title='Sales per Store per Day of week', height=300,
                         color_discrete_sequence=['#DB6574', '#03DAC5','#8B0A50','#008B8B','#FFD700','#483D8B'],
                         )

fig.update_yaxes(showgrid=True, gridcolor="#008B8B")
fig.update_layout(margin=dict(t=100, b=0, l=70, r=40),
                        xaxis_title='Store', yaxis_title="Count of Sales ",title_x=0.5,
                        height = 500, width = 1200,
                        plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
                        title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
                        font=dict(color='#8a8d93'),
                        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
                        xaxis = dict(
                        tick0 = 0,
                        dtick = 1
   ))
fig.show()

In [21]:
sales_per_Month=df.groupby(['Store', 'Month'])['Weekly_Sales'].sum().reset_index(name='Count').sort_values(by='Month', ascending=True)

In [22]:
# Visualization of sales per month per Store
sales_per_Month=df.groupby(['Store', 'Month'])['Weekly_Sales'].sum().reset_index(name='Count').sort_values(by='Month', ascending=True)
fig = px.histogram(sales_per_Month, x='Store',y='Count', color='Month', nbins=20,
                         title='Sales per Store per Month', height=300,
                         color_discrete_sequence=['#DB6574', '#03DAC5','#8B0A50','#008B8B','#FFD700','#483D8B'],
                         )

fig.update_yaxes(showgrid=True, gridcolor="#008B8B")
fig.update_layout(margin=dict(t=100, b=0, l=70, r=40),
                        xaxis_title='Store', yaxis_title="Count of Sales ",title_x=0.5,
                        height = 500, width = 1200,
                        plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
                        title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
                        font=dict(color='#8a8d93'),
                        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
                        xaxis = dict(
                        tick0 = 0,
                        dtick = 1
   ))

fig.show()

In [23]:
sales_temp=df.groupby(['Temperature'])['Weekly_Sales'].sum().reset_index(name='Count').sort_values(by='Temperature', ascending=True)

In [24]:
#Let's visualize is sales are impacted by temperature
fig = px.histogram(sales_temp, x='Temperature',y='Count',nbins=40,text_auto=True,
                         title='Sales vs Temperature', height=400, width=800,color_discrete_sequence=['#800000']
                       )
fig.update_traces(textposition='auto', textfont=dict(color='#fff'))
fig.update_layout(autosize=True,
                         margin=dict(t=110, b=50, l=70, r=40),
                         title_x=0.5,
                         xaxis_title=' ', yaxis_title=" ",
                         plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
                         title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
                         font=dict(color='#8a8d93'),
                         legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),xaxis = dict(
                        tick0 = 0,
                        dtick = 5
                        ))
fig.show()

In [25]:
# Visualize pairwise dependencies
fig = px.scatter_matrix(df, color_discrete_sequence=['#800000'])
fig.update_layout(plot_bgcolor='#8B7D7B', paper_bgcolor='#8B7D7B',
        title = go.layout.Title(text = "Bivariate analysis", x = 0.5), showlegend = False,
            autosize=False, height = 1200, width = 1200,
            title_font=dict(size=25, color='#5E2612', family="Muli, sans-serif"),
                        font=dict(color='#5E2612'),
                        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
            )

fig.show()

In [26]:
# Visualize pairwise dependencies without colums day/month/year/day-of-week

# We create a dataframe with just some columns
first_df=df.iloc[:,:7]
first_df.head()

fig = px.scatter_matrix(first_df, color_discrete_sequence=['#800000'])
fig.update_layout(plot_bgcolor='#8B7D7B', paper_bgcolor='#8B7D7B',
        title = go.layout.Title(text = "Bivariate analysis", x = 0.5), showlegend = False,
            autosize=False, height = 1200, width = 1200,
            title_font=dict(size=25, color='#5E2612', family="Muli, sans-serif"),
                        font=dict(color='#5E2612'),
                        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
            )
fig.show()

In [27]:
# Univariate analysis
# Distribution of each numeric variable
num_features = ['Weekly_Sales', 'Fuel_Price', 'Temperature', 'CPI', 'Unemployment']
for i in range(len(num_features)):
    fig = px.histogram(df[num_features[i]], nbins=20, color_discrete_sequence=['#008B8B'])
    fig.update_layout( yaxis_title="Total",plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
    xaxis_title=num_features[i],height = 400, width = 800,
    title="Univariate analysis",
    title_x=0.5,
    title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
                        font=dict(color='#8a8d93'),
                        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1.7))

    fig.show()



In [28]:
# Correlation matrix
corr_matrix = df.corr().round(2)

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.tolist(),
                                  y = corr_matrix.index.tolist())


fig.show()


## Pre Processing

In [29]:
# Separate target variable Y from features X
target_name = 'Weekly_Sales'
features_list = ["Store","Holiday_Flag","Temperature","Fuel_Price","CPI","Unemployment","Year","Month","Day","DayOfWeek"]

print("Separating labels from features...")
Y = df.loc[:,target_name]
X = df.drop(target_name, axis = 1) # All columns are kept, except the target
print("...Done.")
print(Y.head())
print()
print(X.head())
print()


Separating labels from features...
...Done.
0    1572117.54
1    1807545.43
2    1244390.03
3    1644470.66
4    1857533.70
Name: Weekly_Sales, dtype: float64

   Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
0    6.0           NaN    15.338889       3.045  214.777523         6.858   
1   13.0           0.0     5.766667       3.435  128.616064         7.470   
2   11.0           0.0    29.205556         NaN  214.556497         7.346   
3    6.0           0.0    26.050000       2.759  212.412888         7.092   
4    4.0           0.0          NaN       2.756  126.160226         7.896   

     Year  Month   Day  DayOfWeek  
0  2011.0    2.0  18.0        4.0  
1  2011.0    3.0  25.0        4.0  
2     NaN    NaN   NaN        NaN  
3  2010.0    5.0  28.0        4.0  
4  2010.0    5.0  28.0        4.0  



In [30]:
# Divide dataset Train set & Test set
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [31]:
#Categorical variables : Store, Holiday_Flag, DayOfWeek
#Numerical variables : Temperature, Fuel_Price, CPI, Unemployment, Year, Month, Day
#Create pipeline for numeric features
numeric_features = ['Year', 'Month','Day', 'DayOfWeek','Temperature', 'Fuel_Price', 'CPI', 'Unemployment'] # Names of numeric columns in X_train/X_test

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),# missing values will be replaced by columns' median
    ('scaler', StandardScaler())
])


In [32]:
# Create pipeline for categorical features
categorical_features = ['Store', 'Holiday_Flag'] # Names of categorical columns in X_train/X_test
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

In [33]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [34]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head())
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()


Performing preprocessings on train set...
     Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
118    4.0           NaN    29.216667       3.469  129.112500         5.644   
68     1.0           0.0    16.805556       3.308  218.220509         7.866   
16    18.0           0.0    -5.927778       2.788  131.527903         9.202   
96    18.0           0.0    20.622222       2.906  132.293936           NaN   
122    5.0           0.0    16.872222         NaN  212.560411         6.768   

       Year  Month   Day  DayOfWeek  
118  2011.0    8.0   7.0        6.0  
68   2011.0   11.0  18.0        4.0  
16      NaN    NaN   NaN        NaN  
96   2010.0    5.0  28.0        4.0  
122  2010.0   12.0  11.0        5.0  
...Done.
[[ 0.17418541  0.3878969  -1.200365    1.86210883  1.46315567  0.30806957
  -1.35931117 -1.84213463  0.          0.          1.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0

## Baseline Model

In [35]:
#Baseline Model: Linear Regression =>Train model
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


In [36]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = regressor.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[2203436.37058301 1589845.9029752  1168578.90892836 1247509.51375566
  358692.13624493  831159.3616307  2095391.84683626 2011185.13613192
  416750.99825711 2122192.02920989 1418775.16463356  462580.20079644
 1979164.34501945 1549997.42751573 1447904.7753018  1984436.57528517
 2080965.79574837 2016221.82613351 1624103.68638465 1911079.67828424
 2009702.40028624 1789035.3176782  2012988.64805285  606152.35533527
 1554555.91531025  598468.80583955 1473869.95625612  531566.75184412
  451311.15330596 2012778.36896733 1320744.92070881 1010069.04891472
 1871025.30133024 2056018.08196474 1392003.614203   1906140.50306398
 1968577.69992149 1031787.99507656  479252.34284208  283881.87885134
 2003067.48584084 1975921.30781428 1546512.84534952  628300.50710952
 1618398.51283338 1487412.65243573  166808.95444565 1549497.64959907
 2093930.98889249  631992.66462026 2059331.82426486  836519.18062998
  379481.49298465  987601.90019882 1988757.12717957 1394829.691

In [37]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = regressor.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[ 411669.53198309  505427.44622631 1055504.33425131  940129.63229267
  325061.06510809 1881541.01700659 1459888.4109778  1041567.72587801
 1144936.96302564  896178.73795387 1834081.61462104 2385937.05010301
 1350084.2035416   749668.30784882 1193778.46666477  314665.58721801
 2290848.78865036  340081.59772004 1575001.63574422 1948974.42628948
  892710.04613905  453486.31672932  290048.22189393 1990406.31054897
 2001968.92017196 1682119.92552838  123083.5610687 ]



In [38]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.96900604569718
R2 score on test set :  0.9384398798887249


In [39]:
diff_scores=round((r2_score(Y_train, Y_train_pred))-(r2_score(Y_test, Y_test_pred)),4)
diff_scores

0.0306

In [40]:
# We obtain a R2 > on Train set
print("Let's check is this following difference between Train and Test score is significant {}".format(diff_scores))

Let's check is this following difference between Train and Test score is significant 0.0306


## Cross Validation

In [41]:
print("3-fold cross-validation...")
scores = cross_val_score(regressor, X_train, Y_train, cv=5)
print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

3-fold cross-validation...
The cross-validated R2-score is :  0.9326785700959924
The standard deviation is :  0.023274322998646187


In [42]:
print('The difference between R2 score on train and test is : {}'.format(diff_scores))
print('Which is more than the standard deviation found before, so Yes, we overfitt!')

The difference between R2 score on train and test is : 0.0306
Which is more than the standard deviation found before, so Yes, we overfitt!


In [43]:
# We have a look to the coefficient of our model
regressor.coef_

array([  -35393.12400149,    37030.35187209,   -28117.86944173,
          -2354.55531401,   -29889.63488091,   -29972.25858323,
          92112.1066835 ,   -94254.86346719,   365545.57972288,
       -1233006.69806167,   656944.99253022, -1395973.31665377,
         -54136.32064882,  -888385.53196513,  -848966.90453811,
       -1276808.31256484,   684020.96628748,    42586.24230567,
         581400.17334717,   675336.35568495,  -663814.80220216,
       -1111788.0573481 ,  -624940.95250282,  -133201.60726769,
          94627.95922619,   377686.17189414,   -73525.48935808])

In [44]:
preprocessor.transformers_

[('num',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                  ('scaler', StandardScaler())]),
  ['Year',
   'Month',
   'Day',
   'DayOfWeek',
   'Temperature',
   'Fuel_Price',
   'CPI',
   'Unemployment']),
 ('cat',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                  ('encoder', OneHotEncoder(drop='first'))]),
  ['Store', 'Holiday_Flag'])]

In [45]:
# We need to get back our feature names (before pre processing)
column_names = []
for name, pipeline, features_list in preprocessor.transformers_: # loop over pipelines
    if name == 'num': # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else: # if pipeline is for categorical variables
        features = pipeline.named_steps['encoder'].get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names

print("Names of columns corresponding to each coefficient: ", column_names)


Names of columns corresponding to each coefficient:  ['Year', 'Month', 'Day', 'DayOfWeek', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'x0_2.0', 'x0_3.0', 'x0_4.0', 'x0_5.0', 'x0_6.0', 'x0_7.0', 'x0_8.0', 'x0_9.0', 'x0_10.0', 'x0_11.0', 'x0_13.0', 'x0_14.0', 'x0_15.0', 'x0_16.0', 'x0_17.0', 'x0_18.0', 'x0_19.0', 'x0_20.0', 'x1_1.0']


In [46]:
regressor.coef_.shape

(27,)

In [47]:
# Create a pandas DataFrame
coefs_Lin = pd.DataFrame(index = column_names, data = regressor.coef_.transpose(), columns=["coefficients"])
coefs_Lin

Unnamed: 0,coefficients
Year,-35393.12
Month,37030.35
Day,-28117.87
DayOfWeek,-2354.555
Temperature,-29889.63
Fuel_Price,-29972.26
CPI,92112.11
Unemployment,-94254.86
x0_2.0,365545.6
x0_3.0,-1233007.0


In [48]:
# Compute abs() and sort values
feature_importance = abs(coefs_Lin).sort_values(by ='coefficients')
feature_importance.sort_values(by ='coefficients',ascending=False)

Unnamed: 0,coefficients
x0_5.0,1395973.0
x0_9.0,1276808.0
x0_3.0,1233007.0
x0_16.0,1111788.0
x0_7.0,888385.5
x0_8.0,848966.9
x0_10.0,684021.0
x0_14.0,675336.4
x0_15.0,663814.8
x0_4.0,656945.0


In [49]:
# We visualize the feature important
fig = px.bar(feature_importance)
fig.show()

## Fighting overfitting

RIDGE

In [50]:
#PFight overfitting=>Ridge
Ridge1 = Ridge()

Ridge1.fit(X_train, Y_train)
# Print R^2 scores
print("R2 score on training set : ", Ridge1.score(X_train, Y_train))
print("R2 score on test set : ", Ridge1.score(X_test, Y_test))


R2 score on training set :  0.9290743181924257
R2 score on test set :  0.9075293388096283


LASSO

In [51]:
#Fight overfitting =>Lasso
Lasso1 = Lasso()

Lasso1.fit(X_train, Y_train)
# Print R^2 scores
print("R2 score on training set : ", Lasso1.score(X_train, Y_train))
print("R2 score on test set : ", Lasso1.score(X_test, Y_test))


R2 score on training set :  0.9690060431967218
R2 score on test set :  0.9384563972265778


In [52]:
# Perform grid search with Ridge
print("Grid search...")
regressorR = Ridge()
# Grid of values to be tested
params = {
    'alpha': [0.0, 0.1, 0.5, 1.0, 2.0,2.5, 3.0, 4.0] # 0 corresponds to no regularization
}
gridsearchR = GridSearchCV(regressorR, param_grid = params, cv=10) # cv : the number of folds to be used for CV
gridsearchR.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearchR.best_params_)
print("Best R2 score : ", gridsearchR.best_score_)


Grid search...
...Done.
Best hyperparameters :  {'alpha': 0.0}
Best R2 score :  0.9298425920618832


In [53]:
# Perform grid search with Lasso
print("Grid search...")
regressorL = Lasso()
# Grid of values to be tested
params = {
    'alpha': [0.0, 0.1, 0.5, 5, 10, 13,15, 20, 30,50,100,200,300,400,500,600,700,800,900,1000, 1200] # 0 corresponds to no regularization
}
gridsearchL = GridSearchCV(regressorL, param_grid = params, cv=10) # cv : the number of folds to be used for CV
gridsearchL.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearchL.best_params_)
print("Best R2 score : ", gridsearchL.best_score_)

Grid search...
...Done.
Best hyperparameters :  {'alpha': 700}
Best R2 score :  0.9315461178517195


In [54]:
#Last train with Lasso and our best parameter
Lasso_last = Lasso(alpha=700)

Lasso_last.fit(X_train, Y_train)
# Print R^2 scores
print("R2 score on training set : ", Lasso_last.score(X_train, Y_train))
print("R2 score on test set : ", Lasso_last.score(X_test, Y_test))

R2 score on training set :  0.9678361285378274
R2 score on test set :  0.9465228410570303


In [55]:
diff=(Lasso_last.score(X_train, Y_train))-(Lasso_last.score(X_test, Y_test))
diff

0.021313287480797083

In [56]:
print("this difference is not significant, we are under the standard deviation calculated before of 0.023274")

this difference is not significant, we are under the standard deviation calculated before of 0.023274


## Conclusion

We can keep a linear regression using ridge or lasso with the optimal hyparameter defined by our grid search but we still semm to be on overfittig
It would be interesting to have more information to upgrade our model like the store size, the marketing budget, the location area...etc.
We can also continue to tune our models to get better results