Importing Libraries

In [1]:

import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from src.utils import check_missing_data
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from keras.models import load_model
from keras.losses import MeanSquaredError
from sklearn.preprocessing import MinMaxScaler
import pickle
from datetime import datetime
import joblib

Load data

In [2]:
# Load data
df_train = pd.read_csv(r'../data/train.csv', low_memory=False)
df_test = pd.read_csv(r'../data/test.csv', low_memory=False)
df_store = pd.read_csv(r'../data/store.csv', low_memory=False)

# Preprocessing the Data

checking for missing values

In [3]:
missing_data_df_store = check_missing_data(df_store)
print(missing_data_df_store)

                 Column Name  Missing Values  Percentage Missing
3        CompetitionDistance               3            0.269058
4  CompetitionOpenSinceMonth             354           31.748879
5   CompetitionOpenSinceYear             354           31.748879
7            Promo2SinceWeek             544           48.789238
8            Promo2SinceYear             544           48.789238
9              PromoInterval             544           48.789238


In [4]:
# Handle missing values
df_store['CompetitionDistance'].fillna(df_store['CompetitionDistance'].median(), inplace=True)
df_store['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
df_store['CompetitionOpenSinceYear'].fillna(0, inplace=True)
df_store['Promo2SinceWeek'].fillna(0, inplace=True)
df_store['Promo2SinceYear'].fillna(0, inplace=True)
df_store['PromoInterval'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_store['CompetitionDistance'].fillna(df_store['CompetitionDistance'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_store['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work

In [5]:
# checking for missing values
missing_data_df_store = check_missing_data(df_store)
print(missing_data_df_store)

Success: No missing values.


In [6]:
# Remove leading and trailing whitespaces
df_train['StateHoliday'] = df_train['StateHoliday'].str.strip()

In [7]:
# checking for missing values
missing_data_df_train = check_missing_data(df_train)
print(missing_data_df_train)

Success: No missing values.


In [8]:
# Handle missing values
df_train['StateHoliday'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['StateHoliday'].fillna(0, inplace=True)


In [9]:
# checking for missing values
missing_data_df_train = check_missing_data(df_train)
print(missing_data_df_train)

Success: No missing values.


In [10]:
missing_data_df_test = check_missing_data(df_test)
print(missing_data_df_test)

  Column Name  Missing Values  Percentage Missing
4        Open              11            0.026772


In [11]:
# Handle missing values
df_test['Open'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['Open'].fillna(0, inplace=True)


In [12]:
# checking for missing values
missing_data_df_test = check_missing_data(df_test)
print(missing_data_df_test)

Success: No missing values.


Converting Non-Numeric Columns to Numeric

In [13]:
df_store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,0.0,0.0,
4,5,a,a,29910.0,4.0,2015.0,0,0.0,0.0,


In [15]:
# Convert non-numeric columns to numeric
df_store['StoreType'] = df_store['StoreType'].astype('category').cat.codes
df_store['Assortment'] = df_store['Assortment'].astype('category').cat.codes
df_store['PromoInterval'] = df_store['PromoInterval'].astype('category').cat.codes
df_train['StateHoliday'] = df_train['StateHoliday'].astype('category').cat.codes
df_test['StateHoliday'] = df_test['StateHoliday'].astype('category').cat.codes

In [16]:
df_store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,2,0,1270.0,9.0,2008.0,0,0.0,0.0,0
1,2,0,0,570.0,11.0,2007.0,1,13.0,2010.0,2
2,3,0,0,14130.0,12.0,2006.0,1,14.0,2011.0,2
3,4,2,2,620.0,9.0,2009.0,0,0.0,0.0,0
4,5,0,0,29910.0,4.0,2015.0,0,0.0,0.0,0


In [17]:
df_train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [18]:
df_test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0


Generating New Features

In [19]:
# Extract features from the date column in the test dataset
df_test['Date'] = pd.to_datetime(df_test['Date'])
df_test['Year'] = df_test['Date'].dt.year
df_test['Month'] = df_test['Date'].dt.month
df_test['Day'] = df_test['Date'].dt.day
df_test['WeekOfYear'] = df_test['Date'].dt.isocalendar().week
df_test['DayOfWeek'] = df_test['Date'].dt.dayofweek

In [21]:

df_test['DayOfWeek'] = df_train['DayOfWeek'].astype('category').cat.codes

In [22]:
# Extract features from the date column in the test dataset
df_train['Date'] = pd.to_datetime(df_train['Date'])
df_train['Year'] = df_train['Date'].dt.year
df_train['Month'] = df_train['Date'].dt.month
df_train['Day'] = df_train['Date'].dt.day
df_train['WeekOfYear'] = df_train['Date'].dt.isocalendar().week
df_train['DayOfWeek'] = df_train['Date'].dt.dayofweek

In [23]:
df_train['DayOfWeek'] = df_train['DayOfWeek'].astype('category').cat.codes

Merge the store data with train and test data

In [24]:
#  Merge the store data with train and test data
df_train = df_train.merge(df_store, on='Store', how='left')
df_test = df_test.merge(df_store, on='Store', how='left')


In [25]:
df_train_copy = df_train.copy()


In [26]:
# Drop unnecessary columns
df_train.drop(['Date'], axis=1, inplace=True)
df_test.drop(['Date'], axis=1, inplace=True)

In [27]:
# checking for non-numeric columns
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 21 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Store                      1017209 non-null  int64  
 1   DayOfWeek                  1017209 non-null  int8   
 2   Sales                      1017209 non-null  int64  
 3   Customers                  1017209 non-null  int64  
 4   Open                       1017209 non-null  int64  
 5   Promo                      1017209 non-null  int64  
 6   StateHoliday               1017209 non-null  int8   
 7   SchoolHoliday              1017209 non-null  int64  
 8   Year                       1017209 non-null  int32  
 9   Month                      1017209 non-null  int32  
 10  Day                        1017209 non-null  int32  
 11  WeekOfYear                 1017209 non-null  UInt32 
 12  StoreType                  1017209 non-null  int8   
 13  Assortment  

Define feature matrix X and target y

In [28]:
# Define feature matrix X and target y
X = df_train.drop(['Sales', 'Customers'], axis=1)
y = df_train['Sales']

# Split the data into train and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Scaling data

In [29]:
# Scaling data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


In [31]:
X_train

array([[-6.66209938e-01,  1.50336285e+00, -2.21121666e+00, ...,
         1.52694010e-01,  9.97634270e-01,  9.41509822e-01],
       [-1.39928275e+00,  1.50336285e+00, -2.21121666e+00, ...,
        -7.60559561e-01, -1.00161903e+00, -9.06630382e-01],
       [-8.05990771e-01,  5.01980696e-01,  4.52239718e-01, ...,
        -7.60559561e-01, -1.00161903e+00, -9.06630382e-01],
       ...,
       [-6.07191364e-01,  1.00267177e+00,  4.52239718e-01, ...,
        -7.60559561e-01, -1.00161903e+00, -9.06630382e-01],
       [ 4.73780411e-01,  1.28961791e-03,  4.52239718e-01, ...,
        -4.34397572e-01,  9.96640111e-01,  9.41509822e-01],
       [-4.17710679e-01, -1.50078362e+00,  4.52239718e-01, ...,
         1.84873636e+00,  9.97634270e-01,  9.41509822e-01]])

Building Models with Sklearn Pipelines

In [32]:
# Build Random Forest Regressor model in a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
print(f'Mean Squared Error: {mse:.2f}')


Mean Squared Error: 676802.21


Post-Prediction Analysis

In [33]:
# Get feature importance from the Random Forest model
importances = pipeline.named_steps['model'].feature_importances_
features = X.columns
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)


                      Feature  Importance
2                        Open    0.459827
12        CompetitionDistance    0.107883
0                       Store    0.095762
3                       Promo    0.073461
14   CompetitionOpenSinceYear    0.040378
1                   DayOfWeek    0.038660
13  CompetitionOpenSinceMonth    0.035552
8                         Day    0.025746
9                  WeekOfYear    0.022147
10                  StoreType    0.019131
17            Promo2SinceYear    0.017532
11                 Assortment    0.016217
16            Promo2SinceWeek    0.015398
7                       Month    0.009839
6                        Year    0.008875
18              PromoInterval    0.007731
5               SchoolHoliday    0.003315
15                     Promo2    0.001709
4                StateHoliday    0.000838


Serialize Models

In [34]:
# Save the model with a timestamp
timestamp = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
filename = f'rf_model_{timestamp}.pkl'
with open(filename, 'wb') as file:
    pickle.dump(pipeline, file)

Building model with deep learning

Deep Learning with LSTM

Prepare Data for LSTM

In [35]:
# Scaling the features for LSTM model
scaler = MinMaxScaler(feature_range=(-1, 1))
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Reshape data for LSTM (samples, time steps, features)
X_train_lstm = np.reshape(X_train_scaled, (X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_val_lstm = np.reshape(X_val_scaled, (X_val_scaled.shape[0], 1, X_val_scaled.shape[1]))


In [36]:
X_train_lstm

array([[[-3.84201077e-01,  1.00000000e+00, -1.00000000e+00, ...,
         -4.40000000e-01,  9.96029777e-01,  3.33333333e-01]],

       [[-8.07899461e-01,  1.00000000e+00, -1.00000000e+00, ...,
         -1.00000000e+00, -1.00000000e+00, -1.00000000e+00]],

       [[-4.64991023e-01,  3.33333333e-01,  1.00000000e+00, ...,
         -1.00000000e+00, -1.00000000e+00, -1.00000000e+00]],

       ...,

       [[-3.50089767e-01,  6.66666667e-01,  1.00000000e+00, ...,
         -1.00000000e+00, -1.00000000e+00, -1.00000000e+00]],

       [[ 2.74685817e-01, -1.48102017e-16,  1.00000000e+00, ...,
         -8.00000000e-01,  9.95037221e-01,  3.33333333e-01]],

       [[-2.40574506e-01, -1.00000000e+00,  1.00000000e+00, ...,
          6.00000000e-01,  9.96029777e-01,  3.33333333e-01]]])

Building LSTM Model

In [37]:
# Build LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(50, return_sequences=True, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
lstm_model.add(LSTM(50))
lstm_model.add(Dense(1))

# Compile the model
lstm_model.compile(optimizer='adam', loss='mse')

# Fit the model
lstm_model.fit(X_train_lstm, y_train, epochs=10, batch_size=64, validation_data=(X_val_lstm, y_val))

# Predict using the LSTM model
y_pred_lstm = lstm_model.predict(X_val_lstm)


  super().__init__(**kwargs)


Epoch 1/10
[1m12716/12716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 3ms/step - loss: 46604644.0000 - val_loss: 42444996.0000
Epoch 2/10
[1m12716/12716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 3ms/step - loss: 41251316.0000 - val_loss: 37384196.0000
Epoch 3/10
[1m12716/12716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 4ms/step - loss: 36231848.0000 - val_loss: 32725350.0000
Epoch 4/10
[1m12716/12716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 4ms/step - loss: 31800756.0000 - val_loss: 28475228.0000
Epoch 5/10
[1m12716/12716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 4ms/step - loss: 27635218.0000 - val_loss: 24629802.0000
Epoch 6/10
[1m12716/12716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 4ms/step - loss: 23819974.0000 - val_loss: 21188460.0000
Epoch 7/10
[1m12716/12716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 4ms/step - loss: 20492472.0000 - val_loss: 18151834.0000
Epoch 8/10
[1m12716/12716