Importing Libraries

In [1]:

import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from src.utils import check_missing_data
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from keras.models import load_model
from keras.losses import MeanSquaredError
from sklearn.preprocessing import MinMaxScaler
import pickle
from datetime import datetime
import joblib

Load data

In [2]:
# Load data
df_train = pd.read_csv(r'../data/train.csv', low_memory=False)
df_test = pd.read_csv(r'../data/test.csv', low_memory=False)
df_store = pd.read_csv(r'../data/store.csv', low_memory=False)

checking for missing values

In [3]:
missing_data_df_store = check_missing_data(df_store)
print(missing_data_df_store)

                 Column Name  Missing Values  Percentage Missing
3        CompetitionDistance               3            0.269058
4  CompetitionOpenSinceMonth             354           31.748879
5   CompetitionOpenSinceYear             354           31.748879
7            Promo2SinceWeek             544           48.789238
8            Promo2SinceYear             544           48.789238
9              PromoInterval             544           48.789238


In [4]:
# Handle missing values
df_store['CompetitionDistance'].fillna(df_store['CompetitionDistance'].median(), inplace=True)
df_store['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
df_store['CompetitionOpenSinceYear'].fillna(0, inplace=True)
df_store['Promo2SinceWeek'].fillna(0, inplace=True)
df_store['Promo2SinceYear'].fillna(0, inplace=True)
df_store['PromoInterval'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_store['CompetitionDistance'].fillna(df_store['CompetitionDistance'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_store['CompetitionOpenSinceMonth'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work

In [5]:
# checking for missing values
missing_data_df_store = check_missing_data(df_store)
print(missing_data_df_store)

Success: No missing values.


In [6]:
# Remove leading and trailing whitespaces
df_train['StateHoliday'] = df_train['StateHoliday'].str.strip()

In [7]:
# checking for missing values
missing_data_df_train = check_missing_data(df_train)
print(missing_data_df_train)

Success: No missing values.


In [8]:
# Handle missing values
df_train['StateHoliday'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['StateHoliday'].fillna(0, inplace=True)


In [9]:
# checking for missing values
missing_data_df_train = check_missing_data(df_train)
print(missing_data_df_train)

Success: No missing values.


In [10]:
missing_data_df_test = check_missing_data(df_test)
print(missing_data_df_test)

  Column Name  Missing Values  Percentage Missing
4        Open              11            0.026772


In [11]:
# Handle missing values
df_test['Open'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['Open'].fillna(0, inplace=True)


In [12]:
# checking for missing values
missing_data_df_test = check_missing_data(df_test)
print(missing_data_df_test)

Success: No missing values.


Converting Non-Numeric Columns to Numeric

In [13]:
df_store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,0.0,0.0,
4,5,a,a,29910.0,4.0,2015.0,0,0.0,0.0,


In [15]:
# Convert non-numeric columns to numeric
df_store['StoreType'] = df_store['StoreType'].astype('category').cat.codes
df_store['Assortment'] = df_store['Assortment'].astype('category').cat.codes
df_store['PromoInterval'] = df_store['PromoInterval'].astype('category').cat.codes
df_train['StateHoliday'] = df_train['StateHoliday'].astype('category').cat.codes
df_test['StateHoliday'] = df_test['StateHoliday'].astype('category').cat.codes

In [16]:
df_store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,2,0,1270.0,9.0,2008.0,0,0.0,0.0,0
1,2,0,0,570.0,11.0,2007.0,1,13.0,2010.0,2
2,3,0,0,14130.0,12.0,2006.0,1,14.0,2011.0,2
3,4,2,2,620.0,9.0,2009.0,0,0.0,0.0,0
4,5,0,0,29910.0,4.0,2015.0,0,0.0,0.0,0


In [17]:
df_train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [18]:
df_test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0
