### Importing the necessary libraries

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings 
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,cross_val_predict, KFold,train_test_split,GridSearchCV,RandomizedSearchCV

First, we need to import the path with the raw data

In [2]:
# setting the raw path
processed_data_path = os.path.join(os.path.pardir,"data","processed")
processed_train_file_path = os.path.join(processed_data_path,"processed_train_store_df.csv")
processed_test_file_path = os.path.join(processed_data_path,"processed_test_store_df.csv")

Next, we import the data.

In [3]:
processed_train_store = pd.read_csv(processed_train_file_path)
processed_test_store = pd.read_csv(processed_test_file_path)

Let's have a look at how our data looks like.

In [4]:
processed_train_store.head()

Unnamed: 0,Store,Date,day,month,year,DayOfWeek,WeekOfYear,Sales,Customers,Open,...,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,SalePerCustomer
0,1,2015-07-31,31,7,2015,5,31,5263,555,1,...,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0.0,9.482883
1,1,2015-07-30,30,7,2015,4,31,5020,546,1,...,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0.0,9.194139
2,1,2015-07-29,29,7,2015,3,31,4782,523,1,...,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0.0,9.143403
3,1,2015-07-28,28,7,2015,2,31,5011,560,1,...,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0.0,8.948214
4,1,2015-07-27,27,7,2015,1,31,6102,612,1,...,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0.0,9.970588


In [5]:
processed_test_store.head(2)

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,day,month,year,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,1,4,2015-09-17,1.0,1,o,0,17,9,2015,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0.0
1,857,1,3,2015-09-16,1.0,1,o,0,16,9,2015,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0.0


In [6]:
print("Shape of processed_train_store data is: ",processed_train_store.shape)
print("Shape of processed_test_store data is: ",processed_test_store.shape)

Shape of processed_train_store data is:  (844338, 23)
Shape of processed_test_store data is:  (41088, 20)


In [7]:
processed_train_store_dtypes = pd.DataFrame(processed_train_store.dtypes,
                                            columns = ["Data_type"])
processed_train_store_dtypes

Unnamed: 0,Data_type
Store,int64
Date,object
day,int64
month,int64
year,int64
DayOfWeek,int64
WeekOfYear,int64
Sales,int64
Customers,int64
Open,int64


In [8]:
#Checking for the unique values
unique_values = pd.DataFrame(columns=['Unique Values'])
for x in list(processed_train_store.columns.values):
    unique_values.loc[x] = [processed_train_store[x].nunique()]

In [9]:
unique_values

Unnamed: 0,Unique Values
Store,1115
Date,942
day,31
month,12
year,3
DayOfWeek,7
WeekOfYear,52
Sales,21733
Customers,4083
Open,1


Since our data is preprocessed, next we are going to extract some useful features from the `Date` variable that will be useful in our modelling.
* `Processed_train_store`
since for this dataframe we had day ,month and year extracted, we will just extract the week,the quarter and the seaon.

In [10]:
#converting the 'Date' column  to date formate
processed_train_store["Date"] = pd.to_datetime(processed_train_store["Date"],format = "%Y-%m-%d")

In [11]:
'''
We are going to consider for seasons;
ab)Summer runs from June 1 to August 31;
Fall (autumn) runs from September 1 to November 30; and
Winter runs from December 1 to February 28 (February 29 in a leap year).
'''
#Extracting.
processed_train_store["Week"] = processed_train_store["Date"].dt.week
processed_train_store["Quarter"] = processed_train_store["Date"].dt.quarter
processed_train_store["Season"] = np.where(processed_train_store["month"].isin([12,1,2]),"Winter",
                                    np.where(processed_train_store["month"].isin([6,7,8]),"Summer",
                                    np.where(processed_train_store["month"].isin([9,10,11]),"Fall",
                                    np.where(processed_train_store["month"].isin([3,4,5]),"Spring","None"))))

In [12]:
#renaming the columns;
processed_train_store.rename(columns = {"year":"Year","day":"Day","month":"Month"},inplace = True)

* `Processed_test_store`
since for this dataframe we had day ,month and year extracted, we will just extract the week,the quarter and the seaon.

In [13]:
processed_test_store.rename(columns = {"year":"Year","day":"Day","month":"Month"},inplace = True)

In [14]:
#converting the 'Date' column  to date formate
processed_test_store["Date"] = pd.to_datetime(processed_test_store["Date"],format = "%Y-%m-%d")

In [15]:
'''
We are going to consider for seasons;
ab)Summer runs from June 1 to August 31;
Fall (autumn) runs from September 1 to November 30; and
Winter runs from December 1 to February 28 (February 29 in a leap year).
'''
#Extracting.
processed_test_store["Week"] = processed_test_store["Date"].dt.week
processed_test_store["Quarter"] = processed_test_store["Date"].dt.quarter
processed_test_store["Season"] = np.where(processed_test_store["Month"].isin([12,1,2]),"Winter",
                                    np.where(processed_test_store["Month"].isin([6,7,8]),"Summer",
                                    np.where(processed_test_store["Month"].isin([9,10,11]),"Fall",
                                    np.where(processed_test_store["Month"].isin([3,4,5]),"Spring","None"))))

In [16]:
processed_test_store["Quarter"].value_counts()

3    41088
Name: Quarter, dtype: int64

In [17]:
processed_train_store["Quarter"].value_counts()

1    252549
2    244396
3    192308
4    155085
Name: Quarter, dtype: int64

In [18]:
### checking the categorical columns
categorical_cols = processed_train_store.select_dtypes(include = ["object"]).columns.tolist()
categorical_cols

['StateHoliday', 'StoreType', 'Assortment', 'Season']

In [19]:
### checking the numerical columns
num_cols = processed_train_store.select_dtypes(include = ["int64",'float64']).columns.tolist()
num_cols

['Store',
 'Day',
 'Month',
 'Year',
 'DayOfWeek',
 'WeekOfYear',
 'Sales',
 'Customers',
 'Open',
 'Promo',
 'SchoolHoliday',
 'CompetitionDistance',
 'CompetitionOpenSinceMonth',
 'CompetitionOpenSinceYear',
 'Promo2',
 'Promo2SinceWeek',
 'Promo2SinceYear',
 'PromoInterval',
 'SalePerCustomer',
 'Week',
 'Quarter']

In [20]:
target = ["Sales"]

In [21]:
numeric_columns = ["Customers","Open","Promo","Promo2","SchoolHoliday","CompetitionDistance"]
categorical_columns = ["DayOfWeek","Quarter","Month","Year",
"StoreType","Assortment","Season","StateHoliday"]

In [22]:
#Define a function that will intake the raw dataframe and thecolumn name and return a one hot encoded DF
def create_ohe(df, col):
    le = LabelEncoder()
    a=le.fit_transform(df[col]).reshape(-1,1)
    ohe = OneHotEncoder(sparse=False)
    column_names = [col+ "_"+ str(i) for i in le.classes_]
    return(pd.DataFrame(ohe.fit_transform(a),columns =column_names))

In [23]:
#Since the above function converts the column, one at a time
#We create a loop to create the final dataset with all features
processed_train_store_final = processed_train_store[numeric_columns]
for column in categorical_columns:
    temp_df = create_ohe(processed_train_store,column)
    processed_train_store_final = pd.concat([processed_train_store_final,temp_df],axis=1)

In [24]:
print("Shape of Data:",processed_train_store_final.shape)
print("Distinct Datatypes:",processed_train_store_final.dtypes.unique())

Shape of Data: (844338, 47)
Distinct Datatypes: [dtype('int64') dtype('float64')]


In [25]:
processed_train_store_final['Sales']=processed_train_store['Sales']

In [26]:
processed_train_store_final.head()

Unnamed: 0,Customers,Open,Promo,Promo2,SchoolHoliday,CompetitionDistance,DayOfWeek_1,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,...,Assortment_c,Season_Fall,Season_Spring,Season_Summer,Season_Winter,StateHoliday_a,StateHoliday_b,StateHoliday_c,StateHoliday_o,Sales
0,555,1,1,0,1,1270.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,5263
1,546,1,1,0,1,1270.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,5020
2,523,1,1,0,1,1270.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,4782
3,560,1,1,0,1,1270.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,5011
4,612,1,1,0,1,1270.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,6102


In [28]:
features = processed_train_store_final.drop(["Sales"],axis=1)
#targets = processed_train_store_final[target]
np.random.seed(1234)
kfold = KFold(n_splits=5)
pipe= Pipeline([("scaler", StandardScaler()), ("forest", RandomForestRegressor(n_estimators=5))]) 
#using pipeline gave me a higher means squared error

In [29]:
'''scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(features, targets, train_size=0.8, random_state=42)
scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)
#X_train, X_test, y_train, y_test = train_test_split(X_train,y_train,test_size=0.1,random_state=1234)
'''
scaler=StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(features,processed_train_store_final[target],test_size=0.2,random_state=1234)
scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

In [30]:
X_train.shape

(675470, 47)

In [None]:

score=cross_val_score(pipe,processed_train_store_final,processed_train_store[target],n_jobs=-1,cv=kfold)
param={'n_estimators':range(1,200,10)}
tree=RandomizedSearchCV(estimator=RandomForestRegressor(),param_distributions=param,iid=False, cv=kfold,random_state=1234,scoring='neg_mean_squared_error')
tree.fit(X_train,y_train)
tree.score(x_test,y_test)

In [None]:
#rf = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=0)
#rf.fit(X_train, y_train)

### Predict and check for accuracy
The above model is used to predict the values of the dependent variable in the Test dataset. We also check the model’s performance.

In [12]:
RF_model_pred = rf.predict(X_test)
metrics.r2_score(Y_test, RF_model_pred)

0.8166465503002451

### Tuning Hyperparameters

### Grid Search

First, we define our four parameters.

In [None]:
param_grid = {"max_depth": [3,5,6,7,8,9],
              "max_features":['auto', 'sqrt', 'log2'],
              "min_samples_split": [2, 3,5,7],
              "min_samples_leaf": [1, 3,5,6]}

#### Initializing, Building and Fitting Model

In this step, we initialize and build the Random Forest Regression model using GridSearchCV and fit it on the Train dataset.

In [None]:
model_RF_GS = GridSearchCV(rf, param_grid)
model_RF_GS.fit(X_train,Y_train)

### Best Parameters
We now check for the best combination of parameters.

In [None]:
model_RF_GS.best_params_

In [None]:
model_RF_GS

We use this model to predict the dependent variable in the test data set and check its accuracy.

In [None]:
pred_RF_GS = model_RF_GS.predict(X_test)
metrics.r2_score(Y_test,pred_RF_GS)

In [None]:

np.random.seed(1234)
kfold = KFold(n_splits=5)
pipe= Pipeline([("scaler", StandardScaler()), ("forest", RandomForestRegressor(n_estimators=5))]) 
#using pipeline gave me a higher means squared error

#score=cross_val_score(pipe,temp,df_new[target],n_jobs=-1,cv=kfold)
param={'n_estimators':range(1,200,10)}
#tree=RandomizedSearchCV(estimator=RandomForestRegressor(),param_distributions=param,iid=False, cv=kfold,random_state=1234,scoring='neg_mean_squared_error')
#tree.fit(x_train,y_train)
tree.score(x_val,y_val)