# Store_3

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Importing Required Modules

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import numpy as np
import pylab

In [3]:
# Functions useful for Splitting & Normalization
from sklearn.preprocessing import  MinMaxScaler
from sklearn.model_selection import train_test_split

# Functions for Regression modeling 
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

# Regression Error Metrics - scikit-learn
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

# Regression Error Metrics 
from statsmodels.tools.eval_measures import rmse
from statsmodels.tools.eval_measures import mse
from statsmodels.tools.eval_measures import meanabs

  import pandas.util.testing as tm


# Importing Dataset for Store_3

In [4]:
df_3=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/store3.csv')

In [5]:
df_3.head()

Unnamed: 0,date_o,sales,promo,oil price,Holiday,transactions,cluster
0,2013-01-01,0.0,0,93.14,False,0.0,
1,2013-01-02,24060.348,0,93.14,False,3487.0,
2,2013-01-03,18570.745025,0,92.97,False,3026.0,
3,2013-01-04,17392.097995,0,93.12,False,3188.0,
4,2013-01-05,22700.872005,0,93.2,False,3623.0,


In [6]:
df_3=df_3.drop(['cluster'],axis=1)

In [7]:
# holiday = pd.read_csv('/content/drive/MyDrive/holidays_events.csv')
# oil =  pd.read_csv('/content/drive/MyDrive/oil.csv')
# stores = pd.read_csv('/content/drive/MyDrive/stores.csv')
# transactions = pd.read_csv('/content/drive/MyDrive/transactions.csv')
# train = pd.read_csv('/content/drive/MyDrive/train.csv')
# test = pd.read_csv('/content/drive/MyDrive/test (1).csv')

# Extracting Day, Month & Year from Date

In [8]:
df_3['date_o'] = df_3['date_o'].apply(pd.to_datetime)

In [9]:
df_3['Month']=df_3['date_o'].dt.strftime('%m')

In [10]:
df_3['Year']=df_3['date_o'].dt.strftime('%Y')

In [11]:
df_3['Day']=df_3['date_o'].dt.strftime('%d')

In [12]:
df_3=df_3.drop(['date_o'],axis=1)

In [13]:
df_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1684 entries, 0 to 1683
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sales         1684 non-null   float64
 1   promo         1684 non-null   int64  
 2   oil price     1684 non-null   float64
 3   Holiday       1684 non-null   bool   
 4   transactions  1684 non-null   float64
 5   Month         1684 non-null   object 
 6   Year          1684 non-null   object 
 7   Day           1684 non-null   object 
dtypes: bool(1), float64(3), int64(1), object(3)
memory usage: 93.9+ KB


# Defining X & y

In [14]:
X=df_3.drop(['sales'],axis=1)
y=df_3['sales']

# Splitting Dataset into Training & Test Sets

In [15]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,shuffle=True)

In [16]:
X_train_temp=X_train.drop(['Holiday','Month', 'Year', 'Day'],axis=1)
X_test_temp=X_test.drop(['Holiday','Month', 'Year', 'Day'],axis=1)


In [17]:
col_names=X_train_temp.columns

# Transforming the Distribution of Continuous Variables

In [18]:
from sklearn.preprocessing import PowerTransformer

power = PowerTransformer(method='yeo-johnson')
X_train_temp=power.fit_transform(X_train_temp)
X_test_temp=power.transform(X_test_temp)

# Standardizing the Continuous Variables

In [19]:
scaler=MinMaxScaler()
X_train_temp=scaler.fit_transform(X_train_temp)
X_test_temp=scaler.transform(X_test_temp)

In [20]:
X_train_temp=pd.DataFrame(X_train_temp,columns=col_names)
X_test_temp=pd.DataFrame(X_test_temp,columns=col_names)

In [21]:
# Adding Categorical Variables back for Modeling

X_train_temp[['Holiday','Month', 'Year', 'Day']]=X_train[['Holiday','Month', 'Year', 'Day']].to_numpy()
X_test_temp[['Holiday','Month', 'Year', 'Day']]=X_test[['Holiday','Month', 'Year', 'Day']].to_numpy()

In [22]:
X_train = np.array(X_train_temp)
X_test = np.array(X_test_temp)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Modeling & Evaluation

## Linear Regression

In [23]:
lr=LinearRegression()

In [24]:
# Fitting the Model
lr=lr.fit(X_train,y_train)

In [25]:
# Predictions
preds=lr.predict(X_test)
preds_train=lr.predict(X_train)


* MAPE

In [26]:
print(mean_absolute_percentage_error(y_test,preds))

0.1818015099815326


* R2

In [27]:
print(r2_score(y_test,preds))

0.5474579966175941


* RMSE

In [28]:
print(rmse(y_test,preds))

8329.623569707905


## Random Forest Regressor

In [29]:
from sklearn.ensemble import RandomForestRegressor

In [30]:
rf=RandomForestRegressor(random_state=1)

In [31]:
rf=rf.fit(X_train,y_train)

In [32]:
# Predictions
rf_preds=rf.predict(X_test)

* MAPE

In [33]:
print(mean_absolute_percentage_error(y_test,rf_preds))

0.12158319525577928


* R2

In [34]:
print(r2_score(y_test,rf_preds))

0.6905501202521441


* RMSE

In [35]:
print(rmse(y_test,rf_preds))

6887.967461997684


### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

rf=RandomForestRegressor(random_state=1)
param_grid = {"n_estimators":  [200, 500],
              "max_features": [2,3,4],
              "max_depth":[3,5,7,9],
              "min_samples_split":[10,20,25],"min_samples_leaf":[1,2,3]}
grid_cv_rf = GridSearchCV(rf, param_grid, cv=5)
grid_cv_rf.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=1),
             param_grid={'max_depth': [3, 5, 7, 9], 'max_features': [2, 3, 4],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [10, 20, 25],
                         'n_estimators': [200, 500]})

In [None]:
print("R2: {}".format(grid_cv_rf.best_score_))
print()
print("Best Hyperparameters:{}".format(grid_cv_rf.best_params_))

R2: 0.7391458910977544

Best Hyperparameters:{'max_depth': 9, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}


In [51]:
# Fitting Model with Best Parameters

rf=RandomForestRegressor(n_estimators=500,max_depth=9,max_features=4,min_samples_leaf=1,min_samples_split=10,random_state=1)

In [52]:
rf=rf.fit(X_train,y_train)

In [53]:
rf_preds=rf.predict(X_test)

* MAPE

In [54]:
print(mean_absolute_percentage_error(y_test,rf_preds))

0.1290694467690628


* R2

In [55]:
print(r2_score(y_test,rf_preds))

0.6969408390975318


* RMSE

In [56]:
print(rmse(y_test,rf_preds))

6816.471699902139


## XGBoost Regressor

In [57]:
from xgboost import XGBRegressor

In [58]:
xgb=XGBRegressor(random_state=1)

In [59]:
xgb=xgb.fit(X_train,y_train)



In [60]:
xgb_preds=xgb.predict(X_test)

* MAPE

In [61]:
print(mean_absolute_percentage_error(y_test,xgb_preds))

0.1334794562118471


* R2

In [62]:
print(r2_score(y_test,xgb_preds))

0.7245772479746879


### Hyperparameter Tuning

In [63]:
from sklearn.model_selection import GridSearchCV

xgb=XGBRegressor(random_state=1,silent=1)
param_grid = {"eta":  [0.02,0.05,0.1],
              "subsample": [0.5,1],
              "max_depth":[3,5,7,9]}
grid_cv_xgb = GridSearchCV(xgb,param_grid, cv=5)
grid_cv_xgb.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=XGBRegressor(random_state=1, silent=1),
             param_grid={'eta': [0.02, 0.05, 0.1], 'max_depth': [3, 5, 7, 9],
                         'subsample': [0.5, 1]})

In [64]:
print("R2: {}".format(grid_cv_xgb.best_score_))
print()
print("Best Hyperparameters:{}".format(grid_cv_xgb.best_params_))

R2: 0.7508156719582454

Best Hyperparameters:{'eta': 0.02, 'max_depth': 5, 'subsample': 1}


In [65]:
# Fitting Model with Best Parameters

xgb=XGBRegressor(random_state=1,silent=1,eta=0.02,max_depth=5,subsample=1)

In [66]:
xgb=xgb.fit(X_train,y_train)

In [67]:
xgb_preds=xgb.predict(X_test)
xgb_preds_train=xgb.predict(X_train)

* MAPE

In [68]:
print(mean_absolute_percentage_error(y_test,xgb_preds))

0.11224590232472248


* R2

In [69]:
print(r2_score(y_test,xgb_preds))

0.7517512440491648


In [70]:
print(rmse(y_test,xgb_preds))

6169.350835111602
