# Store_14

In [36]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Required Modules

In [37]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats
import numpy as np
import pylab

In [38]:
# Functions useful for Splitting & Normalization
from sklearn.preprocessing import  MinMaxScaler
from sklearn.model_selection import train_test_split

# Functions for Regression modeling 
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

# Regression Error Metrics - scikit-learn
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

# Regression Error Metrics 
from statsmodels.tools.eval_measures import rmse
from statsmodels.tools.eval_measures import mse
from statsmodels.tools.eval_measures import meanabs

# Importing Dataset for Store_14

In [39]:
df_14=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/store14.csv')

In [40]:
df_14.head()

Unnamed: 0,date_o,sales,promo,oil price,Holiday,transactions,cluster
0,2013-01-01,0.0,0,93.14,False,0.0,
1,2013-01-02,7698.253,0,93.14,False,2002.0,
2,2013-01-03,6158.575996,0,92.97,False,1823.0,
3,2013-01-04,5777.689,0,93.12,False,1641.0,
4,2013-01-05,7974.621,0,93.2,False,2052.0,


In [41]:
df_14=df_14.drop(['cluster'],axis=1)

In [42]:
# holiday = pd.read_csv('/content/drive/MyDrive/holidays_events.csv')
# oil =  pd.read_csv('/content/drive/MyDrive/oil.csv')
# stores = pd.read_csv('/content/drive/MyDrive/stores.csv')
# transactions = pd.read_csv('/content/drive/MyDrive/transactions.csv')
# train = pd.read_csv('/content/drive/MyDrive/train.csv')
# test = pd.read_csv('/content/drive/MyDrive/test (1).csv')

# Extracting Day, Month & Year from Date

In [43]:
df_14['date_o'] = df_14['date_o'].apply(pd.to_datetime)

In [44]:
df_14['Month']=df_14['date_o'].dt.strftime('%m')

In [45]:
df_14['Year']=df_14['date_o'].dt.strftime('%Y')

In [46]:
df_14['Day']=df_14['date_o'].dt.strftime('%d')

In [47]:
df_14=df_14.drop(['date_o'],axis=1)

In [48]:
df_14.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1684 entries, 0 to 1683
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sales         1684 non-null   float64
 1   promo         1684 non-null   int64  
 2   oil price     1684 non-null   float64
 3   Holiday       1684 non-null   bool   
 4   transactions  1684 non-null   float64
 5   Month         1684 non-null   object 
 6   Year          1684 non-null   object 
 7   Day           1684 non-null   object 
dtypes: bool(1), float64(3), int64(1), object(3)
memory usage: 93.9+ KB


# Defining X & y

In [49]:
X=df_14.drop(['sales'],axis=1)
y=df_14['sales']

# Splitting Dataset into Training & Test Sets

In [50]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,shuffle=True)

In [51]:
X_train_temp=X_train.drop(['Holiday','Month', 'Year', 'Day'],axis=1)
X_test_temp=X_test.drop(['Holiday','Month', 'Year', 'Day'],axis=1)


In [52]:
col_names=X_train_temp.columns

# Transforming the Distribution of Continuous Variables

In [53]:
from sklearn.preprocessing import PowerTransformer

power = PowerTransformer(method='yeo-johnson')
X_train_temp=power.fit_transform(X_train_temp)
X_test_temp=power.transform(X_test_temp)

# Standardizing the Continuous Variables

In [54]:
scaler=MinMaxScaler()
X_train_temp=scaler.fit_transform(X_train_temp)
X_test_temp=scaler.transform(X_test_temp)

In [55]:
X_train_temp=pd.DataFrame(X_train_temp,columns=col_names)
X_test_temp=pd.DataFrame(X_test_temp,columns=col_names)

In [56]:
# Adding Categorical Variables back for Modeling

X_train_temp[['Holiday','Month', 'Year', 'Day']]=X_train[['Holiday','Month', 'Year', 'Day']].to_numpy()
X_test_temp[['Holiday','Month', 'Year', 'Day']]=X_test[['Holiday','Month', 'Year', 'Day']].to_numpy()

In [57]:
X_train = np.array(X_train_temp)
X_test = np.array(X_test_temp)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Modeling & Evaluation

## Linear Regression

In [58]:
lr=LinearRegression()

In [59]:
# Fitting the Model
lr=lr.fit(X_train,y_train)

In [60]:
# Predictions
preds=lr.predict(X_test)
preds_train=lr.predict(X_train)


* MAPE

In [61]:
print(mean_absolute_percentage_error(y_test,preds))

1.751166430963419e+16


* R2

In [62]:
print(r2_score(y_test,preds))

0.6579147365509612


* RMSE

In [63]:
print(rmse(y_test,preds))

1499.52524263299


## Random Forest Regressor

In [64]:
from sklearn.ensemble import RandomForestRegressor

In [65]:
rf=RandomForestRegressor(random_state=1)

In [66]:
rf=rf.fit(X_train,y_train)

In [67]:
# Predictions
rf_preds=rf.predict(X_test)

* MAPE

In [68]:
print(mean_absolute_percentage_error(y_test,rf_preds))

0.10189646328494462


* R2

In [69]:
print(r2_score(y_test,rf_preds))

0.6894346544742771


* RMSE

In [70]:
print(rmse(y_test,rf_preds))

1428.7725397325419


### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

rf=RandomForestRegressor(random_state=1)
param_grid = {"n_estimators":  [200, 500],
              "max_features": [2,3,4],
              "max_depth":[3,5,7,9],
              "min_samples_split":[10,20,25],"min_samples_leaf":[1,2,3]}
grid_cv_rf = GridSearchCV(rf, param_grid, cv=5)
grid_cv_rf.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=1),
             param_grid={'max_depth': [3, 5, 7, 9], 'max_features': [2, 3, 4],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [10, 20, 25],
                         'n_estimators': [200, 500]})

In [None]:
print("R2: {}".format(grid_cv_rf.best_score_))
print()
print("Best Hyperparameters:{}".format(grid_cv_rf.best_params_))

R2: 0.7110471136566054

Best Hyperparameters:{'max_depth': 9, 'max_features': 4, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}


In [72]:
# Fitting Model with Best Parameters

rf=RandomForestRegressor(n_estimators=200,max_depth=9,max_features=4,min_samples_leaf=2,min_samples_split=10,random_state=1)

In [73]:
rf=rf.fit(X_train,y_train)

In [74]:
rf_preds=rf.predict(X_test)

In [75]:
y_test=y_test+1

* MAPE

In [76]:
print(mean_absolute_percentage_error(y_test,rf_preds))

1.196911633641406


* R2

In [77]:
print(r2_score(y_test,rf_preds))

0.7381440292115968


* RMSE

In [78]:
print(rmse(y_test,rf_preds))

1311.9516757456054


## XGBoost Regressor

In [79]:
from xgboost import XGBRegressor

In [80]:
xgb=XGBRegressor(random_state=1)

In [81]:
xgb=xgb.fit(X_train,y_train)



In [82]:
xgb_preds=xgb.predict(X_test)

* MAPE

In [83]:
print(mean_absolute_percentage_error(y_test,xgb_preds))

8.776359662404499


* R2

In [84]:
print(r2_score(y_test,xgb_preds))

0.7138154966696515


### Hyperparameter Tuning

In [85]:
from sklearn.model_selection import GridSearchCV

xgb=XGBRegressor(random_state=1,silent=1)
param_grid = {"eta":  [0.02,0.05,0.1],
              "subsample": [0.5,1],
              "max_depth":[3,5,7,9]}
grid_cv_xgb = GridSearchCV(xgb,param_grid, cv=5)
grid_cv_xgb.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=XGBRegressor(random_state=1, silent=1),
             param_grid={'eta': [0.02, 0.05, 0.1], 'max_depth': [3, 5, 7, 9],
                         'subsample': [0.5, 1]})

In [86]:
print("R2: {}".format(grid_cv_xgb.best_score_))
print()
print("Best Hyperparameters:{}".format(grid_cv_xgb.best_params_))

R2: 0.7026507980960275

Best Hyperparameters:{'eta': 0.02, 'max_depth': 7, 'subsample': 0.5}


In [87]:
# Fitting Model with Best Parameters

xgb=XGBRegressor(random_state=1,silent=1,eta=0.02,max_depth=7,subsample=0.5)

In [88]:
xgb=xgb.fit(X_train,y_train)

In [89]:
xgb_preds=xgb.predict(X_test)
xgb_preds_train=xgb.predict(X_train)

* MAPE

In [90]:
print(mean_absolute_percentage_error(y_test,xgb_preds))

4.07542958573668


* R2

In [91]:
print(r2_score(y_test,xgb_preds))

0.6634619816938914


In [92]:
print(rmse(y_test,xgb_preds))

1487.317420058186
