## Step 0: Import libraries and dataset

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import datetime as dt
import re
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from google.cloud import bigquery
import warnings

warnings.filterwarnings('ignore')

In [2]:
%%bigquery train
SELECT *
FROM demoespecialidadgcp.demo_2_black_friday.raw_train;

Query is running:   0%|          |

Downloading:   0%|          |

In [3]:
%%bigquery test
SELECT *
FROM demoespecialidadgcp.demo_2_black_friday.raw_test;

Query is running:   0%|          |

Downloading:   0%|          |

## Step 3: Data preprocessing

### 3.1: Merging of train and test

In [4]:
train['source'] = 'train'
test['source'] = 'test'

In [5]:
dataset = pd.concat([train, test])

### 3.2: Replacing '+' in 'Age' and 'Stay_In_Current_City_Years'

In [6]:
dataset['Age'] = dataset['Age'].apply(lambda x : str(x).replace('55+', '55'))

In [7]:
dataset['Stay_In_Current_City_Years'] = dataset['Stay_In_Current_City_Years'].apply(lambda x : str(x).replace('4+', '4'))

### 3.3: Dropping irrelevant features

In [8]:
dataset.drop('Product_Category_3', axis = 1, inplace = True)

In [9]:
dataset.drop('User_ID', axis = 1, inplace = True)

In [10]:
dataset.drop('Product_ID', axis = 1, inplace = True)

### 3.4: Feature Encoding

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
label_encoder_gender = LabelEncoder()
dataset['Gender'] = label_encoder_gender.fit_transform(dataset['Gender'])

In [13]:
label_encoder_age = LabelEncoder()
dataset['Age'] = label_encoder_age.fit_transform(dataset['Age'])

In [14]:
label_encoder_city = LabelEncoder()
dataset['City_Category'] = label_encoder_city.fit_transform(dataset['City_Category'])

### 3.5: Fixing null values in 'Product_Category_2'

In [12]:
dataset['Product_Category_2'].fillna(dataset['Product_Category_2'].median(), inplace = True)

### 3.6: Convert 'Stay_In_Current_City_Years' into numeric data type

In [13]:
dataset['Stay_In_Current_City_Years'] = dataset['Stay_In_Current_City_Years'].astype('int')

### 3.7: Separating dataset into train and test

In [14]:
train = dataset.loc[dataset['source'] == 'train']
test = dataset.loc[dataset['source'] == 'test']

In [15]:
train.drop('source', axis = 1, inplace = True)
test.drop('source', axis = 1, inplace = True)

### 3.8: Separating train into X and Y

In [16]:
X = train.drop("Purchase", axis = 1)

In [17]:
Y = train["Purchase"]

### 3.9: Feature Selection

In [21]:
from sklearn.ensemble import ExtraTreesRegressor
selector = ExtraTreesRegressor()

In [22]:
selector.fit(X, Y)

In [23]:
feature_imp = selector.feature_importances_

In [24]:
for index, val in enumerate(feature_imp):
    print(index, round((val * 100), 2))

0 0.47
1 2.04
2 4.84
3 0.72
4 2.55
5 0.62
6 81.21
7 7.55


In [18]:
X.drop(['Gender', 'City_Category', 'Marital_Status'], axis = 1, inplace = True)

### 3.10: Feature Scaling

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [20]:
X.columns

Index(['Age', 'Occupation', 'Stay_In_Current_City_Years', 'Product_Category_1',
       'Product_Category_2'],
      dtype='object')

In [21]:
X.head()

Unnamed: 0,Age,Occupation,Stay_In_Current_City_Years,Product_Category_1,Product_Category_2
0,26-35,0,3,3,4
1,26-35,0,3,5,14
2,26-35,0,3,2,5
3,26-35,0,3,8,9
4,26-35,0,3,8,14


In [27]:
for col in X.columns:
    X[col] = scaler.fit_transform(X[col].values.reshape(-1, 1))

### 3.11: Creating a train test split

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [29]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

X_train shape: (440054, 5)
X_test shape: (110014, 5)
Y_train shape: (440054,)
Y_test shape: (110014,)


## Step 4: Data Modelling

### 4.1: Linear Regression

In [30]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

In [31]:
lin_reg.fit(X_train, Y_train)

In [32]:
Y_pred_lin_reg = lin_reg.predict(X_test)

### 4.2: KNN Regression

In [33]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()

In [34]:
knn.fit(X_train, Y_train)

In [35]:
Y_pred_knn = knn.predict(X_test)

### 4.3: Decision Tree Regression

In [36]:
from sklearn.tree import DecisionTreeRegressor
dec_tree = DecisionTreeRegressor()

In [37]:
dec_tree.fit(X_train, Y_train)

In [38]:
Y_pred_dec = dec_tree.predict(X_test)

### 4.4: Random Forest Regressor

In [39]:
from sklearn.ensemble import RandomForestRegressor
ran_for = RandomForestRegressor()

In [40]:
ran_for.fit(X_train, Y_train)

In [41]:
Y_pred_ran_for = ran_for.predict(X_test)

### 4.5: XGB Regressor

In [43]:
from xgboost import XGBRegressor
xgb = XGBRegressor(random_state = 42)

In [44]:
xgb.fit(X_train, Y_train)

In [45]:
Y_pred_xgb = xgb.predict(X_test)

## Step 5: Model Evaluation

In [46]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [47]:
print("Linear Regression: ")
print("RMSE:",np.sqrt(mean_squared_error(Y_test, Y_pred_lin_reg)))
print("R2 score:", r2_score(Y_test, Y_pred_lin_reg))

Linear Regression: 
RMSE: 4708.377655798458
R2 score: 0.11988246710333383


In [48]:
print("KNN regression: ")
print("RMSE:",np.sqrt(mean_squared_error(Y_test, Y_pred_knn)))
print("R2 score:", r2_score(Y_test, Y_pred_knn))

KNN regression: 
RMSE: 3262.9372094531086
R2 score: 0.5773162754669178


In [49]:
print("Decision tree regression: ")
print("RMSE:",np.sqrt(mean_squared_error(Y_test, Y_pred_dec)))
print("R2 score:", r2_score(Y_test, Y_pred_dec))

Decision tree regression: 
RMSE: 3045.508034263699
R2 score: 0.6317713364589213


In [50]:
print("Random forest regression: ")
print("RMSE:",np.sqrt(mean_squared_error(Y_test, Y_pred_ran_for)))
print("R2 score:", r2_score(Y_test, Y_pred_ran_for))

Random forest regression: 
RMSE: 3015.0776750734926
R2 score: 0.6390931686104102


In [51]:
print("XGB regression: ")
print("RMSE:",np.sqrt(mean_squared_error(Y_test, Y_pred_xgb)))
print("R2 score:", r2_score(Y_test, Y_pred_xgb))

XGB regression: 
RMSE: 2957.5738977987285
R2 score: 0.6527283727993626


## Step 6: Hyperparameter tuning

In [52]:
from sklearn.model_selection import RandomizedSearchCV

In [53]:
max_depth = [int(x) for x in np.linspace(start = 5, stop = 20, num = 15)]
learning_rate = ['0.01', '0.05', '0.1', '0.25', '0.5', '0.75', '1.0']
min_child_weight = [int(x) for x in np.linspace(start = 45, stop = 70, num = 15)]

In [54]:
params = {
 "learning_rate"    : learning_rate,
 "max_depth"        : max_depth,
 "min_child_weight" : min_child_weight,
 "gamma"            : [0.0, 0.1, 0.2 , 0.3, 0.4],
 "colsample_bytree" : [0.3, 0.4, 0.5 , 0.7]
}

In [55]:
xgb_tune = XGBRegressor(verbosity = 0, random_state = 42)

In [56]:
xgb_cv = RandomizedSearchCV(xgb_tune, param_distributions = params, cv = 5, random_state = 42)

In [57]:
xgb_cv.fit(X_train, Y_train)

In [58]:
xgb_cv.best_score_

0.6498849391694306

In [59]:
xgb_cv.best_params_

{'min_child_weight': 66,
 'max_depth': 11,
 'learning_rate': '1.0',
 'gamma': 0.3,
 'colsample_bytree': 0.7}

In [60]:
xgb_best = XGBRegressor(colsample_bytree = 0.7, gamma = 0.3, learning_rate = 1.0, max_depth = 11, min_child_weight = 66, verbosity = 0, random_state = 42)

In [61]:
xgb_best.fit(X_train, Y_train)

In [62]:
Y_pred_xgb_best = xgb_best.predict(X_test)

In [63]:
print("XGB regression: ")
print("RMSE:",np.sqrt(mean_squared_error(Y_test, Y_pred_xgb_best)))
print("R2 score:", r2_score(Y_test, Y_pred_xgb_best))

XGB regression: 
RMSE: 2968.4703845911495
R2 score: 0.6501647774100076


<div class="alert alert-block alert-info">

<h3 style="font-family:verdana;"> Conclusion:</h3>

<ul>
    
<li><p style="font-family:verdana;">
In this project, we tried to build a model using various algorithms such as Linear regression, KNN regression, Decision tree regression, Random forest and XGB regressor to get the best possible prediction.
</p></li>     
        
<li><p style="font-family:verdana;">
The hyperparameter tuned XGB regressor gives us the best rmse value and r2 score for this problem.
</p></li>    

   

</ul>

</div>

<div class="alert alert-block alert-info">

<h3 style="font-family:verdana;"> Future work:</h3>

<ul>
        
<li><p style="font-family:verdana;">
We have a large enough dataset, so we can use neural networks such as an artificial neural network to build a model which can result in better performance.
</p></li>    



</ul>

</div>