In [29]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from  sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, max_error
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import LinearSVR
from xgboost import XGBRegressor

In [30]:
data = pd.read_csv("../01-data/cleaned_black_friday.csv")
data.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,0,0-17,10,A,2,0,3,8,,8370
1,1000001,P00248942,0,0-17,10,A,2,0,1,6,14.0,15200
2,1000001,P00087842,0,0-17,10,A,2,0,12,8,,1422
3,1000001,P00085442,0,0-17,10,A,2,0,12,14,,1057
4,1000002,P00285442,1,55+,16,C,4,0,8,8,,7969


In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537577 entries, 0 to 537576
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     537577 non-null  int64  
 1   Product_ID                  537577 non-null  object 
 2   Gender                      537577 non-null  int64  
 3   Age                         537577 non-null  object 
 4   Occupation                  537577 non-null  int64  
 5   City_Category               537577 non-null  object 
 6   Stay_In_Current_City_Years  537577 non-null  int64  
 7   Marital_Status              537577 non-null  int64  
 8   Product_Category_1          537577 non-null  int64  
 9   Product_Category_2          537577 non-null  int64  
 10  Product_Category_3          164278 non-null  float64
 11  Purchase                    537577 non-null  int64  
dtypes: float64(1), int64(8), object(3)
memory usage: 49.2+ MB


In [32]:
data.drop(labels=["User_ID","Product_ID","Product_Category_3"],axis=1,inplace=True)

data['Occupation'] = data['Occupation'].astype(np.object)
data['Gender'] = data['Gender'].astype(np.uint8)
data['Product_Category_1'] = data['Product_Category_1'].astype(np.object)
data['Product_Category_2'] = data['Product_Category_2'].astype(np.object)
data['Stay_In_Current_City_Years'] = data['Stay_In_Current_City_Years'].astype(np.uint8)

In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537577 entries, 0 to 537576
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   Gender                      537577 non-null  uint8 
 1   Age                         537577 non-null  object
 2   Occupation                  537577 non-null  object
 3   City_Category               537577 non-null  object
 4   Stay_In_Current_City_Years  537577 non-null  uint8 
 5   Marital_Status              537577 non-null  int64 
 6   Product_Category_1          537577 non-null  object
 7   Product_Category_2          537577 non-null  object
 8   Purchase                    537577 non-null  int64 
dtypes: int64(2), object(5), uint8(2)
memory usage: 29.7+ MB


In [34]:
data = pd.get_dummies(data)

In [35]:
data.head()

Unnamed: 0,Gender,Stay_In_Current_City_Years,Marital_Status,Purchase,Age_0-17,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,...,Product_Category_2_9,Product_Category_2_10,Product_Category_2_11,Product_Category_2_12,Product_Category_2_13,Product_Category_2_14,Product_Category_2_15,Product_Category_2_16,Product_Category_2_17,Product_Category_2_18
0,0,2,0,8370,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,2,0,15200,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2,0,1422,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,2,0,1057,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,4,0,7969,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
y = data['Purchase']
del data['Purchase']

X = data

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 323, test_size = 0.30,shuffle=True)

In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537577 entries, 0 to 537576
Data columns (total 69 columns):
 #   Column                      Non-Null Count   Dtype
---  ------                      --------------   -----
 0   Gender                      537577 non-null  uint8
 1   Stay_In_Current_City_Years  537577 non-null  uint8
 2   Marital_Status              537577 non-null  int64
 3   Age_0-17                    537577 non-null  uint8
 4   Age_18-25                   537577 non-null  uint8
 5   Age_26-35                   537577 non-null  uint8
 6   Age_36-45                   537577 non-null  uint8
 7   Age_46-50                   537577 non-null  uint8
 8   Age_51-55                   537577 non-null  uint8
 9   Age_55+                     537577 non-null  uint8
 10  Occupation_0                537577 non-null  uint8
 11  Occupation_1                537577 non-null  uint8
 12  Occupation_2                537577 non-null  uint8
 13  Occupation_3                537577 non-null 

In [40]:
random_forest = RandomForestRegressor(n_jobs=-1)
boost_tree = GradientBoostingRegressor()
bagg_tree = BaggingRegressor(n_jobs=-1)
xgb_tree = XGBRegressor()
base_tree = DecisionTreeRegressor()
#best_model = RandomForestRegressor(bootstrap=True, max_features=0.9000000000000001, min_samples_leaf=13, min_samples_split=12, n_estimators=100)

In [41]:
model_tuple = (random_forest, boost_tree, bagg_tree, xgb_tree, base_tree)

In [42]:
scores = cross_val_score(base_tree,X,y,scoring='neg_mean_squared_error',cv=5)
print('\n\n','RMSE:',np.mean(np.sqrt(-1 * scores)))
scores2 = cross_val_score(base_tree,X,y,scoring='neg_mean_absolute_error',cv=5)
print('\n\n','MAE:',np.mean(np.sqrt(-1 * scores2)))



 RMSE: 3139.649046580568


 MAE: 47.66193848940572


In [43]:
def model_function(model_name):
    
    "This fuction fits the model to train data and shows the test error "
    
    print(model_name,"\n",25*"*" )
    
    model_name.fit(X_train,y_train)
    print("\nModel score on train dataset is:",model_name.score(X_train,y_train))
    
    y_pred = model_name.predict(X_test)
    
    print("Mean absolute error: ", mean_absolute_error(y_test, y_pred))
    print("Mean squared error: ",mean_squared_error(y_test, y_pred))
    print("Root mean squared error: ",np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R2 score: ",r2_score(y_test, y_pred))
    print("Max error: ", max_error(y_test, y_pred),"\n\n",25*"*")
    

In [44]:
#Results with one-hot encode 
for i in model_tuple:
    model_function(i)

RandomForestRegressor(n_jobs=-1) 
 *************************

Model score on train dataset is: 0.7372226037159422
Mean absolute error:  2224.2732402339398
Mean squared error:  9197349.723530311
Root mean squared error:  3032.7132610140234
R2 score:  0.6283744572312129
Max error:  17713.622166666664 

 *************************
GradientBoostingRegressor() 
 *************************

Model score on train dataset is: 0.6308016119200106
Mean absolute error:  2309.4282439553645
Mean squared error:  9146791.796099951
Root mean squared error:  3024.3663462120376
R2 score:  0.6304172867187667
Max error:  14103.499452012355 

 *************************
BaggingRegressor(n_jobs=-1) 
 *************************

Model score on train dataset is: 0.7324764659268741
Mean absolute error:  2239.8968038397998
Mean squared error:  9348051.690153582
Root mean squared error:  3057.4583709600333
R2 score:  0.6222852356808553
Max error:  17930.23 

 *************************
XGBRegressor(base_score=None, boo