In [11]:
# Importing the libraries
%matplotlib inline

from sklearn.externals import joblib
import os

import pandas as pd
import numpy as np

# sklearn :: utils
from sklearn.model_selection import train_test_split

# sklearn :: models
from sklearn.ensemble import RandomForestRegressor

# Problem definition

Apply regression models to predict the amount of purchase

# Load the data

In [12]:
# Loading the data
df_original = pd.read_csv('data/black_friday_processed_30K.csv')
df = df_original.copy()
print(df.columns)
df.head()

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')


Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,0,0,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6,14,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,0,0,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14,0,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,0,0,7969


In [13]:
df.dtypes

User_ID                        int64
Product_ID                    object
Gender                        object
Age                           object
Occupation                     int64
City_Category                 object
Stay_In_Current_City_Years    object
Marital_Status                 int64
Product_Category_1             int64
Product_Category_2             int64
Product_Category_3             int64
Purchase                       int64
dtype: object

# Feature Engineering 

In [14]:
# Removing unnecessary columns
del df['User_ID']
del df['Product_ID']
del df['Product_Category_2']
del df['Product_Category_3']

# Transforming the categorical columns to numerical
for col in ['Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1']:
     df_dummies = pd.get_dummies(df[col], prefix=col)
     df = pd.concat([df, df_dummies], axis=1)
     # Removing the original columns
     del df[col]
df.head()

Unnamed: 0,Purchase,Gender_F,Gender_M,Age_0-17,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,...,Product_Category_1_9,Product_Category_1_10,Product_Category_1_11,Product_Category_1_12,Product_Category_1_13,Product_Category_1_14,Product_Category_1_15,Product_Category_1_16,Product_Category_1_17,Product_Category_1_18
0,8370,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15200,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1422,1,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1057,1,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,7969,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [15]:
df.dtypes

Purchase                         int64
Gender_F                         uint8
Gender_M                         uint8
Age_0-17                         uint8
Age_18-25                        uint8
Age_26-35                        uint8
Age_36-45                        uint8
Age_46-50                        uint8
Age_51-55                        uint8
Age_55+                          uint8
Occupation_0                     uint8
Occupation_1                     uint8
Occupation_2                     uint8
Occupation_3                     uint8
Occupation_4                     uint8
Occupation_5                     uint8
Occupation_6                     uint8
Occupation_7                     uint8
Occupation_8                     uint8
Occupation_9                     uint8
Occupation_10                    uint8
Occupation_11                    uint8
Occupation_12                    uint8
Occupation_13                    uint8
Occupation_14                    uint8
Occupation_15            

In [16]:
# Selecting the columns
X_columns = [x for x in df.columns if x != 'Purchase' and df.loc[:,x].dtype != object]
y_column = ['Purchase']
list(X_columns)

['Gender_F',
 'Gender_M',
 'Age_0-17',
 'Age_18-25',
 'Age_26-35',
 'Age_36-45',
 'Age_46-50',
 'Age_51-55',
 'Age_55+',
 'Occupation_0',
 'Occupation_1',
 'Occupation_2',
 'Occupation_3',
 'Occupation_4',
 'Occupation_5',
 'Occupation_6',
 'Occupation_7',
 'Occupation_8',
 'Occupation_9',
 'Occupation_10',
 'Occupation_11',
 'Occupation_12',
 'Occupation_13',
 'Occupation_14',
 'Occupation_15',
 'Occupation_16',
 'Occupation_17',
 'Occupation_18',
 'Occupation_19',
 'Occupation_20',
 'City_Category_A',
 'City_Category_B',
 'City_Category_C',
 'Stay_In_Current_City_Years_0',
 'Stay_In_Current_City_Years_1',
 'Stay_In_Current_City_Years_2',
 'Stay_In_Current_City_Years_3',
 'Stay_In_Current_City_Years_4+',
 'Marital_Status_0',
 'Marital_Status_1',
 'Product_Category_1_1',
 'Product_Category_1_2',
 'Product_Category_1_3',
 'Product_Category_1_4',
 'Product_Category_1_5',
 'Product_Category_1_6',
 'Product_Category_1_7',
 'Product_Category_1_8',
 'Product_Category_1_9',
 'Product_Category

# Model Training

In [17]:
# Spliting the data using sklearn train_test_split
threshold = 0.8
X = df[X_columns]
y = df[y_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)


X_train (24000, 58)
y_train (24000, 1)
X_test (6000, 58)
y_test (6000, 1)


# Model Evaluation

In [18]:
# Using the chosen regression model
model = RandomForestRegressor(100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

  This is separate from the ipykernel package so we can avoid doing imports until


# Data App Files Creation

In [19]:
# Creating the model file
joblib.dump(model, 'model/model.joblib')

['model/model.joblib']

In [20]:
# Creating the X_columns file
joblib.dump(X_columns, 'model/X_columns.joblib')

['model/X_columns.joblib']