In [None]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load Data

In [None]:
from google.colab import drive

drive.mount('/content/drive')
df = pd.read_csv('/content/drive/train.csv')

# df = pd.read_csv("train.csv")
# df.head()

In [48]:
y = df.loc[: ,["Purchase"]] # Target
x = df.drop(["Purchase"], axis=1) # Values

Print y

In [42]:
y

array([[ 8370],
       [15200],
       [ 1422],
       ...,
       [  137],
       [  365],
       [  490]])

Print x

In [41]:
x

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000001,P00069042,F,0-17,10,A,2,0,3,,
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0
2,1000001,P00087842,F,0-17,10,A,2,0,12,,
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,
4,1000002,P00285442,M,55+,16,C,4+,0,8,,
...,...,...,...,...,...,...,...,...,...,...,...
550063,1006033,P00372445,M,51-55,13,B,1,1,20,,
550064,1006035,P00375436,F,26-35,1,C,3,0,20,,
550065,1006036,P00375436,F,26-35,15,B,4+,1,20,,
550066,1006038,P00375436,F,55+,1,C,2,0,20,,


In [63]:

x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8, test_size=0.2,random_state=1)

from sklearn.ensemble import RandomForestRegressor

# Function for building and scoring Random Forest models
def get_random_forest_mae(X_trn, X_tst, y_trn, y_tst):
        
    y_trn = y_trn.ravel()
    y_tst = y_tst.ravel()
    
    mdlRfsMlb = RandomForestRegressor(random_state=1)
    mdlRfsMlb.fit(X_trn, y_trn)
    y_tst_prd = mdlRfsMlb.predict(X_tst)
    mae = mean_absolute_error(y_tst, y_tst_prd)
    return (mae)


Building a model

## In the above you can not build your models since they are elements non Numerical

Extracting numerical features

In [64]:
# Select numeric features
cols_num = [col for col in x.columns if x[col].dtype in ['int64', 'float64']]
Xnum = x[cols_num]

# Split numeric features into training and test sets
Xnum_train, Xnum_test, y_train, y_test = train_test_split(Xnum,y,train_size=0.8, test_size=0.2,random_state=1)

In [65]:
get_random_forest_mae(Xnum_train, Xnum_test, y_train, y_test)

np.float64(2171.9411444677853)

Checking the empty cells filling with imputation

In [67]:
Xnum_train_repnull = Xnum_train.fillna(method = 'ffill')
Xnum_test_repnull = Xnum_test.fillna(method = 'ffill')

print('MAE from Approach 2 (Replace missing values with forward fill):')
print(get_random_forest_mae(Xnum_train_repnull, Xnum_test_repnull, y_train, y_test))

MAE from Approach 2 (Replace missing values with forward fill):
2272.0102878011644


In [69]:
Xnum_train_repnull = Xnum_train.fillna(Xnum_train.mean())
Xnum_test_repnull = Xnum_test.fillna(Xnum_train.mean())

print('MAE from Approach 2 (Replace missing values with mean):')
print(get_random_forest_mae(Xnum_train_repnull, Xnum_test_repnull, y_train, y_test))


MAE from Approach 2 (Replace missing values with mean):
2193.456214200903


In [71]:
x_train[cols_num]=Xnum_train_repnull[cols_num]
x_test[cols_num]=Xnum_test_repnull[cols_num]

# For non-numeric values. I removed Product_ID since it is unnecessary for prediction   

In [79]:
cols_obj = [col for col in x.columns if x[col].dtype == 'object']
cols_obj.remove("Product_ID")
cols_obj


['Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years']

# Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

Xle_train = x_train.copy()
Xle_test = x_test.copy()    
# Apply label encoder to each column with non-numeric data
label_encoder = LabelEncoder()
for col in cols_obj:
    Xle_train[col] = label_encoder.fit_transform(x_train[col])
    Xle_test[col] = label_encoder.transform(x_test[col])

Now we will take only the categorical values. For this code it is not necessary since after the removal of the product id remaining non-numerical values become categorical. Nevertheless;

In [84]:
cols_cat = [col for col in x.columns if x[col].dtype == 'object' and x[col].nunique()<10]

from sklearn.preprocessing import LabelEncoder

Xle_train = x_train.copy()
Xle_test = x_test.copy()    
# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in cols_cat:
    Xle_train[col] = label_encoder.fit_transform(x_train[col])
    Xle_test[col] = label_encoder.transform(x_test[col])


In [85]:
mae = get_random_forest_mae(Xle_train[cols_num + cols_cat], Xle_test[cols_num + cols_cat], y_train, y_test)
print("MAE from Label Encoding all Categorical columns:") 
print(mae)

MAE from Label Encoding all Categorical columns:
2154.210619732392


## Building Gradient Boosted Model

In [86]:
from xgboost import XGBRegressor

#Build and score default Gradient Boosting Model
mdlXgbMlb = XGBRegressor()
mdlXgbMlb.fit(Xle_train[cols_num + cols_cat], y_train)
y_test_pred = mdlXgbMlb.predict(Xle_test[cols_num + cols_cat])
mae = mean_absolute_error(y_test_pred, y_test)

print("MAE from default XGBoost model:")
print(mae)

MAE from default XGBoost model:
2090.9818629360047


In [87]:
#Build and score a tuned Gradient Boosting Model
mdlXgbMlb = XGBRegressor(n_estimators=5000, learning_rate=0.01, max_depth=5)
mdlXgbMlb.fit(Xle_train[cols_num + cols_cat], y_train)
y_test_pred = mdlXgbMlb.predict(Xle_test[cols_num + cols_cat])
mae = mean_absolute_error(y_test_pred, y_test)

print("MAE from tuned XGBoost model:")
print(mae)

MAE from tuned XGBoost model:
2100.3951026451473


# Takeways

* Very simple data models only consist of numbers and floating models we do not need labeling
* More the data complexity increases with non-numerical values we need to label and categorize them. Most important part of this is picking the columns with low granularity which means labeled data should not differentiate very much in the entries
* For dealing with missing columns we simply filled with the mean value
* After that we used Gradient Boosted Model which created lower MAE for the data