In [152]:
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [122]:
df_original = pd.read_csv("/content/drive/My Drive/train.csv")

In [123]:
df_original.head(6)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
5,1000003,P00193542,M,26-35,15,A,3,0,1,2.0,,15227


In [124]:
df_original.dtypes

User_ID                         int64
Product_ID                     object
Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
dtype: object

In [125]:
df = df_original.dropna().reset_index()
df.head(6)

Unnamed: 0,index,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
1,6,1000004,P00184942,M,46-50,7,B,2,1,1,8.0,17.0,19215
2,13,1000005,P00145042,M,26-35,20,A,1,1,1,2.0,5.0,15665
3,14,1000006,P00231342,F,51-55,9,A,1,0,5,8.0,14.0,5378
4,16,1000006,P0096642,F,51-55,9,A,1,0,2,3.0,4.0,13055
5,18,1000007,P00036842,M,36-45,1,B,1,1,1,14.0,16.0,11788


In [126]:
df = df.drop(columns=['index','User_ID', 'Product_ID'])

In [127]:
df.head(6)

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,F,0-17,10,A,2,0,1,6.0,14.0,15200
1,M,46-50,7,B,2,1,1,8.0,17.0,19215
2,M,26-35,20,A,1,1,1,2.0,5.0,15665
3,F,51-55,9,A,1,0,5,8.0,14.0,5378
4,F,51-55,9,A,1,0,2,3.0,4.0,13055
5,M,36-45,1,B,1,1,1,14.0,16.0,11788


Some features are categorical so we need to encode them in order for these features to be meaningful in our model. Some columns are already transformed by the people who prepared the data. These columns are Occupation, Product_Category_1, Product_Category_2 and Product_Category_3 columns. The columns that need encoding are Gender, Age, City_Category, Stay_In_Current_City_Years and Marital_Status. Stay_In_Current_City_Years is transformed as after 4 years it is stated as 4+ which indicates that there is no meaningful difference after 4+ years.  

In [128]:
from sklearn.preprocessing import OneHotEncoder

In [129]:
encoder = OneHotEncoder(sparse_output=False)

transformed_data = encoder.fit_transform(
    df[
        [
            "Gender",
            "Age",
            "City_Category",
            "Stay_In_Current_City_Years",
            "Marital_Status",
        ]
    ]
)

onehot_df = pd.DataFrame(
    transformed_data,
    columns=encoder.get_feature_names_out(
        [
            "Gender",
            "Age",
            "City_Category",
            "Stay_In_Current_City_Years",
            "Marital_Status",
        ]
    ),
)

encoded_df = pd.concat([onehot_df, df], axis=1).drop(
    ["Gender", "Age", "City_Category", "Stay_In_Current_City_Years", "Marital_Status"],
    axis=1,
)

In [130]:
encoded_df.head(6)

Unnamed: 0,Gender_F,Gender_M,Age_0-17,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_A,...,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+,Marital_Status_0,Marital_Status_1,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,10,1,6.0,14.0,15200
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,7,1,8.0,17.0,19215
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,20,1,2.0,5.0,15665
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,9,5,8.0,14.0,5378
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,9,2,3.0,4.0,13055
5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1,1,14.0,16.0,11788


In [131]:
from sklearn.model_selection import train_test_split

X = encoded_df.drop(columns=["Purchase"])
y = encoded_df["Purchase"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=1
)

In [132]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=1)

rf.fit(X_train, y_train)

In [133]:
y_pred = rf.predict(X_test)

In [134]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)

In [135]:
mean_purchase = df['Purchase'].mean()
std_purchase = df['Purchase'].std()

In [136]:
print("Mean Purchase Amount:")
print(mean_purchase)
print('Standart Deviation of Purchase Amount:')
print(std_purchase)
print("Mean Absolute Error for Purchase Amount of Customers With Random Forest Regression Model:")
print(mae)

Mean Purchase Amount:
11658.114979528957
Standart Deviation of Purchase Amount:
5082.28795904627
Mean Absolute Error for Purchase Amount of Customers With Random Forest Regression Model:
2864.5216135382925


The MAE for the Random Forest Regression Model is around 2800 and the ratios between the mae-mean and mae-std are 0.246 and 0.564 respectively. The error is not too bad but the ratios show that there is still room for improvement. Replacing the NaN values with the mean of the respective column can be a step for improvement. This will preserve the original data size which means that the model will be trained with more data. More data usually leads to better predictions.

In [137]:
def get_random_forest_mae(X_trn, X_tst, y_trn, y_tst):
    mdlRfsMlb = RandomForestRegressor(random_state=1)
    mdlRfsMlb.fit(X_trn, y_trn)
    y_tst_prd = mdlRfsMlb.predict(X_tst)
    mae = mean_absolute_error(y_tst, y_tst_prd)
    return (mae)

In [140]:
df_original.isna().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

In [141]:
df_original.shape

(550068, 12)

If we drop the NaN containing columns we drop more than half of the existing data whihc significantly affects model performance. Since the NaN values are only in two columns we can try removing those or fill them with mean values.

In [144]:
df_no_nan = df_original.drop(columns=['User_ID','Product_ID','Product_Category_2','Product_Category_3'])

In [147]:
encoder = OneHotEncoder(sparse_output=False)

transformed_data = encoder.fit_transform(
    df[
        [
            "Gender",
            "Age",
            "City_Category",
            "Stay_In_Current_City_Years",
            "Marital_Status",
        ]
    ]
)

onehot_df = pd.DataFrame(
    transformed_data,
    columns=encoder.get_feature_names_out(
        [
            "Gender",
            "Age",
            "City_Category",
            "Stay_In_Current_City_Years",
            "Marital_Status",
        ]
    ),
)

encoded_df = pd.concat([onehot_df, df_no_nan], axis=1).drop(
    ["Gender", "Age", "City_Category", "Stay_In_Current_City_Years", "Marital_Status"],
    axis=1,
)

In [148]:
X = encoded_df.drop(columns=["Purchase"])
y = encoded_df["Purchase"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=1
)

In [149]:
print("MAE with columns dropped:")
print(get_random_forest_mae(X_train, X_test, y_train, y_test))

MAE with columns dropped:
2315.4527656802343


With this approach, the MAE is lower so the model is improved. Now lets try with filling NaN values with the mean.

In [154]:
df_na_filled = df_original
df_na_filled["Product_Category_2"].fillna(
    df_na_filled["Product_Category_2"].mean(), inplace=True
)
df_na_filled["Product_Category_3"].fillna(
    df_na_filled["Product_Category_2"].mean(), inplace=True
)

In [155]:
df_na_filled = df_original.drop(columns=['User_ID','Product_ID'])

In [157]:
encoder = OneHotEncoder(sparse_output=False)

transformed_data = encoder.fit_transform(
    df[
        [
            "Gender",
            "Age",
            "City_Category",
            "Stay_In_Current_City_Years",
            "Marital_Status",
        ]
    ]
)

onehot_df = pd.DataFrame(
    transformed_data,
    columns=encoder.get_feature_names_out(
        [
            "Gender",
            "Age",
            "City_Category",
            "Stay_In_Current_City_Years",
            "Marital_Status",
        ]
    ),
)

encoded_df = pd.concat([onehot_df, df_na_filled], axis=1).drop(
    ["Gender", "Age", "City_Category", "Stay_In_Current_City_Years", "Marital_Status"],
    axis=1,
)

In [158]:
X = encoded_df.drop(columns=["Purchase"])
y = encoded_df["Purchase"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=1
)

In [160]:
print("MAE with NaN Values Filled:")
print(get_random_forest_mae(X_train, X_test, y_train, y_test))

MAE with NaN Values Filled:
2258.0843538305726


The error is even lower with mean imputation. This is the final dataset and model that should be used.