In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# Baseline

### Random Guessing

In [2]:
df = pd.read_csv("data/ECommerceDataset.csv", index_col="CustomerID")

X = df.drop(columns=['Churn'])
y = df.Churn

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4504, 18), (1126, 18), (4504,), (1126,))

In [3]:
from sklearn.metrics import accuracy_score

In [4]:
random_pred_train = np.random.choice([0, 1], size=len(y_train)) 
random_pred_test = np.random.choice([0, 1], size=len(y_test))

train_acc = accuracy_score(y_train, random_pred_train)
test_acc = accuracy_score(y_test, random_pred_test)
print(train_acc, test_acc)

0.4928952042628774 0.4955595026642984


### Threshold

In [5]:
df.Complain.value_counts()

0    4026
1    1604
Name: Complain, dtype: int64

asumsi awal, semua yang complain itu akan churn dan semua yang tidak complain, tidak akan churn<br>
1 : complain<br>
0 : no complain

In [6]:
pred_train = X_train.Complain.map({0: 0, 1: 1})
pred_test = X_test.Complain.map({0: 0, 1: 1})

train_acc = accuracy_score(y_train, pred_train)
test_acc = accuracy_score(y_test, pred_test)
print(train_acc, test_acc)

0.7291296625222025 0.7193605683836589


# Benchmark

In [7]:
# Import data
df = pd.read_csv("data/ECommerceDataset.csv", index_col="CustomerID")
df.head()

Unnamed: 0_level_0,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
50001,1,4.0,Mobile Phone,3,6.0,Debit Card,Female,3.0,3,Laptop & Accessory,2,Single,9,1,11.0,1.0,1.0,5.0,160
50002,1,,Phone,1,8.0,UPI,Male,3.0,4,Mobile,3,Single,7,1,15.0,0.0,1.0,0.0,121
50003,1,,Phone,1,30.0,Debit Card,Male,2.0,4,Mobile,3,Single,6,1,14.0,0.0,1.0,3.0,120
50004,1,0.0,Phone,3,15.0,Debit Card,Male,2.0,4,Laptop & Accessory,5,Single,8,0,23.0,0.0,1.0,3.0,134
50005,1,0.0,Phone,1,12.0,CC,Male,,3,Mobile,5,Single,3,0,11.0,1.0,1.0,3.0,130


### Mini EDA

In [8]:
# percentage of the missing values
for col in df.columns:
    prct = np.mean(df[col].isnull())
    print('{}: {}%'.format(col, round(prct*100)))

Churn: 0.0%
Tenure: 5.0%
PreferredLoginDevice: 0.0%
CityTier: 0.0%
WarehouseToHome: 4.0%
PreferredPaymentMode: 0.0%
Gender: 0.0%
HourSpendOnApp: 5.0%
NumberOfDeviceRegistered: 0.0%
PreferedOrderCat: 0.0%
SatisfactionScore: 0.0%
MaritalStatus: 0.0%
NumberOfAddress: 0.0%
Complain: 0.0%
OrderAmountHikeFromlastYear: 5.0%
CouponUsed: 5.0%
OrderCount: 5.0%
DaySinceLastOrder: 5.0%
CashbackAmount: 0.0%


hanya 5% missing values, kita impute nanti pakai column transformer

In [9]:
df["Churn"].value_counts() / len(df)

0    0.831616
1    0.168384
Name: Churn, dtype: float64

data imbalance, sementara kita gunakan f1 score

### Simple Model

In [10]:
# Dataset Splitting
X = df.drop(columns=['Churn'])
y = df.Churn

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4504, 18), (1126, 18), (4504,), (1126,))

In [11]:
X.columns

Index(['Tenure', 'PreferredLoginDevice', 'CityTier', 'WarehouseToHome',
       'PreferredPaymentMode', 'Gender', 'HourSpendOnApp',
       'NumberOfDeviceRegistered', 'PreferedOrderCat', 'SatisfactionScore',
       'MaritalStatus', 'NumberOfAddress', 'Complain',
       'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount',
       'DaySinceLastOrder', 'CashbackAmount'],
      dtype='object')

In [24]:
# Preprocessor
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy='mean')),
    ("scaler", MinMaxScaler()) #kita akan gunakan KNN, algo berbasis jarak bagusnya minmax karena ordo antar fitur akan seragam
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('numeric', numerical_pipeline, ["Tenure", "WarehouseToHome", "HourSpendOnApp", "NumberOfDeviceRegistered", "SatisfactionScore", "NumberOfAddress", "OrderAmountHikeFromlastYear", "CouponUsed", "OrderCount", "DaySinceLastOrder", "CashbackAmount"]),
    ('categoric', categorical_pipeline, ["PreferredLoginDevice", "CityTier", "PreferredPaymentMode", "Gender", "PreferedOrderCat", "MaritalStatus", "Complain"]),
])


In [25]:
# Pipeline
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsClassifier())
])

In [26]:
# Parameter Tuning
parameter = {
    "algo__n_neighbors": np.arange(1, 51, 2), # jumlah tetangga harus ganjil
    "algo__weights": ['uniform', 'distance'], #tuning voting, dan jarak
    "algo__p": [1, 2] #manhatan, euclidean
}

In [27]:
model = GridSearchCV(pipeline, param_grid=parameter, cv=3, scoring='f1', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

# Evaluation
print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.1min finished


{'algo__n_neighbors': 1, 'algo__p': 1, 'algo__weights': 'uniform'}
1.0 0.7382693273031599 0.8041775456919059


oke hasilnya cukup overvit antara hasil train dan test, sementara best model score kita di angka 73%. Next kita bisa gunakan beberapa metric evaluasi serta menambahkan polynomial feature, class_weight, transform atau bisa mencoba beberapa algoritma lainnya