### 🖋 **Notebook Contents**

0. Initial Setup
1. Feature Engineering
1. Modelling
    - Benchmark Model
    - Hyperparameter Tuning
    - Final Model
2. Cluster Analysis
3. Conclusion
4. Recommendation

****

## `Initial Setup`

In [25]:
import sys
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, RobustScaler, StandardScaler
import category_encoders as ce

# mlflow and dagshub tracking
import mlflow
import dagshub

# get experiment_id
sys.path.append(os.path.abspath("../src/"))
from get_or_create_mlflow_experiments import get_experiment_id
EXP_ID = get_experiment_id('Credit_Card_Clustering') # experiment id

# model
from sklearn.cluster import KMeans, MeanShift # centroid-based
from sklearn.cluster import DBSCAN, OPTICS # density-based
from sklearn.cluster import AgglomerativeClustering # hierarchical-based
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import Birch
from hdbscan import HDBSCAN # hierarchical density-based

# metric evaluation
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

import warnings
warnings.filterwarnings('ignore')

In [26]:
# dagshub and mlflow setup
DAGSHUB_USERNAME = 'fnkhairudin'
DAGSHUB_EMAIL = 'fnkhairudin@gmail.com'
DAGSHUB_REPO = 'Credit-Card-Clustering-Customer-Segmentation'
DAGSHUB_TOKEN = 'b1b10b82350be145d4ea4d0d14d8514de68ae6cf' # dagshub.auth.get_token()
DASGHUB_URI = f'https://dagshub.com/{DAGSHUB_USERNAME}/{DAGSHUB_REPO}.mlflow'

os.environ['MLFLOW_TRACKING_USERNAME'] = DAGSHUB_USERNAME
os.environ['MLFLOW_TRACKING_PASSWORD'] = DAGSHUB_TOKEN
os.environ['MLFLOW_TRACKING_URI'] = DASGHUB_URI

In [27]:
data = pd.read_csv("../data/processed/clean_cc_data.csv")
data.sample(10)

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
4160,71.345309,1.0,1166.76,0.0,1166.76,0.0,1.0,0.0,1.0,0.0,0.0,12.0,3000.0,1137.865785,162.610445,0.75,12.0
1959,5715.854635,1.0,2173.99,1556.19,617.8,0.0,0.666667,0.583333,0.5,0.0,0.0,33.0,8500.0,1828.965006,1313.883915,0.0,12.0
8158,2.668569,0.333333,0.0,0.0,0.0,91.849463,0.0,0.0,0.0,0.111111,2.0,0.0,1200.0,288.42282,45.299293,1.0,9.0
8267,1929.069939,1.0,0.0,0.0,0.0,973.827667,0.0,0.0,0.0,0.166667,2.0,0.0,2500.0,786.014779,513.327849,0.0,12.0
2138,1571.382204,1.0,1849.26,1616.76,232.5,0.0,1.0,1.0,0.5,0.0,0.0,19.0,7000.0,1522.895942,414.775083,0.0,12.0
8684,1929.555023,1.0,621.11,621.11,0.0,6691.286753,0.333333,0.333333,0.0,0.666667,29.0,5.0,6000.0,2406.905963,549.02708,0.0,12.0
5191,833.541256,1.0,0.0,0.0,0.0,1422.637361,0.0,0.0,0.0,0.333333,8.0,0.0,2500.0,180.104404,282.085369,0.0,12.0
2519,1890.946618,1.0,0.0,0.0,0.0,4561.090005,0.0,0.0,0.0,0.25,7.0,0.0,2500.0,4847.513095,507.635447,0.090909,12.0
6675,1438.959962,1.0,242.01,122.01,120.0,36.887704,0.416667,0.083333,0.333333,0.083333,1.0,6.0,1500.0,318.521195,432.876404,0.0,12.0
3689,2784.364948,1.0,1178.31,1178.31,0.0,0.0,0.583333,0.583333,0.0,0.0,0.0,8.0,8500.0,4188.764712,776.323383,0.0,12.0


In [28]:
print(f'Jumlah baris dan kolom: {data.shape[0]} baris dan {data.shape[1]} kolom')

# skimming dataset
pd.DataFrame(
                {
                'kolom': data.columns.values,
                'data_type': data.dtypes.values,
                'null_value(%)': data.isna().mean().values * 100,
                'n_unique': data.nunique().values,
                'zero_value' : [True if (data[col] == 0).any() else False for col in data.columns],
                'neg_value' : [True if (data[col].dtype == int or data[col].dtype == float) and (data[col] < 0).any() else False for col in data.columns],
                'min': data.min().values,
                'max': data.max().values,
                'sample_unique': [data[col].unique() for col in data.columns]
                }
            )

Jumlah baris dan kolom: 8950 baris dan 17 kolom


Unnamed: 0,kolom,data_type,null_value(%),n_unique,zero_value,neg_value,min,max,sample_unique
0,BALANCE,float64,0.0,8871,True,False,0.0,19043.13856,"[40.900749, 3202.467416, 2495.148862, 1666.670..."
1,BALANCE_FREQUENCY,float64,0.0,43,True,False,0.0,1.0,"[0.818182, 0.909091, 1.0, 0.636364, 0.545455, ..."
2,PURCHASES,float64,0.0,6203,True,False,0.0,49039.57,"[95.4, 0.0, 773.17, 1499.0, 16.0, 1333.28, 709..."
3,ONEOFF_PURCHASES,float64,0.0,4014,True,False,0.0,40761.25,"[0.0, 773.17, 1499.0, 16.0, 6402.63, 661.49, 1..."
4,INSTALLMENTS_PURCHASES,float64,0.0,4452,True,False,0.0,22500.0,"[95.4, 0.0, 1333.28, 688.38, 436.2, 200.0, 920..."
5,CASH_ADVANCE,float64,0.0,4323,True,False,0.0,47137.21176,"[0.0, 6442.945483, 205.788017, 346.81139, 2301..."
6,PURCHASES_FREQUENCY,float64,0.0,47,True,False,0.0,1.0,"[0.166667, 0.0, 1.0, 0.083333, 0.666667, 0.333..."
7,ONEOFF_PURCHASES_FREQUENCY,float64,0.0,47,True,False,0.0,1.0,"[0.0, 1.0, 0.083333, 0.166667, 0.25, 0.916667,..."
8,PURCHASES_INSTALLMENTS_FREQUENCY,float64,0.0,47,True,False,0.0,1.0,"[0.083333, 0.0, 0.583333, 1.0, 0.25, 0.916667,..."
9,CASH_ADVANCE_FREQUENCY,float64,0.0,47,True,False,0.0,1.0,"[0.0, 0.25, 0.083333, 0.166667, 0.333333, 0.36..."


In [29]:
# dataset untuk modelling
data_modelling = data.copy()

## `Feature Engineering (Experiment)`

In [30]:
data_modelling.sample(10)

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
6654,1299.554504,1.0,0.0,0.0,0.0,3955.560549,0.0,0.0,0.0,0.5,11.0,0.0,2500.0,2084.109173,267.774368,0.166667,8.0
8082,2676.253141,0.909091,0.0,0.0,0.0,2832.570661,0.0,0.0,0.0,0.083333,2.0,0.0,3000.0,0.0,3356.729389,0.0,12.0
677,983.339069,1.0,5601.54,3073.82,2527.72,36.336457,1.0,0.75,1.0,0.083333,1.0,182.0,7500.0,4807.552996,207.134142,0.166667,12.0
1509,75.012641,0.545455,480.02,480.02,0.0,0.0,0.083333,0.083333,0.0,0.0,0.0,1.0,1950.0,485.738057,86.502541,0.0,12.0
8351,26.143768,0.363636,0.0,0.0,0.0,490.493,0.0,0.0,0.0,0.090909,4.0,0.0,1000.0,1132.076982,42.063009,0.333333,11.0
2222,684.658511,1.0,184.68,0.0,184.68,0.0,1.0,0.0,1.0,0.0,0.0,12.0,2900.0,2837.204108,350.27437,0.4,12.0
6615,1100.505764,1.0,220.0,220.0,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,2.0,1200.0,344.856423,623.734943,0.0,12.0
2144,139.449391,0.727273,843.03,0.0,843.03,0.0,0.583333,0.0,0.583333,0.0,0.0,7.0,2200.0,1312.375596,179.548143,0.0,12.0
3952,174.214883,0.363636,0.0,0.0,0.0,956.089092,0.0,0.0,0.0,0.083333,1.0,0.0,1200.0,238.06524,200.254118,0.0,12.0
7327,24.660951,0.363636,0.0,0.0,0.0,398.729042,0.0,0.0,0.0,0.090909,1.0,0.0,500.0,757.357928,132.251286,0.5,11.0


In [31]:
# continous columns
cont_column = ['BALANCE', 'PURCHASES', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS']

In [41]:
ss = StandardScaler()
rs = RobustScaler()
mm = MinMaxScaler()
# ct = ColumnTransformer([
# ('OneHot', OneHotEncoder(), cont_column)
# ], remainder='passthrough')

data_modelling[cont_column] = mm.fit_transform(data_modelling[cont_column])
data_modelling.sample(10)

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
3188,0.001168,1.0,0.004818,0.005797,0.0,0.0,0.833333,0.833333,0.0,0.0,0.0,10.0,0.131886,0.005096,0.002009,0.545455,12.0
4140,0.971245,1.0,0.107837,0.089725,0.072488,0.0,1.0,0.583333,1.0,0.0,0.0,76.0,0.732888,0.083715,0.055324,0.0,12.0
4299,0.261937,1.0,0.174987,0.113854,0.175131,0.004783,1.0,0.666667,1.0,0.083333,1.0,94.0,0.348915,0.03099,0.014838,0.0,12.0
2901,0.001467,0.727273,0.004199,0.0,0.009152,0.0,0.666667,0.0,0.666667,0.0,0.0,24.0,0.038397,0.006452,0.001443,0.181818,12.0
654,0.0,0.0,0.0,0.0,0.0,0.006595,0.0,0.0,0.0,0.083333,1.0,0.0,0.298831,0.0,0.001548,0.0,12.0
4107,0.002404,0.636364,0.000245,0.0,0.000533,0.003525,1.0,0.0,0.916667,0.083333,3.0,12.0,0.038397,0.00078,0.000609,0.0,12.0
7893,1e-05,0.090909,0.005445,0.0,0.011867,0.0,1.0,0.0,1.0,0.0,0.0,12.0,0.131886,0.003915,0.0016,0.0,12.0
3302,0.06879,1.0,0.010288,0.005093,0.013196,0.059417,0.833333,0.166667,0.75,0.583333,16.0,18.0,0.248748,0.035402,0.003896,0.083333,12.0
447,0.002721,0.454545,0.018421,0.022162,0.0,0.0,0.25,0.25,0.0,0.0,0.0,6.0,0.499165,0.010624,0.00054,0.0,12.0
6917,0.049496,1.0,0.003059,0.0,0.006667,0.022628,0.333333,0.0,0.25,0.083333,4.0,6.0,0.038397,0.028501,0.006444,0.0,12.0


## `Modelling`

In [33]:
# fungsi untuk melakukan model training
def log_training_model(est, model_name:str, score_result:list, dataset:pd.DataFrame, name_experiment:str):
    """
    est : Estimator object
    model_name : Estimator name
    score_result : Empty list to store the metric result
    dataset : Dataset for training the estimator
    name_experiment : experiment name
    """
    # model params
    model_params = est.get_params()
    # set_tag
    tags = {"model_name": model_name, "experiment-n": name_experiment}
    
    # tag and parameters mlflow ui
    mlflow.set_tags(tags)
    mlflow.log_params(model_params)   
    
    print(f"training model {model_name} ......")
    # hasil cluster untuk masing-masing point data
    result = est.fit_predict(dataset)

    # jika terbentuk HANYA 1 cluster --> metric evaluation tidak bisa melakukan perhitungan
    if len(np.unique(result)) == 1:
        score_result.append({
        'Model' : model_name,
        'silhouette_score': np.nan,
        'calinski_harabasz_score': np.nan,
        'davies_bouldin_score': np.nan
        })
        # log metrics pada mlflow ui
        mlflow.log_metrics({
                'silhouette': np.float('nan'),
                'calinski_harabasz': np.float('nan'),
                'davies_bouldin': np.float('nan')
                })
        # log model pada mlflow ui
        mlflow.sklearn.log_model(est, "sk_model")

    # jika terbentuk lebih dari 1 cluster
    else:
        # hasil clustering
        score_result.append({
        'Model' : model_name,
        'silhouette_score': silhouette_score(dataset, result),
        'calinski_harabasz_score': calinski_harabasz_score(dataset, result),
        'davies_bouldin_score': davies_bouldin_score(dataset, result)
        })
        # log metrics pada mlflow ui
        mlflow.log_metrics({
                        'silhouette': silhouette_score(dataset, result),
                        'calinski_harabasz': calinski_harabasz_score(dataset, result),
                        'davies_bouldin': davies_bouldin_score(dataset, result)
                            })
        # log model pada mlflow ui
        mlflow.sklearn.log_model(est, "sk_model")
    print(f"training model {model_name} is DONE.")

In [43]:
# model
kmeans = KMeans(n_clusters=5)
meanshift = MeanShift()
dbscan = DBSCAN()
optics = OPTICS()
agglomerative = AgglomerativeClustering()
# affinity = AffinityPropagation()
birch = Birch()
hdbscan = HDBSCAN()

# collect model
model = {
    'KMeans': kmeans,
    'MeanShift': meanshift,
    'DBSCAN': dbscan,
    'OPTICS': optics,
    'AgglomerativeClustering': agglomerative,
    # 'AffinityPropagation': affinity,
    'Birch' : birch,
    'HDBSCAN' : hdbscan
}

# hasil metrik evaluasi
metric_result = []

# melakukan set up the mlflow uri untuk tracking hasil eksperiment
mlflow.set_tracking_uri(DASGHUB_URI)

# melakukan tracking eksperiment menggunakan mlflow
for name, estimator in model.items(): 
    with mlflow.start_run(experiment_id=EXP_ID):
        # call fungsi untuk training model
        log_training_model(est=estimator, 
                           model_name=name, 
                           score_result=metric_result, 
                           dataset=data_modelling, 
                           name_experiment='experiment-4'
                           )

training model KMeans ......
training model KMeans is DONE.
training model MeanShift ......
training model MeanShift is DONE.
training model DBSCAN ......




training model DBSCAN is DONE.
training model OPTICS ......




training model OPTICS is DONE.
training model AgglomerativeClustering ......




training model AgglomerativeClustering is DONE.
training model Birch ......
training model Birch is DONE.
training model HDBSCAN ......




training model HDBSCAN is DONE.


In [40]:
# CSS properties
PROPS = 'color:#080808;font-weight:bold;background-color:#85bb65'

# tampilkan hasil training model dalam bentuk DataFrame
metric_result_df = pd.DataFrame(metric_result).style.\
    highlight_min(subset='davies_bouldin_score', props=PROPS).\
    highlight_max(subset=['silhouette_score', 'calinski_harabasz_score'], props=PROPS)
metric_result_df

Unnamed: 0,Model,silhouette_score,calinski_harabasz_score,davies_bouldin_score
0,KMeans,0.408905,9714.779014,0.763659
1,MeanShift,0.410949,732.981306,0.686635
2,DBSCAN,-0.50829,8.999433,1.663686
3,OPTICS,-0.634957,7.444035,1.678579
4,AgglomerativeClustering,0.573559,6213.919554,0.831023
5,Birch,0.525965,9063.723162,0.740156
6,HDBSCAN,-0.450107,21.783492,1.846151
