## HDBScan Clustering for Credit Card Fraud Data

In [1]:
## Logger configuration
from loguru import logger

logger.remove()

logger.add("logs/hdbscan.log",
           level = "DEBUG",
           format = "{time:HH:mm:ss} | {level} | {message}")

1

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data  = pd.read_csv("../creditcard.csv")

sampled_df, _ = train_test_split(data, 
                                test_size = 0.85, 
                                stratify = data['Class'], 
                                random_state = 42)


## Select either sampled or unsampled data
# df = data
df = sampled_df
df_orig = df.copy()

## Save labels for later, drop unneeded features  
label_df = pd.DataFrame({"Class": df["Class"]})
df.drop(columns = ["Class", "Time"], inplace = True)

## Scale the amount feature 
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df[['Amount']])

print(f"Dataset shape: {df.shape}")

Dataset shape: (42721, 29)


## HDBSCAN function definitions

### HDBSCAN Training Function

In [3]:
from sklearn.cluster import HDBSCAN
from time import time

def hdbscan_clustering(X, params, logging = True):
    
    """
    Inputs:
        X: a dataframe object with training features 
        params: a dictionary with model parameters
        logging: True or False on whether training information should be logged 
        
    Returns: 
        The cluster label outputs from hdbscan.fit_predict()
            
    This function clusters input data using HDBSCAN and returns cluster labels. 
    The logging parameter can be used to record training time information. 
    """
    
    ## Set n_jobs to use all available cores
    params["n_jobs"] = -1
    
    ## Cluster it up
    start_time = time()                        # Record start time
    hbdscan = HDBSCAN(**params)                # Instantiate model with input parameters
    cluster_labels = hbdscan.fit_predict(X)    # Assign Cluster Labels 
    end_time = time()                          # Record End Time  
    
    ## Record model training time and parameters used 
    if logging == True:
        total_training_time_seconds = end_time - start_time
        minutes, seconds = divmod(total_training_time_seconds, 60)
        formatted_time = f"{int(minutes):02}:{int(seconds):02}"
        logger.info(f"HBDScan Model Trained | Train Time (Minutes): {formatted_time} | Parameters | {params} |")
    
    return cluster_labels

### Cluster Analysis Function

In [4]:
def analyze_clusters(params, label_df, cluster_labels):
    
    """
    Parameters:
    - label_df: A dataframe object with a "Class" column
    - cluster_labels: The label outputs from hdbscan.fit_predict()

    For each set of parameters, records:
    - the parameters
    - the % noise
    - Count of fraud in clusters, and 
    - Number of clusters  
    
    Logs results in notebooks/hdbscan.log 
    """
    
    ## Adds cluster labels to common df with class labels 
    eval_df = label_df.copy()
    eval_df["Cluster"] = cluster_labels
    
    ## Cluster / Class Distribution Table 
    distribution = eval_df.groupby('Cluster')['Class'].value_counts().unstack(fill_value=0)
    
    ## Assemble counts  
    cluster_count = eval_df["Cluster"].nunique()  
    total_noise_count = (eval_df["Cluster"] == -1).sum()
    total_fraud_count = (eval_df["Class"] == 1).sum()
    noise_fraud_count = distribution.loc[-1, 1]
    noise_pct = ((total_noise_count / eval_df.shape[0]) * 100).round(2)
    
    ## Results
    logger.info(f"Parameters: {params} | Clusters: {cluster_count} | Noise: {noise_pct}% | Fraud in cluster: {total_fraud_count - noise_fraud_count} | Fraud in noise: {noise_fraud_count}")

### Grid Search Function

In [17]:
import itertools

def cluster_grid_search(X, param_grid, label_df):
    
    """
    Accepts X, a parameter grid, and a dataframe with class labels and conducts 
    a HDBScan model grid search. 
    
    Uses the defined hdbscan_clustering and analyze_clusters function to train, assess, and log data on each parameter set. 
    """
    
    # Extract parameters and generate all combinations 
    keys, values = zip(*param_grid.items())
    iter_param = [dict(zip(keys, combo)) for combo in itertools.product(*values)]
    
    
    # Execute Grid Search 
    combinations = len(iter_param)
    logger.info(f"Grid Search Initiated | Combinations to attempt: {combinations}")
    counter = 1
    
    for p in iter_param:
        logger.info(f"Search {counter} / {combinations}")
        cluster_labels = hdbscan_clustering(X, p, logging = False)
        analyze_clusters(p, label_df, cluster_labels)
        counter += 1

## Execution 

In [None]:
params = {}
cluster_labels = hdbscan_clustering(df, params, logging = True)

In [None]:
analyze_clusters(params, label_df, cluster_labels)

In [18]:
## Run Grid Search        
param_grid = {
        'min_cluster_size': [2],
        'min_samples': [1],
        'cluster_selection_epsilon': [11, 12, 13, 14, 15],
    }

cluster_grid_search(df, param_grid, label_df)