In [10]:
from sklearn.cluster import KMeans, DBSCAN
import hdbscan
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd
import re 
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

In [5]:
# load dataset 
df = pd.read_csv("data/final_personal_finance_dataset.csv")

# Combine Merchant + Transaction Description 
df['text_raw'] = (
    df['Merchant'].fillna('') + " " + df['Transaction Description'].fillna('')
)

# Clean text 
def clean_text(text):
    text = text.lower()                              
    text = re.sub(r'[^a-z\s]', ' ', text)              
    text = re.sub(r'\s+', ' ', text).strip()           
    return text

df['text_clean'] = df['text_raw'].apply(clean_text)

# Convert into embeddings (Sentence-BERT) 
model = SentenceTransformer("all-MiniLM-L6-v2")  
embeddings = model.encode(df['text_clean'].tolist(), show_progress_bar=True)

print("✅ Step 1 completed: text cleaned and embedded")
print("Embeddings shape:", embeddings.shape)


Batches: 100%|██████████| 493/493 [00:43<00:00, 11.25it/s]


✅ Step 1 completed: text cleaned and embedded
Embeddings shape: (15767, 384)


In [6]:
# --- Step 2A: K-Means ---
print("\n🔹 Running K-Means...")
kmeans = KMeans(n_clusters=15, random_state=42)   # you can tune n_clusters
kmeans_labels = kmeans.fit_predict(embeddings)
df['kmeans_cluster'] = kmeans_labels

# Evaluate with silhouette score
if len(set(kmeans_labels)) > 1:
    kmeans_score = silhouette_score(embeddings, kmeans_labels)
else:
    kmeans_score = -1
print("K-Means Silhouette Score:", kmeans_score)


🔹 Running K-Means...
K-Means Silhouette Score: 0.12818874418735504


In [7]:
# --- Step 2B: DBSCAN ---
print("\n🔹 Running DBSCAN...")
dbscan = DBSCAN(eps=1.5, min_samples=10, metric='euclidean')
dbscan_labels = dbscan.fit_predict(embeddings)
df['dbscan_cluster'] = dbscan_labels

# Evaluate
if len(set(dbscan_labels)) > 1:
    dbscan_score = silhouette_score(embeddings, dbscan_labels)
else:
    dbscan_score = -1
print("DBSCAN Silhouette Score:", dbscan_score)



🔹 Running DBSCAN...
DBSCAN Silhouette Score: -1


In [8]:
# --- Step 2C: HDBSCAN ---
print("\n🔹 Running HDBSCAN...")
hdb = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean')
hdb_labels = hdb.fit_predict(embeddings)
df['hdbscan_cluster'] = hdb_labels

# Evaluate
if len(set(hdb_labels)) > 1:
    hdbscan_score = silhouette_score(embeddings, hdb_labels)
else:
    hdbscan_score = -1
print("HDBSCAN Silhouette Score:", hdbscan_score)


🔹 Running HDBSCAN...




HDBSCAN Silhouette Score: 0.6177774667739868


In [9]:
# --- Step 2D: Summary ---
print("\n✅ Clustering Finished!")
print("K-Means clusters:", len(set(kmeans_labels)))
print("DBSCAN clusters:", len(set(dbscan_labels)))
print("HDBSCAN clusters:", len(set(hdb_labels)))


✅ Clustering Finished!
K-Means clusters: 15
DBSCAN clusters: 1
HDBSCAN clusters: 410


In [12]:
# Step 3A: Prepare features + labels
X = embeddings  
y = df['Category'].fillna("Unknown")

# Drop categories with <2 samples
category_counts = y.value_counts()
valid_cats = category_counts[category_counts > 1].index
mask = y.isin(valid_cats)

X = X[mask]
y = y[mask]

# Train/test split (stratified, now safe)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [13]:
# --- Step 3B: Train Models ---

## Random Forest
print("\n🌲 Training Random Forest...")
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

## LightGBM
print("\n💡 Training LightGBM...")
lgbm = LGBMClassifier(n_estimators=200, random_state=42)
lgbm.fit(X_train, y_train)
lgbm_preds = lgbm.predict(X_test)

## Neural Network (MLP)
print("\n🧠 Training Neural Network...")
mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=20, random_state=42)
mlp.fit(X_train, y_train)
mlp_preds = mlp.predict(X_test)


🌲 Training Random Forest...

💡 Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027861 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97881
[LightGBM] [Info] Number of data points in the train set: 12611, number of used features: 384
[LightGBM] [Info] Start training from score -4.044162
[LightGBM] [Info] Start training from score -8.749178
[LightGBM] [Info] Start training from score -5.804739
[LightGBM] [Info] Start training from score -3.718740
[LightGBM] [Info] Start training from score -6.551953
[LightGBM] [Info] Start training from score -7.832887
[LightGBM] [Info] Start training from score -3.598780
[LightGBM] [Info] Start training from score -8.343712
[LightGBM] [Info] Start training from score -2.180398
[LightGBM] [Info] Start training from score -7.245100
[LightGBM] [Info] Start training from score -7.139740
[LightGBM] [Info] Start training from score -8.749178
[LightGB




🧠 Training Neural Network...




In [14]:
# --- Step 3C: Evaluate Models ---
def evaluate_model(name, y_true, y_pred):
    print(f"\n🔎 Results for {name}:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("F1-score (macro):", f1_score(y_true, y_pred, average="macro"))
    print(classification_report(y_true, y_pred, zero_division=0))

evaluate_model("Random Forest", y_test, rf_preds)
evaluate_model("LightGBM", y_test, lgbm_preds)
evaluate_model("Neural Net", y_test, mlp_preds)


🔎 Results for Random Forest:
Accuracy: 0.9359340310815096
F1-score (macro): 0.6447509820822757
                           precision    recall  f1-score   support

           Alcohol & Bars       1.00      1.00      1.00        55
                  Apparel       1.00      0.89      0.94         9
           Auto Insurance       1.00      1.00      1.00        77
                   Beauty       1.00      0.75      0.86         4
                    Bonus       0.00      0.00      0.00         1
             Coffee Shops       1.00      0.99      0.99        86
                     Cook       0.00      0.00      0.00         1
      Credit Card Payment       1.00      1.00      1.00       356
                  Culture       0.00      0.00      0.00         2
Dividend earned on Shares       0.00      0.00      0.00         2
                Education       1.00      0.75      0.86         4
   Electronics & Software       1.00      0.81      0.89        21
            Entertainment       