In [1]:
# Step 1: Import necessary libraries
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
import hdbscan
import numpy as np
import time
from sklearn.preprocessing import StandardScaler


In [2]:
# Step 2: Load your Excel dataset (make sure the file is in the same directory)
df = pd.read_excel('nadil_dataset_final.xlsx')  # Change the path if needed


In [3]:
# Step 3: Data Refinement - Text Preprocessing

# Create a dictionary of abbreviations
abbreviations = {
    'PYT': 'Payment',
    'TRF': 'Transfer',
    'DEP': 'Deposit',
    'WDL': 'Withdrawal',
    'WD': 'Withdrawal',
    'POS': 'Point of Sale',
    'ATM': 'ATM Withdrawal',
    'CHQ': 'Cheque',
    'DD': 'Demand Draft',
    'BT': 'Bank Transfer',
    'ACH': 'Automated Clearing House',
    'NEFT': 'National Electronic Funds Transfer',
    'RTGS': 'Real-Time Gross Settlement',
    'IMPS': 'Immediate Payment Service',
    'UPI': 'Unified Payments Interface',
    'INT': 'Interest',
    'CHG': 'Charge',
    'FEE': 'Fee',
    'TXN': 'Transaction',
    'REV': 'Reversal',
    'EMI': 'Equated Monthly Installment',
    'CC': 'Credit Card',
    'POS REF': 'Point of Sale Refund',
    'BIL': 'Bill Payment',
    'BILP': 'Bill Payment',
    'INV': 'Investment',
    'REF': 'Refund',
    'SAL': 'Salary Credit',
    'SL': 'Salary Credit',
    'TFR': 'Transfer'
}

# Step 3.1: Normalize Capitalization and Expand Abbreviations
def clean_text(text, abbr_dict):
    # Convert text to lowercase
    text = text.lower()
    
    # Expand abbreviations
    for abbr, full_form in abbr_dict.items():
        text = re.sub(rf'\b{abbr.lower()}\b', full_form.lower(), text)
    
    return text

# Apply text cleaning to 'Particulars' column
df['cleaned_particulars'] = df['Discription'].apply(lambda x: clean_text(str(x), abbreviations))


In [4]:
# Step 4: Use Sentence Transformers to Create Embeddings

# Initialize the sentence transformer model (you can choose any model you prefer)
model = SentenceTransformer('gtr-t5-large')

# Generate embeddings for the cleaned text
embeddings = model.encode(df['cleaned_particulars'].tolist())


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

In [5]:
# Step 5: Clustering using HDBSCAN

# Step 5.1: Normalize the embeddings (this is optional but can improve HDBSCAN performance)
scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(embeddings)

# Step 5.2: Apply HDBSCAN Clustering
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)  # Adjust min_cluster_size as needed
labels = hdbscan_model.fit_predict(embeddings_scaled)


In [6]:
# Step 6: Add the cluster labels to the dataframe
df['Cluster'] = labels


In [7]:
# Step 7: Print all clusters with their data

# Loop through all unique clusters (ignoring noise points labeled as -1)
unique_clusters = set(labels)

for cluster in unique_clusters:
    print(f"\nCluster {cluster} Transactions:")
    
    # Filter rows for the current cluster
    cluster_data = df[df['Cluster'] == cluster]
    
    # Print the details of each transaction in the cluster
    print(cluster_data[['Discription', 'Payments', 'Receipts', 'Balance']])



Cluster 0 Transactions:
    Discription  Payments  Receipts  Balance
0           NaN       NaN       NaN      NaN
64          NaN       NaN       NaN      NaN
274         NaN       NaN       NaN      NaN
306         NaN       NaN       NaN      NaN
337         NaN       NaN       NaN      NaN
368         NaN       NaN       NaN      NaN
399         NaN       NaN       NaN      NaN
430         NaN       NaN       NaN      NaN

Cluster 1 Transactions:
                  Discription  Payments  Receipts   Balance
42                       INT        NaN     12.16   2017.70
67                        INT       NaN      4.64  18484.64
73                        INT       NaN  20240.00  31777.14
86                        INT       NaN    114.05  68565.20
93                        INT       NaN    185.71  73200.91
104                       INT       NaN    186.69  75148.32
119                       INT       NaN    176.17  57752.66
123                       INT       NaN    151.07  51824.93
136  