In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load your dataset (Make sure the Excel file is in the same directory or provide full path)
df = pd.read_excel('nadil_dataset_final.xlsx')

# View the first few rows to ensure data is loaded correctly
print(df.head())


        Date               Discription  Payments  Receipts   Balance
0        NaT                       NaN       NaN       NaN       NaN
1 2024-02-02           Initial Balance       NaN       NaN  12319.54
2 2024-02-02  0004199071001696 ATM WDR    1030.0       NaN  11289.54
3 2024-02-03       0710620915 BILL PYT     250.0       NaN  11039.54
4 2024-02-03          0443073 Fund Trf    6030.0       NaN   5009.54


In [2]:
# Load pre-trained SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for the 'Particulars' column
embeddings = model.encode(df['Discription'].tolist(), show_progress_bar=True)

# Output total embeddings and a sample embedding to verify
print(f"Total embeddings generated: {len(embeddings)}")
print(f"Sample embedding: {embeddings[0]}")


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Total embeddings generated: 443
Sample embedding: [-4.60676998e-01  5.25866568e-01 -2.41259933e-02 -4.13401932e-01
 -2.75495768e-01 -4.99183804e-01  3.25387985e-01  2.76551574e-01
  4.22429055e-01  4.85774279e-01  4.52098340e-01 -3.62290174e-01
  2.87673265e-01  1.34070694e-01 -2.08634928e-01 -2.68298626e-01
 -4.22179818e-01  2.64252394e-01  1.46887943e-01  2.36614004e-01
  1.32615402e-01 -7.07727149e-02  5.12070954e-01 -4.68115568e-01
  4.39195037e-01  4.16274428e-01  4.52370882e-01  1.78833939e-02
  4.04462427e-01 -7.66960561e-01 -1.78880453e-01  2.68172652e-01
 -3.34261090e-01  1.80803418e-01 -3.99300575e-01  7.62040675e-01
 -3.32320064e-01 -2.43698835e-01  2.11875048e-02  2.69779056e-01
 -5.52161522e-02  2.39256859e-01  7.27366865e-01 -1.61491290e-01
  2.07896039e-01  4.33389336e-01  4.05823499e-01 -7.50954673e-02
  3.83172423e-01 -6.77770451e-02 -5.60731530e-01 -7.59779289e-02
  3.66573721e-01  1.42221704e-01  5.04500985e-01 -6.88692704e-02
  4.50987577e-01 -3.21172386e-01 -8.7844

In [3]:
from sklearn.decomposition import PCA

# Initialize PCA to reduce dimensions (you can change n_components based on your needs)
pca = PCA(n_components=50)  # You can adjust this value (e.g., 50 components)
reduced_embeddings = pca.fit_transform(embeddings)

# Check reduced embeddings shape
print(f"Reduced embeddings shape: {reduced_embeddings.shape}")


Reduced embeddings shape: (443, 50)


In [6]:
from sklearn.cluster import DBSCAN
import time

# Set DBSCAN parameters
eps = 0.5  # Maximum distance between two samples for them to be considered as in the same neighborhood
min_samples = 5  # The number of samples in a neighborhood for a point to be considered a core point

# Track the time taken for clustering
start_time = time.time()

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(reduced_embeddings)

# Track time taken for clustering
end_time = time.time()
print(f"Clustering took {end_time - start_time} seconds")

# Add the cluster labels to the original dataframe
df['Cluster'] = labels

# Print the number of clusters (including noise, labeled as -1)
num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(f"Number of clusters found: {num_clusters}")


Clustering took 0.5504100322723389 seconds
Number of clusters found: 13


In [7]:
# Print transactions in each cluster
for cluster_num in range(n_clusters):
    print(f"\n--- Cluster {cluster_num} ---")
    cluster_transactions = df[df['Cluster'] == cluster_num]
    print(cluster_transactions[['Date', 'Discription', 'Payments', 'Receipts', 'Balance']])
    print("\n")



--- Cluster 0 ---
    Date Discription  Payments  Receipts  Balance
0    NaT         NaN       NaN       NaN      NaN
64   NaT         NaN       NaN       NaN      NaN
274  NaT         NaN       NaN       NaN      NaN
306  NaT         NaN       NaN       NaN      NaN
337  NaT         NaN       NaN       NaN      NaN
368  NaT         NaN       NaN       NaN      NaN
399  NaT         NaN       NaN       NaN      NaN
430  NaT         NaN       NaN       NaN      NaN



--- Cluster 1 ---
          Date               Discription  Payments  Receipts   Balance
2   2024-02-02  0004199071001696 ATM WDR    1030.0       NaN  11289.54
6   2024-02-05  0004199071001696 ATM WDR    1030.0       NaN   3003.54
7   2024-02-06  0004199071001696 ATM WDR    1530.0       NaN   1473.54
8   2024-02-07  0004199071001696 ATM WDR     430.0       NaN   1043.54
11  2024-02-09  0004199071001696 ATM WDR    2530.0       NaN   3470.54
..         ...                       ...       ...       ...       ...
400 2024-12-1