In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load your dataset (Make sure the Excel file is in the same directory or provide full path)
df = pd.read_excel('shihara_dataset.xlsx')

# View the first few rows to ensure data is loaded correctly
print(df.head())


        Date                   Particulars Payments  Receipts   Balance
0 2022-12-27  IB CEFT CHGS K A Kumaranayek       30       NaN   8449.35
1 2022-12-27  PURCHASE UNIVERSITY GRANTS C       50       NaN   8399.35
2 2022-12-28  PURCHASE KEELLS SUPER - MORA     5000       NaN   3399.35
3 2022-12-30                      INTEREST      NaN     41.57   3440.92
4 2023-01-04                      NOV RENT     3000  75000.00  78440.92


In [4]:
# Load pre-trained SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for the 'Particulars' column
embeddings = model.encode(df['Particulars'].tolist(), show_progress_bar=True)

# Output total embeddings and a sample embedding to verify
print(f"Total embeddings generated: {len(embeddings)}")
print(f"Sample embedding: {embeddings[0]}")


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Total embeddings generated: 557
Sample embedding: [-0.35194734  0.08775935 -0.7808948   0.19885507 -0.11037867 -0.20677426
  0.693455    0.21924816  0.30343384 -0.28919327  0.21753198 -0.7188075
 -0.13196921 -0.31510085 -0.3956512  -0.16168134 -0.3047715   0.42435867
  0.12592866 -0.42875928  0.26307696  0.14003891 -0.0738509   0.15381841
 -0.05272217 -0.08566177  0.26523694 -0.20333847  0.09461237 -0.28640062
  0.3491362   0.37835565  0.18152794  0.10556078 -0.21564557  0.20489258
 -0.14693919 -0.02645201  0.06703614 -0.23356733 -0.08021606  0.06440441
  0.1601198   0.2572478   0.38956878  0.4814644  -0.0979213  -0.06341886
 -0.64547986  0.11043025 -0.07851804 -0.4570948   0.18084401 -0.08879726
  0.333129   -0.4177953  -0.48403984  0.0347426   0.19139534 -0.05281593
  0.32812232  0.24269643  0.21454683  0.58562243  0.66526836 -0.14525904
 -0.017019   -0.5633745  -0.27783203  0.15057778  0.16967377 -0.12429614
 -0.19961871 -0.09146208 -0.18468475  0.20078586 -0.3507503   0.51084363
 -

In [5]:
from sklearn.decomposition import PCA

# Initialize PCA to reduce dimensions (you can change n_components based on your needs)
pca = PCA(n_components=50)  # You can adjust this value (e.g., 50 components)
reduced_embeddings = pca.fit_transform(embeddings)

# Check reduced embeddings shape
print(f"Reduced embeddings shape: {reduced_embeddings.shape}")


Reduced embeddings shape: (557, 50)


In [10]:
from sklearn.cluster import MiniBatchKMeans
import time

# Set number of clusters (you can adjust this)
n_clusters = 30  # Adjust number of clusters as needed

# Track the time taken for clustering
start_time = time.time()

# Apply MiniBatchKMeans clustering
mini_batch_kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=100)
labels = mini_batch_kmeans.fit_predict(reduced_embeddings)

# Track time taken for clustering
end_time = time.time()
print(f"Clustering took {end_time - start_time} seconds")

# Add the cluster labels to the original dataframe
df['Cluster'] = labels


Clustering took 0.1986081600189209 seconds




In [11]:
# Print transactions in each cluster
for cluster_num in range(n_clusters):
    print(f"\n--- Cluster {cluster_num} ---")
    cluster_transactions = df[df['Cluster'] == cluster_num]
    print(cluster_transactions[['Date', 'Particulars', 'Payments', 'Receipts', 'Balance']])
    print("\n")



--- Cluster 0 ---
          Date               Particulars Payments    Receipts     Balance
12  2023-01-11     Funds from 3008007576      NaN   213388.42   306739.34
337 2024-02-28     Funds from 3016524874      NaN  3800000.00  9807123.76
355 2024-03-11  INTEREST FROM 3015011429      NaN    28658.76   891697.36
356 2024-03-11  INTEREST FROM 3015011474      NaN    20470.54  912.167.90
443 2024-07-10  INTEREST FROM 3015011429      NaN    17781.92   107782.19
444 2024-07-10  INTEREST FROM 3015011474      NaN    12701.37   120483.56
445 2024-07-10     Funds from 3015011429      NaN   700000.00   820483.56
446 2024-07-10     Funds from 3015011474      NaN   500000.00  1320483.56



--- Cluster 1 ---
          Date                    Particulars Payments  Receipts    Balance
124 2023-06-12         PURCHASE SUPER FASHION     4780   25000.0   37286.47
133 2023-06-20    PURCHASE PIYARA FASHION PVT     2280       NaN   51501.47
164 2023-08-02   PURCHASE PIYARA FASHION (PVT     6490       NaN  