In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

import tqdm

from ollama import Client
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

## Create Embeddings

In [2]:
ollama = Client(host="http://localhost:7869")

In [3]:
df_products = pd.read_json("../data/clean/combined.jsonl", lines=True)
df_products.head(2)

Unnamed: 0,id,product_name,source,price,series_model,brand,processor,memory_gb,storage_gb,storage_type,graphics_card
0,83451581,LAPTOP ADMINISTRASI PERKANTORAN AXIOO,LKPP,12500000,LAPTOP ADMINISTRASI PERKANTORAN AXIOO,AXIOO,Intel Core I5- 3.70 GHz - Frekuensi Turbo | Co...,8.0,,SSD NVME,Integrated
1,84521755,ACER LAPTOP TRAVELMATE P214 CORE I5/8 GB/512 G...,LKPP,18300000,TravelMate P214,ACER,Intel Core i5-1335U,8.0,512.0,HDD,Integrated


In [4]:
df_sampled = df_products.groupby("source").sample(1000, random_state=21, replace=False)
df_sampled["source"].value_counts()

source
LKPP         1000
Lazada       1000
Tokopedia    1000
Name: count, dtype: int64

In [5]:
embeddings = []

for product_name in tqdm.tqdm(df_sampled["product_name"].tolist()):
    res = ollama.embed("nomic-embed-text:latest", "clustering: " + product_name)
    embeddings.append(res.embeddings[0])

embeddings = np.array(embeddings)

100%|██████████| 3000/3000 [01:14<00:00, 40.04it/s]


## Clustering

In [11]:
tsne = TSNE(n_components=3, perplexity=30)
X_reduce = tsne.fit_transform(embeddings)
X_reduce.shape

(3000, 3)

In [12]:
distorsions = []
for k in range(2, 100):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X_reduce)
    distorsions.append(kmeans.inertia_)

In [13]:
px.line(x=range(2, 100), y=distorsions)

In [14]:
kmeans = KMeans(n_clusters=50)
kmeans.fit(X_reduce)

In [15]:
df_reduced = df_sampled[["id", "product_name", "source"]].copy()
df_reduced["d1"] = X_reduce[:, 0]
df_reduced["d2"] = X_reduce[:, 1]
df_reduced["d3"] = X_reduce[:, 2]
df_reduced["cluster"] = kmeans.labels_

In [16]:
df_reduced.drop(columns=["cluster"]).to_json("../data/clean/embedding_small.jsonl", orient="records", lines=True)

In [18]:
px.scatter_3d(df_reduced, x="d1", y="d2", z="d3", color="cluster", hover_data=["product_name"])