<a href="https://colab.research.google.com/github/fikrifaizz/indo-sentiment-engine/blob/main/notebooks/vector_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import torch
import numpy as np
from tqdm.auto import tqdm
import chromadb
from chromadb.utils import embedding_functions
from transformers import AutoTokenizer, AutoModel
from peft import PeftModel, PeftConfig

In [2]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Device: {device}")

Device: mps


In [6]:
BASE_MODEL = "indobenchmark/indobert-base-p1"
LORA_PATH = "../models/indobert-lora-finetuned"

print("Loading Model & LoRA Adapter...")

tokenizer = AutoTokenizer.from_pretrained(LORA_PATH)

try:
    print("Mencoba memuat Base Model dari cache lokal...")
    base_model = AutoModel.from_pretrained(BASE_MODEL, local_files_only=True)
except Exception as e:
    print("Gagal load lokal, mencoba download ulang dari HuggingFace (Internet Required)...")
    base_model = AutoModel.from_pretrained(BASE_MODEL, local_files_only=False)

peft_model = PeftModel.from_pretrained(base_model, LORA_PATH)

print("Merging LoRA weights into Base Model...")
model = peft_model.merge_and_unload()

model.to(device)
model.eval()

Loading Model & LoRA Adapter...
Mencoba memuat Base Model dari cache lokal...
Merging LoRA weights into Base Model...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(50000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [7]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    token_embeddings = outputs.last_hidden_state
    
    attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * attention_mask, 1)
    sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)
    
    embedding = sum_embeddings / sum_mask 
    
    return embedding.cpu().numpy()[0].tolist()

# Tes fungsi
dummy_vec = get_embedding("Tes barang bagus")
print(f"Dimensi Vektor: {len(dummy_vec)} (Harus 768)")

Dimensi Vektor: 768 (Harus 768)


In [8]:
DB_PATH = "../data/chroma_db"
client = chromadb.PersistentClient(path=DB_PATH)

collection_name = "lazada_reviews"
try:
    client.delete_collection(name=collection_name)
    print("Koleksi lama dihapus.")
except:
    pass

collection = client.create_collection(
    name=collection_name,
    metadata={"hnsw:space": "cosine"} # Kita pakai Cosine Similarity
)
print(f"ChromaDB Collection '{collection_name}' siap!")

ChromaDB Collection 'lazada_reviews' siap!


In [9]:
df_test = pd.read_parquet("../data/final/test.parquet")
df_sample = df_test.head(1000).copy().reset_index(drop=True)

print(f"\nSedang meng-embed {len(df_sample)} review ke ChromaDB...")

batch_size = 32
total_batches = len(df_sample) // batch_size + 1

for i in tqdm(range(0, len(df_sample), batch_size)):
    batch = df_sample.iloc[i : i+batch_size]
    
    # Generate Embeddings
    ids = [str(x) for x in batch.index.tolist()]
    documents = batch['clean_text'].tolist()
    metadatas = [{"rating": int(r), "label": int(l)} for r, l in zip(batch['rating'], batch['label'])]
    
    embeddings = [get_embedding(doc) for doc in documents]
    
    # Masukkan ke ChromaDB
    collection.add(
        ids=ids,
        embeddings=embeddings,
        documents=documents,
        metadatas=metadatas
    )

print("Indexing Selesai!")


Sedang meng-embed 1000 review ke ChromaDB...


  0%|          | 0/32 [00:00<?, ?it/s]

Indexing Selesai!


In [11]:
def search_reviews(query, top_k=3):
    print(f"\nQuery: '{query}'")
    
    # Ubah query user jadi vektor
    query_vec = get_embedding(query)
    
    # Cari tetangga terdekat di ChromaDB
    results = collection.query(
        query_embeddings=[query_vec],
        n_results=top_k
    )
    
    # Tampilkan hasil
    for i in range(top_k):
        review = results['documents'][0][i]
        meta = results['metadatas'][0][i]
        score = results['distances'][0][i] # Cosine Distance (Makin kecil makin mirip)
        
        label_map = {0: 'Negatif', 1: 'Netral', 2: 'Positif'}
        print(f"   [{i+1}] {review}")
        print(f"       Rating: {meta['rating']} | Sentimen: {label_map[meta['label']]}")
        print(f"       DIST: {score:.4f}")

# Coba cari sesuatu yang spesifik
search_reviews("pengiriman lama banget sampe seminggu")
search_reviews("barang pecah pas sampe")
search_reviews("kurir ramah sopan")


Query: 'pengiriman lama banget sampe seminggu'
   [1] pengirimannya lama banget
       Rating: 5 | Sentimen: Positif
       DIST: 0.1222
   [2] barangnya alhamdulillah bagus cuma lama banget pengirimanya
       Rating: 2 | Sentimen: Negatif
       DIST: 0.1702
   [3] pengiriman cepat banget pesan malem besoknya dikirim mksh lazada
       Rating: 5 | Sentimen: Positif
       DIST: 0.2338

Query: 'barang pecah pas sampe'
   [1] barang minggu langsung rusak
       Rating: 1 | Sentimen: Negatif
       DIST: 0.1879
   [2] barang fah sampai berfungsi
       Rating: 5 | Sentimen: Positif
       DIST: 0.2153
   [3] cepat sampai barang ori
       Rating: 5 | Sentimen: Positif
       DIST: 0.2271

Query: 'kurir ramah sopan'
   [1] pengiriman sangat cepat
       Rating: 5 | Sentimen: Positif
       DIST: 0.2533
   [2] pengiriman cepat pengemasan rapi
       Rating: 5 | Sentimen: Positif
       DIST: 0.2611
   [3] barang bagus sesuai deskripsi kurir ramah mudah an awet
       Rating: 5 | Sentimen