<a href="https://colab.research.google.com/github/elangbijak4/LLM-SLM-Examples/blob/main/Demo_Kode_Basisdata_Vektor_untuk_Data_Terstruktur.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pymilvus

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from pymilvus import Collection, FieldSchema, CollectionSchema, DataType
from sklearn.metrics.pairwise import cosine_similarity

# Inisialisasi tokenizer dan model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Data tabel
records = [
    "1 product1 10.99",
    "2 product2 15.49",
    "3 product3 7.99"
]
table_schema = "id name price"

# Fungsi untuk mendapatkan embedding dari teks
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt')
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Mendapatkan embedding untuk setiap rekaman
record_embeddings = [get_embedding(record) for record in records]

# Mendapatkan embedding untuk schema tabel
schema_embedding = get_embedding(table_schema)

# Definisikan skema untuk koleksi
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=record_embeddings[0].shape[1])
]
schema = CollectionSchema(fields, description="Product collection")

# Buat koleksi
collection = Collection(name="products", schema=schema)

# Menambahkan data ke koleksi
data = [
    [1, 2, 3],  # id
    [embedding[0] for embedding in record_embeddings]  # vector
]
collection.insert(data)

# Menambahkan schema embedding sebagai metadata atau dalam koleksi terpisah
schema_collection = Collection(name="schemas", schema=schema)
schema_data = [
    [1],  # id, bisa diabaikan atau disesuaikan
    [schema_embedding[0]]  # vector
]
schema_collection.insert(schema_data)

# Contoh tabel schema embedding lainnya
other_table_schema = "user_id user_name user_age"
other_schema_embedding = get_embedding(other_table_schema)

# Menghitung similarity
similarity = cosine_similarity([schema_embedding[0]], [other_schema_embedding[0]])
print("Similarity antara schema tabel:", similarity[0][0])