## Praktikum 4
Percobaan kali ini kita akan melihat perbedaan ketiga model yang telah kita bahas dan bandingkan hasilnya.

In [None]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

# ===============================
# 1. Buat dataset 1 juta data 5D
# ===============================
n_data = 1_000_000   # bisa coba 100_000 dulu jika RAM terbatas
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)

# Query point
query = np.random.random((1, dim)).astype(np.float32)
k = 10

# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", neighbors[0][:5], "...")

# ===============================
# 3. FAISS (Flat Index)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", indices[0][:5], "...")

# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", labels[0][:5], "...")


=== Annoy ===
Build time: 19.997549295425415 detik
Query time: 0.00022077560424804688 detik
Neighbors: [57442, 369591, 406820, 890823, 244484] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.006882905960083008 detik
Query time: 0.006097316741943359 detik
Neighbors: [ 57442 369591 406820 890823 244484] ...

=== HNSW (hnswlib) ===
Build time: 156.05241107940674 detik
Query time: 0.00023412704467773438 detik
Neighbors: [ 57442 369591 406820 890823 244484] ...


Lakukan percobaan pada metric distance yang berbeda. catat hasilnya pada tabel yang anda buat sendiri seperti pada praktikum 1.



In [None]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib
import pandas as pd
from IPython.display import display, HTML

# ======================================================
# 1. Konfigurasi awal
# ======================================================
n_data = 100_000   # gunakan 100 ribu dulu, bisa ubah ke 1 juta jika kuat
dim = 5
k = 10
metrics = ['euclidean', 'angular']
results = []

# ======================================================
# 2. Jalankan percobaan untuk tiap metric
# ======================================================
for metric in metrics:
    print(f"\n===== Percobaan Metric: {metric.upper()} =====")
    X = np.random.random((n_data, dim)).astype(np.float32)
    query = np.random.random((1, dim)).astype(np.float32)

    # ---------------- Annoy ----------------
    ann_index = AnnoyIndex(dim, metric)
    start = time.time()
    for i in range(n_data):
        ann_index.add_item(i, X[i])
    ann_index.build(10)
    build_time_annoy = time.time() - start

    start = time.time()
    idx_annoy = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
    query_time_annoy = time.time() - start

    results.append({
        "Metode": "Annoy",
        "Distance Metric": metric.title(),
        "Jumlah Data": f"{n_data:,}",
        "Dimensi": dim,
        "Waktu Build (s)": round(build_time_annoy, 4),
        "Waktu Query (s)": round(query_time_annoy, 4),
        "Contoh Index Terdekat": str(idx_annoy[0][:5])
    })

    # ---------------- FAISS ----------------
    if metric == 'angular':
        continue  # FAISS tidak support angular langsung

    faiss_index = faiss.IndexFlatL2(dim)
    start = time.time()
    faiss_index.add(X)
    build_time_faiss = time.time() - start

    start = time.time()
    distances, indices = faiss_index.search(query, k)
    query_time_faiss = time.time() - start

    results.append({
        "Metode": "FAISS",
        "Distance Metric": metric.title(),
        "Jumlah Data": f"{n_data:,}",
        "Dimensi": dim,
        "Waktu Build (s)": round(build_time_faiss, 4),
        "Waktu Query (s)": round(query_time_faiss, 4),
        "Contoh Index Terdekat": str(indices[0][:5])
    })

    # ---------------- HNSW ----------------
    hnsw_index = hnswlib.Index(space='l2' if metric == 'euclidean' else 'cosine', dim=dim)
    start = time.time()
    hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
    hnsw_index.add_items(X)
    build_time_hnsw = time.time() - start

    hnsw_index.set_ef(50)
    start = time.time()
    labels, distances = hnsw_index.knn_query(query, k=k)
    query_time_hnsw = time.time() - start

    results.append({
        "Metode": "HNSW",
        "Distance Metric": metric.title(),
        "Jumlah Data": f"{n_data:,}",
        "Dimensi": dim,
        "Waktu Build (s)": round(build_time_hnsw, 4),
        "Waktu Query (s)": round(query_time_hnsw, 4),
        "Contoh Index Terdekat": str(labels[0][:5])
    })

# ======================================================
# 3. Tampilkan hasil dalam tabel yang rapi & modern
# ======================================================
df = pd.DataFrame(results)

styled = (
    df.style
    .set_table_styles([
        {"selector": "thead th",
         "props": [("background-color", "#4B8BBE"),
                   ("color", "white"),
                   ("font-weight", "bold"),
                   ("text-align", "center"),
                   ("font-size", "14px"),
                   ("border", "1px solid #ddd")]},
        {"selector": "tbody td",
         "props": [("text-align", "center"),
                   ("border", "1px solid #ddd"),
                   ("padding", "6px"),
                   ("font-size", "13px")]},
        {"selector": "tbody tr:nth-child(even)",
         "props": [("background-color", "#f9f9f9")]},
        {"selector": "tbody tr:hover",
         "props": [("background-color", "#f1f7ff")]}
    ])
    .set_caption("Tabel 1. Hasil Percobaan ANN dengan Metric Berbeda")
    .set_table_attributes('style="border-collapse: collapse; width: 100%; font-family: Arial;"')
)

display(HTML("<h3 style='font-family:Arial; color:#333;'>Hasil Percobaan ANN (Annoy, FAISS, HNSW)</h3>"))
display(styled)



===== Percobaan Metric: EUCLIDEAN =====

===== Percobaan Metric: ANGULAR =====


Unnamed: 0,Metode,Distance Metric,Jumlah Data,Dimensi,Waktu Build (s),Waktu Query (s),Contoh Index Terdekat
0,Annoy,Euclidean,100000,5,1.6455,0.0001,"[27609, 27983, 74899, 54345, 66763]"
1,FAISS,Euclidean,100000,5,0.0006,0.0007,[27609 27983 74899 54345 66763]
2,HNSW,Euclidean,100000,5,13.3506,0.0002,[27609 27983 74899 54345 66763]
3,Annoy,Angular,100000,5,3.025,0.0001,"[16891, 79174, 10984, 95644, 78500]"
