# **Praktikum 4**

In [None]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

# ===============================
# 1. Buat dataset 1 juta data 5D
# ===============================
n_data = 1_000_000   # bisa coba 100_000 dulu jika RAM terbatas
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)

# Query point
query = np.random.random((1, dim)).astype(np.float32)
k = 10

# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", neighbors[0][:5], "...")

# ===============================
# 3. FAISS (Flat Index)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", indices[0][:5], "...")

# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", labels[0][:5], "...")


=== Annoy ===
Build time: 22.598904609680176 detik
Query time: 0.0002491474151611328 detik
Neighbors: [222100, 507402, 94175, 827168, 509951] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.00824284553527832 detik
Query time: 0.006329536437988281 detik
Neighbors: [222100 507402  94175 827168 509951] ...

=== HNSW (hnswlib) ===
Build time: 182.7708580493927 detik
Query time: 0.00029397010803222656 detik
Neighbors: [222100 507402  94175 827168 509951] ...


In [None]:
import numpy as np
import time
import annoy
import faiss
import hnswlib
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# ===============================
# 1. Parameter Eksperimen
# ===============================
n_data = 1_000_000
dim = 5
k = 10
n_queries = 100  # Menggunakan 100 query agar waktu lebih stabil
metrics_to_test = ['L2', 'IP']

# List untuk menyimpan hasil
results = []

print(f"Memulai eksperimen: {n_data} data, {dim} dimensi, {k} tetangga, {n_queries} query.")
print("-" * 50)

# ===============================
# 2. Mulai Eksperimen
# ===============================

for metric_name in metrics_to_test:
    print(f"\n--- Menguji Metric: {metric_name} ---")

    # Buat data baru untuk setiap metrik
    np.random.seed(42)
    X = np.random.random((n_data, dim)).astype(np.float32)
    queries = np.random.random((n_queries, dim)).astype(np.float32)

    # Tentukan parameter library berdasarkan metrik
    if metric_name == 'L2':
        annoy_metric = 'euclidean'
        faiss_index_class = faiss.IndexFlatL2(dim)
        hnsw_space = 'l2'
        sklearn_metric = 'euclidean'

    elif metric_name == 'IP':
        annoy_metric = 'angular' # Annoy menggunakan 'angular' untuk Cosine/IP
        faiss_index_class = faiss.IndexFlatIP(dim)
        hnsw_space = 'ip'
        sklearn_metric = 'cosine' # Sklearn 'cosine' ekuivalen dengan 'ip' pada data ternormalisasi

        # Normalisasi data PENTING untuk perbandingan IP/Cosine
        print("Normalisasi data untuk metrik IP/Angular...")
        faiss.normalize_L2(X)
        faiss.normalize_L2(queries)

    # --- 3. Annoy ---
    print("Menguji Annoy...")
    ann_index = annoy.AnnoyIndex(dim, annoy_metric)

    start = time.time()
    for i in range(n_data):
        ann_index.add_item(i, X[i])
    ann_index.build(10)  # 10 trees
    build_time = time.time() - start

    start = time.time()
    for q in queries: # Annoy di-query satu per satu
        ann_index.get_nns_by_vector(q, k)
    query_time = time.time() - start

    results.append({
        'Library': 'Annoy',
        'Metric': metric_name,
        'Build Time (s)': build_time,
        'Query Time (s)': query_time
    })

    # --- 4. FAISS (Flat / Exact) ---
    print("Menguji FAISS (Flat)...")
    faiss_index = faiss_index_class

    start = time.time()
    faiss_index.add(X)
    build_time = time.time() - start

    start = time.time()
    distances, indices = faiss_index.search(queries, k)
    query_time = time.time() - start

    results.append({
        'Library': 'FAISS (Flat)',
        'Metric': metric_name,
        'Build Time (s)': build_time,
        'Query Time (s)': query_time
    })

    # --- 5. HNSW (hnswlib) ---
    print("Menguji HNSW...")
    hnsw_index = hnswlib.Index(space=hnsw_space, dim=dim)

    start = time.time()
    hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
    hnsw_index.add_items(X)
    build_time = time.time() - start

    hnsw_index.set_ef(50)

    start = time.time()
    labels, distances = hnsw_index.knn_query(queries, k=k)
    query_time = time.time() - start

    results.append({
        'Library': 'HNSW',
        'Metric': metric_name,
        'Build Time (s)': build_time,
        'Query Time (s)': query_time
    })

# ===============================
# 3. Tampilkan Hasil
# ===============================
print("\n" + "=" * 50)
print("HASIL EKSPERIMEN (1 Juta Data, 5 Dimensi)")
print("=" * 50)

df = pd.DataFrame(results)
print(df.to_string())

Memulai eksperimen: 1000000 data, 5 dimensi, 10 tetangga, 100 query.
--------------------------------------------------

--- Menguji Metric: L2 ---
Menguji Annoy...
Menguji FAISS (Flat)...
Menguji HNSW...

--- Menguji Metric: IP ---
Normalisasi data untuk metrik IP/Angular...
Menguji Annoy...
Menguji FAISS (Flat)...
Menguji HNSW...

HASIL EKSPERIMEN (1 Juta Data, 5 Dimensi)
        Library Metric  Build Time (s)  Query Time (s)
0         Annoy     L2       22.129065        0.007965
1  FAISS (Flat)     L2        0.015058        0.990025
2          HNSW     L2      174.708445        0.004487
3         Annoy     IP       29.769129        0.007183
4  FAISS (Flat)     IP        0.008443        0.122989
5          HNSW     IP      156.178042        0.004334
