## Pengantar

Percobaan kali ini kita akan melihat perbedaan ketiga model yang telah kita bahas dan bandingkan hasilnya.


In [3]:
pip install annoy faiss-cpu hnswlib

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Collecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: annoy, hnswlib
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp312-cp312-li

In [4]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

# ===============================
# 1. Buat dataset 1 juta data 5D
# ===============================
n_data = 1_000_000   # bisa coba 100_000 dulu jika RAM terbatas
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)

# Query point
query = np.random.random((1, dim)).astype(np.float32)
k = 10

# ===============================
# 2. Annoy
# ===============================
print("=== Annoy ===")
ann_index = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index.add_item(i, X[i])
ann_index.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", neighbors[0][:5], "...")

# ===============================
# 3. FAISS (Flat Index)
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", indices[0][:5], "...")

# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib) ===")
hnsw_index = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index.add_items(X)
build_time = time.time() - start

hnsw_index.set_ef(50)

start = time.time()
labels, distances = hnsw_index.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors:", labels[0][:5], "...")

=== Annoy ===
Build time: 34.19833207130432 detik
Query time: 0.00027441978454589844 detik
Neighbors: [763135, 819453, 151432, 610697, 633217] ...

=== FAISS (IndexFlatL2) ===
Build time: 0.017019033432006836 detik
Query time: 0.0064296722412109375 detik
Neighbors: [763135 819453 151432 610697 633217] ...

=== HNSW (hnswlib) ===
Build time: 191.35134434700012 detik
Query time: 0.0002651214599609375 detik
Neighbors: [763135 819453 151432 610697 633217] ...


## Tugas

Lakukan percobaan pada metric distance yang berbeda. Catat hasilnya pada tabel yang anda buat sendiri seperti pada praktikum 1.

In [5]:
import numpy as np
import pandas as pd
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

# --------------------------------------------------------------
# Konfigurasi dataset (10k sampel, dimensi 5)
# --------------------------------------------------------------
N = 10_000
D = 5
K = 10

np.random.seed(0)
data_vecs = np.random.rand(N, D).astype(np.float32)
query_vec = np.random.rand(1, D).astype(np.float32)

summary = []

# --------------------------------------------------------------
# Uji Annoy dengan beberapa metric
# --------------------------------------------------------------
annoy_distance_types = ["euclidean", "angular", "manhattan"]

for dist_type in annoy_distance_types:
    ann = AnnoyIndex(D, dist_type)

    # Penyusunan index + penambahan item
    t0 = time.time()
    for idx in range(N):
        ann.add_item(idx, data_vecs[idx])
    ann.build(10)
    t_build = time.time() - t0

    # Query
    t1 = time.time()
    _ = ann.get_nns_by_vector(query_vec[0], K)
    t_query = time.time() - t1

    summary.append({
        "Library": "Annoy",
        "Metric": dist_type,
        "Build (s)": round(t_build, 3),
        "Query (s)": round(t_query, 6)
    })

# --------------------------------------------------------------
# Uji FAISS (L2, L1, Inner Product)
# --------------------------------------------------------------
faiss_indices = {
    "L2" : faiss.IndexFlatL2(D),
    "L1" : faiss.IndexFlat(D, faiss.METRIC_L1),
    "IP" : faiss.IndexFlatIP(D)
}

for metric_name, idx_model in faiss_indices.items():
    t0 = time.time()
    idx_model.add(data_vecs)
    t_build = time.time() - t0

    t1 = time.time()
    _ = idx_model.search(query_vec, K)
    t_query = time.time() - t1

    summary.append({
        "Library": "FAISS",
        "Metric": metric_name,
        "Build (s)": round(t_build, 3),
        "Query (s)": round(t_query, 6)
    })

# --------------------------------------------------------------
# Uji HNSW (L2, Cosine, Inner Product)
# --------------------------------------------------------------
hnsw_spaces = ["l2", "cosine", "ip"]

for space_type in hnsw_spaces:
    hnsw = hnswlib.Index(space=space_type, dim=D)

    t0 = time.time()
    hnsw.init_index(max_elements=N, M=16, ef_construction=100)
    hnsw.add_items(data_vecs)
    t_build = time.time() - t0

    hnsw.set_ef(50)

    t1 = time.time()
    _ = hnsw.knn_query(query_vec, k=K)
    t_query = time.time() - t1

    summary.append({
        "Library": "HNSW",
        "Metric": space_type.upper(),
        "Build (s)": round(t_build, 3),
        "Query (s)": round(t_query, 6)
    })

# --------------------------------------------------------------
# Tampilkan tabel hasil
# --------------------------------------------------------------
df_results = pd.DataFrame(summary)
print(df_results.to_string(index=False))


Library    Metric  Build (s)  Query (s)
  Annoy euclidean      0.154   0.000103
  Annoy   angular      0.261   0.000079
  Annoy manhattan      0.184   0.000104
  FAISS        L2      0.000   0.000150
  FAISS        L1      0.000   0.002601
  FAISS        IP      0.000   0.000160
   HNSW        L2      0.631   0.000100
   HNSW    COSINE      0.807   0.000115
   HNSW        IP      1.420   0.000141
