## Install Library

Install hnswlib terlebih dahulu.

In [2]:
!pip install hnswlib

Collecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hnswlib: filename=hnswlib-0.8.0-cp312-cp312-linux_x86_64.whl size=2528146 sha256=1de4ae9c1c6ca90c9dce422b278f7a51d09f744d84b1cab2cf4d7164e1826c86
  Stored in directory: /root/.cache/pip/wheels/ac/39/b3/cbd7f9cbb76501d2d5fbc84956e70d0b94e788aac87bda465e
Successfully built hnswlib
Installing collected packages: hnswlib
Successfully installed hnswlib-0.8.0


## Percobaan

Percobaan berikut akan membandingkan exact NN dengan HNSW pada 1000 data 2D.

In [3]:
import hnswlib
import numpy as np
import time
from sklearn.neighbors import NearestNeighbors

# ===========================
# 1. Buat data 2D acak
# ===========================
num_elements = 1000
dim = 2
data = np.random.random((num_elements, dim)).astype(np.float32)

# Query point
query = np.array([[0.5, 0.5]], dtype=np.float32)
k = 5  # cari 5 tetangga terdekat

# ===========================
# 2. Exact NN (Brute Force)
# ===========================
nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
nn.fit(data)

start = time.time()
distances, indices = nn.kneighbors(query)
end = time.time()

print("=== Exact NN ===")
print("Indices:", indices)
print("Distances:", distances)
print("Waktu:", end - start, "detik")

# ===========================
# 3. HNSW
# ===========================
# Inisialisasi index HNSW
p = hnswlib.Index(space='l2', dim=dim)

# Ukuran maksimum elemen yang bisa ditampung
p.init_index(max_elements=num_elements, ef_construction=100, M=16)

# Tambahkan data
p.add_items(data)

# Set parameter pencarian
p.set_ef(50)   # tradeoff speed vs accuracy

start = time.time()
labels, distances = p.knn_query(query, k=k)
end = time.time()

print("\n=== HNSW ===")
print("Indices:", labels)
print("Distances:", distances)
print("Waktu:", end - start, "detik")

=== Exact NN ===
Indices: [[153 947 251 281 328]]
Distances: [[0.01380907 0.028142   0.04482079 0.05125963 0.05424578]]
Waktu: 0.037459373474121094 detik

=== HNSW ===
Indices: [[153 947 251 281 328]]
Distances: [[0.00019069 0.00079197 0.0020089  0.00262755 0.0029426 ]]
Waktu: 0.00014781951904296875 detik


## Tugas

Lakukan percobaan pada metric distance yang berbeda, 1000 vs 1jt data, 2D vs 5D data. Catat hasilnya pada tabel yang anda buat sendiri seperti pada praktikum 1.

In [4]:
import pandas as pd
import numpy as np
import time
import hnswlib
from sklearn.neighbors import NearestNeighbors

# -------------------------------------------------------------------
# Fungsi Evaluasi: Exact NN vs HNSW
# -------------------------------------------------------------------
def evaluate_hnsw(dataset, q_point, k=10):
    feature_dim = dataset.shape[1]

    # --- Exact Nearest Neighbor (Brute Force) ---
    nbr = NearestNeighbors(n_neighbors=k, algorithm="brute", metric="euclidean")
    nbr.fit(dataset)

    t0 = time.time()
    dist_ex, idx_ex = nbr.kneighbors(q_point)
    t_exact = time.time() - t0

    # --- HNSW Indexing + Query ---
    index = hnswlib.Index(space="l2", dim=feature_dim)
    index.init_index(max_elements=len(dataset), ef_construction=100, M=16)
    index.add_items(dataset)
    index.set_ef(50)

    t1 = time.time()
    idx_hnsw, dist_hnsw = index.knn_query(q_point, k=k)
    t_hnsw = time.time() - t1

    # --- Recall ---
    set_exact = set(idx_ex[0])
    set_hnsw  = set(idx_hnsw[0])
    recall = len(set_exact & set_hnsw) / k

    return t_exact, t_hnsw, recall


# -------------------------------------------------------------------
# Konfigurasi Percobaan
# -------------------------------------------------------------------
experiments = [
    {"n": 1000,      "d": 2},
    {"n": 1_000_000, "d": 2},
    {"n": 1000,      "d": 5},
    {"n": 1_000_000, "d": 5},
]

records = []

# -------------------------------------------------------------------
# Eksekusi semua konfigurasi
# -------------------------------------------------------------------
for exp in experiments:
    N, D = exp["n"], exp["d"]

    np.random.seed(42)
    data_sample = np.random.rand(N, D).astype(np.float32)
    query_pt    = np.random.rand(1, D).astype(np.float32)

    exact_t, hnsw_t, rec = evaluate_hnsw(data_sample, query_pt)

    records.append({
        "Jumlah Data": N,
        "Dimensi": D,
        "Waktu Exact (s)": round(exact_t, 6),
        "Waktu HNSW (s)": round(hnsw_t, 6),
        "Recall@10": round(rec, 3)
    })


# -------------------------------------------------------------------
# Tabel Hasil
# -------------------------------------------------------------------
df_result = pd.DataFrame(records)
print(df_result.to_string(index=False))


 Jumlah Data  Dimensi  Waktu Exact (s)  Waktu HNSW (s)  Recall@10
        1000        2         0.000876        0.000054        1.0
     1000000        2         0.026257        0.000109        1.0
        1000        5         0.001352        0.000093        1.0
     1000000        5         0.031710        0.000139        1.0
