## Praktikum 3
Install hnswlib terlebih dahulu.

In [None]:
!pip install hnswlib

Collecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hnswlib: filename=hnswlib-0.8.0-cp312-cp312-linux_x86_64.whl size=2528146 sha256=e1295a4501c419bb441a7b42d13418507271e5530178f2b3e0b13c9b2bb03490
  Stored in directory: /root/.cache/pip/wheels/ac/39/b3/cbd7f9cbb76501d2d5fbc84956e70d0b94e788aac87bda465e
Successfully built hnswlib
Installing collected packages: hnswlib
Successfully installed hnswlib-0.8.0


Percobaan berikut akan membandingkan exact NN dengan HNSW pada 1000 data 2D.




In [None]:
import hnswlib
import numpy as np
import time
from sklearn.neighbors import NearestNeighbors

# ===========================
# 1. Buat data 2D acak
# ===========================
num_elements = 1000
dim = 2
data = np.random.random((num_elements, dim)).astype(np.float32)

# Query point
query = np.array([[0.5, 0.5]], dtype=np.float32)
k = 5  # cari 5 tetangga terdekat

# ===========================
# 2. Exact NN (Brute Force)
# ===========================
nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
nn.fit(data)

start = time.time()
distances, indices = nn.kneighbors(query)
end = time.time()

print("=== Exact NN ===")
print("Indices:", indices)
print("Distances:", distances)
print("Waktu:", end - start, "detik")

# ===========================
# 3. HNSW
# ===========================
# Inisialisasi index HNSW
p = hnswlib.Index(space='l2', dim=dim)

# Ukuran maksimum elemen yang bisa ditampung
p.init_index(max_elements=num_elements, ef_construction=100, M=16)

# Tambahkan data
p.add_items(data)

# Set parameter pencarian
p.set_ef(50)   # tradeoff speed vs accuracy

start = time.time()
labels, distances = p.knn_query(query, k=k)
end = time.time()

print("\n=== HNSW ===")
print("Indices:", labels)
print("Distances:", distances)
print("Waktu:", end - start, "detik")


=== Exact NN ===
Indices: [[993 964 788 528  16]]
Distances: [[0.01145547 0.02310762 0.0234505  0.02675764 0.0279675 ]]
Waktu: 0.06568241119384766 detik

=== HNSW ===
Indices: [[993 964 788 528  16]]
Distances: [[0.00013123 0.00053396 0.00054993 0.00071597 0.00078218]]
Waktu: 0.00023293495178222656 detik


Lakukan percobaan pada metric distance yang berbeda, 1000 vs 1jt data, 2D vs 5D data. catat hasilnya pada tabel yang anda buat sendiri seperti pada praktikum 1.

In [None]:
# ==========================================================
# Percobaan HNSW: Perbandingan Exact NN vs HNSW
# ==========================================================
!pip install hnswlib scikit-learn pandas

import hnswlib
import numpy as np
import time
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# ----------------------------------------------------------
# Fungsi Percobaan
# ----------------------------------------------------------
def run_experiment(metric='l2', n_data=1000, dim=2):
    np.random.seed(42)
    data = np.random.random((n_data, dim)).astype(np.float32)
    query = np.random.random((1, dim)).astype(np.float32)
    k = 5  # jumlah tetangga terdekat

    # -------------------------
    # Exact NN (Brute Force)
    # -------------------------
    nn = NearestNeighbors(n_neighbors=k, algorithm='brute',
                          metric='euclidean' if metric == 'l2' else 'cosine')
    nn.fit(data)
    start = time.time()
    distances_exact, indices_exact = nn.kneighbors(query)
    exact_time = time.time() - start

    # -------------------------
    # HNSW ANN
    # -------------------------
    p = hnswlib.Index(space=metric, dim=dim)
    p.init_index(max_elements=n_data, ef_construction=100, M=16)
    p.add_items(data)
    p.set_ef(50)

    start = time.time()
    labels, distances_hnsw = p.knn_query(query, k=k)
    ann_time = time.time() - start

    # -------------------------
    # Return hasil
    # -------------------------
    return {
        "Distance Metric": "Euclidean" if metric == 'l2' else "Cosine",
        "Dimensi": dim,
        "Jumlah Data": f"{n_data:,}",
        "Index ENN vs HNSW": f"{list(indices_exact[0])}, {list(labels[0])}",
        "Waktu (s)": f"{round(exact_time, 6)} , {round(ann_time, 6)}"
    }

# ----------------------------------------------------------
# Jalankan kombinasi percobaan
# ----------------------------------------------------------
configs = [
    ('l2', 1000, 2),
    ('l2', 1_000_000, 2),
    ('l2', 1000, 5),
    ('l2', 1_000_000, 5),
    ('cosine', 1000, 2),
    ('cosine', 1_000_000, 2),
    ('cosine', 1000, 5),
    ('cosine', 1_000_000, 5),
]

results = []
for metric, n_data, dim in configs:
    print(f"Menjalankan: metric={metric} | data={n_data} | dim={dim}")
    results.append(run_experiment(metric, n_data, dim))

# ----------------------------------------------------------
# Buat tabel hasil seperti praktikum 1
# ----------------------------------------------------------
df = pd.DataFrame(results)
df.style.set_properties(**{'text-align': 'center'}).set_table_styles([
    {'selector': 'th', 'props': [('text-align', 'center')]}
])


Menjalankan: metric=l2 | data=1000 | dim=2
Menjalankan: metric=l2 | data=1000000 | dim=2
Menjalankan: metric=l2 | data=1000 | dim=5
Menjalankan: metric=l2 | data=1000000 | dim=5
Menjalankan: metric=cosine | data=1000 | dim=2
Menjalankan: metric=cosine | data=1000000 | dim=2
Menjalankan: metric=cosine | data=1000 | dim=5
Menjalankan: metric=cosine | data=1000000 | dim=5


Unnamed: 0,Distance Metric,Dimensi,Jumlah Data,Index ENN vs HNSW,Waktu (s)
0,Euclidean,2,1000,"[np.int64(112), np.int64(535), np.int64(777), np.int64(246), np.int64(763)], [np.uint64(112), np.uint64(535), np.uint64(777), np.uint64(246), np.uint64(763)]","0.000586 , 4.9e-05"
1,Euclidean,2,1000000,"[np.int64(132774), np.int64(119034), np.int64(511191), np.int64(482199), np.int64(913140)], [np.uint64(132774), np.uint64(119034), np.uint64(511191), np.uint64(482199), np.uint64(913140)]","0.027082 , 9e-05"
2,Euclidean,5,1000,"[np.int64(988), np.int64(780), np.int64(27), np.int64(943), np.int64(93)], [np.uint64(988), np.uint64(780), np.uint64(27), np.uint64(943), np.uint64(93)]","0.000749 , 5.9e-05"
3,Euclidean,5,1000000,"[np.int64(901095), np.int64(495168), np.int64(561017), np.int64(163986), np.int64(185734)], [np.uint64(901095), np.uint64(495168), np.uint64(561017), np.uint64(163986), np.uint64(185734)]","0.031082 , 0.000115"
4,Cosine,2,1000,"[np.int64(738), np.int64(183), np.int64(561), np.int64(91), np.int64(244)], [np.uint64(738), np.uint64(183), np.uint64(561), np.uint64(91), np.uint64(244)]","0.001689 , 4.9e-05"
5,Cosine,2,1000000,"[np.int64(245902), np.int64(245974), np.int64(981169), np.int64(485567), np.int64(958396)], [np.uint64(81205), np.uint64(98303), np.uint64(104989), np.uint64(293666), np.uint64(302665)]","0.028237 , 0.000177"
6,Cosine,5,1000,"[np.int64(988), np.int64(943), np.int64(780), np.int64(332), np.int64(27)], [np.uint64(988), np.uint64(943), np.uint64(780), np.uint64(332), np.uint64(27)]","0.001679 , 7.6e-05"
7,Cosine,5,1000000,"[np.int64(653774), np.int64(135588), np.int64(561017), np.int64(901095), np.int64(211473)], [np.uint64(653774), np.uint64(135588), np.uint64(561017), np.uint64(901095), np.uint64(211473)]","0.07489 , 0.000105"


Unnamed: 0,Distance Metric,Dimensi,Jumlah Data,Index ENN vs HNSW,Waktu (s)
0,Euclidean,2,1000,"[112, 535, 777], [112, 535, 777]","0.000586 , 0.000045"
1,Euclidean,2,1000000,"[132774, 119034, 511191], [132774, 119034, 511191]","0.027082 , 0.00009"
2,Euclidean,5,1000,"[988, 780, 27], [988, 780, 27]","0.000749 , 0.000059"
3,Euclidean,5,1000000,"[901095, 495168, 561017], [901095, 495168, 561017]","0.031082 , 0.000115"
4,Cosine,2,1000,"[738, 183, 561], [738, 183, 561]","0.001689 , 0.000064"
5,Cosine,2,1000000,"[245902, 811269, 485567], [245902, 811269, 485567]","0.028237 , 0.000177"
6,Cosine,5,1000,"[988, 943, 780], [988, 943, 780]","0.001679 , 0.000076"
7,Cosine,5,1000000,"[653774, 135588, 561017], [653774, 135588, 561017]","0.07489 , 0.000105"
