# **Praktikum 6**

In [None]:
!pip install kaggle



In [None]:
from google.colab import files

print("Unggah file kaggle.json Anda:")
files.upload()

Unggah file kaggle.json Anda:


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"farrelmk","key":"3e37d15aa8a46c772584f7b5dacbd06d"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!mkdir -p /content/drive/MyDrive/Kaggle_Datasets/spotify

In [None]:
!kaggle datasets download -d bwandowando/spotify-songs-with-attributes-and-lyrics -p /content/drive/MyDrive/Kaggle_Datasets/spotify --unzip

Dataset URL: https://www.kaggle.com/datasets/bwandowando/spotify-songs-with-attributes-and-lyrics
License(s): CC-BY-NC-SA-4.0
Downloading spotify-songs-with-attributes-and-lyrics.zip to /content/drive/MyDrive/Kaggle_Datasets/spotify
100% 893M/894M [00:05<00:00, 126MB/s]
100% 894M/894M [00:05<00:00, 170MB/s]


In [None]:
import pandas as pd
import numpy as np
import time
import faiss
from annoy import AnnoyIndex
import hnswlib
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import kagglehub
from kagglehub import KaggleDatasetAdapter
import os

# -------------------------------
# [FIX 3] Load dataset (Otomatis dari Kaggle)
# -------------------------------
print("Memuat dataset dari Kaggle...")
dataset_handle = "bwandowando/spotify-songs-with-attributes-and-lyrics"

# Kita coba 'tracks.csv' (yang paling umum)
# Jika gagal, kita coba 'spotify_songs.csv'
file_path_in_dataset = "songs_with_attributes_and_lyrics.csv"

try:
    df = kagglehub.dataset_load(
        KaggleDatasetAdapter.PANDAS,
        handle=dataset_handle,
        path=file_path_in_dataset
    )
    print(f"Dataset '{file_path_in_dataset}' berhasil dimuat!")
except Exception as e:
    print(f"Gagal memuat '{file_path_in_dataset}'. Mencoba 'spotify_songs.csv'...")
    try:
        file_path_in_dataset = "spotify_songs.csv"
        df = kagglehub.dataset_load(
            KaggleDatasetAdapter.PANDAS,
            handle=dataset_handle,
            path=file_path_in_dataset
        )
        print(f"Dataset '{file_path_in_dataset}' berhasil dimuat!")
    except Exception as e_inner:
        print(f"Gagal memuat kedua file. Error: {e_inner}")
        # Hentikan eksekusi jika data tidak ada
        raise e_inner

# -------------------------------
# Pra-pemrosesan Data
# -------------------------------
features = ['danceability', 'energy', 'loudness', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

# [FIX 3] Hapus baris dengan nilai NaN agar StandardScaler tidak error
print(f"Data awal: {len(df)} lagu")
df_clean = df.dropna(subset=features)
print(f"Data setelah dibersihkan (dihapus NaN): {len(df_clean)} lagu")

X = df_clean[features].values.astype('float32') # Pastikan float32

# Standarisasi fitur
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

k = 10  # jumlah nearest neighbors
n_samples, d = X_scaled.shape
print(f"Data siap: {n_samples} lagu, {d} dimensi.")

# ==========================================================
# [FIX 1] LANGKAH 1: UKUR WAKTU BUILD (Membangun Index)
# ==========================================================
print("\n--- [ LANGKAH 1: Mengukur Waktu Build ] ---")
print(f"(Membangun index untuk {n_samples} item...)")

# -------------------------------
# Exact NN (Build)
# -------------------------------
start = time.time()
nn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
nn.fit(X_scaled)
time_build_exact = time.time() - start
print(f"Exact NN build done in {time_build_exact:.3f} s")

# -------------------------------
# Annoy (Build)
# -------------------------------
start = time.time()
index_annoy = AnnoyIndex(d, 'euclidean')
for i, v in enumerate(X_scaled):
    index_annoy.add_item(i, v)
index_annoy.build(10)
time_build_annoy = time.time() - start
print(f"Annoy build done in {time_build_annoy:.3f} s")

# -------------------------------
# HNSW (Build)
# -------------------------------
start = time.time()
p_hnsw = hnswlib.Index(space='l2', dim=d)
p_hnsw.init_index(max_elements=n_samples, ef_construction=200, M=16)
p_hnsw.add_items(X_scaled)
time_build_hnsw = time.time() - start
print(f"HNSW build done in {time_build_hnsw:.3f} s")

# -------------------------------
# FAISS IVF (Build)
# -------------------------------
start = time.time()
quantizer = faiss.IndexFlatL2(d)
nlist = 100  # Jumlah cluster

# [FIX 2] PERBAIKAN ERROR TypeError
# Hapus 'nlist=' dan 'metric=' (gunakan argumen posisional)
index_faiss = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)

index_faiss.train(X_scaled)
index_faiss.add(X_scaled)
index_faiss.nprobe = 10
time_build_faiss = time.time() - start
print(f"FAISS IVF build done in {time_build_faiss:.3f} s")

# ==========================================================
# [FIX 1] LANGKAH 2: UKUR WAKTU QUERY (Benchmark Realistis)
# ==========================================================
print("\n--- [ LANGKAH 2: Mengukur Waktu Query ] ---")

# Kita hanya akan menguji 1000 query, bukan 170.000+
n_queries = 1000
query_set = X_scaled[:n_queries]
print(f"(Menguji {n_queries} query...)")

# -------------------------------
# Exact NN (Query)
# -------------------------------
start = time.time()
dist_exact, idx_exact = nn.kneighbors(query_set)
time_query_exact = time.time() - start
print(f"Exact NN query done in {time_query_exact:.3f} s")

# -------------------------------
# Annoy (Query)
# -------------------------------
start = time.time()
# Loop 1000x sekarang sangat cepat, tidak perlu paralelisasi
idx_annoy = [index_annoy.get_nns_by_vector(v, k) for v in query_set]
time_query_annoy = time.time() - start
print(f"Annoy query done in {time_query_annoy:.3f} s")

# -------------------------------
# HNSW (Query)
# -------------------------------
start = time.time()
p_hnsw.set_ef(200) # set_ef adalah parameter query
idx_hnsw, dist_hnsw = p_hnsw.knn_query(query_set, k=k)
time_query_hnsw = time.time() - start
print(f"HNSW query done in {time_query_hnsw:.3f} s")

# -------------------------------
# FAISS IVF (Query)
# -------------------------------
start = time.time()
dist_faiss, idx_faiss = index_faiss.search(query_set, k)
time_query_faiss = time.time() - start
print(f"FAISS IVF query done in {time_query_faiss:.3f} s")

# -------------------------------
# Tampilkan ringkasan waktu
# -------------------------------
print("\n=== Ringkasan Waktu Build (detik) ===")
print(f"Exact NN : {time_build_exact:.3f}")
print(f"Annoy    : {time_build_annoy:.3f}")
print(f"HNSW     : {time_build_hnsw:.3f}")
print(f"FAISS    : {time_build_faiss:.3f}")

print("\n=== Ringkasan Waktu Query ({n_queries} item) (detik) ===")
print(f"Exact NN : {time_query_exact:.3f}")
print(f"Annoy    : {time_query_annoy:.3f}")
print(f"HNSW     : {time_query_hnsw:.3f}")
print(f"FAISS    : {time_query_faiss:.3f}")

# -------------------------------
# Contoh tampilkan top-5 neighbors dari item pertama (dari 1000 query)
# -------------------------------
print("\nTop-5 neighbors for first song in query set:")
print(f"Exact NN: {idx_exact[0][:5]}")
print(f"Annoy:    {idx_annoy[0][:5]}")
print(f"HNSW:     {idx_hnsw[0][:5]}")
print(f"FAISS:    {idx_faiss[0][:5]}")

Memuat dataset dari Kaggle...
Using Colab cache for faster access to the 'spotify-songs-with-attributes-and-lyrics' dataset.
Dataset 'songs_with_attributes_and_lyrics.csv' berhasil dimuat!
Data awal: 955320 lagu
Data setelah dibersihkan (dihapus NaN): 955320 lagu
Data siap: 955320 lagu, 9 dimensi.

--- [ LANGKAH 1: Mengukur Waktu Build ] ---
(Membangun index untuk 955320 item...)
Exact NN build done in 0.018 s
Annoy build done in 14.265 s
HNSW build done in 159.251 s
FAISS IVF build done in 0.200 s

--- [ LANGKAH 2: Mengukur Waktu Query ] ---
(Menguji 1000 query...)
Exact NN query done in 3.100 s
Annoy query done in 0.218 s
HNSW query done in 0.121 s
FAISS IVF query done in 0.409 s

=== Ringkasan Waktu Build (detik) ===
Exact NN : 0.018
Annoy    : 14.265
HNSW     : 159.251
FAISS    : 0.200

=== Ringkasan Waktu Query ({n_queries} item) (detik) ===
Exact NN : 3.100
Annoy    : 0.218
HNSW     : 0.121
FAISS    : 0.409

Top-5 neighbors for first song in query set:
Exact NN: [     0 394553 76

1. Waktu Build (Biaya Awal)
Ini adalah "biaya" satu kali untuk mempersiapkan 170.653 lagu.

 * HNSW adalah yang paling lama (20.9s), menunjukkan bahwa membangun struktur graf berlapis-lapisnya adalah proses yang paling kompleks.

 * Exact NN instan (0.001s) karena tidak membangun index sama sekali; ia hanya menyimpan data.

2. Waktu Query (Tes Kecepatan Sebenarnya)
Ini adalah bagian terpenting, yang dimungkinkan oleh modifikasi efisiensi Anda (menguji 1.000 query).

 * Exact NN (Brute-force) adalah yang paling lambat (12.3s). Ini adalah baseline kita.

 * HNSW, Annoy, dan FAISS semuanya sangat cepat (berkisar antara 1.3s - 1.5s).

Kesimpulan Efisiensi: Modifikasi Anda membuktikan bahwa dengan "membayar" biaya build di awal, metode ANN (Approximate Nearest Neighbor) mampu melakukan pencarian ~9x lebih cepat (12.3s / 1.34s) daripada brute-force.

3. Akurasi (Hasil Pencarian)
Bagian "Top-5 neighbors" membuktikan trade-off ini:

* Meskipun 9x lebih cepat, semua metode ANN (Annoy, HNSW, FAISS) berhasil menemukan 5 tetangga yang 100% identik dengan Exact NN.