# TWORZENIE ZBIOROW: WALIDACYJNEGO, TESTOWEGO, TRENINGOWEGO + BENCHMARK KNN

## FUNKCJA POMOCNICZA + IMPORTY

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.metrics import accuracy_score, classification_report

In [3]:
def load_fma_data(tracks_path='~/studia/3_sem/priad/big_project/PROJEKT_PRiAD/data/tracks.csv', features_path='~/studia/3_sem/priad/big_project/PROJEKT_PRiAD/data/features.csv', subset='small'):

    print(f"ladowanie danych (subset: {subset})...")

    tracks = pd.read_csv(tracks_path, index_col=0, header=[0, 1], low_memory=False)

    if subset is not None:
        subset_mask = tracks[('set', 'subset')] == subset
        tracks = tracks[subset_mask]


    features = pd.read_csv(features_path, index_col=0, header=[0, 1, 2], low_memory=False)


    common_ids = tracks.index.intersection(features.index)
    tracks = tracks.loc[common_ids]
    features = features.loc[common_ids]

    print(f"zaladowano {len(tracks)} utworow.")
    return tracks, features

## PRZYGOTOWANIE ZBIOROW (TRAIN / VAL / TEST)

### LADOWANIE DANYCH

In [4]:
tracks, features = load_fma_data(subset='small')

ladowanie danych (subset: small)...
zaladowano 8000 utworow.


### WWYCIAGNIECIE ETYKIET

In [5]:
y = tracks[('track', 'genre_top')]

### WYCIAGNIECIE INFORMACJI O PODZIALE (SPLIT) Z METADANYCH

In [6]:
split_column = tracks[('set', 'split')]

### MASKA LOGICZNA DLA KAZDEGO ZBIORU

In [7]:
mask_train = split_column == 'training'
mask_val = split_column == 'validation'
mask_test = split_column == 'test'

### PODZIAL CECH I ETYKIET

In [8]:
X_train = features.loc[mask_train]
X_val = features.loc[mask_val]
X_test = features.loc[mask_test]

y_train = y.loc[mask_train]
y_val = y.loc[mask_val]
y_test = y.loc[mask_test]

In [9]:
print(f"liczebnosc zbiorow:")
print(f" - treningowy:  {X_train.shape[0]}")
print(f" - walidacyjny: {X_val.shape[0]}")
print(f" - testowy:     {X_test.shape[0]}")

liczebnosc zbiorow:
 - treningowy:  6400
 - walidacyjny: 800
 - testowy:     800


## PREPROCESSING (STANDARYZACJA)

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## BENCHMARK kNN (TRENING I WALIDACJA)

### WYSZUKIWANIE PARAMETRU K I NAJLEPSZEJ METRYKI

In [11]:
k_values = [5, 10, 20, 50, 70, 100]
best_k = 5
best_score = 0
best_metric = ""

print("szukanie optymalnego k i metryki na zbiorze walidacyjnym:")
for k in k_values:

    knn_cosine = KNeighborsClassifier(n_neighbors=k, metric='cosine')
    knn_euclidean = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
    knn_cosine.fit(X_train_scaled, y_train)
    knn_euclidean.fit(X_train_scaled, y_train)

    score_cosine = knn_cosine.score(X_val_scaled, y_val)
    score_euclidean = knn_euclidean.score(X_test_scaled, y_test)
    print(f"k={k}: accuracy cosine (val) = {score_cosine:.4f}")
    print(f"k={k}: accuracy euclidean (val) = {score_euclidean:.4f}")

    score, metric = (
    (score_cosine, "cosine")
    if score_cosine > score_euclidean
    else (score_euclidean, "euclidean")
    )

    if score > best_score:
        best_score = score
        best_k = k
        best_metric = metric

print(f"\nnajlepsze k wybrane w walidacji: {best_k} dla metryki {best_metric}.")

szukanie optymalnego k i metryki na zbiorze walidacyjnym:


k=5: accuracy cosine (val) = 0.4662
k=5: accuracy euclidean (val) = 0.3225
k=10: accuracy cosine (val) = 0.4800
k=10: accuracy euclidean (val) = 0.3212
k=20: accuracy cosine (val) = 0.4850
k=20: accuracy euclidean (val) = 0.3563
k=50: accuracy cosine (val) = 0.4888
k=50: accuracy euclidean (val) = 0.3513
k=70: accuracy cosine (val) = 0.4888
k=70: accuracy euclidean (val) = 0.3538
k=100: accuracy cosine (val) = 0.4850
k=100: accuracy euclidean (val) = 0.3463

najlepsze k wybrane w walidacji: 50 dla metryki cosine.


### OSTATECZNY BENCHMARK

In [12]:
final_model = KNeighborsClassifier(n_neighbors=best_k, metric=best_metric)
final_model.fit(X_train_scaled, y_train)

y_pred = final_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"\nWYNIK KONCOWY (BENCHMARK)")
print(f"metoda: kNN (k={best_k}, metric={best_metric})")
print(f"accuracy na zbiorze testowym: {test_accuracy:.4f}")
print("\nraport klasyfikacji:")
print(classification_report(y_test, y_pred))


WYNIK KONCOWY (BENCHMARK)
metoda: kNN (k=50, metric=cosine)
accuracy na zbiorze testowym: 0.4213

raport klasyfikacji:
               precision    recall  f1-score   support

   Electronic       0.45      0.61      0.52       100
 Experimental       0.32      0.26      0.29       100
         Folk       0.24      0.37      0.29       100
      Hip-Hop       0.60      0.59      0.60       100
 Instrumental       0.33      0.35      0.34       100
International       0.45      0.38      0.41       100
          Pop       0.63      0.12      0.20       100
         Rock       0.58      0.69      0.63       100

     accuracy                           0.42       800
    macro avg       0.45      0.42      0.41       800
 weighted avg       0.45      0.42      0.41       800



## PRZYK≈ÅAD REKOMENDACJI (SYSTEM REKOMENDACYJNY)

In [13]:
print("\n--- symulacja systemu rekomendacyjnego ---")
recommender = NearestNeighbors(n_neighbors=5, metric='cosine')
recommender.fit(X_train_scaled)

# biore pierwszy utwor ze zbioru testowego jako "zapytanie" (query song)
query_index = 0
query_song_features = X_test_scaled[query_index].reshape(1, -1)
query_song_id = X_test.index[query_index]
query_genre = y_test.iloc[query_index]

distances, indices = recommender.kneighbors(query_song_features)

print(f"dla utworu ID: {query_song_id} (Gatunek: {query_genre}), rekomendowane utwory (ID ze zbioru treningowego):")

for i, idx in enumerate(indices[0]):
    neighbor_id = X_train.index[idx]
    neighbor_genre = y_train.iloc[idx]
    dist = distances[0][i]
    print(f"{i+1}. ID: {neighbor_id} | gatunek: {neighbor_genre} | dystans: {dist:.4f}")


--- symulacja systemu rekomendacyjnego ---
dla utworu ID: 182 (Gatunek: Rock), rekomendowane utwory (ID ze zbioru treningowego):
1. ID: 119086 | gatunek: Experimental | dystans: 0.5574
2. ID: 131770 | gatunek: Pop | dystans: 0.5708
3. ID: 4522 | gatunek: Electronic | dystans: 0.5765
4. ID: 126221 | gatunek: Experimental | dystans: 0.5768
5. ID: 58140 | gatunek: Folk | dystans: 0.5835
