### Preprocessing

In [1]:
import sys
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

1) Resize the data

In [3]:
from src.utils.resize_features import process_files
process_files("../data/da-tacos_benchmark_subset_hpcp", "../data/17K_data", 17000)

Processing Folders: 100%|██████████| 3000/3000 [01:55<00:00, 26.06folder/s]


2. Validate the length

In [2]:
from src.utils.check_length import check_hpcp_lengths

check_hpcp_lengths("../data/17K_data")


Unique HPCP frame lengths found: {17000}
All files have the same HPCP frame length: 17000


3. Make mappings

In [3]:
import json
from src.utils.make_mappings import get_all_folders, create_song_pairs,remove_duplicate_pairs,check_pair_balance

data = get_all_folders("../data/17K_data")
song_pairs = create_song_pairs(data, 10000)
song_pairs_no_dup = remove_duplicate_pairs(song_pairs)
check_pair_balance(song_pairs_no_dup)
with open("song_pairs.json", "w") as f:
    json.dump(song_pairs, f, indent=4)


Similar Pairs (Label 0): 9425
Dissimilar Pairs (Label 1): 9994


4. Split data

In [4]:
from src.utils.split_data import process_data

process_data(
    mapping_json="song_pairs.json",
    base_path="../data/17K_data",
    output_dir="../data/data_model",
    batch_size=100,
    sample_ratio=0.01,
    test_size=0.1,
    val_size=0.1
)

Dataset split: Train=160, Val=20, Test=20
Processed data saved in '../data/data_model'


### Train

In [5]:
import torch
import yaml
from torch.utils.data import DataLoader
from src.model import SiameseNetworkWithBatchNorm
from src.dataset import HpcpDataset
from src.train import train_siamese
from src.evaluate import evaluate_metrics

1) Initialize the device

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2. Initialize the dataloaders

In [7]:
train_dataset = HpcpDataset("../data/data_model/train_features.h5","../data/data_model/train_labels.h5")
val_dataset = HpcpDataset("../data/data_model/val_features.h5", "../data/data_model/val_labels.h5")
test_dataset = HpcpDataset("../data/data_model/test_features.h5", "../data/data_model/test_labels.h5") 

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

3. Initialize model

In [8]:
model = SiameseNetworkWithBatchNorm().to(device)

4. Start Training

In [9]:
train_siamese(
    device=device,
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    epochs=3,
    lr=0.001,
    patience=5
)

Training on mps
Training Started
Epoch [1/3] | Train Loss: 9.7710 | Train Acc: 0.5125 | Val Loss: 0.2947 | Val Acc: 0.5000
Model improved. Saving checkpoint.
Epoch [2/3] | Train Loss: 6.4795 | Train Acc: 0.5125 | Val Loss: 0.9537 | Val Acc: 0.5000
Early stopping patience: 1/5
Epoch [3/3] | Train Loss: 1.6999 | Train Acc: 0.5125 | Val Loss: 1.0788 | Val Acc: 0.5000
Early stopping patience: 2/5
Training Complete.
Starting Evaluation on Test Set.
Metrics saved to log/2025-02-11/metrics.txt


Note:
Check in the notebooks directory:
   - 2 log directories
   - 1 models directory

5. Evaluate on test set

In [10]:
evaluate_metrics(device, model, test_loader)

Test Metrics - Accuracy: 0.5500, Precision: 0.4706, Recall: 1.0000, F1: 0.6400, AUC: 0.6250


The metrics of this run should not be taken in mind as they were created by demostrating the pipeline of the code