In [4]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt
import librosa
from collections import Counter

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [6]:
DATA_ROOT = "data"
SEED = 42
TEST_SIZE = 0.2

genres = sorted([d for d in os.listdir(DATA_ROOT)
                 if os.path.isdir(os.path.join(DATA_ROOT, d))])

print("Genres found:", genres)

samples = []
for genre in genres:
  genre_dir = os.path.join(DATA_ROOT, genre)
  for fname in os.listdir(genre_dir):
    if fname.lower().endswith((".wav", ".mp3")):
      fpath = os.path.join(genre_dir, fname)
      samples.append((fpath, genre))

print("Total tracks:", len(samples))
print("Example sample:", samples[0])

Genres found: ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
Total tracks: 1000
Example sample: ('data\\blues\\blues.00000.wav', 'blues')


In [7]:
def is_readable_audio(path):
  try:
    y, sr = librosa.load(path, sr=None, mono=True, duration=0.5)
    return y is not None and len(y) > 0
  except Exception:
    return False

good_samples = [(p, g) for (p, g) in samples if is_readable_audio(p)]
print("Before:", len(samples), "After:", len(good_samples))

samples = good_samples

Before: 1000 After: 999


  y, sr = librosa.load(path, sr=None, mono=True, duration=0.5)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [8]:
X = [s[0] for s in samples] #filepaths
y = [s[1] for s in samples] #genres

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y) #stratify preserves proportions of the split

print(f"\nTrain tracks: {len(X_train)}")
print(f"Test tracks: {len(X_test)}")

print(f"\nTrain per genre: {Counter(y_train)}")
print(f"Test per genre: {Counter(y_test)}")


Train tracks: 799
Test tracks: 200

Train per genre: Counter({'country': 80, 'rock': 80, 'hiphop': 80, 'pop': 80, 'reggae': 80, 'metal': 80, 'blues': 80, 'classical': 80, 'disco': 80, 'jazz': 79})
Test per genre: Counter({'classical': 20, 'blues': 20, 'metal': 20, 'jazz': 20, 'country': 20, 'rock': 20, 'hiphop': 20, 'reggae': 20, 'disco': 20, 'pop': 20})


In [9]:
label_to_idx = {label: i for i, label in enumerate(genres)}
idx_to_label = {i: label for label, i in label_to_idx.items()}

print(f"\nLabel mapping: {label_to_idx}")


Label mapping: {'blues': 0, 'classical': 1, 'country': 2, 'disco': 3, 'hiphop': 4, 'jazz': 5, 'metal': 6, 'pop': 7, 'reggae': 8, 'rock': 9}


In [10]:
#loading audios
SAMPLE_RATE = 22050

def load_audio(path):
  y, sr = librosa.load(path, sr=SAMPLE_RATE, mono=True)

  #peak normalization
  peak = np.max(np.abs(y))
  if peak > 0:
    y = y / peak

  return y, sr

In [11]:
#data augmentation 10s clips

CLIP_SECONDS = 10
CLIP_SAMPLES = CLIP_SECONDS * SAMPLE_RATE

def random_crop(y, clip_samples):
  if len(y) < clip_samples:
    return y
  
  max_start = len(y) - clip_samples
  start = random.randint(0, max_start)
  return y[start:start + clip_samples]

#for testing
def fixed_crop(y, clip_samples):
  return y[:clip_samples]

In [12]:
#mel spectogram

N_FFT = 2048 #size of fft window
HOP_LENGTH = 512 #step between fft windows
N_MELS = 128 #number of mel freq bands

def waveform_to_logmel(y, sr):
  mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS, power=2.0)
  log_mel = librosa.power_to_db(mel, ref=np.max)

  return log_mel

In [13]:
class GTZANDataset(Dataset):
  def __init__(self, filepaths, labels, label_to_idx, train=True):
    self.filepaths = filepaths
    self.labels = labels
    self.label_to_idx = label_to_idx
    self.train = train

  def __len__(self):
    return len(self.filepaths)
  
  def __getitem__(self, idx):
    path = self.filepaths[idx]
    label = self.labels[idx]

    #load audio
    y, sr = load_audio(path)

    #crop
    if self.train:
      y = random_crop(y, CLIP_SAMPLES)
    else:
      y = fixed_crop(y, CLIP_SAMPLES)

    #waveform to logmel
    log_mel = waveform_to_logmel(y, sr)
    log_mel = (log_mel - log_mel.mean()) / (log_mel.std() + 1e-9) #normalization

    #logmel to tensor
    x = torch.tensor(log_mel, dtype=torch.float32).unsqueeze(0)

    #label to index
    y_label = self.label_to_idx[label]

    return x, y_label

In [14]:
BATCH_SIZE = 16

train_dataset = GTZANDataset(X_train, y_train, label_to_idx, train=True)
test_dataset = GTZANDataset(X_test, y_test, label_to_idx, train=False)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [15]:
#shape check
x_batch, y_batch = next(iter(train_loader))
x_batch.shape, y_batch.shape

(torch.Size([16, 1, 128, 431]), torch.Size([16]))

In [16]:
class GenreClassifier(nn.Module):
  def __init__(self):
    super().__init__()
    self.block_1 = nn.Sequential(
      nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1),
      nn.BatchNorm2d(num_features=32),
      nn.ReLU(),
      nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding=1),
      nn.BatchNorm2d(num_features=32),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2)
      )
    self.block_2 = nn.Sequential(
      nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
      nn.BatchNorm2d(num_features=64),
      nn.ReLU(),
      nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
      nn.BatchNorm2d(num_features=64),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2)
      )
    self.block_3 = nn.Sequential(
      nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
      nn.BatchNorm2d(num_features=128),
      nn.ReLU(),
      nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1),
      nn.BatchNorm2d(num_features=128),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2)
      )
    self.classifier = nn.Sequential(
      nn.AdaptiveAvgPool2d((1, 1)),
      nn.Flatten(),
      nn.Dropout(p=0.4),
      nn.Linear(in_features=128, out_features=10)
    )
  
  def forward(self, x):
    x = self.block_1(x)
    x = self.block_2(x)
    x = self.block_3(x)
    x = self.classifier(x)
    return x

torch.manual_seed(42)
model_0 = GenreClassifier().to(device)
model_0

GenreClassifier(
  (block_1): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (block_2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (block_3): Sequential(
    (0): Conv

In [17]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model_0.parameters(), lr = 0.001)

In [16]:
torch.manual_seed(42)

epochs = 50
best_test_acc = 0.0

for epoch in range(epochs):
  model_0.train()
  epoch_loss = 0.0
  epoch_correct = 0
  epoch_total = 0

  for X_batch, y_batch in train_loader:
    X_batch = X_batch.to(device)
    y_batch = y_batch.to(device)

    y_logits = model_0(X_batch)
    loss = loss_fn(y_logits, y_batch)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item() * y_batch.size(0)
    preds = torch.argmax(y_logits, dim=1)
    epoch_correct += (preds == y_batch).sum().item()
    epoch_total += y_batch.size(0)

  train_loss = epoch_loss / epoch_total
  train_acc = 100 * epoch_correct / epoch_total

  model_0.eval()
  test_loss = 0.0
  test_correct = 0
  test_total = 0

  with torch.no_grad():
    for X_batch, y_batch in test_loader:
      X_batch = X_batch.to(device)
      y_batch = y_batch.to(device)

      y_logits = model_0(X_batch)
      loss = loss_fn(y_logits, y_batch)

      test_loss += loss.item() * y_batch.size(0)
      preds = torch.argmax(y_logits, dim=1)
      test_correct += (preds == y_batch).sum().item()
      test_total += y_batch.size(0)

  test_loss = test_loss / test_total
  test_acc = 100 * test_correct / test_total

  if test_acc > best_test_acc:
    best_test_acc = test_acc
    torch.save(model_0.state_dict(), "models/best_model.pth")

  if epoch % 1 == 0:
    print(
      f"Epoch {epoch} | "
      f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
      f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.2f}%"
    )

Epoch 0 | Train Loss: 1.8946 | Train Acc: 31.91% | Test Loss: 1.7610 | Test Acc: 34.50%
Epoch 1 | Train Loss: 1.5580 | Train Acc: 44.93% | Test Loss: 1.4694 | Test Acc: 42.50%
Epoch 2 | Train Loss: 1.4941 | Train Acc: 45.31% | Test Loss: 1.3715 | Test Acc: 47.50%
Epoch 3 | Train Loss: 1.3706 | Train Acc: 51.19% | Test Loss: 1.7456 | Test Acc: 37.50%
Epoch 4 | Train Loss: 1.3192 | Train Acc: 53.69% | Test Loss: 1.6654 | Test Acc: 40.50%
Epoch 5 | Train Loss: 1.2537 | Train Acc: 58.82% | Test Loss: 1.1176 | Test Acc: 63.00%
Epoch 6 | Train Loss: 1.2359 | Train Acc: 57.45% | Test Loss: 1.0086 | Test Acc: 71.50%
Epoch 7 | Train Loss: 1.1076 | Train Acc: 60.70% | Test Loss: 1.1335 | Test Acc: 62.50%
Epoch 8 | Train Loss: 1.1728 | Train Acc: 60.20% | Test Loss: 1.4169 | Test Acc: 45.00%
Epoch 9 | Train Loss: 1.1150 | Train Acc: 59.32% | Test Loss: 1.3009 | Test Acc: 54.50%
Epoch 10 | Train Loss: 1.0578 | Train Acc: 63.70% | Test Loss: 1.5904 | Test Acc: 45.50%
Epoch 11 | Train Loss: 1.0923 |

In [17]:
#torch.save(model_0.state_dict(), "models/firstrun.pth")

In [18]:
#test inference

model_1 = GenreClassifier()
model_1.load_state_dict(torch.load("models/best_model.pth"))
model_1.to(device)

GenreClassifier(
  (block_1): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (block_2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (block_3): Sequential(
    (0): Conv

In [19]:
model_1.eval()

with torch.inference_mode():
  X_batch, y_batch = next(iter(test_loader))
  X_batch = X_batch.to(device)

  probs = torch.softmax(model_1(X_batch), dim=1)
  preds = torch.argmax(probs, dim=1)

for i in range(16):
  true_i = y_batch[i].item()
  pred_i = preds[i].item()
  p_pred = probs[i, pred_i].item() * 100
  p_true = probs[i, true_i].item() * 100

  print(
    f"{i} | true={idx_to_label[true_i]:10s} "
    f"pred={idx_to_label[pred_i]:10s} "
    f"p(pred)={p_pred:6.2f}% p(true)={p_true:6.2f}%"
  )

#p(pred) is model's prediction of the class, p(true) is what it gave to the actual true class

0 | true=classical  pred=classical  p(pred)= 99.67% p(true)= 99.67%
1 | true=blues      pred=country    p(pred)= 28.56% p(true)= 20.60%
2 | true=metal      pred=metal      p(pred)= 98.39% p(true)= 98.39%
3 | true=jazz       pred=jazz       p(pred)= 95.29% p(true)= 95.29%
4 | true=blues      pred=blues      p(pred)= 35.76% p(true)= 35.76%
5 | true=country    pred=classical  p(pred)= 57.43% p(true)= 27.49%
6 | true=rock       pred=rock       p(pred)= 53.92% p(true)= 53.92%
7 | true=country    pred=country    p(pred)= 73.33% p(true)= 73.33%
8 | true=hiphop     pred=disco      p(pred)= 77.38% p(true)=  0.41%
9 | true=jazz       pred=jazz       p(pred)= 97.42% p(true)= 97.42%
10 | true=blues      pred=blues      p(pred)= 72.26% p(true)= 72.26%
11 | true=blues      pred=blues      p(pred)= 67.75% p(true)= 67.75%
12 | true=reggae     pred=reggae     p(pred)= 86.80% p(true)= 86.80%
13 | true=blues      pred=blues      p(pred)= 67.90% p(true)= 67.90%
14 | true=rock       pred=rock       p(pred)

In [20]:
#Now training with full dataset

X_full = X_train + X_test
y_full = y_train + y_test

full_dataset = GTZANDataset(X_full, y_full, label_to_idx, train=True)
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=True)

In [21]:
model_2 = GenreClassifier().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model_2.parameters(), lr = 0.001)

In [22]:
torch.manual_seed(42)

epochs = 50

for epoch in range(epochs):
  model_2.train()
  epoch_loss = 0.0
  epoch_correct = 0
  epoch_total = 0

  for X_batch, y_batch in full_loader:
    X_batch = X_batch.to(device)
    y_batch = y_batch.to(device)

    y_logits = model_2(X_batch)
    loss = loss_fn(y_logits, y_batch)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item() * y_batch.size(0)
    preds = torch.argmax(y_logits, dim=1)
    epoch_correct += (preds == y_batch).sum().item()
    epoch_total += y_batch.size(0)

  train_loss = epoch_loss / epoch_total
  train_acc = 100 * epoch_correct / epoch_total

  if epoch % 1 == 0:
    print(
      f"Epoch {epoch} | "
      f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%"
    )

Epoch 0 | Train Loss: 1.8060 | Train Acc: 37.14%
Epoch 1 | Train Loss: 1.5214 | Train Acc: 47.65%
Epoch 2 | Train Loss: 1.3880 | Train Acc: 51.85%
Epoch 3 | Train Loss: 1.2779 | Train Acc: 55.06%
Epoch 4 | Train Loss: 1.2113 | Train Acc: 60.56%
Epoch 5 | Train Loss: 1.1526 | Train Acc: 61.36%
Epoch 6 | Train Loss: 1.0858 | Train Acc: 62.56%
Epoch 7 | Train Loss: 1.0402 | Train Acc: 64.16%
Epoch 8 | Train Loss: 1.0294 | Train Acc: 64.36%
Epoch 9 | Train Loss: 0.9762 | Train Acc: 68.37%
Epoch 10 | Train Loss: 0.9666 | Train Acc: 67.37%
Epoch 11 | Train Loss: 1.0014 | Train Acc: 65.47%
Epoch 12 | Train Loss: 0.8719 | Train Acc: 72.97%
Epoch 13 | Train Loss: 0.8605 | Train Acc: 71.37%
Epoch 14 | Train Loss: 0.8526 | Train Acc: 72.07%
Epoch 15 | Train Loss: 0.8880 | Train Acc: 70.27%
Epoch 16 | Train Loss: 0.8521 | Train Acc: 72.17%
Epoch 17 | Train Loss: 0.8145 | Train Acc: 72.57%
Epoch 18 | Train Loss: 0.7816 | Train Acc: 73.27%
Epoch 19 | Train Loss: 0.7807 | Train Acc: 75.08%
Epoch 20 |

In [None]:
#torch.save(model_2.state_dict(), "models/final_model.pth")

In [30]:
model_final = GenreClassifier()
model_final.load_state_dict(torch.load("models/final_model.pth"))
model_final.to(device)

GenreClassifier(
  (block_1): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (block_2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (block_3): Sequential(
    (0): Conv

In [31]:
USER_DIR = "user_songs"

def predict_folder(model, folder):
  model.eval()

  files = []
  for fname in os.listdir(folder):
    if fname.lower().endswith((".wav", ".mp3")):
      files.append(os.path.join(folder, fname))

  if len(files) == 0:
    print("No .wav or .mp3 files found in:", folder)
    return

  with torch.inference_mode():
    for path in files:
      y, sr = load_audio(path)
      y = fixed_crop(y, CLIP_SAMPLES)

      log_mel = waveform_to_logmel(y, sr)
      log_mel = (log_mel - log_mel.mean()) / (log_mel.std() + 1e-9)

      x = torch.tensor(log_mel, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
      x = x.to(device)

      logits = model(x)
      probs = torch.softmax(logits, dim=1).squeeze(0)

      pred_i = torch.argmax(probs).item()

      print("\nFile:", os.path.basename(path))
      print("Predicted:", idx_to_label[pred_i])
      for j, p in enumerate(probs):
        print(f"  {idx_to_label[j]}: {p.item() * 100:.2f}%")

In [39]:
predict_folder(model_final, USER_DIR)


File: Bleed- Meshuggah (Full Version HD).mp3
Predicted: metal
  blues: 0.00%
  classical: 0.00%
  country: 0.00%
  disco: 0.00%
  hiphop: 0.34%
  jazz: 0.00%
  metal: 99.01%
  pop: 0.23%
  reggae: 0.00%
  rock: 0.41%

File: Holy Wars... The Punishment Due (2004 Remix).mp3
Predicted: metal
  blues: 0.00%
  classical: 0.13%
  country: 0.03%
  disco: 0.09%
  hiphop: 7.26%
  jazz: 0.11%
  metal: 69.57%
  pop: 10.20%
  reggae: 0.42%
  rock: 12.19%

File: Master of Puppets (Remastered).mp3
Predicted: metal
  blues: 0.05%
  classical: 0.00%
  country: 0.06%
  disco: 1.65%
  hiphop: 2.68%
  jazz: 0.00%
  metal: 78.12%
  pop: 0.52%
  reggae: 0.09%
  rock: 16.83%
