# Music Genre Classification with a PyTorch CNN
This notebook walks through the process of building a music genre classifier using Mel-spectrograms and a Convolutional Neural Network (CNN).
### Steps:
1.  **Data Preparation**: Load audio files and transform them into Mel-spectrograms.
2.  **Dataset & DataLoader**: Create a custom PyTorch dataset to handle the data.
3.  **Model Building**: Define and instantiate the CNN model.
4.  **Training**: Train the model on the spectrogram data.
5.  **Evaluation**: Evaluate the model's performance on a validation set.

In [None]:
import os
import json
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

# Import our custom CNN model
from model.cnn_model import GenreCNN

## 1. Data Preprocessing: From Audio to Spectrogram

We will process the raw `.wav` files from the GTZAN dataset. For each 30-second audio clip, we'll generate a Mel-spectrogram, which is a visual representation suitable for a CNN.

**Important:** Update the `DATASET_PATH` to the location of your GTZAN dataset's `genres_original` folder.

In [None]:
DATASET_PATH = "path/to/your/genres_original" # <--- IMPORTANT: CHANGE THIS
JSON_PATH = "data.json"
SAMPLE_RATE = 22050
DURATION = 30 # seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

def save_mel_spectrograms(dataset_path, json_path, n_mels=128, n_fft=2048, hop_length=512):
    data = {
        "mappings": [],
        "labels": [],
        "mel_spectrograms": []
    }

    num_samples_per_segment = int(SAMPLES_PER_TRACK / 10) # 10 segments per track

    for i, (dirpath, _, filenames) in enumerate(os.walk(dataset_path)):
        if dirpath is not dataset_path:
            # Save the genre label
            genre_label = os.path.basename(dirpath)
            data["mappings"].append(genre_label)
            print(f"\nProcessing: {genre_label}")

            # Process all audio files in the genre sub-folder
            for f in tqdm(filenames):
                file_path = os.path.join(dirpath, f)
                try:
                    signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)

                    # Process segments of the audio file
                    for s in range(10):
                        start_sample = num_samples_per_segment * s
                        finish_sample = start_sample + num_samples_per_segment

                        # Generate Mel-spectrogram
                        mel_spec = librosa.feature.melspectrogram(
                            y=signal[start_sample:finish_sample],
                            sr=sr,
                            n_fft=n_fft,
                            n_mels=n_mels,
                            hop_length=hop_length
                        )
                        log_mel_spec = librosa.power_to_db(mel_spec)

                        if log_mel_spec.shape == (n_mels, 130): # Ensure consistent shape
                            data["mel_spectrograms"].append(log_mel_spec.tolist())
                            data["labels"].append(i - 1)
                except Exception as e:
                    print(f"Could not process {file_path}: {e}")

    # Save to JSON file
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)
    print("\nData successfully saved to data.json")

# Run the function only if the JSON file doesn't exist
if not os.path.exists(JSON_PATH):
    save_mel_spectrograms(DATASET_PATH, JSON_PATH)
else:
    print("data.json already exists. Skipping preprocessing.")

In [None]:
# Load a sample spectrogram and display it
with open(JSON_PATH, "r") as fp:
    sample_data = json.load(fp)

sample_spectrogram = np.array(sample_data["mel_spectrograms"][0])
genre_map = sample_data["mappings"]

plt.figure(figsize=(10, 4))
librosa.display.specshow(sample_spectrogram, sr=SAMPLE_RATE, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.title(f'Mel-spectrogram for a "{genre_map[0]}" track')
plt.tight_layout()
plt.show()

## 2. Creating a Custom PyTorch Dataset

We'll create a custom `Dataset` class to load our data and prepare it for the `DataLoader`.

In [None]:
class GenreDataset(Dataset):
    def __init__(self, X, y, device):
        self.X = torch.tensor(X, dtype=torch.float32).unsqueeze(1).to(device) # Add channel dimension
        self.y = torch.tensor(y, dtype=torch.long).to(device)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def load_data(json_path):
    with open(json_path, "r") as fp:
        data = json.load(fp)
    X = np.array(data["mel_spectrograms"])
    y = np.array(data["labels"])
    return X, y

# Load data and split into train/validation sets
X, y = load_data(JSON_PATH)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Create datasets and dataloaders
train_dataset = GenreDataset(X_train, y_train, device)
val_dataset = GenreDataset(X_val, y_val, device)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

## 3. Training the CNN Model

Now we define the training loop, loss function, and optimizer, and then train our model.

In [None]:
# Instantiate the model
model = GenreCNN(num_genres=len(genre_map)).to(device)
print(model)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 50
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    train_accuracy = 100 * correct_train / total_train
    train_accuracies.append(train_accuracy)

    # Validation
    model.eval()
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    val_accuracy = 100 * correct_val / total_val
    val_accuracies.append(val_accuracy)

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, "
          f"Train Acc: {train_accuracy:.2f}%, Val Acc: {val_accuracy:.2f}%")

print("Finished Training")
print(f"Final Validation Accuracy: {val_accuracies[-1]:.2f}%")

## 4. Visualizing Performance

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(range(1, num_epochs + 1), train_accuracies, label='Training Accuracy')
plt.plot(range(1, num_epochs + 1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.grid(True)
plt.show()