In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 데이터

In [None]:
!pip install torchlibrosa

Collecting torchlibrosa
  Downloading torchlibrosa-0.1.0-py3-none-any.whl (11 kB)
Installing collected packages: torchlibrosa
Successfully installed torchlibrosa-0.1.0


In [None]:
!wget https://os.unil.cloud.switch.ch/fma/fma_small.zip

--2023-12-10 04:26:00--  https://os.unil.cloud.switch.ch/fma/fma_small.zip
Resolving os.unil.cloud.switch.ch (os.unil.cloud.switch.ch)... 86.119.28.16, 2001:620:5ca1:201::214
Connecting to os.unil.cloud.switch.ch (os.unil.cloud.switch.ch)|86.119.28.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7679594875 (7.2G) [application/zip]
Saving to: ‘fma_small.zip’


2023-12-10 04:33:19 (16.8 MB/s) - ‘fma_small.zip’ saved [7679594875/7679594875]



In [None]:
!unzip -q fma_small.zip

In [None]:
!rm -rf fma_small.zip

In [None]:
!rm -rf fma_small/checksums

In [None]:
!rm -rf fma_small/README.txt

## Data Loader

In [None]:
import os
drive_path = "/content/drive/MyDrive/Colab Notebook"

In [None]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ast

In [None]:
dict_label = {'Hip-Hop': 0, 'Pop': 1, 'Rock': 2, 'Folk': 3, 'Jazz': 4, 'Electronic': 5, 'Experimental': 6, 'International': 7, 'Spoken': 8, 'Country': 9, 'Blues': 10, 'Old-Time / Historic': 11, 'Soul-RnB': 12, 'Classical': 13, 'Instrumental': 14, 'Easy Listening': 15}

In [None]:
tracks = pd.read_csv(os.path.join(drive_path, 'tracks.csv'), index_col=0, header=[0, 1])

COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
            ('track', 'genres'), ('track', 'genres_all')]
for column in COLUMNS:
    tracks[column] = tracks[column].map(ast.literal_eval)

COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
            ('album', 'date_created'), ('album', 'date_released'),
            ('artist', 'date_created'), ('artist', 'active_year_begin'),
            ('artist', 'active_year_end')]
for column in COLUMNS:
    tracks[column] = pd.to_datetime(tracks[column])

SUBSETS = ('small', 'medium', 'large')
try:
    tracks['set', 'subset'] = tracks['set', 'subset'].astype(
            'category', categories=SUBSETS, ordered=True)
except (ValueError, TypeError):
    # the categories and ordered arguments were removed in pandas 0.25
    tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                pd.CategoricalDtype(categories=SUBSETS, ordered=True))

COLUMNS = [('track', 'genre_top'), ('track', 'license'),
            ('album', 'type'), ('album', 'information'),
            ('artist', 'bio')]
for column in COLUMNS:
    tracks[column] = tracks[column].astype('category')

In [None]:
import os
import torchaudio
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import Resample, MelSpectrogram
from tqdm import tqdm
import torch
import torch.nn.functional as F

In [None]:
n_fft = 1024
win_length = 1024
hop_length = 1024

n_mels = 128
n_mfcc = 128

target_sample_rate = 22040
max_len = 1293

In [None]:
from torchaudio.transforms import Resample, MelSpectrogram
import torchvision.transforms as transforms

In [None]:
import torch
import torchaudio
from torchvision import transforms
from torch.utils.data import Dataset
import torchaudio.transforms as T
import torch.nn.functional as F

def apply_codec(waveform, sample_rate, format, encoder=None):
    encoder = torchaudio.io.AudioEffector(format=format, encoder=encoder)
    return encoder.apply(waveform, sample_rate)

class SpectrogramDataset(Dataset):
    def __init__(self, file_paths, labels, target_sample_rate, max_len, dict_label, format, encoder=None):
        self.file_paths = file_paths
        self.labels = labels
        self.target_sample_rate = target_sample_rate
        self.max_len = max_len
        self.dict_label = dict_label
        self.format = format
        self.encoder = encoder

    def __getitem__(self, index):
        audio_path = self.file_paths[index]
        y, sr = librosa.load(audio_path)

        # 멜 스펙트로그램 생성
        spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=1024)
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)

        # Convert NumPy array to PyTorch tensor
        spectrogram = torch.tensor(spectrogram)

        # Ensure all spectrograms have the same length (e.g., max_len)
        spectrogram = F.pad(spectrogram, (0, self.max_len - spectrogram.shape[1]))

        spectrogram = spectrogram.unsqueeze(0)

        # One-hot encode labels
        label = F.one_hot(torch.tensor(self.labels[index]), num_classes=16).float()

        return spectrogram, label

    def __len__(self):
        return len(self.file_paths)

In [None]:
root = '/content/fma_small'

train_data = []
train_label = []
val_data = []
val_label = []
test_data = []
test_label = []
err = ['001486.mp3','005574.mp3','065753.mp3','080391.mp3','098558.mp3','098559.mp3','098560.mp3','098565.mp3','098566.mp3','098567.mp3','098568.mp3','098569.mp3','098571.mp3','099134.mp3','105247.mp3','108924.mp3','108925.mp3','126981.mp3','127336.mp3','133297.mp3','143992.mp3']

# Load data paths and labels (similar to your provided code)
root = '/content/fma_small'

for num in tqdm(os.listdir(root)):
  path = os.path.join(root, num)
  for f in os.listdir(path):
    if f in err: continue


for folder in tqdm(os.listdir(root)):
  path = os.path.join(root, folder)
  for f in os.listdir(path):
    if f in err: continue
    idx = int(f.split('.')[0])
    if tracks['set', 'split'][idx] == 'training':
      train_data.append(os.path.join(path, f))
      train_label.append(dict_label[tracks['track', 'genre_top'][idx]])
    elif tracks['set', 'split'][idx] == 'validation':
      val_data.append(os.path.join(path, f))
      val_label.append(dict_label[tracks['track', 'genre_top'][idx]])
    elif tracks['set', 'split'][idx] == 'test':
      test_data.append(os.path.join(path, f))
      test_label.append(dict_label[tracks['track', 'genre_top'][idx]])

100%|██████████| 156/156 [00:00<00:00, 9718.27it/s]
100%|██████████| 156/156 [00:01<00:00, 103.32it/s]


In [None]:
# Create data loaders
batch_size = 4
train_loader = DataLoader(SpectrogramDataset(train_data, train_label, target_sample_rate, max_len, dict_label, format, encoder=None), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(SpectrogramDataset(val_data, val_label, target_sample_rate, max_len, dict_label, format, encoder=None), batch_size=batch_size, shuffle=False)

In [None]:
# 예시로 첫 번째 배치의 데이터 shape 확인
for batch_data, batch_labels in train_loader:
    print("Batch 데이터 shape:", batch_data.shape)
    print("Batch 레이블 shape:", batch_labels.shape)
    break  # 첫 번째 배치만 확인하기 위해 break 문 사용

Batch 데이터 shape: torch.Size([4, 1, 128, 1293])
Batch 레이블 shape: torch.Size([4, 16])


## ResNet50

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet50

In [None]:
torch.cuda.empty_cache()

In [None]:
n_fft = 512
win_length = 320
hop_length = 320
n_mels = 128
# sample_rate = 192000
model = resnet50(pretrained=False, num_classes=16)
model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)



In [None]:
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.4, patience=5, verbose=True)

start_epoch = 0
num_epochs = 50

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print_iter = 100
save_epoch = 5
save_iter = 1000
test_epoch = 1

In [None]:
name = 'FMA_ResNet50'

In [None]:
checkpoint_path = os.path.join(drive_path, f"models/{name}_checkpoint_epoch_latest.pth")
torch.save({
    'epoch': start_epoch + 1,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler.state_dict': scheduler.state_dict(),
    'loss': 0,
}, checkpoint_path)

## 모델 학습

In [None]:
# loading checkpoint
checkpoint = torch.load(os.path.join(drive_path, 'models/FMA_ResNet50_checkpoint_epoch_latest.pth'))

start_epoch = checkpoint['epoch'] - 1
model.load_state_dict(checkpoint['model_state_dict'], strict=False)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler.state_dict'])

In [None]:
# Training loop
for epoch in range(start_epoch, num_epochs):
    model.train()

    for iter, (inputs, labels) in enumerate(train_loader):
        if (iter + 1) % print_iter == 1:
              verbose_loss = 0
              verbose_Acc = 0

        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute the loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        verbose_loss += loss.item()

        _, argmax = torch.max(outputs, 1)
        _, l_argmax = torch.max(labels, 1)
        verbose_Acc += (l_argmax == argmax).float().mean()

        if (iter + 1) % print_iter == 0: print(f"Epoch {epoch + 1}/{num_epochs}, Iteration: {iter + 1}/{len(train_loader)} Training Loss: {verbose_loss / print_iter : .4f}, Training Accuracy: {verbose_Acc / print_iter : .4f}")

        if (iter + 1) % save_iter == 0:
          checkpoint_path = os.path.join(drive_path, f"models/{name}_checkpoint_epoch_latest.pth")
          torch.save({
              'epoch': epoch + 1,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'scheduler.state_dict': scheduler.state_dict(),
              'loss': verbose_loss / len(train_loader),
          }, checkpoint_path)

    # Print statistics every epoch
    #print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {loss / len(train_loader)}, Training Accuracy: {accuracy}")

    if (epoch + 1) % save_epoch == 0:
      checkpoint_path = os.path.join(drive_path, f"models/{name}_checkpoint_epoch_{epoch + 1}.pth")
      torch.save({
          'epoch': epoch + 1,
          'model_state_dict': model.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),
          'scheduler.state_dict': scheduler.state_dict(),
          'loss': verbose_loss / len(train_loader),
      }, checkpoint_path)

      print(f"Checkpoint saved at {checkpoint_path}")

    # Validation
    if (epoch + 1) % test_epoch == 0:
      model.eval()
      test_loss = 0.0
      correct_predictions = 0
      total_samples = 0

      with torch.no_grad():
          for inputs, labels in val_loader:
              inputs, labels = inputs.to(device), labels.to(device)

              # Forward pass
              outputs = model(inputs)

              # Compute the loss for validation
              loss = criterion(outputs, labels)
              test_loss += loss.item()

              # Calculate accuracy
              _, predicted = torch.max(outputs.data, 1)
              _, labels = torch.max(labels, 1)

              total_samples += labels.size(0)
              correct_predictions += (predicted == labels).sum().item()

      test_loss /= len(val_loader)
      accuracy = correct_predictions / total_samples

      print(f"Validation Loss: {test_loss}, Accuracy: {accuracy}")
      scheduler.step(test_loss)


print("Training complete")

Epoch 29/50, Iteration: 100/1599 Training Loss:  0.0036, Training Accuracy:  1.0000
Epoch 29/50, Iteration: 200/1599 Training Loss:  0.0022, Training Accuracy:  1.0000
Epoch 29/50, Iteration: 300/1599 Training Loss:  0.0022, Training Accuracy:  1.0000
Epoch 29/50, Iteration: 400/1599 Training Loss:  0.0018, Training Accuracy:  1.0000
Epoch 29/50, Iteration: 500/1599 Training Loss:  0.0117, Training Accuracy:  0.9950
Epoch 29/50, Iteration: 600/1599 Training Loss:  0.0033, Training Accuracy:  1.0000
Epoch 29/50, Iteration: 700/1599 Training Loss:  0.0092, Training Accuracy:  0.9975
Epoch 29/50, Iteration: 800/1599 Training Loss:  0.0031, Training Accuracy:  1.0000
Epoch 29/50, Iteration: 900/1599 Training Loss:  0.0041, Training Accuracy:  1.0000
Epoch 29/50, Iteration: 1000/1599 Training Loss:  0.0035, Training Accuracy:  1.0000
Epoch 29/50, Iteration: 1100/1599 Training Loss:  0.0023, Training Accuracy:  1.0000
Epoch 29/50, Iteration: 1200/1599 Training Loss:  0.0039, Training Accurac