<a href="https://colab.research.google.com/github/drew0523/MIR-Music_Information_Retrieval/blob/main/notebooks/5th_week_autotagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Music Auto-tagging

In [1]:

import torch
import torchaudio
import IPython.display as ipd
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
!pip install --upgrade gdown
!gdown 15e9E3oZdudErkPKwb0rCAiZXkPxdZkV6
# !wget https://sogang365-my.sharepoint.com/:u:/g/personal/dasaem_jeong_o365_sogang_ac_kr/EdkHWV-qvxBEi-d0Ua73VG4BEp7EZO7HMvrXsWqeJvMJzg?e=GbYylV&download=1

!unzip -q mtat_8000.zip

Downloading...
From (original): https://drive.google.com/uc?id=15e9E3oZdudErkPKwb0rCAiZXkPxdZkV6
From (redirected): https://drive.google.com/uc?id=15e9E3oZdudErkPKwb0rCAiZXkPxdZkV6&confirm=t&uuid=596f8543-db06-4fa8-ba5d-3fd78c3467be
To: /content/mtat_8000.zip
100% 921M/921M [00:11<00:00, 78.7MB/s]


In [4]:
# check dataset
data_dir = Path('MTAT_SMALL')
assert data_dir.exists()

mp3_fns = list(data_dir.rglob('*.mp3'))
len(mp3_fns)

mp3_fn = mp3_fns[0]
y, sr = torchaudio.load(mp3_fn)
print(mp3_fn, sr)
ipd.Audio(y, rate=sr)

16000


In [25]:
df = pd.read_csv('MTAT_SMALL/meta.csv', index_col=0)
df.values[:,1:-1]

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=object)

In [7]:
str(mp3_fn.relative_to('MTAT_SMALL/'))

'8/justin_bianco-phoenix-12-framework-0-29.mp3'

In [10]:
df[df['mp3_path']==str(mp3_fn.relative_to('MTAT_SMALL/'))]

Unnamed: 0.1,Unnamed: 0,clip_id,singer,harpsichord,sitar,heavy,foreign,no piano,classical,female,...,rock,dance,cello,techno,flute,beat,soft,choir,baroque,mp3_path
4614,21858,48021,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,8/justin_bianco-phoenix-12-framework-0-29.mp3


In [24]:
class MTATDataset:
  def __init__(self, dir_path, split='train', num_max_data=6000, sr=16000):
    self.dir = Path(dir_path)
    self.labels = pd.read_csv(self.dir / "meta.csv", index_col=[0])
    self.sr = sr

    if split=="train":
      sub_dir_ids = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c']
    elif split=='valid':
      sub_dir_ids = ['d']
    elif split=='test': #test
      sub_dir_ids = ['e', 'f', 'g']
    else:
      raise NotImplementedError

    is_in_set = [True if x[0] in sub_dir_ids else False for x in self.labels['mp3_path'].values.astype('str')]
    self.labels = self.labels.iloc[is_in_set] # filter label by is_in_set
    self.labels = self.labels[:num_max_data]  # 너무 많이 가져오면 터지기 때문에 상한 지정
    self.vocab = self.labels.columns.values[1:-1]
    self.label_tensor = self.convert_label_to_tensor()

  def convert_label_to_tensor(self):
    return torch.tensor(self.labels.values[:, 1:-1].astype('bool'), dtype=torch.float)

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    info = self.labels.iloc[idx]
    mp3_path = self.dir / info['mp3_path']

    audio, sr = torchaudio.load(mp3_path)
    assert sr == self.sr
    label = self.label_tensor[idx]
    return audio.mean(0), label

train_set = MTATDataset('MTAT_SMALL')


In [26]:
audio, label = train_set[0]
ipd.display(ipd.Audio(audio, rate=train_set.sr, normalize=False))
print(label)

tensor([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.])


In [20]:
# convert multi-hot label to readable tag
activated_tag_idx=torch.where(label)[0]
train_set.vocab[activated_tag_idx]

array(['female', 'quiet', 'choir'], dtype=object)

In [21]:
audio, label = train_set[100]
ipd.display(ipd.Audio(audio, rate=train_set.sr, normalize=False))
print(label)

tensor([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


In [22]:
# convert multi-hot label to readable tag
activated_tag_idx=torch.where(label)[0]
train_set.vocab[activated_tag_idx]

array(['singer', 'harpsichord', 'male', 'vocal'], dtype=object)

In [27]:
import torch.nn as nn
import torch.nn.functional as F
class AutoTagger(nn.Module):
  def __init__(self):
    super().__init__()
    self.mel = torchaudio.transforms.MelSpectrogram(n_fft=2048, hop_length=1024, n_mels=80)

  def forward(self, x):
    return self.mel(x)

model = AutoTagger()
model(audio).shape

torch.Size([80, 456])