In [67]:
from os import listdir
import os
from os.path import isfile, join
import numpy as np
import pandas as pd
import librosa
import json
from tqdm import tqdm
import csv

In [68]:
SPLITS = ('training', 'test', 'validation')
PERCENTAGES = (0.7, 0.15, 0.15)
f_split = lambda : random.choices(SPLITS, weights=PERCENTAGES)[0]

In [82]:
mypath = 'data/raw/Progressive_Rock_Songs/'
progressive_rock = [[os.path.join(dirpath,f), 1] for (dirpath, dirnames, filenames) in os.walk(mypath) for f in filenames if f.endswith('.mp3')]

In [83]:
mypath = 'data/raw/Not_Progressive_Rock/'
not_progressive_rock = [[os.path.join(dirpath,f), 0] for (dirpath, dirnames, filenames) in os.walk(mypath) for f in filenames if f.endswith('.mp3')]

In [84]:
dataset = not_progressive_rock + progressive_rock

In [86]:
my_df = pd.DataFrame(dataset)
my_df.to_csv('annotations_v2.csv', index=False, header=False)

In [87]:
pd.read_csv('annotations_v2.csv', header=None)

Unnamed: 0,0,1
0,data/raw/Not_Progressive_Rock/Other_Songs\01 -...,0
1,data/raw/Not_Progressive_Rock/Other_Songs\01 -...,0
2,data/raw/Not_Progressive_Rock/Other_Songs\01 -...,0
3,data/raw/Not_Progressive_Rock/Other_Songs\01 -...,0
4,data/raw/Not_Progressive_Rock/Other_Songs\01 -...,0
...,...,...
267,data/raw/Progressive_Rock_Songs/L'evoluzione.mp3,1
268,data/raw/Progressive_Rock_Songs/MIKE OLDFIELD ...,1
269,data/raw/Progressive_Rock_Songs/Pain of Salvat...,1
270,data/raw/Progressive_Rock_Songs/The Flower Kin...,1


In [6]:
def process_audio(path, sr=16000):
    x , sr = librosa.load(path, sr=sr)
    mel_spectrogram = librosa.feature.melspectrogram(y=x, n_fft=400, hop_length=160, n_mels=80, fmax=8000, sr=sr)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    return log_mel_spectrogram

In [7]:
def process_dataset(dataset, save_dir='data/process/'):
    os.makedirs(save_dir, exist_ok=True)
    data = []
    
    for i in tqdm(range(len(dataset))):
        try:
            path, label = dataset[i]        
            mat = process_audio(path)
            save_path = save_dir + f'{i}.npy'
            np.save(save_path, mat)
            data += [save_path,label]
        except:
            print(f"Exception in song {i}")
    
    with open("annotation.csv", "wb") as f:
        writer = csv.writer(f)
    

In [8]:
with open('dataset.json', 'r') as f:
    data = json.load(f)

In [44]:
b = process_audio(dataset[0][0])

In [45]:
b.shape

(80, 20618)

In [9]:
d = []
for i in range(int(len(data)/2)):
    d+= [[data[2*i], data[2*i+1]]]

In [10]:
import pandas as pd

In [11]:
df = pd.DataFrame(d)
df.to_csv('annotation.csv', index=False)

In [None]:
process_dataset(dataset)

  3%|████▏                                                                                                                             | 9/279 [01:02<26:47,  5.95s/it]

Exception in song 8


  4%|█████                                                                                                                            | 11/279 [01:11<24:31,  5.49s/it]

Exception in song 10


  4%|█████▌                                                                                                                           | 12/279 [01:17<24:25,  5.49s/it]

Exception in song 11


  5%|██████                                                                                                                           | 13/279 [01:32<36:44,  8.29s/it]

Exception in song 12


  5%|██████▍                                                                                                                          | 14/279 [01:41<38:10,  8.64s/it]

In [12]:
from torch import nn
import torch
import utils
from typing import Dict

In [13]:
from utils.model import ModelDimensions, AudioEncoder

In [14]:
dims = ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=384, n_audio_head=6, n_audio_layer=4)

In [21]:
class AudioClassifier(nn.Module):
    def __init__(self, dims: ModelDimensions):
        super().__init__()
        self.dims = dims
        self.encoder = AudioEncoder(
            self.dims.n_mels,
            self.dims.n_audio_ctx,
            self.dims.n_audio_state,
            self.dims.n_audio_head,
            self.dims.n_audio_layer,
        )
        
    def forward(self, mel: torch.Tensor) -> Dict[str, torch.Tensor]:
        return self.encoder(mel)

In [22]:
model = AudioClassifier(dims)

In [23]:
x = torch.zeros((10, 80, 1500))

In [24]:
y = model(x)

AssertionError: incorrect audio shape

In [19]:
import ffmpeg
import numpy as np
import torch
import os
from functools import lru_cache

In [20]:
SAMPLE_RATE = 16000
N_FFT = 400
N_MELS = 80
HOP_LENGTH = 160
CHUNK_LENGTH = 30
N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000: number of samples in a chunk

In [35]:
def load_audio(file: str, sr: int = SAMPLE_RATE):
    """
    Open an audio file and read as mono waveform, resampling as necessary
    Parameters
    ----------
    file: str
        The audio file to open
    sr: int
        The sample rate to resample the audio if necessary
    Returns
    -------
    A NumPy array containing the audio waveform, in float32 dtype.
    """
    try:
        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
        out, _ = (
            ffmpeg.input(file, threads=0)
            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
        )
    except ffmpeg.Error as e:
        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e

    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0

In [36]:
np.savez_compressed(
            "assets/mel_filters.npz",
            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
        )

In [37]:
@lru_cache(maxsize=None)
def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
    """
    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
    Allows decoupling librosa dependency; saved using:
        np.savez_compressed(
            "mel_filters.npz",
            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
        )
    """
    assert n_mels == 80, f"Unsupported n_mels: {n_mels}"
    with np.load(os.path.join("assets","mel_filters.npz")) as f:
        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)

In [38]:
def log_mel_spectrogram(audio, n_mels: int = N_MELS):
    """
    Compute the log-Mel spectrogram of
    Parameters
    ----------
    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
    n_mels: int
        The number of Mel-frequency filters, only 80 is supported
    Returns
    -------
    torch.Tensor, shape = (80, n_frames)
        A Tensor that contains the Mel spectrogram
    """
    if not torch.is_tensor(audio):
        if isinstance(audio, str):
            audio = load_audio(audio)
        audio = torch.from_numpy(audio)

    window = torch.hann_window(N_FFT).to(audio.device)
    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
    magnitudes = stft[:, :-1].abs() ** 2

    filters = mel_filters(audio.device, n_mels)
    mel_spec = filters @ magnitudes

    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
    log_spec = (log_spec + 4.0) / 4.0
    return log_spec

In [46]:
a = log_mel_spectrogram(dataset[0][0])

In [50]:
c = load_audio(dataset[0][0])
c.shape

(3298878,)

In [41]:
dataset[8][0]

'data/raw/Not_Progressive_Rock/Other_Songs\\01.ArmenMiran-PreciousStory.mp3'

In [53]:
from torch.utils.data import Dataset, DataLoader
from random import random

In [63]:
class SongDatasetTest_v2(Dataset):
    def __init__(self):
        pass
    def __len__(self):
        return 99999999

    def __getitem__(self, idx):
        if random()>0.1: raise StopIteration()
        return np.array([0])  else None


In [64]:
data = SongDatasetTest_v2()

In [65]:
loader = DataLoader(data)

In [66]:
for i in loader:
    print(i)

tensor([[0]], dtype=torch.int32)
tensor([[0]], dtype=torch.int32)


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>