In [7]:
import csv
import os
from pathlib import Path
import librosa
import matplotlib.pyplot as plt
import numpy as np
import json
import torch
from sklearn.model_selection import train_test_split
import math

In [None]:
path = "Data/genres_original/blues/blues.00000.wav"


class FreatureExtractor:
    COLUMNS = [
        "zero_crossings", "spectral_centeroid_mean", "spectral_centroid_var"
    ]

    def __init__(self, path: os.PathLike):

        self.y, self.sr = librosa.load(path)

    def to_row(self):
        spectral_centroid = self.spectral_centroid()
        
        return [
            sum(self.zero_crossings()),
            np.average(spectral_centroid),
            np.var(spectral_centroid),
        ]
    
    def zero_crossings(self):
        return librosa.zero_crossings(self.y, pad=False)

    def spectral_centroid(self):
        return librosa.feature.spectral_centroid(y=self.y, sr=self.sr)[0]
    



extractor = FreatureExtractor(path)
spectral = extractor.spectral_centroid()

In [4]:

labels = [
    'blues', 'classical', 'country', 'disco', 'hiphop', 
    'jazz', 'metal', 'pop', 'reggae', 'rock'
]

base_path = Path("Data/genres_original")
data_path = Path("Data/mfcc.json")


In [None]:

def extract_data(
    input_path: Path, 
    output_path: Path, 
    *,
    segments: int = 10,
    sample_rate: int = 22050,
    n_mfcc: int = 20, 
    n_fft: int = 2048, 
    hop_length: int = 512, 
):
    features = []
    targets = []

    samples_per_seg = int((sample_rate * 30) / segments)
    seg_len = math.ceil(samples_per_seg / hop_length)

    for path in input_path.glob("*/*.wav"):
        if path.name == "jazz.00054.wav":
            # librosa can't read files > 1MB
            continue

        genre = path.parent.stem
        y, sr = librosa.load(path, sr=sample_rate)

        label = labels.index(genre)

        for seg in range(segments):
            start = samples_per_seg * seg
            end = start + samples_per_seg

            mfcc = librosa.feature.mfcc(
                y=y[start:end], 
                sr=sr,
                n_fft=n_fft,
                n_mfcc=n_mfcc,
                hop_length=hop_length,
            ).T

            # I'm not super clear as to why, but some of 
            # the mfcc's are slightly larger than the rest
            # so I descarded them.
            if len(mfcc) == seg_len:
                features.append(mfcc.tolist())
                targets.append(label)
    
    with open(output_path, "w") as f:
        json.dump(
            {'mfcc': features, 'labels': targets}, 
            f, 
            indent=2,
        )


extract_data(base_path, data_path)

In [12]:

def load_data(path):
    with open(path, 'r') as f:
        data = json.load(f)
    
    sizes = {}
    for mfcc in data["mfcc"]:
        length = len(mfcc)
        sizes.setdefault(length, 0)
        sizes[length] += 1


    return torch.tensor(data["mfcc"]), torch.tensor(data["labels"])

inputs, targets = load_data(data_path)


X_train, X_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2)
print(X_train.shape, X_test.shape)

torch.Size([7988, 130, 20]) torch.Size([1998, 130, 20])
