# Create MelSpectograms from 2022 BirdClef data

This notebook is based on Kkiller Mels Computer from 2021

### Setup and config

In [None]:
###############
### Imports ###
###############

import joblib, json

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import librosa
import librosa.display
import soundfile
from  soundfile import SoundFile
from  IPython.display import Audio
from pathlib import Path
from tqdm.notebook import tqdm

from  sklearn.model_selection  import StratifiedKFold

In [None]:
#################
### Constants ###
#################
START_INDEX = 0  # The file index in the metadata to start from 
END_INDEX = 14852  # The end index. 14852 is maximal


##############
### Config ###
##############
SR = 32_000  # sample rate
DURATION = 7
SEED = 261

N_FFT = SR // 10
HOP_LENGTH = SR // (10 * 4)

FMIN = 0
FMAX = SR // 2
N_MELS = 128

#############
### PATHS ###
#############
DATA_ROOT = Path("../input/birdclef-2022")
TRAIN_AUDIO_ROOT = Path("../input/birdclef-2022/train_audio")
TRAIN_AUDIO_IMAGES_SAVE_ROOT = Path("audio_images") # Where to save the mels images
TRAIN_AUDIO_IMAGES_SAVE_ROOT.mkdir(exist_ok=True, parents=True)

### Utils

In [None]:
def get_audio_info(filepath):
    """Get some properties from  an audio file"""
    with SoundFile(filepath) as f:
        sr = f.samplerate
        frames = f.frames
        duration = float(frames)/sr
    return {"frames": frames, "sr": sr, "duration": duration}

In [None]:
def create_meta_df(n_splits=5, seed=SEED, nrows=None):
    df = pd.read_csv(DATA_ROOT/"train_metadata.csv", nrows=nrows)
    label_ids = {label: label_id for label_id,label in enumerate(sorted(df["primary_label"].unique()))}
    
    # get fully / partial data from df based on the given desired indexes
    df = df.iloc[START_INDEX: END_INDEX]

    # add umeric label and path to the df
    df["label_id"] = df["primary_label"].map(label_ids)
    df["filepath"] = [str(TRAIN_AUDIO_ROOT/filename) for primary_label,filename in zip(df.primary_label, df.filename) ]

    # get audio info for ogg files
    pool = joblib.Parallel(4)
    mapper = joblib.delayed(get_audio_info)
    tasks = [mapper(filepath) for filepath in df.filepath] # tasks will include sr, frames, duration for each filepath
    df = pd.concat([df, pd.DataFrame(pool(tqdm(tasks)))], axis=1, sort=False)
    
    # provides train/test indices to split data in train/test sets.
    skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)
    splits = skf.split(np.arange(len(df)), y=df.label_id.values)
    df["fold"] = -1

    for fold, (train_set, val_set) in enumerate(splits): 
        df.loc[df.index[val_set], "fold"] = fold

    return label_ids, df

### Extract Metadata 

In [None]:
# save enriched metadata and labels mapping to files
LABEL_IDS, meta_df = create_meta_df(nrows=None)

meta_df.to_csv("rich_train_metadata.csv", index=True)
with open("LABEL_IDS.json", "w") as f:
    json.dump(LABEL_IDS, f)

meta_df.head()

### Data statistics

In [None]:
# num of audio files in every fold
meta_df["fold"].value_counts()

In [None]:
# num of train files per labels
meta_df["primary_label"].value_counts()

In [None]:
# data duration histogram
meta_df["duration"].hist(bins=20)

In [None]:
meta_df["duration"].quantile(np.arange(0, 1, 0.01)).plot()

### MelSpectogram Computer
class for casting audio data to melspectogram

In [None]:
def compute_mel(audio, sr=SR, n_mels=N_MELS, fmin=FMIN, fmax=FMAX, n_fft=N_FFT, hop_length=HOP_LENGTH):
    melspec = librosa.feature.melspectrogram(
        y=audio, 
        sr=sr, 
        n_mels=n_mels, 
        fmin=fmin, 
        fmax=fmax,
        n_fft=n_fft,
        hop_length=hop_length
    )

    melspec = librosa.power_to_db(melspec).astype(np.float32)
    return melspec

### Utils for mels data edit

In [None]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    # normalize
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)
    
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

# used to make all data vectors in the same length
def crop_or_pad(y, length, is_train=True, start=None):
    if len(y) < length:
        y = np.concatenate([y, np.zeros(length - len(y))])
        
        n_repeats = length // len(y)
        epsilon = length % len(y)
        
        y = np.concatenate([y]*n_repeats + [y[:epsilon]])
        
    elif len(y) > length:
        if not is_train:
            start = start or 0
        else:
            start = start or np.random.randint(len(y) - length)

        y = y[start:start + length]

    return y

### Create mels from audio

In [None]:
def audio_to_image(audio):
    melspec = compute_mel(audio) 
    image = mono_to_color(melspec)
    return image
        
def process_audio_files(row, duration=DURATION, sr=SR, res_type="kaiser_fast",
                        resample=True, save=True, step=None):
    audio_length = duration * sr
    step = step or audio_length
    
    # convert ogg to audio format
    audio, orig_sr = soundfile.read(row.filepath, dtype="float32")
    
    # if stereo - convert ro mono
    if np.ndim(audio)>1:
        audio = np.mean(audio, axis=1)

    # resample if audio sr is not desired sr
    if resample and orig_sr != sr:
        audio = librosa.resample(audio, orig_sr, sr, res_type=res_type)
    
    # split long audio to shorter, same length parts.
    audios = [audio[i:i + audio_length] for i in range(0, max(1, len(audio) - audio_length + 1), step)]
    
    # crop or pad last part
    audios[-1] = crop_or_pad(audios[-1] , length=audio_length)
    
    # convert sudio parts to image
    images = [audio_to_image(audio) for audio in audios]
    
    # cobine all parts together - for easyier saving
    images = np.stack(images)
    
    # save or return images as np arrays
    if save:
        path = TRAIN_AUDIO_IMAGES_SAVE_ROOT/f"{row.filename}.npy"
        path.parent.mkdir(exist_ok=True, parents=True)
        np.save(str(path), images)
    else:
        return row.filename, images

In [None]:
def get_audios_as_images(meta_df): 
    mapper = joblib.delayed(process_audio_files)
    tasks = [mapper(row, step=int(DURATION*0.666*SR)) 
             for row in meta_df.itertuples(False)]
    
    joblib.Parallel(2)(tqdm(tasks))

In [None]:
get_audios_as_images(meta_df)

### Check output mels

In [None]:
row = meta_df.loc[meta_df.duration.idxmax()]
mels = np.load(str((TRAIN_AUDIO_IMAGES_SAVE_ROOT/row.filename).as_posix() + ".npy"))