<a href="https://colab.research.google.com/github/fdebrain/kaggle-free-audio-tagging-2019/blob/main/Kaggle_Freesound_Audio_Tagging_Feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Freesound Audio Tagging 2019 Kaggle Competition - Formatting & Extraction
Notebook written by [Frédéric Debraine](https://www.linkedin.com/in/fdebraine/)


[Official Competition Link](https://www.kaggle.com/c/freesound-audio-tagging-2019/overview)

In [None]:
import os
import time
import json
import h5py
import tables
import librosa
import pickle
import numpy as np
import pandas as pd
import IPython
import IPython.display as ipd
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm_notebook
from functools import partial
import multiprocessing
import tensorflow as tf

In [None]:
# Helper functions
def load_train_data_df(mode='curated'):
    # Load training data filenames and labels (raw -> multilabels are represented as a string with comma separated values)
    csv_path = f'train_{mode}.csv'
    raw_df = pd.read_csv(csv_path, index_col='fname')
        
    # Extract list of expected labels
    sub = pd.read_csv('sample_submission.csv', index_col='fname')
    labels_list = sub.columns.values 

    # Encode multi-labels in a binary vector
    splitted_labels = [ labels.split(',') for labels in raw_df['labels'].values ]
    encoder = MultiLabelBinarizer()
    encoded_labels = encoder.fit_transform(splitted_labels)

    # Create a new pandas Dataframe to represent training labels as binary vectors
    labels_df = pd.DataFrame(data=encoded_labels, index=list(raw_df.index), columns=labels_list)
    
    return labels_df

def listen_sample(sample, sr=44100):
    return IPython.display.display(ipd.Audio(data=sample, rate=sr))

def load_sample(path, resample=None, trim=True):
    input_data = tf.io.read_file(path)
    sample, _ = tf.audio.decode_wav(input_data)
    sample = sample.numpy().flatten()
    
    if resample:
        sample = librosa.resample(sample, orig_sr=44100, target_sr=resample)
    
    if trim:
        sample , _ = librosa.effects.trim(sample)
    return sample

def extract_mfcc(sample, n_mfcc=20, sr=44100):
    """ Return a matrix of shape (n_mfcc, int(seconds*sr/1024)). """
    mfccs = librosa.feature.mfcc(sample, sr=sr, n_mfcc=n_mfcc)
    return mfccs.astype(np.float32)

def extract_mel(sample, n_mels=128, sr=44100, hop=347, log=False):
    mel = librosa.feature.melspectrogram(sample, sr=sr, n_fft=20*n_mels, hop_length=hop, n_mels=n_mels, fmin=20, fmax=sr//2)
    if log:
        mel = librosa.core.power_to_db(mel, ref=1.0, amin=1e-10, top_db=None)
    return mel.astype(np.float32)

def compress(x, eps=1e-6):
    x_min = np.min(x, keepdims=True)
    x_max = np.max(x, keepdims=True)
    x = 255 * (x - x_min)/(x_max - x_min + eps)
    return x.astype(np.uint8)

def save_h5(savepath, X):
    with tables.open_file(savepath, mode='w') as h5_file:
        filters = tables.Filters(complib='zlib', complevel=1)
        for filename, x in X.items():
            h5_file.create_carray('/', f"t{filename.split('.')[0]}", obj=x, filters=filters)

def load_h5(filenames, h5_filename):
    if not isinstance(filenames, list):
        filenames = [filenames]
    with h5py.File(h5_filename, mode='r') as dataset:
        samples = [ dataset[f][()] for f in filenames ]
        return samples

def save_npy(savepath, X):
    if not os.path.isdir(savepath):
        os.mkdir(savepath)
    for filename, x in X.items():
        np.save(f"{savepath}/{filename.split('.')[0]}.npy", x)

def load_npy(filenames):
    if not isinstance(filenames, list):
        filenames = list(filenames)
    return [np.load(filename) for filename in filenames]

def save_pkl(savepath, obj):
    """Save pickle object in file."""
    with open(savepath, 'wb') as pickle_file:
        pickle.dump(obj, pickle_file, pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
    """Load pickle object from file."""
    with open(filename, 'rb') as pickle_file:
        return pickle.load(pickle_file)

def save_features(wav_paths, savepath='train_curated', feat_type='mel', n_feats=128, hop=347, sr=44100, resample=22050, format='pkl', n_splits=1):
    """ Save compressed features (wav, mfcc or mel) into the specified format. Increase n_splits if you go out of RAM. 
    In terms of compression ratio: h5 (best) > pkl > npy.
    In terms of read speed: pkl (best) > npy > h5.
    """
    assert feat_type in ['wav', 'mel', 'mfcc'], 'Wrong feat_type argument !'
    assert format in ['pkl', 'npy', 'h5'], 'Wrong format argument !'
    
    start_time = time.time()
    savepath = f'{savepath}_{feat_type}'
    savepath += str(n_feats) if feat_type!='wav' else ''

    for split_idx, split_paths in enumerate(np.array_split(wav_paths, n_splits)):
        X = {}
        for path in split_paths:
            filename = path.split('/')[-1]
            x = load_sample(path, resample, trim=True)

            if feat_type == 'mfcc':
                x = extract_mfcc(x, n_feats, sr)
            elif feat_type == 'mel':
                x = extract_mel(x, n_feats, sr, hop)
            
            x = compress(x)
            X[filename] = x

        savepath_split = savepath + ('', f'_{split_idx + 1}')[n_splits > 1]
        if format == 'pkl':
            save_pkl(savepath_split + '.pkl', X)
        elif format == 'h5':
            save_h5(savepath_split + '.h5', X)
        elif format == 'npy':
            save_npy(savepath_split, X)

        print(f'Successfully extracted {feat_type} features in {savepath_split} ! (took {time.time() - start_time:.2f}s).')

def fuse_h5(h5_filename1, h5_filename2):
    with tables.open_file(h5_filename1, 'a') as h5_main, h5py.File(h5_filename2, 'r') as h5_bis:
        filters = tables.Filters(complib='zlib', complevel=1)
        keys_to_copy = list(h5_bis.keys())
        for id in keys_to_copy:
            h5_main.create_carray('/', id, obj=h5_bis[id][()], filters=filters)
    print(f'Successfully fused {h5_filename1} and {h5_filename2} !')

# 1. Download original data

For more info about the Kaggle API see https://www.kaggle.com/docs/api

The dataset can also be downloaded directly at https://www.kaggle.com/c/freesound-audio-tagging-2019/data

In [None]:
# Upload your kaggle.json file (might require a few seconds after upload)
!mkdir ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
%%time

# Download dataset from Kaggle (~8mn)
!kaggle competitions download -c freesound-audio-tagging-2019

Downloading sample_submission.csv to /content
  0% 0.00/569k [00:00<?, ?B/s]
100% 569k/569k [00:00<00:00, 35.7MB/s]
Downloading test.zip to /content
 99% 2.07G/2.08G [00:23<00:00, 68.8MB/s]
100% 2.08G/2.08G [00:23<00:00, 94.5MB/s]
Downloading train_noisy.zip to /content
100% 20.0G/20.0G [05:50<00:00, 44.7MB/s]
100% 20.0G/20.0G [05:50<00:00, 61.3MB/s]
Downloading train_curated.zip to /content
100% 2.24G/2.24G [00:39<00:00, 106MB/s] 
100% 2.24G/2.24G [00:39<00:00, 61.0MB/s]
Downloading train_noisy.csv to /content
  0% 0.00/571k [00:00<?, ?B/s]
100% 571k/571k [00:00<00:00, 78.7MB/s]
Downloading train_curated.csv to /content
  0% 0.00/140k [00:00<?, ?B/s]
100% 140k/140k [00:00<00:00, 147MB/s]
CPU times: user 3.44 s, sys: 848 ms, total: 4.29 s
Wall time: 7min


In [None]:
%%time

# Unzip all files (~9mn)
!mkdir -p data/train/curated
!mkdir -p data/train/noisy
!mkdir -p data/test

!unzip -q train_curated.zip -d data/train/curated/wav && rm train_curated.zip
!unzip -q train_noisy.zip -d data/train/noisy/wav && rm train_noisy.zip
!unzip -q test.zip -d data/test/wav && rm test.zip

CPU times: user 1.46 s, sys: 245 ms, total: 1.71 s
Wall time: 8min 51s


# 2. Data formatting - Compressed .wav files

Due to its considerable size (~23Gb), the original dataset is inadequate for loading into the memory directly.

In this section, we propose a way to reduce the memory footprint by converting the wav signals data type to uint8 and using the HDF5 file format. The loss in sound quality is barely noticeable.

In [None]:
# Load data filenames and labels
curated_train_labels = load_train_data_df(mode='curated')
noisy_train_labels = load_train_data_df(mode='noisy')
test_labels = pd.read_csv('sample_submission.csv', index_col='fname')

# Main info about the training/testing sets
print(f'{curated_train_labels.shape[1]} possible classes.')
print(f'{curated_train_labels.shape[0]} curated training samples.')
print(f'{noisy_train_labels.shape[0]} noisy training samples.')
print(f'{test_labels.shape[0]} test samples.')

curated_train_wav_paths = 'data/train/curated/wav/' + curated_train_labels.index.values
noisy_train_wav_paths = 'data/train/noisy/wav/' + noisy_train_labels.index.values
test_wav_paths = 'data/test/wav/' + test_labels.index.values

80 possible classes.
4970 curated training samples.
19815 noisy training samples.
3361 test samples.


In [None]:
%%time
save_features(curated_train_wav_paths, savepath='train_curated', feat_type='wav', format='h5', n_splits=1, resample=22050) # 2mn
save_features(noisy_train_wav_paths, savepath='train_noisy', feat_type='wav', format='h5', n_splits=2, resample=22050) # 15mn
save_features(test_wav_paths, savepath='test', feat_type='wav', format='h5', n_splits=1, resample=22050) # 2mn

# fuse_h5('train_noisy_wav_1.h5', 'train_noisy_wav_2.h5')
# !mv train_noisy_wav_1.h5 train_noisy_wav.h5 && rm train_noisy_wav_2.h5

Successfully extracted wav features in train_curated_wav ! (took 1739.26s).
Successfully extracted wav features in train_noisy_wav_1 ! (took 6582.18s).


In [None]:
# Listen to audio quality after compression
with h5py.File('train_curated_wav.h5', mode='r') as h5_file:
    filenames = list(h5_file.keys())

samples = load_h5(filenames[0], 'train_curated_wav.h5')
listen_sample(samples[0])

In [None]:
# (OPTIONAL) Create Kaggle Dataset
%%time
!mkdir -p wav_dataset
!mv train_curated_wav.h5 test_wav.h5 train_noisy_wav.h5 wav_dataset
!kaggle datasets init -p wav_dataset

# Define metadata
meta = {}
meta["title"] = "Freesound Audio Tagging 2019 - HDF5 wav"
meta["subtitle"] = "Compressed .wav recordings in HDF5 format from the FAT2019 Dataset."
meta["licenses"] = [{"name": "CC0-1.0"}]
meta["id"] = "obione26/FAT2019Wav"
meta["keywords"] = ["audio", "classification", "features", "tagging"]

with open('wav_dataset/dataset-metadata.json', 'w', encoding='utf-8') as f:
    json.dump(meta, f, ensure_ascii=False, indent=4)

# Send data to kaggle (~10mn)
!kaggle datasets version -p wav_dataset -m "Fused noisy splits into single h5"

Data package template written to: wav_dataset/dataset-metadata.json
Starting upload for file train_noisy_wav.h5
100% 6.25G/6.25G [02:04<00:00, 54.0MB/s]
Upload successful: train_noisy_wav.h5 (6GB)
Starting upload for file test_wav.h5
100% 599M/599M [00:12<00:00, 48.8MB/s]
Upload successful: test_wav.h5 (599MB)
Starting upload for file train_curated_wav.h5
100% 665M/665M [00:13<00:00, 50.8MB/s]
Upload successful: train_curated_wav.h5 (665MB)
The following are not valid tags and could not be added to the dataset: [u'audio', u'features', u'tagging']
Dataset version is being created. Please check progress at https://www.kaggle.com/obione26/fat2019wav
CPU times: user 1.76 s, sys: 657 ms, total: 2.41 s
Wall time: 2min 35s


# 3. Feature extraction - Melspectrogram HDF5

In [None]:
# Feature parameters
sr = 44100
resample = None
n_feats = 128
n_fft = 20 * n_feats
hop_len = 347

# Extract features (~1h45)
save_features(curated_train_wav_paths, 'train_curated_mel128.h5', 'mel', n_feats, hop_len, sr, resample) # 900s (15mn)
!rm -r data/train/curated/wav

save_features(noisy_train_wav_paths, 'train_noisy_mel128.h5', 'mel', n_feats, hop_len, sr, resample) # 5000s (80mn)
!rm -r data/train/noisy/wav

save_features(test_wav_paths, 'test_mel128.h5', 'mel', n_feats, hop_len, sr, resample) # 700s (10mn)
!rm -r data/test/wav

80 possible classes.
4970 curated training samples.
19815 noisy training samples.
3361 test samples.
Successfully extracted mel features in train_curated_mel128.h5 ! (took 1055.64s).


# 3. Saving features

In [None]:
%%time

# Compress all (~25mn)
# !zip -jq train_curated_mel.zip data/train/curated/mel/* && rm -r data/train/curated
# !zip -jq train_noisy_mel.zip data/train/noisy/mel/* && rm -r data/train/noisy
# !zip -jq test_mel.zip data/test/mel/* && rm -r data/test

# !du -m train_curated_mel.zip
# !du -m train_noisy_mel.zip
# !du -m test_mel.zip

2102	train_curated_mel.zip
16273	train_noisy_mel.zip
1964	test_mel.zip
CPU times: user 4.05 s, sys: 462 ms, total: 4.51 s
Wall time: 24min 44s


In [None]:
%%time

# OPTIONAL - Copy files to your drive (~10mn)
!cp train_curated_mel128.h5 "/content/drive/My Drive/"
!cp train_noisy_mel128.h5 "/content/drive/My Drive/"
!cp test_mel128.h5 "/content/drive/My Drive/"

cp: cannot stat 'mel_dataset/train_curated_mel.zip': No such file or directory
cp: cannot stat 'mel_dataset/train_noisy_mel.zip': No such file or directory
cp: cannot stat 'mel_dataset/test_mel.zip': No such file or directory
CPU times: user 12.1 ms, sys: 21.5 ms, total: 33.6 ms
Wall time: 361 ms


# 4. Create Kaggle dataset

In [None]:
# Gather all data into a single folder
!mkdir -p mel_dataset
!mv train_curated_mel.zip test_mel.zip train_noisy_mel.zip mel_dataset

In [None]:
%%time

# Define metadata
!kaggle datasets init -p mel_dataset
meta = {}
meta["title"] = "Freesound Audio Tagging 2019 - Mel128"
meta["subtitle"] = "Mel spectrograms (n_mels=128) from the Freesound Audio Tagging 2019 Dataset."
meta["licenses"] = [{"name": "CC0-1.0"}]
meta["id"] = "obione26/mel128"
meta["keywords"] = ["audio", "classification", "melspectrogram", "features", "tagging"]

with open('mel_dataset/dataset-metadata.json', 'w', encoding='utf-8') as f:
    json.dump(meta, f, ensure_ascii=False, indent=4)

# Send data to kaggle (~10mn)
!kaggle datasets create -p mel_dataset

Data package template written to: mel_dataset/dataset-metadata.json
Starting upload for file test_mel.zip
100% 1.92G/1.92G [01:03<00:00, 32.5MB/s]
Upload successful: test_mel.zip (2GB)
Starting upload for file train_curated_mel.zip
100% 2.05G/2.05G [01:14<00:00, 29.8MB/s]
Upload successful: train_curated_mel.zip (2GB)
Starting upload for file train_noisy_mel.zip
100% 15.9G/15.9G [09:03<00:00, 31.4MB/s]
Upload successful: train_noisy_mel.zip (16GB)
Dataset creation error: Cannot update dataset usability rating without a current dataset version
CPU times: user 4.49 s, sys: 1.37 s, total: 5.86 s
Wall time: 11min 25s
