# Is it a banger? - Make your own dataset

### TODO

Discuss folder structure, `split_files.sh` script, duration of each clip, `p_include`

#### Folder structure
```
../data
├── label_1
├── label_2
├──    ·
├──    ·
├──    ·
└── label_k
```

For the given example **Need link here eventually**, we simply have

```
../data
├── banger
└── not_a_banger
```

#### File splitting - EXPLAIN WHAT THIS DOES, UPDATE IF FILE CHANGED
```bash
#!/bin/bash

SEGMENT_TIME=5 # in seconds
DATA_ROOT_DIR="../data"

DIRS=$(find "${DATA_ROOT_DIR}" -maxdepth 1 -mindepth 1 -type d)

# Make sure globstar is enabled
shopt -s globstar

for FILE in "${DATA_ROOT_DIR}"/**/*.mp3
do 
    echo "Processing ${FILE}"
    ffmpeg -i "${FILE}" -f segment -segment_time ${SEGMENT_TIME} -c copy "${FILE%.*}"%03d.wav                
    rm "${FILE}"
    rm "$(ls -t "${FILE%.*}"*.wav | tail -n 1)" # remove last file so uniform length
done
```

In [141]:
import glob
import os
import librosa
import numpy as np
import pandas as pd

In [158]:
parent_dir = '../data'
parent_dir_contents = [os.path.join(parent_dir, dirname) for dirname in os.listdir(parent_dir)]
sub_dirs = [filename if os.path.isdir(filename) else None for filename in parent_dir_contents]
sub_dirs = list(filter(None.__ne__, sub_dirs))
labels_list = [os.path.relpath(path, parent_dir) for path in sub_dirs]

In [161]:
def extract_features(file_name, sample_rate=22050):
    audio, sample_rate = librosa.load(file_name, sr=sample_rate)
    log_specgram = librosa.logamplitude(np.abs(librosa.stft(audio))**2, ref_power=np.max)
    features = {"audio": audio, "log_specgram": log_specgram}
    return features

def one_hot_encode(label, labels_list):
    n_labels = len(labels_list)
    one_hot_encoded = np.zeros(n_labels)
    for idx, cmp in enumerate(labels_list):
        if label == cmp:
            one_hot_encoded[idx] = 1                     
    return one_hot_encoded

def trim_file_list(fnames_list, p_include=1.0):
    fnames_list = np.asarray(fnames_list)
    include = np.random.rand(*fnames_list.shape)
    fnames_list = fnames_list[include < p_include]
    return fnames_list
    

def parse_audio_files(parent_dir, sub_dirs_list, labels_list, file_ext='*.wav', p_include=1.0):
    data = []
    index = []
    for label_idx, sub_dir in enumerate(sub_dirs_list):
        fnames_list = glob.glob(os.path.join(sub_dir, "*.wav"))
        fnames_list = trim_file_list(fnames_list, p_include=p_include)
        for fname in fnames_list:
            print("Processing " + os.path.basename(fname))
            features = extract_features(fname)
            label = labels_list[label_idx]
            label_one_hot = one_hot_encode(label, labels_list)
            features['label'] = labels_list[label_idx]
            features["label_one_hot"] = label_one_hot
            data.append(features)
            index.append(os.path.basename(fname))
    return pd.DataFrame(data, index=index)

In [164]:
df = parse_audio_files(parent_dir, sub_dirs, labels_list, p_include=0.1)
df = df.iloc[np.random.permutation(len(df))] # shuffle rows
df.to_pickle(os.path.join(parent_dir, 'processed_dataset.pkl'))

Processing 808 State - In Yer Face (Bicep Remix)006.wav
Processing 808 State - In Yer Face (Bicep Remix)008.wav
Processing 808 State - In Yer Face (Bicep Remix)022.wav
Processing 808 State - In Yer Face (Bicep Remix)036.wav
Processing 808 State - In Yer Face (Bicep Remix)041.wav
Processing 808 State - In Yer Face (Bicep Remix)043.wav
Processing 808 State - In Yer Face (Bicep Remix)046.wav
Processing 808 State - In Yer Face (Bicep Remix)048.wav
Processing 808 State - In Yer Face (Bicep Remix)060.wav
Processing 808 State - In Yer Face (Bicep Remix)075.wav
Processing 808 State - In Yer Face (Bicep Remix)088.wav
Processing 808 State - In Yer Face (Bicep Remix)089.wav
Processing Pacific_State_Will_Bailey_Remix006.wav
Processing Pacific_State_Will_Bailey_Remix010.wav
Processing Pacific_State_Will_Bailey_Remix022.wav
Processing Pacific_State_Will_Bailey_Remix024.wav
Processing Pacific_State_Will_Bailey_Remix028.wav
Processing Pacific_State_Will_Bailey_Remix034.wav
Processing Pacific_State_Wil

In [166]:
display(df[:10])

Unnamed: 0,audio,label,label_one_hot,log_specgram
Twinkle Twinkle Little Star004.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[0.0, 1.0]","[[-80.0, -54.7389, -54.4667, -68.6597, -63.849..."
Twinkle Twinkle Little Star018.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[0.0, 1.0]","[[-68.1095, -57.2034, -51.3898, -52.9167, -58...."
Twinkle Twinkle Little Star003.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[0.0, 1.0]","[[-80.0, -57.1564, -54.7311, -68.209, -68.4011..."
808 State - In Yer Face (Bicep Remix)008.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[1.0, 0.0]","[[-41.8047, -41.0589, -64.2355, -47.7573, -42...."
Pacific_State_Will_Bailey_Remix034.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[1.0, 0.0]","[[-35.1649, -28.6896, -31.3873, -56.1729, -67...."
Twinkle Twinkle Little Star005.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[0.0, 1.0]","[[-80.0, -62.9963, -59.4128, -75.3882, -64.472..."
Pacific_State_Will_Bailey_Remix047.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[1.0, 0.0]","[[-54.9241, -63.8443, -61.8939, -74.689, -78.2..."
Twinkle Twinkle Little Star014.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[0.0, 1.0]","[[-62.2799, -38.1733, -36.6015, -46.6677, -53...."
808 State - In Yer Face (Bicep Remix)089.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[1.0, 0.0]","[[-41.4646, -37.1323, -34.7023, -40.0474, -52...."
808 State - In Yer Face (Bicep Remix)006.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[1.0, 0.0]","[[-64.5562, -62.4264, -69.4593, -80.0, -70.112..."
