# Is it a banger? - Make your own dataset

### TODO

Discuss folder structure, `split_files.sh` script, duration of each clip, `p_include`

#### Folder structure
```
data
├── label_1
├── label_2
├──    ·
├──    ·
├──    ·
└── label_k
```

For the given example **Need link here eventually**, we simply have

```
data
├── banger
└── not_a_banger
```

#### File splitting - EXPLAIN WHAT THIS DOES, UPDATE IF FILE CHANGED
```bash
#!/bin/bash

SEGMENT_TIME=5 # in seconds
DATA_ROOT_DIR="../data"

DIRS=$(find "${DATA_ROOT_DIR}" -maxdepth 1 -mindepth 1 -type d)

# Make sure globstar is enabled
shopt -s globstar

for FILE in "${DATA_ROOT_DIR}"/**/*.mp3
do 
    echo "Processing ${FILE}"
    ffmpeg -i "${FILE}" -f segment -segment_time ${SEGMENT_TIME} -c copy "${FILE%.*}"%03d.wav                
    rm "${FILE}"
    rm "$(ls -t "${FILE%.*}"*.wav | tail -n 1)" # remove last file so uniform length
done
```

In [9]:
import os
import glob
import librosa
import numpy as np
np.random.seed(1234)
import pandas as pd

In [13]:
parent_dir = '../data'
parent_dir_contents = [os.path.join(parent_dir, dirname) for dirname in os.listdir(parent_dir)]
sub_dirs = [filename if os.path.isdir(filename) else None for filename in parent_dir_contents]
sub_dirs = list(filter(None.__ne__, sub_dirs))
labels_list = [os.path.relpath(path, parent_dir) for path in sub_dirs]

In [27]:
def extract_features(file_name, sample_rate=22050, segment_time=5, samples_to_clip=500):
    audio, sample_rate = librosa.load(file_name, sr=sample_rate)
    end_idx = (sample_rate * segment_time) - samples_to_clip # remove some end samples as not strictly uniform size
    audio = audio[0:end_idx]
    log_specgram = librosa.logamplitude(np.abs(librosa.stft(audio))**2, ref_power=np.max)
    features = {"audio": audio, "log_specgram": log_specgram}
    return features

def one_hot_encode(label, labels_list):
    n_labels = len(labels_list)
    one_hot_encoded = np.zeros(n_labels)
    for idx, cmp in enumerate(labels_list):
        if label == cmp:
            one_hot_encoded[idx] = 1                     
    return one_hot_encoded

def trim_file_list(fnames_list, p_include=1.0):
    fnames_list = np.asarray(fnames_list)
    include = np.random.rand(*fnames_list.shape)
    fnames_list = fnames_list[include < p_include]
    return fnames_list
    

def parse_audio_files(parent_dir, sub_dirs_list, labels_list, file_ext='*.wav', p_include=1.0,\
                      sample_rate=22050, segment_time=5, samples_to_clip=500):
    data = []
    index = []
    for label_idx, sub_dir in enumerate(sub_dirs_list):
        fnames_list = glob.glob(os.path.join(sub_dir, file_ext))
        fnames_list = trim_file_list(fnames_list, p_include=p_include)
        for fname in fnames_list:
            print("Processing " + os.path.basename(fname))
            features = extract_features(fname, segment_time=segment_time, \
                                        sample_rate=sample_rate, samples_to_clip=samples_to_clip)
            label = labels_list[label_idx]
            label_one_hot = one_hot_encode(label, labels_list)
            features['label'] = label
            features["label_one_hot"] = label_one_hot
            data.append(features)
            index.append(os.path.basename(fname))
    return pd.DataFrame(data, index=index)

In [28]:
df = parse_audio_files(parent_dir, sub_dirs, labels_list, p_include=0.1, segment_time=5, samples_to_clip=1100)
df = df.iloc[np.random.permutation(len(df))] # shuffle rows
df.to_pickle(os.path.join(parent_dir, 'processed_dataset.pkl'))

Processing Sleep On The Floor (LYRICS) - The Lumineers_0015.wav
Processing Enya   The Best Of Enya FULL ALBUM_0465.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1374.wav
Processing Enya   The Best Of Enya FULL ALBUM_0115.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0903.wav
Processing John Newman - Love Me Again_0033.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0308.wav
Processing Enya   The Best Of Enya FULL ALBUM_0505.wav
Processing The Lumineers - Big Parade_0019.wav
Processing Fun. - Some Nights [OFFICIAL VIDEO]_0034.wav
Processing John Legend - All of Me (Edited Video)_0000.wav
Processing Ed Sheeran - Give Me Love [Official Video]_0017.wav
Processing The Lumineers - Darlene [Lyrics in description]_0012.wav
Processing John Legend - All of Me (Edited Video)_0014.wav
Processing Enya   The Best Of Enya FULL ALBUM_0276.wav
Processing Enya   The Best Of Enya FULL ALBUM_051

Processing The Lumineers - In The Light [Lyrics]_0017.wav
Processing Passenger _ Let Her Go (Official Video)_0018.wav
Processing Scotland - The Lumineers (Lyrics)_0003.wav
Processing Avicii - Wake Me Up (Official Video)_0019.wav
Processing The Lumineers - 'Stubborn Love' (Official Video)_0027.wav
Processing George Ezra - Budapest (Official Video)_0023.wav
Processing The Lumineers - 'Stubborn Love' (Official Video)_0032.wav
Processing Fun. - Some Nights [OFFICIAL VIDEO]_0042.wav
Processing Scotland - The Lumineers (Lyrics)_0002.wav
Processing Imagine Dragons - Demons (Official)_0037.wav
Processing John Newman - Love Me Again_0045.wav
Processing The Lumineers - In The Light [Lyrics]_0016.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0140.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0785.wav
Processing The Lumineers   This Must Be The Place_0009.wav
Processing The Lumineers   This Must Be The Place_0021.wav
Proce

Processing John Legend - All of Me (Edited Video)_0043.wav
Processing Enya   The Best Of Enya FULL ALBUM_0584.wav
Processing The Lumineers - Blue Christmas_0016.wav
Processing Passenger _ Let Her Go (Official Video)_0010.wav
Processing The Lumineers - Flowers in your hair ( lyrics )_0012.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0808.wav
Processing Imagine Dragons - It's Time_0038.wav
Processing The Lumineers - Ain't Nobody's Problem_0026.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1243.wav
Processing The Lumineers - Morning Song_0048.wav
Processing The Lumineers - Flapper Girl_0018.wav
Processing Enya   The Best Of Enya FULL ALBUM_0618.wav
Processing Phillip Phillips - Home_0025.wav
Processing The Lumineers - My Eyes [Lyrics]_0027.wav
Processing Fun. - We Are Young ft. Janelle Monáe [OFFICIAL VIDEO]_0025.wav
Processing Enya   The Best Of Enya FULL ALBUM_0368.wav
Processing Bruno Mars - Locked Out Of Heav

Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1432.wav
Processing Enya   The Best Of Enya FULL ALBUM_0492.wav
Processing The Lumineers - Slow It Down (Live on KEXP)_0049.wav
Processing Fun. - We Are Young ft. Janelle Monáe [OFFICIAL VIDEO]_0046.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0538.wav
Processing Enya   The Best Of Enya FULL ALBUM_0490.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0060.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0712.wav
Processing Enya   The Best Of Enya FULL ALBUM_0137.wav
Processing The Lumineers - Morning Song_0001.wav
Processing Capital Cities - Safe And Sound (Official Video)_0004.wav
Processing The Lumineers - Charlie Boy_0008.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0841.wav
Processing John Newman - Love Me Again_0039.wav
Processing The Lumineers - Angela_003

Processing Dennis Cruz - Rock & Roll (Original Mix)_0030.wav
Processing Best Of 2017 Tech House Yearmix_0196.wav
Processing Dennis Cruz - Rock & Roll (Original Mix)_0024.wav
Processing Deeper Purpose - Breathe & Stop (Free Download)_0028.wav
Processing Deeper Purpose - Breathe & Stop (Free Download)_0014.wav
Processing Technasia - I Am Somebody (Original Mix)_0003.wav
Processing Best Of 2017 Tech House Yearmix_0237.wav
Processing Selected New Year Mix_0305.wav
Processing Best Of 2017 Tech House Yearmix_0592.wav
Processing Patrick Topping - Be Sharp Say Nowt_0080.wav
Processing Eats Everything Ft. Green Velvet - The Duster_0012.wav
Processing Premiere - De La Swing - Creeper [Elrow Music]_0067.wav
Processing Premiere - Marco Strous - That's My Jam [Cr2 Records]_0013.wav
Processing Premiere - Anthony Attalla & Dqwon - Thick (Pirupa Remix) [Lapsus Music]_0071.wav
Processing Selected New Year Mix_0315.wav
Processing Carl Cox - I Want You (Forever) - Josh Butler Remix_0067.wav
Processing Si

Processing Premiere - De La Swing - Creeper [Elrow Music]_0017.wav
Processing Selected New Year Mix_0177.wav
Processing CamelPhat - Drop It_0006.wav
Processing Premiere - Pete Zorba - She Got Back [Lapsus Music]_0082.wav
Processing TECH-HOUSE - Max Chapman - La Fiesta [Hot Creations]_0035.wav
Processing Best Of 2017 Tech House Yearmix_0521.wav
Processing Best Of 2017 Tech House Yearmix_0253.wav
Processing Best Of 2017 Tech House Yearmix_0535.wav
Processing CamelPhat - Hangin' Out With Charlie (Original Mix)_0003.wav
Processing Technasia - I Am Somebody (Original Mix)_0073.wav
Processing Best Of 2017 Tech House Yearmix_0441.wav
Processing The Tribe Of Good - Loving You Baby (Weiss Remix)_0042.wav
Processing Homework - The Street_0026.wav
Processing Jay Lumen & Wade - Dirty Groove_0063.wav
Processing Dennis Cruz - Rock & Roll (Original Mix)_0040.wav
Processing 50 Cent - Just A Lil Bit (Paradox City Edit) (Free Download)_0002.wav
Processing Low Steppa - Vocal Loop (Premiere)_0057.wav
Proc

Processing Premiere - Raumakustik - Can't Get Enough [Toolroom Records]_0058.wav
Processing Best Of 2017 Tech House Yearmix_0474.wav
Processing Deeper Purpose - Breathe & Stop (Free Download)_0051.wav
Processing Jay Lumen & Wade - Dirty Groove_0056.wav
Processing Jay Lumen & Wade - Dirty Groove_0042.wav
Processing Hosse - Ready For This (Raffa FL Remix)_0074.wav
Processing Format B - The Scoop (Original Mix)_0049.wav
Processing Dennis Cruz - Rock & Roll (Original Mix)_0074.wav
Processing Best Of 2017 Tech House Yearmix_0461.wav
Processing CamelPhat - Hangin' Out With Charlie (Original Mix)_0037.wav
Processing Patrick Topping - Be Sharp Say Nowt_0007.wav
Processing Audiophonik - Pass Me (Raffa FL Remix)_0027.wav
Processing Stardust - Music Sounds Better (Mistrix Dub) (Free Download)_0052.wav
Processing Gorgon City - Grooves On The Vinyl (Official Video)_0039.wav
Processing Premiere - De La Swing - Creeper [Elrow Music]_0023.wav
Processing Gorgon City - Grooves On The Vinyl (Official Vid

In [25]:
display(df[:10])

Unnamed: 0,audio,label,label_one_hot,log_specgram
Deeper Purpose - Breathe & Stop (Free Download)_0010.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[0.0, 1.0]","[[-80.0, -72.3026, -52.3942, -50.8262, -58.774..."
Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0112.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -80.0, -67.0795, -80.0, -53.5072, -60..."
Enya The Best Of Enya FULL ALBUM_0608.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -59.1552, -51.7495, -51.2771, -58.820..."
PAWSA - Groovy Cat_0057.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[0.0, 1.0]","[[-80.0, -50.9362, -37.8347, -37.5939, -40.366..."
The Lumineers - Darlene [Lyrics in description]_0023.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -54.552, -39.0408, -33.541, -80.0, -4..."
Enya The Best Of Enya FULL ALBUM_0439.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -58.0433, -51.0994, -80.0, -64.7069, ..."
Best Of 2017 Tech House Yearmix_0419.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[0.0, 1.0]","[[-80.0, -78.4149, -40.4557, -34.4459, -44.026..."
CamelPhat - Make 'Em Dance (Original Mix) [Suara]_0031.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[0.0, 1.0]","[[-33.9356, -45.455, -41.9561, -53.6853, -51.4..."
Enya The Best Of Enya FULL ALBUM_0128.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -54.549, -44.0689, -36.7398, -57.6807..."
P!nk - Just Give Me A Reason ft. Nate Ruess_0022.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -80.0, -33.2159, -25.3378, -20.678, -..."
