# Is it a banger? - Make your own dataset

### TODO

Discuss folder structure, `split_files.sh` script, duration of each clip, `p_include`

#### Folder structure
```
data
├── label_1
├── label_2
├──    ·
├──    ·
├──    ·
└── label_k
```

For the given example **Need link here eventually**, we simply have

```
data
├── banger
└── not_a_banger
```

#### File splitting - EXPLAIN WHAT THIS DOES, UPDATE IF FILE CHANGED
```bash
#!/bin/bash

SEGMENT_TIME=5 # in seconds
DATA_ROOT_DIR="../data"

DIRS=$(find "${DATA_ROOT_DIR}" -maxdepth 1 -mindepth 1 -type d)

# Make sure globstar is enabled
shopt -s globstar

for FILE in "${DATA_ROOT_DIR}"/**/*.mp3
do 
    echo "Processing ${FILE}"
    ffmpeg -i "${FILE}" -f segment -segment_time ${SEGMENT_TIME} -c copy "${FILE%.*}"%03d.wav                
    rm "${FILE}"
    rm "$(ls -t "${FILE%.*}"*.wav | tail -n 1)" # remove last file so uniform length
done
```

In [9]:
import os
import glob
import librosa
import numpy as np
np.random.seed(1234)
import pandas as pd

In [10]:
parent_dir = '../data'
parent_dir_contents = [os.path.join(parent_dir, dirname) for dirname in os.listdir(parent_dir)]
sub_dirs = [filename if os.path.isdir(filename) else None for filename in parent_dir_contents]
sub_dirs = list(filter(None.__ne__, sub_dirs))
labels_list = [os.path.relpath(path, parent_dir) for path in sub_dirs]

In [11]:
def extract_features(file_name, sample_rate=22050, segment_time=1, samples_to_clip=500):
    audio, sample_rate = librosa.load(file_name, sr=sample_rate)
    end_idx = sample_rate * segment_time - samples_to_clip # remove some end samples as not strictly uniform size
    audio = audio[0:end_idx]
    log_specgram = librosa.logamplitude(np.abs(librosa.stft(audio))**2, ref_power=np.max)
    features = {"audio": audio, "log_specgram": log_specgram}
    return features

def one_hot_encode(label, labels_list):
    n_labels = len(labels_list)
    one_hot_encoded = np.zeros(n_labels)
    for idx, cmp in enumerate(labels_list):
        if label == cmp:
            one_hot_encoded[idx] = 1                     
    return one_hot_encoded

def trim_file_list(fnames_list, p_include=1.0):
    fnames_list = np.asarray(fnames_list)
    include = np.random.rand(*fnames_list.shape)
    fnames_list = fnames_list[include < p_include]
    return fnames_list
    

def parse_audio_files(parent_dir, sub_dirs_list, labels_list, file_ext='*.wav', p_include=1.0,\
                      sample_rate=22050, segment_time=1, samples_to_clip=500):
    data = []
    index = []
    for label_idx, sub_dir in enumerate(sub_dirs_list):
        fnames_list = glob.glob(os.path.join(sub_dir, "*.wav"))
        fnames_list = trim_file_list(fnames_list, p_include=p_include)
        for fname in fnames_list:
            print("Processing " + os.path.basename(fname))
            features = extract_features(fname)
            label = labels_list[label_idx]
            label_one_hot = one_hot_encode(label, labels_list)
            features['label'] = label
            features["label_one_hot"] = label_one_hot
            data.append(features)
            index.append(os.path.basename(fname))
    return pd.DataFrame(data, index=index)

In [12]:
df = parse_audio_files(parent_dir, sub_dirs, labels_list, p_include=0.2, segment_time=5)
df = df.iloc[np.random.permutation(len(df))] # shuffle rows
df.to_pickle(os.path.join(parent_dir, 'processed_dataset.pkl'))

Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1176.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1348.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_955.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_821.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_404.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_362.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_389.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_377.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1003.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_834.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_954.wav
Processing Cliff Richard - Gr

Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1261.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1275.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_920.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_707.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1315.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_288.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_273.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_529.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1113.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_717.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_065.wav
Processing Cliff Richard - G

Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_291.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_044.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_642.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_871.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_468.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_441.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_864.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1331.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_045.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_737.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1319.wav
Processing Cliff Richard - Gre

Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1226.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_146.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_620.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_967.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_783.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_797.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1346.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_032.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_218.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1150.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_556.wav
Processing Cliff Richard - Gr

Processing Best Of 2017 Tech House Yearmix_402.wav
Processing Premiere - Anthony Attalla & Dqwon - Thick (Pirupa Remix) [Lapsus Music]_037.wav
Processing Best Of 2017 Tech House Yearmix_416.wav
Processing Carl Cox - I Want You (Forever) - Josh Butler Remix_084.wav
Processing Best Of 2017 Tech House Yearmix_562.wav
Processing MK - 17 (CamelPhat Dub) [Audio]_074.wav
Processing Claptone - The Music Got Me (Darius Syrossian Remix)_016.wav
Processing Best Of 2017 Tech House Yearmix_238.wav
Processing 50 Cent - Just A Lil Bit (Paradox City Edit) (Free Download)_060.wav
Processing Audiophonik - Pass Me (Raffa FL Remix)_033.wav
Processing 50 Cent - Just A Lil Bit (Paradox City Edit) (Free Download)_048.wav
Processing Homework - The Street_047.wav
Processing Homework - The Street_090.wav
Processing Eats Everything Ft. Green Velvet - The Duster_045.wav
Processing KlangKuenstler - Dance With Me (Raffa FL Remix)_066.wav
Processing Homework - The Street_085.wav
Processing TECH-HOUSE - Max Chapman -

Processing Best Of 2017 Tech House Yearmix_388.wav
Processing Best Of 2017 Tech House Yearmix_607.wav
Processing Best Of 2017 Tech House Yearmix_175.wav
Processing Premiere - L.O.R.D.I.E - Alpha (Darius Syrossian Remix) [Griffintown Records]_045.wav
Processing Low Steppa - Vocal Loop (Premiere)_036.wav
Processing CamelPhat - Hangin' Out With Charlie (Original Mix)_033.wav
Processing Best Of 2017 Tech House Yearmix_149.wav
Processing Premiere - Kinnerman - Dominator [Elrow Music]_060.wav
Processing Groove Armada - House With Me (Andrea Oliva Remix)_059.wav
Processing Technasia - I Am Somebody (Original Mix)_065.wav
Processing Hosse - Ready For This (Raffa FL Remix)_024.wav
Processing Stardust - Music Sounds Better (Mistrix Dub) (Free Download)_078.wav
Processing CamelPhat - Drop It_025.wav
Processing Deeper Purpose - Pubs N Parties (Free Download)_039.wav
Processing Shiba San & Green Velvet - Chance_077.wav
Processing CamelPhat - Drop It_042.wav
Processing Technasia - I Am Somebody (Ori

Processing Stardust - Music Sounds Better (Mistrix Dub) (Free Download)_019.wav
Processing Shiba San & Green Velvet - Chance_071.wav
Processing Groove Armada - House With Me (Andrea Oliva Remix)_038.wav
Processing Caal & Baum - This Story (Raffa FL Remix)_053.wav
Processing Luigi Rocca, Federico Buratti - Secret Bass (Knober, Sylter Remix)_034.wav
Processing FISHER - Ya Kidding_026.wav
Processing Premiere - L.O.R.D.I.E - Alpha (Darius Syrossian Remix) [Griffintown Records]_030.wav
Processing Deeper Purpose - Breathe & Stop (Free Download)_073.wav
Processing Best Of 2017 Tech House Yearmix_672.wav
Processing Premiere - Pete Zorba - She Got Back [Lapsus Music]_017.wav
Processing The Shapeshifters - Lola's Theme (Mistrix Dub) (Free Download)_013.wav
Processing Route 94 - House & Pressure_015.wav
Processing Premiere - Anthony Attalla & Dqwon - Thick (Pirupa Remix) [Lapsus Music]_045.wav
Processing Best Of 2017 Tech House Yearmix_289.wav
Processing Thick Dick - Welcome To The Jungle (Andrea

KeyboardInterrupt: 

In [9]:
display(df[:10])

Unnamed: 0,audio,label,label_one_hot,log_specgram
The Lumineers - Ain't Nobody's Problem010.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -34.0311, -15.0564, -15.9555, -21.711..."
The Lumineers - Angela004.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -48.4892, -19.4601, -19.1463, -42.978..."
Imagine Dragons - Radioactive029.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-74.7518, -42.5359, -33.4131, -39.1166, -28...."
Fun. - Some Nights [OFFICIAL VIDEO]057.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -27.6447, -19.1384, -22.9659, -39.575..."
P!nk - Just Give Me A Reason ft. Nate Ruess003.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -71.0971, -45.9859, -73.1597, -35.874..."
Mumford & Sons - I Will Wait014.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -75.9527, -34.2853, -41.4846, -42.21,..."
Selected New Year Mix196.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[0.0, 1.0]","[[-66.5509, -46.4885, -46.7563, -47.9261, -42...."
The Lumineers - Dead Sea006.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -80.0, -72.3932, -67.8201, -64.4327, ..."
The Lumineers - Big Parade005.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-67.1488, -59.6014, -37.0821, -27.2378, -37...."
Selected New Year Mix186.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[0.0, 1.0]","[[-69.115, -37.6017, -29.8442, -33.0671, -47.0..."
