# Is it a banger? - Make your own dataset

### TODO

Discuss folder structure, `split_files.sh` script, duration of each clip, `p_include`

#### Folder structure
```
data
├── label_1
├── label_2
├──    ·
├──    ·
├──    ·
└── label_k
```

For the given example **Need link here eventually**, we simply have

```
data
├── banger
└── not_a_banger
```

#### File splitting - EXPLAIN WHAT THIS DOES, UPDATE IF FILE CHANGED
```bash
#!/bin/bash

SEGMENT_TIME=5 # in seconds
DATA_ROOT_DIR="../data"

DIRS=$(find "${DATA_ROOT_DIR}" -maxdepth 1 -mindepth 1 -type d)

# Make sure globstar is enabled
shopt -s globstar

for FILE in "${DATA_ROOT_DIR}"/**/*.mp3
do 
    echo "Processing ${FILE}"
    ffmpeg -i "${FILE}" -f segment -segment_time ${SEGMENT_TIME} -c copy "${FILE%.*}"%03d.wav                
    rm "${FILE}"
    rm "$(ls -t "${FILE%.*}"*.wav | tail -n 1)" # remove last file so uniform length
done
```

In [9]:
import os
import glob
import librosa
import numpy as np
np.random.seed(1234)
import pandas as pd

In [13]:
parent_dir = '../data'
parent_dir_contents = [os.path.join(parent_dir, dirname) for dirname in os.listdir(parent_dir)]
sub_dirs = [filename if os.path.isdir(filename) else None for filename in parent_dir_contents]
sub_dirs = list(filter(None.__ne__, sub_dirs))
labels_list = [os.path.relpath(path, parent_dir) for path in sub_dirs]

In [19]:
def extract_features(file_name, sample_rate=22050, segment_time=5, samples_to_clip=500):
    audio, sample_rate = librosa.load(file_name, sr=sample_rate)
    end_idx = (sample_rate * segment_time) - samples_to_clip # remove some end samples as not strictly uniform size
    audio = audio[0:end_idx]
    log_specgram = librosa.logamplitude(np.abs(librosa.stft(audio))**2, ref_power=np.max)
    features = {"audio": audio, "log_specgram": log_specgram}
    return features

def one_hot_encode(label, labels_list):
    n_labels = len(labels_list)
    one_hot_encoded = np.zeros(n_labels)
    for idx, cmp in enumerate(labels_list):
        if label == cmp:
            one_hot_encoded[idx] = 1                     
    return one_hot_encoded

def trim_file_list(fnames_list, p_include=1.0):
    fnames_list = np.asarray(fnames_list)
    include = np.random.rand(*fnames_list.shape)
    fnames_list = fnames_list[include < p_include]
    return fnames_list
    

def parse_audio_files(parent_dir, sub_dirs_list, labels_list, file_ext='*.wav', p_include=1.0,\
                      sample_rate=22050, segment_time=5, samples_to_clip=500):
    data = []
    index = []
    for label_idx, sub_dir in enumerate(sub_dirs_list):
        fnames_list = glob.glob(os.path.join(sub_dir, file_ext))
        fnames_list = trim_file_list(fnames_list, p_include=p_include)
        for fname in fnames_list:
            print("Processing " + os.path.basename(fname))
            features = extract_features(fname, segment_time=segment_time)
            label = labels_list[label_idx]
            label_one_hot = one_hot_encode(label, labels_list)
            features['label'] = label
            features["label_one_hot"] = label_one_hot
            data.append(features)
            index.append(os.path.basename(fname))
    return pd.DataFrame(data, index=index)

In [20]:
df = parse_audio_files(parent_dir, sub_dirs, labels_list, p_include=0.2, segment_time=5)
df = df.iloc[np.random.permutation(len(df))] # shuffle rows
df.to_pickle(os.path.join(parent_dir, 'processed_dataset.pkl'))

Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0268.wav
Processing Mumford & Sons - I Will Wait_0006.wav
Processing Sleep On The Floor (LYRICS) - The Lumineers_0015.wav
Processing Enya   The Best Of Enya FULL ALBUM_0303.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1360.wav
Processing Enya   The Best Of Enya FULL ALBUM_0698.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0903.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1200.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1228.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0650.wav
Processing The Lumineers - Patience [Lyrics]_0006.wav
Processing The Lumineers - Holdin' Out - Storks - Original Motion Picture Soundtrack_0016.wav
Processing The Lumineers - Patience [Lyrics]_0012.wav
Processing Cliff Richard - Greatest Hits 

Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0292.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0709.wav
Processing Enya   The Best Of Enya FULL ALBUM_0058.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1211.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0682.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0696.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0866.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0325.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0457.wav
Processing Gun Song (LYRICS) - The Lumineers_0032.wav
Processing The Lumineers - 'Stubborn Love' (Official Video)_0055.wav
Processing The Lumineers - 'Stubborn Love' (Official Video)_0054.wav
Processing Cliff Richard - Greatest Hits 

Processing Theme - The Lumineers - Scotland_0007.wav
Processing The Lumineers - Classy Girls_0031.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0423.wav
Processing The Lumineers - 'Submarines' (Official Video)_0017.wav
Processing The Lumineers - Blue Christmas_0025.wav
Processing The Lumineers - Ophelia_0028.wav
Processing The Lumineers - Angela_0049.wav
Processing The Lumineers - Classy Girls_0018.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1072.wav
Processing The Lumineers - Flowers in your hair ( lyrics )_0009.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0344.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0813.wav
Processing Theme - The Lumineers - Scotland_0006.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1264.wav
Processing Enya   The Best Of Enya FULL ALBUM_0011.wav
Processing The Lumineers

Processing The Lumineers - Nobody Knows (From 'Pete's Dragon')_0038.wav
Processing The Lumineers - Nobody Knows (From 'Pete's Dragon')_0010.wav
Processing The Lumineers - Flapper Girl_0024.wav
Processing Enya   The Best Of Enya FULL ALBUM_0630.wav
Processing The Lumineers - Flapper Girl_0018.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0997.wav
Processing The Lumineers - Slow It Down (Live on KEXP)_0016.wav
Processing The Lumineers - Slow It Down (Live on KEXP)_0002.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0565.wav
Processing The Lumineers - My Eyes [Lyrics]_0033.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1121.wav
Processing The Lumineers - Cleopatra_0007.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0559.wav
Processing The Lumineers - Ho Hey (Official Video)_0029.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now 

Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0818.wav
Processing Theme - The Lumineers - Scotland_0025.wav
Processing Theme - The Lumineers - Scotland_0019.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0367.wav
Processing The Lumineers - Angela_0042.wav
Processing Enya   The Best Of Enya FULL ALBUM_0230.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0415.wav
Processing Enya   The Best Of Enya FULL ALBUM_0595.wav
Processing The Lumineers - 'Stubborn Love' (Official Video)_0003.wav
Processing The Lumineers - 'Stubborn Love' (Official Video)_0017.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1093.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0428.wav
Processing The Lumineers - Flowers in your hair ( lyrics )_0003.wav
Processing The Lumineers - Flowers in your hair ( lyrics )_0017.wav
Processing The Luminee

Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1353.wav
Processing Enya   The Best Of Enya FULL ALBUM_0683.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1233.wav
Processing The Lumineers - Holdin' Out - Storks - Original Motion Picture Soundtrack_0019.wav
Processing Theme - The Lumineers - Scotland_0051.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0844.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_0313.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1019.wav
Processing Enya   The Best Of Enya FULL ALBUM_0244.wav
Processing The Lumineers - Big Parade_0002.wav
Processing Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1025.wav
Processing Enya   The Best Of Enya FULL ALBUM_0278.wav
Processing Enya   The Best Of Enya FULL ALBUM_0287.wav
Processing Gun Song (LYRICS) - The Lumineers_0011.wa

Processing Premiere - Anthony Attalla & Dqwon - Thick (Pirupa Remix) [Lapsus Music]_0060.wav
Processing KlangKuenstler - Dance With Me (Raffa FL Remix)_0069.wav
Processing Route 94 - House & Pressure_0010.wav
Processing CamelPhat - Make 'Em Dance (Original Mix) [Suara]_0065.wav
Processing Route 94 - House & Pressure_0004.wav
Processing Sean Roman - I Wanna_0042.wav
Processing MK - 17 (CamelPhat Dub) [Audio]_0070.wav
Processing Sean Roman - I Wanna_0056.wav
Processing MK - 17 (CamelPhat Dub) [Audio]_0058.wav
Processing Best Of 2017 Tech House Yearmix_0020.wav
Processing Best Of 2017 Tech House Yearmix_0593.wav
Processing Groove Armada - House With Me (Andrea Oliva Remix)_0037.wav
Processing Premiere - Pete Zorba - She Got Back [Lapsus Music]_0024.wav
Processing Groove Armada - House With Me (Andrea Oliva Remix)_0023.wav
Processing Best Of 2017 Tech House Yearmix_0587.wav
Processing Raumakustik - Dem A Pree (Patrick Topping Remix)_0072.wav
Processing Best Of 2017 Tech House Yearmix_0236.

Processing Premiere - Pete Zorba - She Got Back [Lapsus Music]_0022.wav
Processing TECH-HOUSE - Max Chapman - La Fiesta [Hot Creations]_0081.wav
Processing Solardo - Keep Pushing On_0027.wav
Processing Stardust - Music Sounds Better (Mistrix Dub) (Free Download)_0005.wav
Processing Best Of 2017 Tech House Yearmix_0224.wav
Processing TECH-HOUSE - Max Chapman - La Fiesta [Hot Creations]_0056.wav
Processing Patrick Topping - Be Sharp Say Nowt_0078.wav
Processing Caal & Baum - This Story (Raffa FL Remix)_0049.wav
Processing Claptone - The Music Got Me (Darius Syrossian Remix)_0003.wav
Processing Premiere - Raumakustik - Can't Get Enough [Toolroom Records]_0032.wav
Processing FISHER - Ya Kidding_0019.wav
Processing Carl Cox - I Want You (Forever) - Josh Butler Remix_0064.wav
Processing Simone Liberali - Panameras_0012.wav
Processing Premiere - Kinnerman - Dominator [Elrow Music]_0040.wav
Processing Best Of 2017 Tech House Yearmix_0436.wav
Processing Shiba San & Green Velvet - Chance_0054.wa

Processing Shadow Child - Ooh Tune_0046.wav
Processing Thick Dick - Welcome To The Jungle (Andrea Oliva Remix)_0016.wav
Processing Selected New Year Mix_0039.wav
Processing Thick Dick - Welcome To The Jungle (Andrea Oliva Remix)_0002.wav
Processing Thick Dick - Welcome To The Jungle (Andrea Oliva Remix)_0017.wav
Processing Best Of 2017 Tech House Yearmix_0122.wav
Processing Premiere - L.O.R.D.I.E - Alpha (Darius Syrossian Remix) [Griffintown Records]_0058.wav
Processing Format B - The Scoop (Original Mix)_0047.wav
Processing Best Of 2017 Tech House Yearmix_0693.wav
Processing 50 Cent - Just A Lil Bit (Paradox City Edit) (Free Download)_0039.wav
Processing Hosse - Ready For This (Raffa FL Remix)_0046.wav
Processing Latmun - Footsteps_0018.wav
Processing Best Of 2017 Tech House Yearmix_0334.wav
Processing Selected New Year Mix_0212.wav
Processing Latmun - Footsteps_0024.wav
Processing Deeper Purpose - Breathe & Stop (Free Download)_0063.wav
Processing Shiba San & Green Velvet - Chance_00

Processing 50 Cent - Just A Lil Bit (Paradox City Edit) (Free Download)_0017.wav
Processing Best Of 2017 Tech House Yearmix_0695.wav
Processing Latmun - Footsteps_0022.wav
Processing Premiere - Kinnerman - Dominator [Elrow Music]_0036.wav
Processing Best Of 2017 Tech House Yearmix_0326.wav
Processing Best Of 2017 Tech House Yearmix_0454.wav
Processing Best Of 2017 Tech House Yearmix_0332.wav
Processing Technasia - I Am Somebody (Original Mix)_0066.wav
Processing Green Velvet & Patrick Topping - Voicemail_0000.wav
Processing CamelPhat - Hangin' Out With Charlie (Original Mix)_0002.wav
Processing Best Of 2017 Tech House Yearmix_0508.wav
Processing PAWSA - Groovy Cat_0014.wav
Processing Selected New Year Mix_0360.wav
Processing Caal & Baum - This Story (Raffa FL Remix)_0003.wav
Processing Best Of 2017 Tech House Yearmix_0252.wav
Processing TECH-HOUSE - Max Chapman - La Fiesta [Hot Creations]_0020.wav
Processing Selected New Year Mix_0348.wav
Processing Best Of 2017 Tech House Yearmix_0246

Processing Technasia - I Am Somebody (Original Mix)_0082.wav
Processing Technasia - I Am Somebody (Original Mix)_0069.wav
Processing Luigi Rocca, Federico Buratti - Secret Bass (Knober, Sylter Remix)_0024.wav
Processing Carl Cox - I Want You (Forever) - Josh Butler Remix_0035.wav
Processing Simone Liberali - Panameras_0043.wav
Processing Premiere - Raumakustik - Can't Get Enough [Toolroom Records]_0063.wav
Processing Mark Jenkyns - Sirens_0088.wav
Processing Best Of 2017 Tech House Yearmix_0275.wav
Processing Raumakustik - Dem A Pree (Patrick Topping Remix)_0031.wav
Processing Caal & Baum - This Story (Raffa FL Remix)_0024.wav
Processing Premiere - Pete Zorba - She Got Back [Lapsus Music]_0067.wav
Processing Stardust - Music Sounds Better (Mistrix Dub) (Free Download)_0054.wav
Processing Audiophonik - Pass Me (Raffa FL Remix)_0021.wav
Processing Best Of 2017 Tech House Yearmix_0077.wav
Processing Premiere - De La Swing - Creeper [Elrow Music]_0025.wav
Processing Gorgon City - Grooves O

Processing Technasia - I Am Somebody (Original Mix)_0047.wav
Processing Green Velvet & Patrick Topping - Voicemail_0021.wav
Processing Simone Liberali - Panameras_0051.wav
Processing Green Velvet & Patrick Topping - Voicemail_0009.wav
Processing Luigi Rocca, Federico Buratti - Secret Bass (Knober, Sylter Remix)_0022.wav
Processing The Shapeshifters - Lola's Theme (Mistrix Dub) (Free Download)_0050.wav
Processing Patrick Topping - Be Sharp Say Nowt_0007.wav
Processing TECH-HOUSE - Max Chapman - La Fiesta [Hot Creations]_0029.wav
Processing Claptone - The Music Got Me (Darius Syrossian Remix)_0068.wav
Processing Best Of 2017 Tech House Yearmix_0515.wav
Processing Best Of 2017 Tech House Yearmix_0501.wav
Processing Best Of 2017 Tech House Yearmix_0267.wav
Processing Premiere - Pete Zorba - She Got Back [Lapsus Music]_0061.wav
Processing MK - 17 (CamelPhat Dub) [Audio]_0021.wav
Processing Premiere - Marco Strous - That's My Jam [Cr2 Records]_0043.wav
Processing Best Of 2017 Tech House Year

Processing TECH-HOUSE - Max Chapman - La Fiesta [Hot Creations]_0074.wav
Processing Best Of 2017 Tech House Yearmix_0560.wav
Processing The Shapeshifters - Lola's Theme (Mistrix Dub) (Free Download)_0031.wav
Processing Caal & Baum - This Story (Raffa FL Remix)_0057.wav
Processing Selected New Year Mix_0334.wav
Processing Caal & Baum - This Story (Raffa FL Remix)_0043.wav
Processing Carl Cox - I Want You (Forever) - Josh Butler Remix_0046.wav
Processing Simone Liberali - Panameras_0030.wav
Processing Premiere - Raumakustik - Can't Get Enough [Toolroom Records]_0038.wav
Processing Green Velvet & Patrick Topping - Voicemail_0040.wav
Processing Selected New Year Mix_0283.wav
Processing Selected New Year Mix_0297.wav
Processing Shiba San & Green Velvet - Chance_0076.wav
Processing Premiere - Kinnerman - Dominator [Elrow Music]_0076.wav
Processing Mark Jenkyns - Sirens_0004.wav
Processing Selected New Year Mix_0254.wav
Processing Best Of 2017 Tech House Yearmix_0428.wav
Processing The Tribe 

In [16]:
display(df[:10])

Unnamed: 0,audio,label,label_one_hot,log_specgram
Enya The Best Of Enya FULL ALBUM_0089.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-73.9835, -45.8846, -40.6965, -59.8163, -50...."
Premiere - Marco Strous - That's My Jam [Cr2 Records]_0073.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[0.0, 1.0]","[[-76.3698, -43.9466, -36.525, -40.4099, -57.7..."
Enya The Best Of Enya FULL ALBUM_0225.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -65.6183, -47.5027, -52.6115, -59.222..."
Cliff Richard - Greatest Hits 1958-1962 (Not Now Music) [Full Album]_1397.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -80.0, -57.261, -41.1285, -39.6702, -..."
Jay Lumen & Wade - Dirty Groove_0026.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[0.0, 1.0]","[[-63.2814, -37.5433, -27.8069, -36.516, -37.7..."
Theme - The Lumineers - Scotland_0012.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -53.7615, -37.2244, -30.208, -27.7006..."
Low Steppa - Vocal Loop (Premiere)_0003.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[0.0, 1.0]","[[-80.0, -54.548, -29.9076, -24.2944, -27.5968..."
The Lumineers - Big Parade_0062.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",not_a_banger,"[1.0, 0.0]","[[-80.0, -71.7206, -37.1355, -32.2714, -47.919..."
Sean Roman - I Wanna_0023.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[0.0, 1.0]","[[-80.0, -63.3079, -41.5738, -39.5527, -48.835..."
Premiere - De La Swing - Creeper [Elrow Music]_0005.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",banger,"[0.0, 1.0]","[[-80.0, -80.0, -66.287, -56.6934, -41.0459, -..."
