<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Wavegan-drum-dataset" data-toc-modified-id="Wavegan-drum-dataset-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Wavegan drum dataset</a></span></li><li><span><a href="#Musicradar-essential-drumkit" data-toc-modified-id="Musicradar-essential-drumkit-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Musicradar essential drumkit</a></span></li><li><span><a href="#200-Drum-Machines" data-toc-modified-id="200-Drum-Machines-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>200 Drum Machines</a></span></li><li><span><a href="#MDLib2.2" data-toc-modified-id="MDLib2.2-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>MDLib2.2</a></span></li><li><span><a href="#Fraunhofer-IDMT-SMT" data-toc-modified-id="Fraunhofer-IDMT-SMT-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Fraunhofer IDMT SMT</a></span></li></ul></div>

In [1]:
import glob
import os
import pandas as pd
from pandas import DataFrame

### Wavegan drum dataset
http://deepyeti.ucsd.edu/cdonahue/wavegan/data/drums.tar.gz
### Musicradar essential drumkit
http://cdn.mos.musicradar.com/audio/samples/musicradar-essential-drumkit-samples.zip
### 200 Drum Machines
http://www.hexawe.net/mess/200.Drum.Machines/
### MDLib2.2
Articulation dataset

http://www.mattprockup.com/percussion-dataset

https://www.dropbox.com/s/p736vokha3240e6/MDLib2.2.zip?dl=0
### Fraunhofer IDMT SMT

https://www.idmt.fraunhofer.de/en/business_units/m2d/smt/drums.html


In [2]:
base_dir = "./Samples"
!ls $base_dir

200-drum-machines  MDLib2.2				 Percussion
IDMT-SMT-DRUMS-V2  musicradar-essential-drumkit-samples  wavegan-drums


In [3]:
!ls {base_dir}/MDLib2.2/Sorted/*

./Samples/MDLib2.2/Sorted/Cymbals:
'Bright Crash'	'Dark Crash'   HiHat   Ride

'./Samples/MDLib2.2/Sorted/Floor Tom':
Buzz  Rim  Strike

'./Samples/MDLib2.2/Sorted/Floor Tom SNoff':
Buzz  Rim  Strike

./Samples/MDLib2.2/Sorted/Kick:
'Dead Kick'  'Long Kick'

'./Samples/MDLib2.2/Sorted/Kick SNoff':
'Dead Kick'  'Long Kick'

'./Samples/MDLib2.2/Sorted/Rack Tom':
Buzz  Rim  Strike

'./Samples/MDLib2.2/Sorted/Rack Tom SNoff':
Buzz  Rim  Strike

./Samples/MDLib2.2/Sorted/Snare:
 Buzz  'Cross Stick'   Rim   Strike

'./Samples/MDLib2.2/Sorted/Snare SNoff':
 Buzz  'Cross Stick'   Rim   Strike


In [4]:
wavegan = glob.glob(base_dir + '/wavegan-drums/*/*.wav')
musicradar = glob.glob(base_dir + '/musicradar-essential-drumkit-samples/*/*.wav')
drum200 = glob.glob(base_dir + '/200-drum-machines/*/*.wav')
mdlib1 = glob.glob(base_dir + '/MDLib2.2/Sorted/*/*/*.wav')
mdlib2 = glob.glob(base_dir + '/MDLib2.2/Sorted/*/*/*/*.wav')
idmt = glob.glob(base_dir + '/IDMT-SMT-DRUMS-V2/audio/*.wav')

In [5]:
def split(level=1):
    def _split(row):
        directory = row['path']
        for lvl in range(level):
            if lvl == 0:
                colname = 'filename'
            else:
                colname = 'dir_' + str(lvl)
            directory, row[colname] = os.path.split(directory)
        return row
    return _split

In [6]:
def set_classes(df, column, rules):
    for d_class, pattern in rules.items():
        mask = df[column].str.contains(pattern, case=False)
        df.loc[mask, d_class] = 1
        df.loc[~mask, d_class] = 0

In [7]:
wavegan_df = DataFrame(data=wavegan, columns=['path'])
wavegan_df = wavegan_df.apply(split(),axis=1)
wavegan_df

Unnamed: 0,path,filename
0,./Samples/wavegan-drums/valid/Tom_00041.wav,Tom_00041.wav
1,./Samples/wavegan-drums/valid/Snare_00133.wav,Snare_00133.wav
2,./Samples/wavegan-drums/valid/Hat_00049.wav,Hat_00049.wav
3,./Samples/wavegan-drums/valid/Snare_00179.wav,Snare_00179.wav
4,./Samples/wavegan-drums/valid/Tom_00343.wav,Tom_00343.wav
...,...,...
2997,./Samples/wavegan-drums/train/Tom_00655.wav,Tom_00655.wav
2998,./Samples/wavegan-drums/train/Clap_00886.wav,Clap_00886.wav
2999,./Samples/wavegan-drums/train/Tom_00803.wav,Tom_00803.wav
3000,./Samples/wavegan-drums/train/Snare_00743.wav,Snare_00743.wav


In [8]:
rules = {
    'class_Tom': 'Tom',
    'class_Snare': 'Snare',
    'class_Hat': 'Hat',
    'class_Clap': 'Clap',
    'class_Cowbell': 'Cowbell',
    'class_Cymbal': 'Ride|Crash',
    'subclass_Cymbal_Crash': 'Crash',
    'subclass_Cymbal_Ride': 'Ride',
    'class_Kick': 'Kick',
        }

set_classes(wavegan_df, 'filename', rules)

#wavegan_df['Cymbal'] = 0
#wavegan_df.loc[(wavegan_df['Cym_Crash'] == 1) | (wavegan_df['Cym_Ride'] == 1), 'Cymbal'] 
    
mask = wavegan_df.loc[:, [c for c in wavegan_df.columns if c.startswith('class_')] ].sum(axis=1) != 1
print('unlabelled:', len(wavegan_df[mask]))
print('labelled:', len(wavegan_df[~mask]))
print(list(wavegan_df[mask]['filename']))

unlabelled: 0
labelled: 3002
[]


In [9]:
musicradar_df = DataFrame(data=musicradar, columns=['path'])
musicradar_df = musicradar_df.apply(split(2),axis=1)
musicradar_df

Unnamed: 0,path,filename,dir_1
0,./Samples/musicradar-essential-drumkit-samples...,TR 808 Closed Hat 01.wav,TR 808 Kit
1,./Samples/musicradar-essential-drumkit-samples...,TR 808 Closed Hat 02.wav,TR 808 Kit
2,./Samples/musicradar-essential-drumkit-samples...,TR 808 Snare 01.wav,TR 808 Kit
3,./Samples/musicradar-essential-drumkit-samples...,TR 808 Rimshot.wav,TR 808 Kit
4,./Samples/musicradar-essential-drumkit-samples...,TR 808 Cymbal 01.wav,TR 808 Kit
...,...,...,...
489,./Samples/musicradar-essential-drumkit-samples...,D&B Drop FX 01.wav,Drum & Bass Kit
490,./Samples/musicradar-essential-drumkit-samples...,D&B Snare 03.wav,Drum & Bass Kit
491,./Samples/musicradar-essential-drumkit-samples...,D&B Kick 03.wav,Drum & Bass Kit
492,./Samples/musicradar-essential-drumkit-samples...,D&B Kick 01.wav,Drum & Bass Kit


In [10]:
rules ={
    'class_Tom': 'Tom',
    'class_Snare': 'Snare',
    'class_Kick': 'Kick',
    'class_Rimshot': 'rim',
    'class_Shaken': 'shake[nr]|tamb|mara?cas|cabas',
    'subclass_Shaken_Shaker': 'shaker',
    'subclass_Shaken_Tambourin': 'tamb',
    'subclass_Shaken_Maracas': 'mara?cas',
    'subclass_Shaken_Cabasa': 'cabas',
    'class_Cymbal': 'Cymbal',
    'class_Conga': 'Conga',
    'class_Bongo': 'Bongo',
    'class_Djembe': 'Djmbe',
    'class_Tabla': 'Tabla',
    'class_Darbuka': 'Darbuka',
    'class_Cajon': 'Cajon',
    'class_Timpani': 'timb',
    'class_Hat': 'hat',
    'subclass_Hat_Open': 'open hat',
    'subclass_Hat_Close': 'closed? hat',
    'class_Clap': 'clap',
    'class_Cowbell': 'cowbell',
    'class_Triangle': 'triangle',
    'class_Agogo': 'agogo',
       }

set_classes(musicradar_df, 'filename', rules)

mask = musicradar_df.loc[:, [c for c in musicradar_df.columns if c.startswith('class_')] ].sum(axis=1) != 1
print('unlabelled:', len(musicradar_df[mask]))
print('labelled:', len(musicradar_df[~mask]))
print(list(musicradar_df[mask]['filename']))

unlabelled: 67
labelled: 427
['TR 808 Clave.wav', 'Modular Perc 01.wav', 'Modular Perc 03.wav', 'Modular Perc 04.wav', 'Modular Perc 05.wav', 'Modular Perc 06.wav', 'Modular Perc 02.wav', 'Foley Bucket Perc.wav', 'Foley Can Perc.wav', 'Foley Tube Perc.wav', 'Foley Paint Tin Perc.wav', 'Vintage Perc 01.wav', 'Vintage Perc 04.wav', 'Vintage Perc 03.wav', 'Vintage Clave.wav', 'Vintage Perc 02.wav', 'ChipTune Arcade FX 06.wav', 'ChipTune Perc 05.wav', 'ChipTune Arcade FX 08.wav', 'ChipTune Arcade FX 01.wav', 'ChipTune Perc 01.wav', 'ChipTune Arcade FX 02.wav', 'ChipTune Perc 03.wav', 'ChipTune Perc 04.wav', 'ChipTune Arcade FX 03.wav', 'ChipTune Arcade FX 04.wav', 'ChipTune Perc 02.wav', 'ChipTune Arcade FX 05.wav', 'ChipTune Arcade FX 07.wav', '80s Digital Bonga 01.wav', 'Vermona Fx 05.wav', 'Vermona Perc 01.wav', 'Vermona Perc 09.wav', 'Vermona Perc 04.wav', 'Vermona Perc 05.wav', 'Vermona Fx 03.wav', 'Vermona Perc 06.wav', 'Vermona Fx 04.wav', 'Vermona Perc 02.wav', 'Vermona Fx 01.wav',

In [11]:
shaken_classes = [c for c in musicradar_df.columns if c.startswith('subclass_Shaken')]
mask = (musicradar_df['class_Shaken'] == 1) ^ (musicradar_df[shaken_classes] == 1).any(axis=1)
print('mislabelled:\n', musicradar_df[mask][['filename', 'class_Shaken'] + shaken_classes])
print('mislabelled:\n', list(musicradar_df[mask]['filename']))

mislabelled:
                  filename  class_Shaken  subclass_Shaken_Shaker  \
383  Shaken Rainstick.wav           1.0                     0.0   
387   Shaken Bells 01.wav           1.0                     0.0   
390      Shaken Gourd.wav           1.0                     0.0   
392    Shaken Jingles.wav           1.0                     0.0   
397    Shaken Shekere.wav           1.0                     0.0   
399   Shaken Bells 02.wav           1.0                     0.0   
402    Shaken Caxixi .wav           1.0                     0.0   

     subclass_Shaken_Tambourin  subclass_Shaken_Maracas  \
383                        0.0                      0.0   
387                        0.0                      0.0   
390                        0.0                      0.0   
392                        0.0                      0.0   
397                        0.0                      0.0   
399                        0.0                      0.0   
402                        0.0      

In [12]:
drum200_df = DataFrame(data=drum200, columns=['path'])
drum200_df = drum200_df.apply(split(2),axis=1)
drum200_df

Unnamed: 0,path,filename,dir_1
0,./Samples/200-drum-machines/Yamaha RX-11/MaxV ...,MaxV - RX11 OPHAT2.wav,Yamaha RX-11
1,./Samples/200-drum-machines/Yamaha RX-11/MaxV ...,MaxV - RX11 BD3.wav,Yamaha RX-11
2,./Samples/200-drum-machines/Yamaha RX-11/MaxV ...,MaxV - RX11 BD2.wav,Yamaha RX-11
3,./Samples/200-drum-machines/Yamaha RX-11/MaxV ...,MaxV - RX11 CLHAT2.wav,Yamaha RX-11
4,./Samples/200-drum-machines/Yamaha RX-11/MaxV ...,MaxV - RX11 RIDE.wav,Yamaha RX-11
...,...,...,...
6745,./Samples/200-drum-machines/Korg SR-120/MaxV -...,MaxV - HH Op.wav,Korg SR-120
6746,./Samples/200-drum-machines/Korg SR-120/KORG S...,KORG SR120 8.wav,Korg SR-120
6747,./Samples/200-drum-machines/Korg SR-120/KORG S...,KORG SR120 6.wav,Korg SR-120
6748,./Samples/200-drum-machines/Korg SR-120/KORG S...,KORG SR12011.wav,Korg SR-120


In [13]:
rules = {
    'class_Hat': 'hat|hh|open|close',
    'subclass_Hat_Open': 'op?.?(hi)?hat|hat.?o|close|pedal|c.?hh|hh.?c|foot',
    'subclass_Hat_Close': '((cl?)|p).?(hi)?hat|hat.?(c|p)|ope?n|o.?hh|hh.?o',
    'class_Bongo': 'Bongo',
    'class_Cymbal': 'cym|crash|ride|chi?n[^o]|spla?sh',
    'subclass_Cymbal_Crash': 'crash',
    'subclass_Cymbal_Ride': 'ride',
    'subclass_Cymbal_Chinese': 'chi?n[^o]',
    'subclass_Cymbal_Splash': 'spla?sh',
    'class_Tom': 'tom',
    'class_Conga': 'cong',
    'class_Kick': 'kic?k|bd',
    'class_Gong': 'Gong',
    'class_Cowbell': 'cow|Cowbell',
    'class_Snare': 'sna?r',
    'class_Rimshot': 'rim',
    'class_Clap': 'clap',
    'class_Shaken': 'shake|tamb|cabas',
    'subclass_Shaken_Shaker': 'shaker',
    'subclass_Shaken_Tambourin': 'tamb',
    'subclass_Shaken_Cabasa': 'cabas',
    'class_Clave': 'cla?ve',
    'class_Timpani': 'tim[pb]',
    'class_Agogo': 'agogo',
    'class_Triangle': 'triangle',
    'class_Djembe': 'djembe',
    'class_Tabla': 'tabla',
    'class_Darbuka': 'darbuka',
        }

set_classes(drum200_df, 'filename', rules)

  return func(self, *args, **kwargs)


In [14]:
mask = drum200_df.loc[:, [c for c in drum200_df.columns if c.startswith('class_')] ].sum(axis=1) < 1
print('unlabelled:', len(drum200_df[mask]))
print('labelled:', len(drum200_df[~mask]))
print('unlabelled:', list(drum200_df[mask]['filename']))

unlabelled: 3457
labelled: 3293
unlabelled: ['MaxV - RX11 MED SN.wav', 'MaxV - RX11 LITE SN.wav', 'MaxV - RX11 HEVY SN.wav', 'MaxV - Metal.wav', '13.wav', '06.wav', '05.wav', '09.wav', '08.wav', '01.wav', '00.wav', '14.wav', '11.wav', '02.wav', '12.wav', '03.wav', '10.wav', '04.wav', '07.wav', '15.wav', 'SR16Tank.wav', 'SR16FSnap.wav', 'SR16Guiro.wav', 'SR16Spark.wav', 'SR16Block.wav', 'Acoustic Bass Drum.wav', 'MaxV - EMU14.wav', 'MaxV - KDR21.wav', 'MaxV - EMU13.wav', 'MaxV - EMU20.wav', 'MaxV - EMU19.wav', 'MaxV - DRM03.wav', 'MaxV - SND24.wav', 'MaxV - EMU15.wav', 'MaxV - EMU09.wav', 'MaxV - DRM04.wav', 'MaxV - EMU12.wav', 'MaxV - CLD01.wav', 'MaxV - DRM08.wav', 'MaxV - DRM07.wav', 'MaxV - EMU17.wav', 'MaxV - OHD22.wav', 'MaxV - EMU16.wav', 'MaxV - EMU18.wav', 'MaxV - EMU_SD.wav', 'MaxV - DRM05.wav', 'MaxV - EMU11.wav', 'DrumulatorBlock.wav', 'MaxV - EMU10.wav', 'MaxV - DRM06.wav', 'MaxV - SND23.wav', 'MaxV - CLD02.wav', 'MaxV - Snap.wav', '06.wav', '05.wav', '01.wav', '00.wav', '0

In [15]:
mask = drum200_df.loc[:, [c for c in drum200_df.columns if c.startswith('class_')] ].sum(axis=1) > 1
print('overlabelled:', len(drum200_df[mask]))
display('overlabelled:', drum200_df[mask])
print('overlabelled:', list(drum200_df[mask]['filename']))

overlabelled: 22


'overlabelled:'

Unnamed: 0,path,filename,dir_1,class_Hat,subclass_Hat_Open,subclass_Hat_Close,class_Bongo,class_Cymbal,subclass_Cymbal_Crash,subclass_Cymbal_Ride,...,subclass_Shaken_Shaker,subclass_Shaken_Tambourin,subclass_Shaken_Cabasa,class_Clave,class_Timpani,class_Agogo,class_Triangle,class_Djembe,class_Tabla,class_Darbuka
1659,./Samples/200-drum-machines/Fairlight IIx/MaxV...,MaxV - HHTamb.wav,Fairlight IIx,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2325,./Samples/200-drum-machines/Kawai r-50/MaxV - ...,MaxV - ATOMBDEN.wav,Kawai r-50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2341,./Samples/200-drum-machines/Kawai r-50/MaxV - ...,MaxV - ATOMBDRV.wav,Kawai r-50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2378,./Samples/200-drum-machines/Kawai r-50/MaxV - ...,MaxV - ATOMBDGA.wav,Kawai r-50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3319,./Samples/200-drum-machines/Roland R-5/R-5 Rim...,R-5 Rimshot Snare 2.wav,Roland R-5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3345,./Samples/200-drum-machines/Roland R-5/R-5 Ope...,R-5 Open Low Conga.wav,Roland R-5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3347,./Samples/200-drum-machines/Roland R-5/R-5 Rim...,R-5 Rimshot Snare 3.wav,Roland R-5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3366,./Samples/200-drum-machines/Roland R-5/R-5 Rim...,R-5 Rimshot Snare 1.wav,Roland R-5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3376,./Samples/200-drum-machines/Roland R-5/R-5 Ope...,R-5 Open Snare.wav,Roland R-5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4010,./Samples/200-drum-machines/Roland SC-88/MaxV ...,MaxV - D#4 Open High Conga.wav,Roland SC-88,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


overlabelled: ['MaxV - HHTamb.wav', 'MaxV - ATOMBDEN.wav', 'MaxV - ATOMBDRV.wav', 'MaxV - ATOMBDGA.wav', 'R-5 Rimshot Snare 2.wav', 'R-5 Open Low Conga.wav', 'R-5 Rimshot Snare 3.wav', 'R-5 Rimshot Snare 1.wav', 'R-5 Open Snare.wav', 'MaxV - D#4 Open High Conga.wav', 'MaxV - A5 Open Triangle.wav', 'TOMSnare.wav', 'TOMHat_O.wav', 'TOMClap.wav', 'TOMCymbal.wav', 'TOMKick.wav', 'TOMHat_C.wav', 'MaxV - RIM-TOM3.wav', 'MaxV - RIM-TOM1.wav', 'MaxV - RIM-TOM2.wav', 'MaxV - RIM-TOM4.wav', 'HR16B 030 Open Conga.wav']


In [16]:
mask = (drum200_df['class_Hat'] == 1) ^ ( (drum200_df['subclass_Hat_Open'] ==1) | (drum200_df['subclass_Hat_Close'] ==1 ))
print('mislabelled:\n', drum200_df[mask][['filename', 'class_Hat', 'subclass_Hat_Open', 'subclass_Hat_Close']])
print('mislabelled:\n', list(drum200_df[mask]['filename']))

mislabelled:
                       filename  class_Hat  subclass_Hat_Open  \
196       MaxV - HiHat Mid.wav        1.0                0.0   
280              MaxV - hh.wav        1.0                0.0   
519              MaxV - HH.wav        1.0                0.0   
521             MaxV - HH2.wav        1.0                0.0   
1278   MaxV - HR16 MED HAT.wav        1.0                0.0   
...                        ...        ...                ...   
6343             MaxV - HH.wav        1.0                0.0   
6345            MaxV - HH2.wav        1.0                0.0   
6384               WAT_HH3.wav        1.0                0.0   
6387               WAT_HH2.wav        1.0                0.0   
6653  MaxV - Mattel Hi-Hat.wav        1.0                0.0   

      subclass_Hat_Close  
196                  0.0  
280                  0.0  
519                  0.0  
521                  0.0  
1278                 0.0  
...                  ...  
6343                 0.0  
6

In [17]:
mask = (drum200_df['class_Cymbal'] == 1) ^ ((drum200_df['subclass_Cymbal_Crash'] ==1) |
                                             (drum200_df['subclass_Cymbal_Ride'] ==1) |
                                             (drum200_df['subclass_Cymbal_Chinese'] ==1) |
                                             (drum200_df['subclass_Cymbal_Splash'] ==1) 
                                             )
#print('unlabelled:\n', drum200_df[mask][['filename', 'class_Cymbale', 'subclass_Cymbale_Crash', 'subclass_Cymbale_Ride']])
print('mislabeled:', len(drum200_df[mask]['filename']))
print(list(drum200_df[mask]['filename']))

mislabeled: 50
['DR-110Cymbal.wav', 'MaxV - EMU_CYMB.wav', 'TR606Cymb04.wav', 'TR606Cymb01.wav', 'TR606Cymb_OD.wav', 'TR606Cymb05.wav', 'TR606Cymb02.wav', 'TR606Cymb03.wav', 'MaxV - Cymbal.wav', 'MaxV - Cymbal.wav', '512cymb.wav', 'CR8000Cymbal.wav', 'MR10Cymbal.wav', 'MaxV - Cymshort.wav', 'MaxV - Cym2.wav', 'MaxV - Cym1.wav', 'MaxV - Cym3.wav', 'Cymbal2.wav', 'Cymbal1.wav', 'MaxV - 110_cymbal.wav', 'Ddm110Cymbal.wav', 'MaxV - Cymbal.wav', 'MaxV - Cymbal.wav', 'MaxV - Cymbal.wav', 'R8FingerCym.wav', 'R8CymbHand.wav', 'R-5 Back Cymbal.wav', 'MaxV - Cymbal.wav', 'TOMCymbal.wav', '202cym08.wav', '202cym03.wav', '202cym07.wav', '202cym06.wav', '202cym01.wav', '202cym04.wav', '202cym05.wav', '202cym02.wav', '202cym10.wav', '202cym09.wav', 'MaxV - CRSH-CYM.wav', 'MaxV - CRSHCYM1.wav', 'MaxV - CRSHCYM2.wav', 'MaxV - Cymbal.wav', 'MaxV - locymbal.wav', 'MaxV - hicymbal.wav', 'DR220eCymbal.wav', 'MaxV - Cymbal.wav', 'KPRCymbal.wav', 'MaxV - Mattel Cymbal.wav', 'MaxV - Cymbal.wav']


In [18]:
drum200_df[drum200_df.iloc[:,4:].sum(axis=1) > 1].iloc[:,5:]

Unnamed: 0,subclass_Hat_Close,class_Bongo,class_Cymbal,subclass_Cymbal_Crash,subclass_Cymbal_Ride,subclass_Cymbal_Chinese,subclass_Cymbal_Splash,class_Tom,class_Conga,class_Kick,...,subclass_Shaken_Shaker,subclass_Shaken_Tambourin,subclass_Shaken_Cabasa,class_Clave,class_Timpani,class_Agogo,class_Triangle,class_Djembe,class_Tabla,class_Darbuka
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6680,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6684,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6685,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6686,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
mdlib1_df = DataFrame(data=mdlib1, columns=['path'])
mdlib1_df = mdlib1_df.apply(split(3),axis=1)
mdlib1_df

Unnamed: 0,path,filename,dir_1,dir_2
0,./Samples/MDLib2.2/Sorted/Snare SNoff/Buzz/ST_...,ST_Snare_Buzz_0112.2.wav,Buzz,Snare SNoff
1,./Samples/MDLib2.2/Sorted/Snare SNoff/Buzz/DI_...,DI_Snare_Buzz_0224.3.wav,Buzz,Snare SNoff
2,./Samples/MDLib2.2/Sorted/Snare SNoff/Buzz/IN_...,IN_Snare_Buzz_0214.1.wav,Buzz,Snare SNoff
3,./Samples/MDLib2.2/Sorted/Snare SNoff/Buzz/ST_...,ST_Snare_Buzz_0333.3.wav,Buzz,Snare SNoff
4,./Samples/MDLib2.2/Sorted/Snare SNoff/Buzz/MN_...,MN_Snare_Buzz_0111.4.wav,Buzz,Snare SNoff
...,...,...,...,...
7483,./Samples/MDLib2.2/Sorted/Floor Tom/Strike/DI_...,DI_Floor Tom_Strike_1124.2.wav,Strike,Floor Tom
7484,./Samples/MDLib2.2/Sorted/Floor Tom/Strike/MN_...,MN_Floor Tom_Strike_1113.2.wav,Strike,Floor Tom
7485,./Samples/MDLib2.2/Sorted/Floor Tom/Strike/IN_...,IN_Floor Tom_Strike_1113.2.wav,Strike,Floor Tom
7486,./Samples/MDLib2.2/Sorted/Floor Tom/Strike/MN_...,MN_Floor Tom_Strike_1322.4.wav,Strike,Floor Tom


In [20]:
mdlib1_df['dir_2'].unique() , mdlib1_df['dir_2'].unique()

(array(['Snare SNoff', 'Rack Tom SNoff', 'Floor Tom SNoff', 'Snare',
        'Rack Tom', 'Floor Tom'], dtype=object),
 array(['Snare SNoff', 'Rack Tom SNoff', 'Floor Tom SNoff', 'Snare',
        'Rack Tom', 'Floor Tom'], dtype=object))

In [21]:
rules = {
    'class_Snare': 'Snare',
    'class_Tom': 'Tom',
        }

set_classes(mdlib1_df, 'dir_2', rules)

In [22]:
mask = mdlib1_df.loc[:, [c for c in mdlib1_df.columns if c.startswith('class_')] ].sum(axis=1) < 1
print('unlabelled:', len(mdlib1_df[mask]))
print('labelled:', len(mdlib1_df[~mask]))
print('unlabelled:', list(mdlib1_df[mask]['dir_2']))

unlabelled: 0
labelled: 7488
unlabelled: []


In [23]:
mdlib2_df = DataFrame(data=mdlib2, columns=['path'])
mdlib2_df = mdlib2_df.apply(split(4),axis=1)
mdlib2_df

Unnamed: 0,path,filename,dir_1,dir_2,dir_3
0,./Samples/MDLib2.2/Sorted/Cymbals/Bright Crash...,ST_Bright Crash_Crash_1112.3.wav,Crash,Bright Crash,Cymbals
1,./Samples/MDLib2.2/Sorted/Cymbals/Bright Crash...,DI_Bright Crash_Crash_1112.3.wav,Crash,Bright Crash,Cymbals
2,./Samples/MDLib2.2/Sorted/Cymbals/Bright Crash...,MN_Bright Crash_Crash_1112.4.wav,Crash,Bright Crash,Cymbals
3,./Samples/MDLib2.2/Sorted/Cymbals/Bright Crash...,ST_Bright Crash_Crash_1111.4.wav,Crash,Bright Crash,Cymbals
4,./Samples/MDLib2.2/Sorted/Cymbals/Bright Crash...,ST_Bright Crash_Crash_1111.3.wav,Crash,Bright Crash,Cymbals
...,...,...,...,...,...
3131,./Samples/MDLib2.2/Sorted/Kick SNoff/Long Kick...,IN_Long Kick_Release_0113.4.wav,Release,Long Kick,Kick SNoff
3132,./Samples/MDLib2.2/Sorted/Kick SNoff/Long Kick...,MN_Long Kick_Release_0113.4.wav,Release,Long Kick,Kick SNoff
3133,./Samples/MDLib2.2/Sorted/Kick SNoff/Long Kick...,DI_Long Kick_Release_0113.3.wav,Release,Long Kick,Kick SNoff
3134,./Samples/MDLib2.2/Sorted/Kick SNoff/Long Kick...,MN_Long Kick_Release_0111.5.wav,Release,Long Kick,Kick SNoff


In [24]:
rules = {
    'class_Snare': 'snare',
    'class_Tom': 'tom',
    'class_Cymbal': 'crash|ride',
    'subclass_Cymbal_Crash': 'crash',
    'subclass_Cymbal_Ride': 'ride',
    'class_Hat': 'HiHat',
    'class_Kick': 'Kick',
           }

set_classes(mdlib2_df, 'dir_2', rules)

In [25]:
mask = mdlib2_df.loc[:, [c for c in mdlib2_df.columns if c.startswith('class_')] ].sum(axis=1) < 1
print('unlabelled:', len(mdlib2_df[mask]))
print('labelled:', len(mdlib2_df[~mask]))
print('unlabelled:', list(mdlib2_df[mask]['dir_2']))

unlabelled: 0
labelled: 3136
unlabelled: []


In [26]:
# idmt_df = DataFrame(data=idmt, columns=['path'])
# idmt_df.apply(split(),axis=1)

In [27]:
df = pd.concat([#idmt_df,
                mdlib1_df,
                mdlib2_df,
                drum200_df,
                wavegan_df,
                musicradar_df],
               axis=0,
               sort=False,
               ignore_index=True
              )
df.shape

(20870, 35)

In [28]:
df.iloc[:,4:].sum().sort_values()

class_Cajon                     2.0
class_Darbuka                   4.0
subclass_Shaken_Maracas         4.0
class_Gong                      4.0
class_Djembe                    5.0
class_Triangle                  8.0
subclass_Cymbal_Splash          8.0
class_Tabla                     9.0
subclass_Cymbal_Chinese        11.0
subclass_Shaken_Cabasa         29.0
class_Agogo                    33.0
class_Bongo                    42.0
subclass_Shaken_Shaker         45.0
class_Clave                    47.0
class_Timpani                  50.0
subclass_Shaken_Tambourin      62.0
class_Conga                   105.0
class_Rimshot                 126.0
class_Cowbell                 132.0
class_Shaken                  152.0
subclass_Hat_Open             260.0
class_Clap                    272.0
subclass_Hat_Close            284.0
subclass_Cymbal_Ride          448.0
subclass_Cymbal_Crash         818.0
class_Cymbal                 1372.0
class_Hat                    2293.0
class_Kick                  

In [29]:
df[df[[c for c in df.columns if c.startswith('class_')]].sum(axis=1) > 1]

Unnamed: 0,path,filename,dir_1,dir_2,class_Snare,class_Tom,dir_3,class_Cymbal,subclass_Cymbal_Crash,subclass_Cymbal_Ride,...,subclass_Shaken_Cabasa,class_Clave,class_Timpani,class_Agogo,class_Triangle,class_Djembe,class_Tabla,class_Darbuka,subclass_Shaken_Maracas,class_Cajon
12283,./Samples/200-drum-machines/Fairlight IIx/MaxV...,MaxV - HHTamb.wav,Fairlight IIx,,0.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
12949,./Samples/200-drum-machines/Kawai r-50/MaxV - ...,MaxV - ATOMBDEN.wav,Kawai r-50,,0.0,1.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
12965,./Samples/200-drum-machines/Kawai r-50/MaxV - ...,MaxV - ATOMBDRV.wav,Kawai r-50,,0.0,1.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
13002,./Samples/200-drum-machines/Kawai r-50/MaxV - ...,MaxV - ATOMBDGA.wav,Kawai r-50,,0.0,1.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
13943,./Samples/200-drum-machines/Roland R-5/R-5 Rim...,R-5 Rimshot Snare 2.wav,Roland R-5,,1.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
13969,./Samples/200-drum-machines/Roland R-5/R-5 Ope...,R-5 Open Low Conga.wav,Roland R-5,,0.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
13971,./Samples/200-drum-machines/Roland R-5/R-5 Rim...,R-5 Rimshot Snare 3.wav,Roland R-5,,1.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
13990,./Samples/200-drum-machines/Roland R-5/R-5 Rim...,R-5 Rimshot Snare 1.wav,Roland R-5,,1.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
14000,./Samples/200-drum-machines/Roland R-5/R-5 Ope...,R-5 Open Snare.wav,Roland R-5,,1.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
14634,./Samples/200-drum-machines/Roland SC-88/MaxV ...,MaxV - D#4 Open High Conga.wav,Roland SC-88,,0.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


In [32]:
#df[df[[c for c in df.columns if c.startswith('class_')]].sum(axis=1) < 1]
overlabelled = df[[c for c in df.columns if c.startswith('class_')]].sum(axis=1) > 1
list(df[overlabelled]['filename'])

['MaxV - HHTamb.wav',
 'MaxV - ATOMBDEN.wav',
 'MaxV - ATOMBDRV.wav',
 'MaxV - ATOMBDGA.wav',
 'R-5 Rimshot Snare 2.wav',
 'R-5 Open Low Conga.wav',
 'R-5 Rimshot Snare 3.wav',
 'R-5 Rimshot Snare 1.wav',
 'R-5 Open Snare.wav',
 'MaxV - D#4 Open High Conga.wav',
 'MaxV - A5 Open Triangle.wav',
 'TOMSnare.wav',
 'TOMHat_O.wav',
 'TOMClap.wav',
 'TOMCymbal.wav',
 'TOMKick.wav',
 'TOMHat_C.wav',
 'MaxV - RIM-TOM3.wav',
 'MaxV - RIM-TOM1.wav',
 'MaxV - RIM-TOM2.wav',
 'MaxV - RIM-TOM4.wav',
 'HR16B 030 Open Conga.wav']

In [None]:
df.shape

In [None]:
unabelled = df[[c for c in df.columns if c.startswith('class_')]].sum(axis=1) < 1
list(df[unabelled]['filename'])

In [None]:
clean = df[[c for c in df.columns if c.startswith('class_')]].sum(axis=1) == 1
df_clean =df[clean]

In [None]:
df_clean.to_csv('data.csv')