In [1]:
import pandas as pd
import numpy as np
import os, pathlib
import librosa
from pathlib import Path
import glob
import pickle
import shutil
import datetime
import wave_manipulator
import DataLoad

p_home = Path.cwd() / 'data'
p_temp1 = Path.cwd() / 'data' / 'UIOWA' / 'tmp1'
p_wav = Path.cwd() / 'data' / 'UIOWA' / 'wav'




### Helper Functions

In [2]:
def analyze(data):

    tags = set(data.tags)

    count = []
    labels = []
    instruments = []
    for tag in tags:
        pad = data.loc[data.tags == tag]
        count.append(len(pad))
        labels.append(pad.iloc[0]['labels'])
        instruments.append(tag)

    analyze = pd.DataFrame(list(zip(labels, instruments, count)), columns = ['Label', 'Instrument', 'Nos']) 
    Label_dict = dict(zip(instruments, labels))
    
    return analyze, Label_dict

# IRMAS Dataset Base
___

### Create

In [37]:
_, df, label_dict = DataLoad.load_to_dataframe()
df.to_pickle(p_home / 'IRMAS_Base.pkl')


### Load

In [48]:
data = pd.read_pickle(p_home / 'IRMAS_Base.pkl')
overview, labeldict = analyze(data)

In [49]:
overview

Unnamed: 0,Label,Instrument,Nos
0,9,vio,580
1,2,flu,451
2,3,gac,637
3,8,tru,577
4,5,org,682
5,6,pia,721
6,1,cla,505
7,0,cel,388
8,10,voi,778
9,4,gel,760


In [47]:
labeldict

{'vio': 9,
 'flu': 2,
 'gac': 3,
 'tru': 8,
 'org': 5,
 'pia': 6,
 'cla': 1,
 'cel': 0,
 'voi': 10,
 'gel': 4,
 'sax': 7}

# IRMAS Dataset match IOWA
___

### Load Base

In [3]:
data = pd.read_pickle(p_home / 'IRMAS_Base.pkl')
data

Unnamed: 0,tags,wav_path,labels,raw_sounds,sample_rate
0,voi,data/IRMAS-TrainingData/voi/203__[voi][nod][co...,10,"[-0.10688782, -0.10713196, -0.10412598, -0.100...",44100
1,sax,data/IRMAS-TrainingData/sax/[sax][pop_roc]1626...,7,"[-0.022399902, -0.012954712, -0.0035705566, 0....",44100
2,voi,data/IRMAS-TrainingData/voi/186__[voi][nod][co...,10,"[-0.4929657, -0.45367432, -0.41244507, -0.3837...",44100
3,gel,data/IRMAS-TrainingData/gel/[gel][jaz_blu]0893...,4,"[0.081970215, 0.08126831, 0.08039856, 0.078536...",44100
4,pia,data/IRMAS-TrainingData/pia/[pia][jaz_blu]1318...,6,"[-0.07913208, -0.0715332, -0.06549072, -0.0613...",44100
...,...,...,...,...,...
6700,gel,data/IRMAS-TrainingData/gel/[gel][pop_roc]1036...,4,"[0.12220764, 0.230896, 0.027923584, 0.1275177,...",44100
6701,sax,data/IRMAS-TrainingData/sax/[sax][jaz_blu]1772...,7,"[-0.019760132, -0.020629883, -0.020523071, -0....",44100
6702,gel,data/IRMAS-TrainingData/gel/092__[gel][dru][ja...,4,"[0.002380371, 0.0067443848, -0.0069885254, -0....",44100
6703,gel,data/IRMAS-TrainingData/gel/[gel][jaz_blu]0991...,4,"[-0.014694214, -0.01399231, -0.013305664, -0.0...",44100


### Create

In [4]:
labels = []
tags = []
indexes = []
raw_data = []
wave_path = []
sr = []

for index, row in data.iterrows():
    indexes.append(index)
    raw_data.append(row.raw_sounds)
    wave_path.append(row.wav_path)
    sr.append(row.sample_rate)
    if row.labels == 0:
        labels.append(4)
        tags.append('cel')
    elif row.labels ==1:
        labels.append(1)
        tags.append('cla')
    elif row.labels == 2:
        labels.append(2)
        tags.append('flu')
    elif row.labels == 3:
        labels.append(3)
        tags.append('gac')
    elif row.labels == 4:
        labels.append(0)
        tags.append('noise')
    elif row.labels == 5:
        labels.append(0)
        tags.append('noise')
    elif row.labels == 6:
        labels.append(6)
        tags.append('pia')
    elif row.labels == 7:
        labels.append(7)
        tags.append('sax')
    elif row.labels == 8:
        labels.append(8)
        tags.append('tru')
    elif row.labels == 9:
        labels.append(5)
        tags.append('vio')
    elif row.labels == 10:
        labels.append(0)
        tags.append('noise')
        

        
        
        
        
data = pd.DataFrame(list(zip(labels, tags, wave_path, raw_data, sr)),
                    columns = ['labels', 'tags', 'wav_path', 'raw_sounds', 'sample_rate'])
       
data.to_pickle(p_home / 'IRMAS_match_IOWA.pkl')


### Load

In [5]:
data = pd.read_pickle(p_home / 'IRMAS_match_IOWA.pkl')
overview, labeldict = analyze(data)

### Analyse

In [6]:
overview.sort_values(by='Label')

Unnamed: 0,Label,Instrument,Nos
1,0,noise,2220
2,1,cla,505
8,2,flu,451
4,3,gac,637
7,4,cel,388
3,5,vio,580
0,6,pia,721
6,7,sax,626
5,8,tru,577


In [8]:
labeldict

{'pia': 6,
 'noise': 0,
 'cla': 1,
 'vio': 5,
 'gac': 3,
 'tru': 8,
 'sax': 7,
 'cel': 4,
 'flu': 2}

# UIOWA
___

# IOWA all Instruments
___

In [3]:
structure = pd.read_pickle(p_home / 'UIOWA_Structure.pkl')

instrument = []
Label_int = []
Label = []

restructure = structure.copy()

for index, row in restructure.iterrows():
    direc = row.Directory
    inst = direc.split('][')[2]
    if 'Pizzicato' in direc:
        instrument.append('noise')
    elif 'Percussion' in direc:
        instrument.append('noise')
    else:
        instrument.append(inst)

restructure['tags'] = instrument
restructure['Label'] = instrument

key = set(instrument)
labeldict = dict([[x, y] for [y, x] in enumerate(key)])


for index, row in restructure.iterrows():
    Label_int.append(labeldict[row.tags])
restructure['Label_int'] = Label_int

data = wave_manipulator.create_dataframe(p_wav, restructure)



In [4]:
data.to_pickle(p_home / ('UIOWA_All.pkl'))
overview, labeldict = analyze(data)

In [5]:
overview.sort_values(by='Label')

Unnamed: 0,Label,Instrument,Nos
0,0,Eb Clarinet,158
1,1,Bb Trumpet,283
2,2,Horn,140
3,3,Flute,303
4,4,Bass Flute,140
5,5,Double Bass,233
6,6,Eb Alto Saxophone,256
7,7,Alto Flute,135
8,8,Oboe,137
9,9,Cello,876


In [6]:
labeldict

{'Eb Clarinet': 0,
 'Bb Trumpet': 1,
 'Horn': 2,
 'Flute': 3,
 'Bass Flute': 4,
 'Double Bass': 5,
 'Eb Alto Saxophone': 6,
 'Alto Flute': 7,
 'Oboe': 8,
 'Cello': 9,
 'Piano': 10,
 'Bb Clarinet': 11,
 'Violin': 12,
 'Bass Trombone': 13,
 'Bb Soprano Saxophone': 14,
 'Tenor Trombone': 15,
 'Viola': 16,
 'Bass Clarinet': 17,
 'Tuba': 18,
 'Bassoon': 19,
 'noise': 20,
 'Guitar': 21}

# IOWA match IRMAS
___

### Create

In [31]:
structure = pd.read_pickle(p_home / 'UIOWA_Structure.pkl')
structure

instrument = []
Label = []
Label_int = []
restructure = structure.copy()

for index, row in restructure.iterrows():
    direc = row.Directory
    inst = direc.split('][')[2]
    if 'Pizzicato' in direc:
        Label.append('noise')
        instrument.append('various')
        Label_int.append(0)
    elif 'Percussion' in direc:
        Label.append('noise')
        instrument.append('various')
        Label_int.append(0)
    elif 'Trumpet' in direc:
        instrument.append(inst)
        Label.append('Trumpet')
        Label_int.append(8)
    elif 'Guitar' in direc:
        instrument.append(inst)
        Label.append('Guitar')
        Label_int.append(3)
    elif 'Piano' in direc:
        instrument.append(inst)
        Label.append('Piano')
        Label_int.append(6)
    elif 'Clarinet' in direc:
        instrument.append(inst)
        Label.append('Clarinet')
        Label_int.append(1)
    elif 'Saxophone' in direc:
        instrument.append(inst)
        Label.append('Saxophone')
        Label_int.append(7)
    elif 'Violin' in direc:
        instrument.append(inst)
        Label.append('Violine')
        Label_int.append(5)
    elif 'Flute' in direc:
        instrument.append(inst)
        Label.append('Flute')
        Label_int.append(2)
    elif 'Cello' in direc:
        instrument.append(inst)
        Label.append('Cello')
        Label_int.append(4)
        
    else:
        Label.append('noise')
        instrument.append('various')
        Label_int.append(0)
restructure['Label'] = Label
restructure['tags'] = instrument
restructure['Label_int'] = Label_int


data = wave_manipulator.create_dataframe(p_wav, restructure)

In [22]:
data.to_pickle(p_home / ('UIOWA_match_IRMAS.pkl'))

In [56]:
data = pd.read_pickle(p_home / 'UIOWA_match_IRMAS.pkl')
overview, labeldict = analyze(data)

In [26]:
overview.sort_values(by='Label')

Unnamed: 0,Label,Instrument,Nos
13,0,various,7277
0,1,Bb Clarinet,185
7,1,Eb Clarinet,158
9,1,Bass Clarinet,185
3,2,Alto Flute,135
4,2,Bass Flute,140
12,2,Flute,303
10,3,Guitar,736
11,4,Cello,876
1,5,Violin,325


# UIOWA only top class
___

In [3]:
structure = pd.read_pickle(p_home / 'UIOWA_Structure.pkl')

instrument = []
Label_int = []
Label = []

restructure = structure.copy()

for index, row in restructure.iterrows():
    direc = row.Directory
    inst = direc.split('][')[2]
    types = direc.split('][')[1]
    if 'Pizzicato' in direc:
        instrument.append('various')
        Label.append('noise')
    elif 'Percussion' in direc:
        instrument.append('various')
        Label.append('noise')
    else:
        instrument.append(inst)
        Label.append(types)

restructure['tags'] = Label
restructure['Label'] = instrument

key = set(Label)
labeldict = dict([[x, y] for [y, x] in enumerate(key)])


for index, row in restructure.iterrows():
    Label_int.append(labeldict[row.tags])
restructure['Label_int'] = Label_int

data = wave_manipulator.create_dataframe(p_wav, restructure)



In [4]:
data.to_pickle(p_home / ('UIOWA_Top_Class.pkl'))

In [5]:
data = pd.read_pickle(p_home / 'UIOWA_Top_Class.pkl')
overview, labeldict = analyze(data)

In [6]:
overview.sort_values(by='Label')

Unnamed: 0,Label,Instrument,Nos
0,0,Strings,1738
1,1,Brass,812
2,2,Woodwinds,1917
3,3,noise,5912
4,4,Guitar,736
5,5,Piano,206
