In [3]:
import pandas as pd
import numpy as np
import os, pathlib
import librosa
from pathlib import Path
import glob
import pickle
import shutil
import datetime
import wave_manipulator
import DataLoad
from sklearn.model_selection import train_test_split

p_home = Path.cwd() / 'data'
p_temp1 = Path.cwd() / 'data' / 'UIOWA' / 'tmp1'
p_wav = Path.cwd() / 'data' / 'UIOWA' / 'wav'




# IRMAS Dataset Base
___

### Create

In [4]:
Dataset = 'IRMAS_Data_Base'

In [5]:
_, df, label_dict = DataLoad.load_to_dataframe()
df.to_pickle(p_home / (Dataset + '.pkl'))


In [12]:
label_dict

{0: 'cel',
 1: 'cla',
 2: 'flu',
 3: 'gac',
 4: 'gel',
 5: 'org',
 6: 'pia',
 7: 'sax',
 8: 'tru',
 9: 'vio',
 10: 'voi'}

### Load

In [7]:
data = pd.read_pickle(p_home / (Dataset + '.pkl'))
overview, labeldict = wave_manipulator.analyze(data)

In [8]:
overview

Unnamed: 0,Label,Instrument,Nos
0,4,gel,760
1,10,voi,778
2,2,flu,451
3,0,cel,388
4,8,tru,577
5,7,sax,626
6,1,cla,505
7,6,pia,721
8,5,org,682
9,9,vio,580


In [9]:
labeldict

{'gel': 4,
 'voi': 10,
 'flu': 2,
 'cel': 0,
 'tru': 8,
 'sax': 7,
 'cla': 1,
 'pia': 6,
 'org': 5,
 'vio': 9,
 'gac': 3}

### Train test split
___

In [10]:
X_train, X_test, = train_test_split(data, test_size=0.3, stratify=data['labels'])

X_train.to_pickle(p_home / (Dataset + '_train.pkl'))
X_test.to_pickle(p_home / (Dataset + '_test.pkl'))
count, dict_ = wave_manipulator.analyze(X_train)

In [11]:
wave_manipulator.analyze(X_train)

(    Label Instrument  Nos
 0       4        gel  532
 1      10        voi  544
 2       2        flu  316
 3       7        sax  438
 4       8        tru  404
 5       0        cel  272
 6       1        cla  353
 7       6        pia  505
 8       5        org  477
 9       9        vio  406
 10      3        gac  446,
 {'gel': 4,
  'voi': 10,
  'flu': 2,
  'sax': 7,
  'tru': 8,
  'cel': 0,
  'cla': 1,
  'pia': 6,
  'org': 5,
  'vio': 9,
  'gac': 3})

# IRMAS Dataset match IOWA
___

In [33]:
Dataset = 'IRMAS_Data_match_IOWA'

### Load Base

In [34]:
data = pd.read_pickle(p_home / 'IRMAS_Data_Base.pkl')

### Create

In [42]:
labels = []
tags = []
indexes = []
raw_data = []
wave_path = []
sr = []

for index, row in data.iterrows():
    indexes.append(index)
    raw_data.append(row.raw_sounds)
    wave_path.append(row.wav_path)
    sr.append(row.sample_rate)
    if row.labels == 0:
        labels.append(4)
        tags.append('cel')
    elif row.labels ==1:
        labels.append(1)
        tags.append('cla')
    elif row.labels == 2:
        labels.append(2)
        tags.append('flu')
    elif row.labels == 3:
        labels.append(3)
        tags.append('gac')
    elif row.labels == 4:
        labels.append(0)
        tags.append('noise')
    elif row.labels == 5:
        labels.append(0)
        tags.append('noise')
    elif row.labels == 6:
        labels.append(6)
        tags.append('pia')
    elif row.labels == 7:
        labels.append(7)
        tags.append('sax')
    elif row.labels == 8:
        labels.append(8)
        tags.append('tru')
    elif row.labels == 9:
        labels.append(5)
        tags.append('vio')
    elif row.labels == 10:
        labels.append(0)
        tags.append('noise')
        

        
        
        
        
data = pd.DataFrame(list(zip(labels, tags, wave_path, raw_data, sr)),
                    columns = ['labels', 'tags', 'wav_path', 'raw_sounds', 'sample_rate'])
       
data.to_pickle(p_home / (Dataset + '.pkl'))


### Load

In [43]:
data = pd.read_pickle(p_home / (Dataset + '.pkl'))

In [44]:
count, dict_ = wave_manipulator.analyze(data)


### Train test split
___

In [46]:
X_train, X_test, = train_test_split(data, test_size=0.3, stratify=data['labels'])

X_train.to_pickle(p_home / (Dataset + '_train.pkl'))
X_test.to_pickle(p_home / (Dataset + '_test.pkl'))
count, dict_ = wave_manipulator.analyze(X_train)

### Analyse

In [47]:
count.sort_values(by='Label')

Unnamed: 0,Label,Instrument,Nos
2,0,noise,1554
6,1,cla,353
7,2,flu,316
3,3,gac,446
4,4,cel,271
1,5,vio,406
8,6,pia,505
0,7,sax,438
5,8,tru,404


In [28]:
dict_

{'noise': 0,
 'cel': 4,
 'cla': 1,
 'sax': 7,
 'tru': 8,
 'gac': 3,
 'flu': 2,
 'pia': 6,
 'vio': 5}

# UIOWA
___

# IOWA all Instruments
___

In [2]:
Dataset = 'UIOWA_Data_All'

In [3]:
structure = pd.read_pickle(p_home / 'UIOWA_Structure.pkl')

instrument = []
Label_int = []
Label = []

restructure = structure.copy()

for index, row in restructure.iterrows():
    direc = row.Directory
    inst = direc.split('][')[2]
    if 'Pizzicato' in direc:
        instrument.append('noise')
    elif 'Percussion' in direc:
        instrument.append('noise')
    else:
        instrument.append(inst)

restructure['tags'] = instrument
restructure['Label'] = instrument

key = set(instrument)
key_l = [[x, y] for [y, x] in enumerate(key)]
labeldict = dict(key_l)

for index, row in restructure.iterrows():
    Label_int.append(labeldict[row.tags])
restructure['Label_int'] = Label_int

data = wave_manipulator.create_dataframe(p_wav, restructure)



In [4]:
data.to_pickle(p_home / (Dataset + '.pkl'))

### Train test split

In [5]:
X_train, X_test, = train_test_split(data, test_size=0.3, stratify=data['labels'])
X_train.to_pickle(p_home / (Dataset + '_train.pkl'))
X_test.to_pickle(p_home / (Dataset + '_test.pkl'))
count, dict_ = wave_manipulator.analyze(X_train)

In [6]:
labeldict

{'Double Bass': 0,
 'Bass Clarinet': 1,
 'Guitar': 2,
 'Eb Clarinet': 3,
 'Flute': 4,
 'Horn': 5,
 'Piano': 6,
 'Eb Alto Saxophone': 7,
 'Violin': 8,
 'Bb Clarinet': 9,
 'Tenor Trombone': 10,
 'Tuba': 11,
 'Bassoon': 12,
 'Viola': 13,
 'Bb Soprano Saxophone': 14,
 'Bass Flute': 15,
 'Oboe': 16,
 'Bb Trumpet': 17,
 'Cello': 18,
 'noise': 19,
 'Alto Flute': 20,
 'Bass Trombone': 21}

In [7]:
count.sort_values(by='Label')

Unnamed: 0,Label,Instrument,Nos
0,0,Double Bass,163
1,1,Bass Clarinet,129
2,2,Guitar,515
3,3,Eb Clarinet,111
4,4,Flute,212
5,5,Horn,98
8,6,Piano,144
6,7,Eb Alto Saxophone,179
7,8,Violin,227
9,9,Bb Clarinet,130


# UIOWA match IRMAS
___

In [8]:
Dataset = 'UIOWA_Data_match_IRMAS'

### Create

In [9]:
structure = pd.read_pickle(p_home / 'UIOWA_Structure.pkl')


instrument = []
Label = []
Label_int = []
restructure = structure.copy()

for index, row in restructure.iterrows():
    direc = row.Directory
    inst = direc.split('][')[2]
    if 'Pizzicato' in direc:
        Label.append('noise')
        instrument.append('various')
        Label_int.append(0)
    elif 'Percussion' in direc:
        Label.append('noise')
        instrument.append('various')
        Label_int.append(0)
    elif 'Trumpet' in direc:
        instrument.append(inst)
        Label.append('Trumpet')
        Label_int.append(8)
    elif 'Guitar' in direc:
        instrument.append(inst)
        Label.append('Guitar')
        Label_int.append(3)
    elif 'Piano' in direc:
        instrument.append(inst)
        Label.append('Piano')
        Label_int.append(6)
    elif 'Clarinet' in direc:
        instrument.append(inst)
        Label.append('Clarinet')
        Label_int.append(1)
    elif 'Saxophone' in direc:
        instrument.append(inst)
        Label.append('Saxophone')
        Label_int.append(7)
    elif 'Violin' in direc:
        instrument.append(inst)
        Label.append('Violine')
        Label_int.append(5)
    elif 'Flute' in direc:
        instrument.append(inst)
        Label.append('Flute')
        Label_int.append(2)
    elif 'Cello' in direc:
        instrument.append(inst)
        Label.append('Cello')
        Label_int.append(4)
        
    else:
        Label.append('noise')
        instrument.append('various')
        Label_int.append(0)
restructure['Label'] = Label
restructure['tags'] = instrument
restructure['Label_int'] = Label_int


data = wave_manipulator.create_dataframe(p_wav, restructure)

In [10]:
data.to_pickle(p_home / (Dataset + '.pkl'))

In [11]:
data = pd.read_pickle(p_home / (Dataset + '.pkl'))
overview, labeldict = wave_manipulator.analyze(data)

### Train test split
___

In [12]:
X_train, X_test, = train_test_split(data, test_size=0.3, stratify=data['labels'])
X_train.to_pickle(p_home / (Dataset + '_train.pkl'))
X_test.to_pickle(p_home / (Dataset + '_test.pkl'))
count, dict_ = wave_manipulator.analyze(X_train)

In [13]:
count.sort_values(by='Label')

Unnamed: 0,Label,Instrument,Nos
6,0,various,5093
0,1,Bass Clarinet,115
10,1,Bb Clarinet,132
11,1,Eb Clarinet,123
3,2,Flute,216
9,2,Alto Flute,89
13,2,Bass Flute,100
4,3,Guitar,515
2,4,Cello,613
7,5,Violin,228


# UIOWA only top class
___

In [14]:
Dataset = 'UIOWA_Data_Topclass'

In [15]:
structure = pd.read_pickle(p_home / 'UIOWA_Structure.pkl')

instrument = []
Label_int = []
Label = []

restructure = structure.copy()

for index, row in restructure.iterrows():
    direc = row.Directory
    inst = direc.split('][')[2]
    types = direc.split('][')[1]
    if 'Pizzicato' in direc:
        instrument.append('various')
        Label.append('noise')
    elif 'Percussion' in direc:
        instrument.append('various')
        Label.append('noise')
    else:
        instrument.append(inst)
        Label.append(types)

restructure['tags'] = Label
restructure['Label'] = instrument

key = set(Label)
labeldict = {i[1] : i[0] for i in enumerate(key)}


for index, row in restructure.iterrows():
    Label_int.append(labeldict[row.tags])
restructure['Label_int'] = Label_int

data = wave_manipulator.create_dataframe(p_wav, restructure)



In [16]:
data.to_pickle(p_home / (Dataset + '.pkl'))

In [19]:
data = pd.read_pickle(p_home / 'UIOWA_Data_Topclass.pkl')
overview, labeldict = wave_manipulator.analyze(data)

In [20]:
overview.sort_values(by='Label')

Unnamed: 0,Label,Instrument,Nos
0,0,Brass,812
1,1,Woodwinds,1917
2,2,noise,5912
3,3,Guitar,736
4,4,Piano,206
5,5,Strings,1738


### Train test split
___

In [21]:
X_train, X_test, = train_test_split(data, test_size=0.3, stratify=data['labels'])
X_train.to_pickle(p_home / (Dataset + '_train.pkl'))
X_test.to_pickle(p_home / (Dataset + '_test.pkl'))
count, dict_ = wave_manipulator.analyze(X_train)

In [22]:
count.sort_values(by='Label')

Unnamed: 0,Label,Instrument,Nos
0,0,Brass,568
1,1,Woodwinds,1342
2,2,noise,4138
3,3,Guitar,515
4,4,Piano,144
5,5,Strings,1217


In [23]:
count, dict_ = wave_manipulator.analyze(X_test)
count.sort_values(by='Label')

Unnamed: 0,Label,Instrument,Nos
0,0,Brass,244
1,1,Woodwinds,575
2,2,noise,1774
3,3,Guitar,221
4,4,Piano,62
5,5,Strings,521


In [13]:
structure = pd.read_pickle(p_home / 'UIOWA_Structure.pkl')

In [14]:
structure

Unnamed: 0,Directory,Nos
0,[UIOWA][Piano][Piano][unused],206
1,[UIOWA][Guitar][Guitar][unused],736
2,[UIOWA][Strings][Viola][Arco],304
3,[UIOWA][Strings][Viola][Pizzicato],781
4,[UIOWA][Strings][Violin][Pizzicato],643
...,...,...
59,[UIOWA][Percussion][Xylophone][Xylophone],119
60,[UIOWA][Percussion][Xylophone][unused],380
61,[UIOWA][Strings][Cello][Arco],876
62,[UIOWA][Strings][Cello][Pizzicato],1081
