##### *Reminder - use daily log to track progress

In [2]:
#!pip install librosa

In [5]:
import pandas as pd
import numpy as np
import os, pathlib
from sklearn.model_selection import train_test_split
import librosa
import torch
import IPython.display as ipd
from random import randint

In [6]:
#!sudo apt-get install wget unzip

In [7]:
# Download training data
#!mkdir -p data && cd data && wget https://irmas-dataset.s3-eu-west-1.amazonaws.com/IRMAS-TrainingData.zip && unzip IRMAS-TrainingData.zip 

In [8]:
#Download testing data (Around 2GB each)
#wget https://irmas-dataset.s3-eu-west-1.amazonaws.com/IRMAS-TestingData-Part1.zip && IRMAS-TestingData-Part1.zip
#wget https://irmas-dataset.s3-eu-west-1.amazonaws.com/IRMAS-TestingData-Part2.zip && IRMAS-TestingData-Part2.zip
#wget https://irmas-dataset.s3-eu-west-1.amazonaws.com/IRMAS-TestingData-Part3.zip && IRMAS-TestingData-Part3.zip    

IRMAS dataset (training)
 
Audio files: 6705 audio files in 16 bit stereo wav format sampled at 44.1kHz. They are excerpts of 3 seconds from more than 2000 distinct recordings. 

Annotations: The annotation of the predominant instrument of each excerpt is both in the name of the containing folder, and in the file name: cello (cel), clarinet (cla), flute (flu), acoustic guitar (gac), electric guitar (gel), organ (org), piano (pia), saxophone (sax), trumpet (tru), violin (vio), and human singing voice (voi). The number of files per instrument are: cel(388), cla(505), flu(451), gac(637), gel(760), org(682), pia(721), sax(626), tru(577), vio(580), voi(778). 

Additionally, some of the files have annotations in the filename regarding the presence ([dru]) or non presence([nod]) of drums, and the musical genre: country-folk ([cou_fol]), classical ([cla]), pop-rock ([pop-roc]), latin-soul ([lat-sou]).

In [9]:
IRMAS_TRAINING = 'data/IRMAS-TrainingData'
base_path = pathlib.Path(IRMAS_TRAINING)
classes, paths = [], []

for p in base_path.glob('*/*'):
    relative_path = p.relative_to(base_path)
    classes.append(str(relative_path.parent))
    paths.append(p)
    
#classes = set(classes)
df = pd.DataFrame({"tags": classes, "wav_path": paths}).sample(frac=1)#.reset_index(drop=True)#, inplace=True)
#df_training.piano = df.tags.map({'pia': True, '': 0})
#df.tags = df.tags.map({'pia': True, })
df.reset_index(drop=True, inplace=True)
df.head(5)

Unnamed: 0,tags,wav_path
0,voi,data/IRMAS-TrainingData/voi/124__[voi][dru][la...
1,gac,data/IRMAS-TrainingData/gac/[gac][cla]0608__3.wav
2,gel,data/IRMAS-TrainingData/gel/023__[gel][dru][po...
3,gel,data/IRMAS-TrainingData/gel/057__[gel][dru][ja...
4,gac,data/IRMAS-TrainingData/gac/[gac][cla]0533__1.wav


In [10]:
df.shape

(6705, 2)

In [11]:
classes = set(classes)

In [12]:
classes

{'cel', 'cla', 'flu', 'gac', 'gel', 'org', 'pia', 'sax', 'tru', 'vio', 'voi'}

In [13]:
amount_of_labels = len(classes)

In [14]:
classes = list(classes)

In [15]:
type(classes)

list

In [16]:
#inst_dict = {1:'cel', 2:'cla', 3:'flu', 4:'gac', 5:'gel',6:'org',7:'pia',8:'sax',9:'tru',10:'vio',11:'voi'}

In [17]:
class_dict = { i: classes[i] for i in range(0, len(classes))}

In [18]:
class_dict

{0: 'tru',
 1: 'cel',
 2: 'pia',
 3: 'voi',
 4: 'vio',
 5: 'cla',
 6: 'gel',
 7: 'gac',
 8: 'org',
 9: 'flu',
 10: 'sax'}

In [19]:
class_dict[9]

'flu'

In [20]:
df_train, df_test = train_test_split(df, test_size=0.3)

In [21]:
df_train.shape

(4693, 2)

In [22]:
df_test.shape

(2012, 2)

In [23]:
# Test if files are loading
successful, corrupted = [], []
len_x, sample_rates = [], []
raw_samples, sample_rates = [], []
for p in df.wav_path:
    try:
        x, sr = librosa.load(p, sr=None)
        successful.append(p)
        len_x.append(len(x))
        raw_samples.append(x)
        sample_rates.append(sr)
    except:
        corrupted.append(p)
#       print(p)
###df = df[~df.wav_path.isin(corrupted)]
assert len(successful) == len(raw_samples)
len(successful), len(corrupted), len(raw_samples), len(sample_rates)

(6705, 0, 6705, 6705)

In [24]:
set(len_x), set(sample_rates), 132299/44100

({132299}, {44100}, 2.9999773242630385)

In [25]:
df_raw = df.copy()

In [27]:
df_raw['raw_sounds'] = raw_samples
df_raw['sample_rate'] = sample_rates
df_raw

In [None]:
import json

#with open('Data/IRMAS_raw.json', 'w') as f:
#    json.dumps(df.T.to_dict(orient='list'), f)
#df_raw.to_json('Data/IRMAS_raw.json')
#df_raw.to_json('Data/IRMAS_raw.json')

In [19]:
class InstrumentClassificationDataset():
    def __init__(self, df):
        super().__init__()
        self.df = df.copy()
#        self.image_transform = image_transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        sound = self.load_from_disk(index)
        label = self.load_label(index)
#        Xi = self.image_transform(Xi)
        return sound, label

    def load_to_librosa(self, path):
        image = librosa.load(path)
        return image

    def load_from_disk(self, index):
        wav_path = df.iloc[index].wav_path 
        return self.load_to_librosa(wav_path)

    def load_label(self, index):
        label = df.iloc[index].tags
        return label

In [20]:
class BaseSampler():
    def __init__(self, df, list_of_instruments, n_samples):
        self.df = df.copy()
        self.n_samples = n_samples
        self.instruments = list_of_instruments
        #df = df[self.instruments]
        df = df[df['tags'].isin(self.instruments)]
        
    def __iter__(self):
        return iter(self._get_sample())
        
    def __len__(self):
        return self.n_samples
    
    def _get_samples(self):
        return np.random.choice(len(self.df), self.n_samples, replace=False)

In [21]:
# define the instruments we are using
list_of_instruments = ['sax', 'pia']

In [22]:
bs = BaseSampler(df, list_of_instruments, 30)

In [23]:
bs._get_samples()

array([ 794,  254, 5836, 6570, 2048,  953, 3287, 5205, 5680,  859, 5639,
       3228, 1066,  489, 2200, 5034, 5284, 3052, 1599, 2621, 4725, 1140,
       2022, 6590, 5689, 1100, 1986, 5000,   76,  595])

In [24]:
batch_index = bs._get_samples()

In [25]:
ds_train = InstrumentClassificationDataset(df_train)

In [26]:
def sound_generator(df, inst, batch_size = 30):
    df = df[df['tags'].isin(inst)].copy()
    while True:
        batch_x, labels = [], []
        batch_index = np.random.choice(len(df), batch_size, replace=False)
        for idx in batch_index:
            wav_path = df.iloc[idx].wav_path 
            sound_frame, sr = librosa.load(wav_path, sr=None)
            batch_x.append(sound_frame)
            labels.append(df.iloc[idx].tags)
#        batch_x = np.array( batch_x )
#        labels = np.array (labels)
        yield( batch_x, labels )

In [27]:
sg  = sound_generator(df_train, list_of_instruments, 30)

In [28]:
sounds, labels = next(sg)

In [29]:
sounds = np.array(sounds)

In [30]:
sounds.shape

(30, 132299)

In [31]:
sounds[1], labels[1]

(array([-0.07740784, -0.06619263, -0.05935669, ..., -0.31373596,
        -0.28694153, -0.25541687], dtype=float32),
 'sax')

In [32]:
len(sounds[1]), min(sounds[1]), max(sounds[1])

(132299, -0.85964966, 0.82769775)

In [33]:
# Check if the sounds and labels match 
random = randint(0,29)
print(labels[random])
ipd.Audio(sounds[random], rate=44100)

sax


In [56]:
from tensorflow.keras import layers, models

model = models.Sequential()

#model.add(layers.Conv1D(1, kernel_size = 200, input_shape=(13299,1))) #
model.add(layers.Conv1D(filters = 64, kernel_size=200, activation='relu', input_shape=(13299,1)))
model.add(layers.MaxPooling1D(pool_size=4))
#model.add(LSTM(64))
model.add(layers.Dense(512, activation='linear'))
model.add(layers.Dense(amount_of_labels))
model.add(layers.Activation('softmax'))
model.summary()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_13 (Conv1D)           (None, 13100, 64)         12864     
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 3275, 64)          0         
_________________________________________________________________
dense_6 (Dense)              (None, 3275, 512)         33280     
_________________________________________________________________
dense_7 (Dense)              (None, 3275, 11)          5643      
_________________________________________________________________
activation (Activation)      (None, 3275, 11)          0         
Total params: 51,787
Trainable params: 51,787
Non-trainable params: 0
_________________________________________________________________
