<img src="header_anwender.png" align="left"/>

# Anwendungsbeispiel Import of audio data with classification

Das Ziel dieses Beispieles ist es die Arbeit mit Audiodaten, den Import, die Vorbereitung und die Klassifikation zu erklären. Dabei werden folgende Schritte durchgeführt:

- Dynamisches Laden und entpacken der Audiodaten von einer externen Quelle
- Review der Organisation auf dem Filesystem
- Laden der Daten
- Transformationen
- Training
- Analyse

Der verwendete Datensatz heisst ESC-50 [1] mit 50 Klassen von Geräuschen in Dateien organisiert. Die Audiodaten sind jeweils 5 Sekunden lang und haben 40 Samples pro Klasse.


Der Code für das Beispiel ist aus [2],[3],[4] und [5] kombiniert.


Quellen für die Beispiele und Daten:

- [1] [https://github.com/karolpiczak/ESC-50/blob/master/LICENSE](https://github.com/karolpiczak/ESC-50/blob/master/LICENSE) (Hinweise auf Unterlizenzen der Daten)
- [2] [https://github.com/CarmineCella/esc50_keras/blob/master/esc50_keras.py](https://github.com/CarmineCella/esc50_keras/blob/master/esc50_keras.py)
- [3] [https://medium.com/@mikesmales/sound-classification-using-deep-learning-8bc2aa1990b7](https://medium.com/@mikesmales/sound-classification-using-deep-learning-8bc2aa1990b7)
- [4] [https://www.kaggle.com/msripooja/steps-to-convert-audio-clip-to-spectrogram](https://www.kaggle.com/msripooja/steps-to-convert-audio-clip-to-spectrogram)
- [5] [https://ipython-books.github.io/117-creating-a-sound-synthesizer-in-the-notebook/](https://ipython-books.github.io/117-creating-a-sound-synthesizer-in-the-notebook/)

Zitat der Datenquelle:
```
K. J. Piczak. ESC: Dataset for Environmental Sound Classification. Proceedings of the 23rd Annual ACM Conference on Multimedia, Brisbane, Australia, 2015.
```





In [None]:
import os
import fnmatch
import joblib
import librosa
import librosa.display
import numpy as np
import os.path
import zipfile
from urllib.request import urlretrieve

import pandas as pd

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import SGD, Adam
from keras.callbacks import History
from keras.callbacks import LearningRateScheduler
from keras.callbacks import EarlyStopping

from keras.utils import np_utils
from sklearn.svm import SVC
from keras.preprocessing.image import ImageDataGenerator
from sklearn.base import BaseEstimator
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt


#
# Abdrehen von Fehlermeldungen
#
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=Warning)


In [None]:
#
# Für GPU Support
#
import tensorflow as tf
print ( tf.__version__ ) 

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR )
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))


# Hilfsfunktionen

In [None]:
urlDataSource = 'https://github.com/karoldvl/ESC-50/archive/master.zip'
localExtractionFolder = 'data/ESC-50'
localDataArchive = 'data/ESC-50/master.zip'
audioData = localExtractionFolder + '/ESC-50-master/audio'

sampleRate = 22050
sampleLen = 110250 # in samples is 5 sec @ 22050

In [None]:
#
# Laden der Daten von einer URL
#
def download_dataset(url,extraction_path,dataset_file_path):
    if (not os.path.exists(extraction_path)):
        os.makedirs(extraction_path)
    if os.path.exists(localDataArchive):
        print("archive already downloaded.")
    else:
        print("started loading archive from url {}".format(url))
        filename, headers = urlretrieve(url, dataset_file_path)
        print("finished loading archive from url {}".format(url))

def extract_dataset(dataset_file_path, extraction_directory):    
    if (not os.path.exists(extraction_directory)):
        os.makedirs(extraction_directory)        
    zip = zipfile.ZipFile(dataset_file_path)
    zip.extractall(path=extraction_directory)        
    print("extraction of dataset from {} to {} done.".format(dataset_file_path,extraction_directory) )


# Laden der Daten

In [None]:
#
# Laden der Daten ausführen
#
download_dataset(urlDataSource,localDataArchive)

In [None]:
#
# Extrahieren der Daten
#
extract_dataset(localDataArchive,localExtractionFolder)

# Organisation von Audiodaten auf dem Filesystem

Die Audiodateien liegen alle in einem Verzeichnis. Die Zuordnung der Klasse ist im Dateinamen kodiert und in einer Datenbank gespeichert.
Details dazu unter [[1]](https://github.com/karolpiczak/ESC-50)

<img src="info.png" align="left"/> 

In [None]:
#
# Auslesen der Datenbank
#
df = pd.read_csv( localExtractionFolder + '/ESC-50-master/meta/esc50.csv')
df.head()

In [None]:
#
# Sammeln der Klasseninformation
#
classes = df[['target', 'category']].values.tolist()
classes = set(['{} {}'.format(c[0], c[1]) for c in classes])
classes = np.array([c.split(' ') for c in classes])
classes = {k: v for k, v in classes}
print(classes)

# Analyse der Daten

In [None]:
#
# Einlesen von 5 files
#

x_check = []
count = 0
for root, dir, files in os.walk(audioData):
    waves = fnmatch.filter(files, "*.wav")
    for item in waves:
        soundFile = os.path.join(root, item)
        yt, sr = librosa.core.load (soundFile, mono=True)

        print('found file {} with data shape {} and sampling rate {}'.format(soundFile,yt.shape,sr))
        
        x_check.append(yt)
        count = count + 1
        if count > 5:
            break

x_check = np.array(x_check)            

In [None]:
#
# Anzeige als PCM Kurven
#
displayIndex = 1
x_show = x_check[displayIndex]

plt.figure(figsize=(11, 5))
librosa.display.waveplot(x_show, sr=sampleRate)

In [None]:
X = librosa.stft(x_show)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
plt.colorbar()

In [None]:
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
plt.colorbar()
plt.figure(figsize=(11.4, 5))
librosa.display.waveplot(x_show, sr=sampleRate)

In [None]:
from IPython.display import (
    Audio, display, clear_output)

In [None]:
display(Audio(x_show, rate=sampleRate, autoplay=True))

# Erzeugen der Trainingsdaten

https://en.wikipedia.org/wiki/Constant-Q_transform

<img src="info.png" align="left"/> 

In [None]:
#
# Berechnen der Featuretransformation für Audio
# cqt 
# 

# step size
window = 1024
# frequency pins
bins = 64

def get_features (file, hop, bins):
    
    y = np.zeros(sampleLen);   
    yt, sr = librosa.core.load (file, mono=True)
    
    if len(yt) == 0: 
        print ('found empty file ' + file )
        return 0

    min_length = min(len(y), len(yt))
    y[:min_length] = yt[:min_length]
    
    # https://librosa.github.io/librosa/generated/librosa.feature.mfcc.html
    #C = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop, n_mfcc = bins)  

    #https://librosa.github.io/librosa/generated/librosa.core.cqt.html
    C = np.log1p( 1000 * np.abs (librosa.core.cqt( y=y, sr=sr, hop_length=hop, n_bins=bins)))
    
    return C

memory = joblib.Memory(cachedir=localExtractionFolder+'/esc50_joblib', verbose=0)
cached_get_features = memory.cache(get_features)


def compute_features (root_path):
        
    classes = 50
    samples = 0

    y_data = []    
    X_data = []
    
    for root, dir, files in os.walk(root_path):
        
        waves = fnmatch.filter(files, "*.wav")

        if len(waves) != 0:
            for item in waves:
                # e.g. 2-39443-A-19.wav
                fileName = os.path.splitext ( os.path.basename(item) )[0] 
                classID = int(fileName.split('-')[3])
                
                mfcc = cached_get_features( os.path.join(root, item), window, bins)
                print(".",end='')

                X_data.append(mfcc)
                y_data.append(classID)
                
                samples = samples + 1
                if samples >= 100:
                    break

    X_data = np.stack(X_data, axis=2)
    
    print('shape features {}'.format(X_data.shape))
    
    X_data = np.transpose(X_data, (2,0,1))
    d1 = X_data.shape[0]
    d2 = X_data.shape[1]
    d3 = X_data.shape[2]    
    X_data = np.reshape(X_data, (d1,d2,d3,1))
    y_data = np.array(y_data)    

    print('shape transformed {}'.format(X_data.shape))

    print ("samples = " + str (samples))

    return X_data, y_data, classes, samples

In [None]:
x_data, y_data, classes, samples = compute_features (audioData)

In [None]:
x_data = x_data.astype('float32')
y_data = y_data.astype('uint8')

# Anzeige der Features als Bild

In [None]:
x_show = x_data[displayIndex]
x_show = np.reshape(x_show, (64,108))
#x_show = np.transpose(x_show,(1,0))
plt.imshow(x_show)
plt.colorbar()

# Normalisieren der Daten

In [None]:
def standardize (x):
    mu = np.mean (x, axis=0)
    de = np.std (x, axis=0)
    
    eps = np.finfo('float32').eps
    x = (x - mu) / (eps + de)
    return x


In [None]:
x_data = standardize(x_data)

In [None]:
# convert class vectors to binary class matrices
y_data = np_utils.to_categorical(y_data, classes )

In [None]:
#
# Nochmals checken
#
x_show = x_data[displayIndex]
x_show = np.reshape(x_show, (64,108))
#x_show = np.transpose(x_show,(1,0))
plt.imshow(x_show)
plt.colorbar()


# Train und Test Split

In [None]:
#
# Split der Daten in Train und Test(validation) Datensätze
#
x_train, x_validation, y_train, y_validation = train_test_split(x_data, y_data, test_size=0.25, random_state=42)

print('shapes {} {} {} {}'.format(x_train.shape, x_validation.shape, y_train.shape, y_validation.shape ))

# Model bauen

In [None]:
#
# Erzeugen eines einfache Modelles
#
def createModel():
    
    img_rows = x_train.shape[1]
    img_cols = x_train.shape[2]
    
    model = Sequential()
    model.add(Convolution2D(32, kernel_size=5, strides=1, border_mode='same', input_shape=(img_rows, img_cols, 1)))
    model.add(Activation('relu'))
    model.add(Convolution2D(32, kernel_size=3, strides=1))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.6))
     
    model.add(Convolution2D(64, kernel_size=3, strides=1, border_mode='same'))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, kernel_size=3, strides=1))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.4))
    
    model.add(Flatten())
    model.add(Dense(1024))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(classes))
    model.add(Activation('softmax'))
    return model


In [None]:
#
# Anlegen des Modelles mit Beschreibung
#
model_cnn = createModel()
model_cnn.summary()

In [None]:
#
# Festlegen des Optimizers
#
optimizer = Adam ()
model_cnn.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:
#
# Anlegen des Datengenerators mit Augmentierung
#
datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            featurewise_std_normalization=False,  # divide inputs by std of the dataset
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
            width_shift_range=0.4,  # randomly shift images horizontally (fraction of total width)
            height_shift_range=0.4,  # randomly shift images vertically (fraction of total height)
            horizontal_flip=False,  # randomly flip images
            vertical_flip=False)  # randomly flip images

In [None]:
#
# Training des Generators
#
datagen.fit(x_train)

In [None]:
callbacks = [History()]
#callbacks = [EarlyStopping(monitor='val_loss', patience=4, verbose=1, mode='auto'), History()]

In [None]:
#
# Training des Modelles
#
history = model_cnn.fit_generator(datagen.flow(x_train, y_train,
                            batch_size=64),
                            samples_per_epoch=x_train.shape[0],
                            nb_epoch=2400,
                            callbacks=callbacks,
                            validation_data=(x_validation, y_validation))

In [None]:
#
# Prüfung des Modelles
#
score = model_cnn.evaluate(x_validation, y_validation, verbose=0)
print("validation {} {:.3f}" .format(model_cnn.metrics_names[1], score[1]))

In [None]:
#
# Ausgabe des Trainingsverlaufes
#
def summarize_diagnostics(history,modelname):
    pyplot.subplot(211)
    pyplot.title('Cross Entropy Loss')
    pyplot.plot(history.history['loss'], color='blue', label='train')
    pyplot.plot(history.history['val_loss'], color='green', label='test')
    pyplot.subplot(212)
    pyplot.title('Classification Accuracy')
    pyplot.plot(history.history['accuracy'], color='blue', label='train')
    pyplot.plot(history.history['val_accuracy'], color='green', label='test')
    pyplot.subplots_adjust(hspace=0.5)
    pyplot.savefig( 'results/' + modelname + '_plot.png')
    pyplot.show()
    pyplot.close()

In [None]:
summarize_diagnostics(history,'06_model_cnn')

In [None]:
#
# Speichern des Modelles
#
from keras.models import model_from_json
prefix = 'results/06_'
modelName = prefix + "model.json"
weightName = prefix + "model.h5"

In [None]:
if False:
    model_json = model_cnn.to_json()
    with open( modelName , "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model_cnn.save_weights( weightName )
    print("saved model to disk as {} {}".format(modelName,weightName))

In [None]:
#
# Laden eines vortrainierten Modelles
#
if False:
    json_file = open(modelName, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(weightName)
    print("loaded model from disk")

In [None]:
# 
# evaluate loaded model on test data
#
loaded_model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
score = loaded_model.evaluate(x_validation, y_validation, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))