## Проект по детекции эмоций по аудио

In [1]:
import pandas as pd
import numpy as np
import os
import sys

import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

from IPython.display import Audio

import keras
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.callbacks import ModelCheckpoint

import joblib

#### Загрузка датасета и проверка классов

In [2]:
data_path = 'C:\projects\Python\ml\DAILY_WORK\project_audio\dataset'

In [8]:
for files in os.listdir(data_path):
    print(files)

OAF_angry
OAF_disgust
OAF_Fear
OAF_happy
OAF_neutral
OAF_Pleasant_surprise
OAF_Sad
YAF_angry
YAF_disgust
YAF_fear
YAF_happy
YAF_neutral
YAF_pleasant_surprised
YAF_sad


#### Распределение количества данных внутри каждого класса

In [30]:
data_amount_info = {}
data_duration_info = {}

for emotion in os.listdir(data_path):
    print(emotion[4:])
    emotion_path = os.path.join(data_path, files)
    for file in os.listdir(emotion_path):
        file_path = os.path.join(emotion_path, file)
        if file.endswith(".wav"):
            duration = librosa.get_duration(path=file_path)  # продолжительность файла
            data_duration_info[emotion] = data_duration_info.get(emotion, 0) + duration  # добавляем продолжительность к нужной эмоции

            data_amount_info[emotion] = data_amount_info.get(emotion, 0) + 1  # считаем количество файлов
        else:
            print('Найден файл с некорректным разрешением')

angry
disgust
Fear
happy
neutral
Pleasant_surprise
Sad
angry
disgust
fear
happy
neutral
pleasant_surprised
sad


In [31]:
# средняя продолжительность звукового фрагмента данных
for emotion in data_duration_info.keys():
    data_duration_info[emotion] /= data_amount_info[emotion]

In [32]:
data_amount_info

{'OAF_angry': 200,
 'OAF_disgust': 200,
 'OAF_Fear': 200,
 'OAF_happy': 200,
 'OAF_neutral': 200,
 'OAF_Pleasant_surprise': 200,
 'OAF_Sad': 200,
 'YAF_angry': 200,
 'YAF_disgust': 200,
 'YAF_fear': 200,
 'YAF_happy': 200,
 'YAF_neutral': 200,
 'YAF_pleasant_surprised': 200,
 'YAF_sad': 200}

In [33]:
data_duration_info

{'OAF_angry': 2.2683728598345207,
 'OAF_disgust': 2.2683728598345207,
 'OAF_Fear': 2.2683728598345207,
 'OAF_happy': 2.2683728598345207,
 'OAF_neutral': 2.2683728598345207,
 'OAF_Pleasant_surprise': 2.2683728598345207,
 'OAF_Sad': 2.2683728598345207,
 'YAF_angry': 2.2683728598345207,
 'YAF_disgust': 2.2683728598345207,
 'YAF_fear': 2.2683728598345207,
 'YAF_happy': 2.2683728598345207,
 'YAF_neutral': 2.2683728598345207,
 'YAF_pleasant_surprised': 2.2683728598345207,
 'YAF_sad': 2.2683728598345207}

В датасете одинаковое количество данных на каждую эмоцию и одинаковая средняя длина записи

#### Демонстрация случайных записей

In [34]:
def create_waveplot(data, sr, e, ax):
    plt.title('Waveplot for audio with {} emotion'.format(e), size=15)
    return librosa.display.waveshow(data, sr=sr, ax=ax)

def create_spectrogram(data, sr, e, ax):
    X = librosa.stft(data)
    Xdb = librosa.amplitude_to_db(abs(X))
    plt.title('Spectrogram for audio with {} emotion'.format(e), size=15)
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz', ax=ax)
    plt.colorbar(ax=ax)

In [36]:
for emotion in os.listdir(data_path):
    emotion_path = os.path.join(data_path, emotion)
    for file in os.listdir(emotion_path):
        k = np.randint(0, data_amount_info[emotion])
        for _ in range(k): continue  # нужно чтобы брать случайный файл из датасета
        file_path = os.path.join(emotion_path, file)
        if file.endswith(".wav"):
            data, sr = librosa.load(file_path)
            create_waveplot(data, sr, emotion)
            create_spectrogram(data, sr, emotion)
            break
    break

TypeError: create_waveplot() missing 1 required positional argument: 'ax'

### Попытка построить нейронку

In [44]:
training_set, validation_set = keras.utils.audio_dataset_from_directory(
    directory=data_path,
    batch_size=16,
    validation_split=0.2,
    output_sequence_length=16_000,
    subset='both',
    seed=42
)

Found 2800 files belonging to 14 classes.
Using 2240 files for training.
Using 560 files for validation.


In [45]:
label_names = training_set.class_names
label_names

['OAF_Fear',
 'OAF_Pleasant_surprise',
 'OAF_Sad',
 'OAF_angry',
 'OAF_disgust',
 'OAF_happy',
 'OAF_neutral',
 'YAF_angry',
 'YAF_disgust',
 'YAF_fear',
 'YAF_happy',
 'YAF_neutral',
 'YAF_pleasant_surprised',
 'YAF_sad']

In [46]:
def squeeze(audio, labels):
    '''перевод звука в формат mono'''
    audio = tf.squeeze(audio, axis=-1)
    return audio, labels

In [47]:
training_set = training_set.map(squeeze, tf.data.AUTOTUNE)
validation_set = validation_set.map(squeeze, tf.data.AUTOTUNE)

In [48]:
def get_spectrogram(waveform): 
    '''Преобразование данных в спектрограмму'''
    # кратковременное преобразование Фурье STFT
    spectrogram = tf.signal.stft(waveform, frame_length=255, frame_step=128) 
    spectrogram = tf.abs(spectrogram)
    return spectrogram[..., tf.newaxis]

In [49]:
def get_spectrogram_dataset(dataset):
    dataset = dataset.map(lambda x, y: (get_spectrogram(x), y),
                          num_parallel_calls=tf.data.AUTOTUNE)
    return dataset

In [50]:
train_set = get_spectrogram_dataset(training_set) 
validation_set = get_spectrogram_dataset(validation_set) 
  
val_set = validation_set.take(validation_set.cardinality() // 2) 
test_set = validation_set.skip(validation_set.cardinality() // 2)

In [51]:
input_shape = next(iter(train_set))[0][0].shape 
print("Input shape:", input_shape) 
num_labels = len(label_names)

Input shape: (124, 129, 1)


In [35]:
model = tf.keras.Sequential([ 
        tf.keras.layers.Input(shape=input_shape), 

        tf.keras.layers.Resizing(64, 64), 
        tf.keras.layers.Normalization(), 
          
        tf.keras.layers.Conv2D(64, 3, activation='relu'), 
        tf.keras.layers.Conv2D(128, 3, activation='relu'), 
        tf.keras.layers.MaxPooling2D(), 
        tf.keras.layers.Dropout(0.5), 
        tf.keras.layers.Flatten(), 
          
        tf.keras.layers.Dense(256, activation='relu'), 
        tf.keras.layers.Dropout(0.5), 
          
        tf.keras.layers.Dense(num_labels, activation='softmax')
])

In [36]:
model.compile( 
    optimizer=tf.keras.optimizers.Adam(), 
    loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
    metrics=['accuracy'] 
)

In [37]:
epochs = 10
verbose = True

monitor = 'val_loss'
min_delta = 0.01
patience = 3

callbacks = [keras.callbacks.EarlyStopping(monitor=monitor, min_delta=min_delta, patience=patience, verbose=verbose)]

In [34]:
history = model.fit(
    train_set,
    validation_data=val_set,
    epochs=epochs,
    callbacks=callbacks
)

Epoch 1/10

InvalidArgumentError: Graph execution error:

Header mismatch: Expected RIFF but found FORM
	 [[{{node DecodeWav}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_7828]