In [52]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from IPython.display import Audio
from keras import layers
from keras import models
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.python.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import tensorflow as tf
from sklearn.model_selection import KFold
from pydub import AudioSegment
from pydub.utils import make_chunks
from tensorflow.keras.optimizers import Adam
import pickle

In [4]:
Path = "C:/Users/pc/Desktop/Dataset"

In [5]:
data = []
for wav in os.listdir(Path):
  emotion = wav.partition(".wav")[0].split('_')
  if emotion[0] == 'Abuse':
      data.append(('Abusive', Path+'/'+wav))
  if len(emotion) >= 3:
    # if emotion[2] == 'DRU':
    #   data.append(('Drunk', Path+'/'+wav))
    if emotion[2] == 'DIS':
      data.append(('Painful', Path+'/'+wav))
    elif emotion[2] == 'FEA':
      data.append(('Stressful', Path+'/'+wav))
    elif emotion[2] == 'HAP':
      data.append(('Prank', Path+'/'+wav))
    elif emotion[2] == 'ANG':
      data.append(('Angry', Path+'/'+wav))
    elif emotion[2] == 'SAD':
      data.append(('Sad', Path+'/'+wav))
    elif emotion[2] == 'NEU':
      data.append(('Neutral', Path+'/'+wav))
df = pd.DataFrame.from_dict(data)
df.rename(columns={0:'Emotion', 1:'Path'}, inplace=True)
df.head()

Unnamed: 0,Emotion,Path
0,Angry,C:/Users/pc/Desktop/Dataset/1001_DFA_ANG_XX.wav
1,Painful,C:/Users/pc/Desktop/Dataset/1001_DFA_DIS_XX.wav
2,Stressful,C:/Users/pc/Desktop/Dataset/1001_DFA_FEA_XX.wav
3,Prank,C:/Users/pc/Desktop/Dataset/1001_DFA_HAP_XX.wav
4,Neutral,C:/Users/pc/Desktop/Dataset/1001_DFA_NEU_XX.wav


In [6]:
df

Unnamed: 0,Emotion,Path
0,Angry,C:/Users/pc/Desktop/Dataset/1001_DFA_ANG_XX.wav
1,Painful,C:/Users/pc/Desktop/Dataset/1001_DFA_DIS_XX.wav
2,Stressful,C:/Users/pc/Desktop/Dataset/1001_DFA_FEA_XX.wav
3,Prank,C:/Users/pc/Desktop/Dataset/1001_DFA_HAP_XX.wav
4,Neutral,C:/Users/pc/Desktop/Dataset/1001_DFA_NEU_XX.wav
...,...,...
8561,Abusive,C:/Users/pc/Desktop/Dataset/Abuse_995.wav
8562,Abusive,C:/Users/pc/Desktop/Dataset/Abuse_996.wav
8563,Abusive,C:/Users/pc/Desktop/Dataset/Abuse_997.wav
8564,Abusive,C:/Users/pc/Desktop/Dataset/Abuse_998.wav


In [7]:
df.shape

(8566, 2)

In [48]:
%matplotlib inline

plt.style.use("ggplot")

In [106]:
plt.title("Count of emotions:")
sns.countplot(x=df["Emotion"])
sns.despine(top=True, right=True, left=False, bottom=False)

In [10]:
def create_waveplot(data, sr, e):
    plt.figure(figsize=(10, 3))
    plt.title(f'Waveplot for audio with {e} emotion', size=15)
    librosa.display.waveplot(data, sr=sr)
    plt.show()

def create_spectrogram(data, sr, e):
    # stft function converts the data into short term fourier transform
    X = librosa.stft(data)
    Xdb = librosa.amplitude_to_db(abs(X))
    plt.figure(figsize=(12, 3))
    plt.title('Spectrogram for audio with {} emotion'.format(e), size=15)
    librosa.display.specshow(Xdb, sr=44100, x_axis='time', y_axis='hz')
    #librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar()

In [11]:
def noise(data, random=False, rate=0.035, threshold=0.075):
    """Add some noise to sound sample. Use random if you want to add random noise with some threshold.
    Or use rate Random=False and rate for always adding fixed noise."""
    if random:
        rate = np.random.random() * threshold
    noise_amp = rate*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    """Stretching data with some rate."""
    return librosa.effects.time_stretch(data, rate)

def shift(data, rate=1000):
    """Shifting data with some rate"""
    shift_range = int(np.random.uniform(low=-5, high = 5)*rate)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7, random=False):
    """"Add some pitch to sound sample. Use random if you want to add random pitch with some threshold.
    Or use pitch_factor Random=False and rate for always adding fixed pitch."""
    if random:
        pitch_factor=np.random.random() * pitch_factor
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=pitch_factor)

In [12]:
n_fft = 2048
hop_length = 512

In [13]:
def zcr(data, frame_length=2048, hop_length=512):
    zcr = librosa.feature.zero_crossing_rate(y=data, frame_length=frame_length, hop_length=hop_length)
    return np.squeeze(zcr)

def rmse(data, frame_length=2048, hop_length=512):
    rmse = librosa.feature.rms(y=data, frame_length=frame_length, hop_length=hop_length)
    return np.squeeze(rmse)

def mfcc(data, sr, frame_length=2048, hop_length=512, flatten: bool = True):
    mfcc_feature = librosa.feature.mfcc(y=data, sr=sr)
    return np.squeeze(mfcc_feature.T) if not flatten else np.ravel(mfcc_feature.T)

In [14]:
path = np.array(df["Path"])[10]
data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
len(data)

30915

In [15]:
def extract_features(data, sr, frame_length=2048, hop_length=512):
    result = np.array([])
    result = np.hstack((result,
                        zcr(data, frame_length, hop_length),
                        rmse(data, frame_length, hop_length),
                        mfcc(data, sr, frame_length, hop_length)
                                    ))
    return result

In [16]:
def get_features(path, duration=2.5, offset=0.6):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=duration, offset=offset)

     # without augmentation
    res1 = extract_features(data, sample_rate)
    result = np.array(res1)

    # data with noise
    noise_data = noise(data, random=True)
    res2 = extract_features(noise_data, sample_rate)
    result = np.vstack((result, res2)) # stacking vertically

    # data with pitching
    pitched_data = pitch(data, sample_rate, random=True)
    res3 = extract_features(pitched_data, sample_rate)
    result = np.vstack((result, res3)) # stacking vertically

    # data with pitching and white_noise
    new_data = pitch(data, sample_rate, random=True)
    data_noise_pitch = noise(new_data, random=True)
    res3 = extract_features(data_noise_pitch, sample_rate)
    result = np.vstack((result, res3)) # stacking vertically

    return result

In [17]:
X, Y = [], []
print("Feature processing...")
for path, emotion, ind in zip(df.Path, df.Emotion, range(df.Path.shape[0])):
    features = get_features(path)
    if ind % 100 == 0:
        print(f"{ind} samples has been processed...")
    for ele in features:
        X.append(ele)
        # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
        Y.append(emotion)
print("Done.")

Feature processing...
0 samples has been processed...
100 samples has been processed...
200 samples has been processed...
300 samples has been processed...
400 samples has been processed...
500 samples has been processed...
600 samples has been processed...
700 samples has been processed...
800 samples has been processed...
900 samples has been processed...
1000 samples has been processed...
1100 samples has been processed...
1200 samples has been processed...
1300 samples has been processed...
1400 samples has been processed...
1500 samples has been processed...
1600 samples has been processed...
1700 samples has been processed...
1800 samples has been processed...
1900 samples has been processed...
2000 samples has been processed...
2100 samples has been processed...
2200 samples has been processed...
2300 samples has been processed...
2400 samples has been processed...
2500 samples has been processed...
2600 samples has been processed...
2700 samples has been processed...
2800 sampl

In [18]:
features_path = "./features.csv"

In [19]:
extracted_df = pd.DataFrame(X)
extracted_df["labels"] = Y
extracted_df.to_csv(features_path, index=False)
extracted_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2367,2368,2369,2370,2371,2372,2373,2374,2375,labels
0,0.03418,0.069336,0.115723,0.148926,0.14209,0.130859,0.101562,0.070801,0.069336,0.074219,...,,,,,,,,,,Angry
1,0.041992,0.083984,0.13623,0.164551,0.154785,0.136719,0.100586,0.071777,0.074219,0.080078,...,,,,,,,,,,Angry
2,0.038086,0.074219,0.121582,0.135254,0.12793,0.114258,0.083496,0.073242,0.066406,0.069824,...,,,,,,,,,,Angry
3,0.060059,0.114746,0.176758,0.19043,0.177734,0.14502,0.101562,0.087891,0.083496,0.090332,...,,,,,,,,,,Angry
4,0.041504,0.059082,0.070312,0.062988,0.080566,0.128906,0.145508,0.146484,0.125977,0.072266,...,,,,,,,,,,Painful


In [20]:
extracted_df = pd.read_csv(features_path)
print(extracted_df.shape)

(34264, 2377)


In [21]:
# Fill NaN with 0
extracted_df = extracted_df.fillna(0)
print(extracted_df.isna().any())
extracted_df.shape

0         False
1         False
2         False
3         False
4         False
          ...  
2372      False
2373      False
2374      False
2375      False
labels    False
Length: 2377, dtype: bool


(34264, 2377)

In [22]:
extracted_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2367,2368,2369,2370,2371,2372,2373,2374,2375,labels
0,0.03418,0.069336,0.115723,0.148926,0.14209,0.130859,0.101562,0.070801,0.069336,0.074219,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Angry
1,0.041992,0.083984,0.13623,0.164551,0.154785,0.136719,0.100586,0.071777,0.074219,0.080078,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Angry
2,0.038086,0.074219,0.121582,0.135254,0.12793,0.114258,0.083496,0.073242,0.066406,0.069824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Angry
3,0.060059,0.114746,0.176758,0.19043,0.177734,0.14502,0.101562,0.087891,0.083496,0.090332,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Angry
4,0.041504,0.059082,0.070312,0.062988,0.080566,0.128906,0.145508,0.146484,0.125977,0.072266,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Painful


In [23]:
X = extracted_df.drop(labels="labels", axis=1)
Y = extracted_df["labels"]

In [24]:
lb = LabelEncoder()
Y = np_utils.to_categorical(lb.fit_transform(Y))
print(lb.classes_)
Y

['Abusive' 'Angry' 'Neutral' 'Painful' 'Prank' 'Sad' 'Stressful']


array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [25]:
scaler = StandardScaler()
newX = scaler.fit_transform(X)
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
for train_index, test_index in kf.split(newX):
    X_train, X_val_test = newX[train_index], newX[test_index]
    y_train, y_val_test = Y[train_index], Y[test_index]

    val_size = len(X_val_test) // 2
    X_val = X_val_test[:val_size]
    y_val = y_val_test[:val_size]
    X_test = X_val_test[val_size:]
    y_test = y_val_test[val_size:]

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

(27412, 2376) (27412, 7) (3426, 2376) (3426, 7) (3426, 2376) (3426, 7)


In [73]:
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_val.npy', X_val)
np.save('y_val.npy', y_val)
np.save('X_test.npy', X_test)
np.save('y_test.npy', y_test)

In [26]:
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',
                                            patience=2,
                                            verbose=1,
                                            factor=0.2)

In [3]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [74]:
model = models.Sequential()
model.add(layers.Conv1D(64, kernel_size=5, strides=1,
                        padding="same", activation="relu",
                        input_shape=(X_train.shape[1], 1)))
model.add(layers.BatchNormalization())
model.add(layers.MaxPool1D(pool_size=5, strides=2, padding="same"))
model.add(layers.Dropout(0.2))

model.add(layers.Conv1D(128, kernel_size=5, strides=1,
                        padding="valid", activation="relu"))
model.add(layers.BatchNormalization())
model.add(layers.MaxPool1D(pool_size=5, strides=2, padding="same"))
model.add(layers.Dropout(0.2))

model.add(layers.Conv1D(256, kernel_size=3, strides=1, padding='valid', activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling1D(pool_size=3, strides = 2, padding = 'same'))
model.add(layers.Dropout(0.2))

model.add(layers.Conv1D(512, kernel_size=3, strides=1, padding='valid', activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling1D(pool_size=3, strides = 2, padding = 'same'))
model.add(layers.Dropout(0.2))

model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.5))
model.add(layers.Dense(7, activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["acc", f1_m])

In [75]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_20 (Conv1D)          (None, 2376, 64)          384       
                                                                 
 batch_normalization_25 (Bat  (None, 2376, 64)         256       
 chNormalization)                                                
                                                                 
 max_pooling1d_20 (MaxPoolin  (None, 1188, 64)         0         
 g1D)                                                            
                                                                 
 dropout_25 (Dropout)        (None, 1188, 64)          0         
                                                                 
 conv1d_21 (Conv1D)          (None, 1184, 128)         41088     
                                                                 
 batch_normalization_26 (Bat  (None, 1184, 128)       

In [30]:
EPOCHS = 20
batch_size = 64

In [76]:
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=batch_size, callbacks=[learning_rate_reduction])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 00017: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [77]:
model.save("my_model.h5")

In [97]:
print("Accuracy of our model on test data : " , model.evaluate(X_test,y_test)[1]*100 , "%")

Accuracy of our model on test data :  95.50496339797974 %
