In [3]:
# pip install librosa

In [2]:
'''
Please install this specific version of resampy for librosa to work without errors.
'''

'\nPlease install this specific version of resampy for librosa to work without errors.\n'

In [4]:
# pip install resampy==0.3.1

In [2]:
import soundfile
import os
import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import librosa
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report
import warnings; warnings.filterwarnings('ignore')

In [3]:
emotions ={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

### Data for binary classification


In [4]:
def load_extract_features(data_path):

    '''
    load_extract_features() is a function that is used to load all the audio files one at a time, compute their features and return the features as well as the target values.

    There are around 8-10 audio files which are corrupted. We hardcode zero values for such files in order to maintain consistency.

    ['calm', 'happy'] emotion data is categorized into 'positive' and  ['angry', 'fearful'] into 'negative'

    Returns:
    1. Features
    2. Binary Target Values
    '''
    final_features,target_emotions, binary_label = [],[],[]
    count = 0
    
    for i in glob.glob(data_path + "/Actor_*/*.wav"): #Loop to read every file.
        
        name = os.path.basename(i)
        #We split the name of the file to understand the emotion associated with the file.
        split = name.split("-")
        #We know that the third identifier is associated with the emotion of the audio file. Hence, we use [2] as it represents the third identifier.
        emotion = emotions[split[2]]

        #Below is the code to categorize the emotions into two classes to make this a binary problem.
        if emotion in ['calm', 'happy']:
            binary_label.append(-1)
        elif emotion in ['angry', 'fearful']:
            binary_label.append(1)
        else:
            continue
        
        with soundfile.SoundFile(i) as audio:
            waveform = audio.read(dtype="float32")
            sr = audio.samplerate
            
            #Below is the code to extract the Mel spectrogram features
            #128 is the standard for machine learning applications using Mel spectrograms
            m_feature = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=128, fmax=sr / 2.0).T
            melspectrogram = np.mean(m_feature,axis=0)
            if melspectrogram.shape != (128,):
                melspectrogram = np.zeros(128)
            
            #Below is the code to extract the chromagram features
            stft_wave = librosa.stft(waveform)
            stft = np.abs(stft_wave)
            c_feature = librosa.feature.chroma_stft(S=stft, sr=sr).T
            chromagram = np.mean(c_feature,axis=0)
            
            #12 is the number of pitch classes
            if chromagram.shape != (12,):
                chromagram = np.zeros(12)
                
            features=np.array([])
            features=np.hstack((chromagram, melspectrogram))
        
            final_features.append(features)
            target_emotions.append(emotion)
            
            count += 1
            if count % 100 == 0:
                print("Processed Audio File Number: ", count)
    
    #We return the features and the binary target values.
    return np.array(final_features), np.array(binary_label)

In [5]:
#Please change the path below to the path of the folder saved on your computer.
data_path = './Audio_Speech_Actors_01-24'
X, binary_label = load_extract_features(data_path)

Processed Audio File Number:  100
Processed Audio File Number:  200
Processed Audio File Number:  300
Processed Audio File Number:  400
Processed Audio File Number:  500
Processed Audio File Number:  600
Processed Audio File Number:  700


In [6]:
# Reshaping binary_label to make it compatible with the train_test_split function.
binary_label = binary_label.reshape(-1,1)

### Splitting the data into training and test sets

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, binary_label, test_size=0.3, random_state=41)
print(X.shape)
print(binary_label.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(768, 140)
(768, 1)
(537, 140)
(231, 140)
(537, 1)
(231, 1)


# Training the SVM

### Define gradient descent function

In [34]:
def gradient_wrt_w(X, y, w, b, lambda_reg):
    '''
    gradient_wrt_w() is a function that is used to compute the gradient of the loss function with respect to the weights.

    Returns:
    1. Gradient of the loss function with respect to the weights.
    '''
    N = X.shape[0]
    grad_w = np.zeros(w.shape)

    for i in range(N):
        if y[i]*(np.dot(w, X[i]) + b) < 1:
            grad_w += lambda_reg*w - y[i]*X[i]
        else:
            grad_w += lambda_reg*w
    return grad_w / N

def gradient_wrt_b(X, y, w, b):
    '''
    gradient_wrt_b() is a function that is used to compute the gradient of the loss function with respect to the bias.

    Returns:
    1. Gradient of the loss function with respect to the bias.
    '''
    N = X.shape[0]
    grad_b = 0

    for i in range(N):
        if y[i]*(np.dot(w, X[i]) + b) < 1:
            grad_b += -y[i]
        else:
            grad_b += 0
    return grad_b / N

def loss(X, y, w, b, lambda_reg):
    '''
    loss() is a function that is used to compute the loss function.

    Returns:
    1. Loss function value.
    '''
    N = X.shape[0]
    loss = 0

    for i in range(N):
        if y[i]*(np.dot(w, X[i]) + b) < 1:
            loss += 1 - y[i]*(np.dot(w, X[i]) + b)
        else:
            loss += 0
    return 0.5*lambda_reg*np.dot(w.T, w) + loss / N

def gradient_descent(X, y, w, b, step_size, lambda_reg, num_iterations):
    '''
    gradient_descent() is a function that is used to perform gradient descent.

    Returns:
    1. Optimal Weights
    2. Optimal Bias
    '''
    for i in range(num_iterations):
        loss_pre = np.mean(loss(X, y, w, b, lambda_reg))
        
        # apply gradient descent step
        grad_w = gradient_wrt_w(X, y, w, b, lambda_reg)
        grad_b = gradient_wrt_b(X, y, w, b)
        w -= step_size * grad_w
        b -= step_size * grad_b

        loss_post = np.mean(loss(X, y, w, b, lambda_reg))

        # print every 100 iterations
        if i % 100 == 0:
            print("Iteration: ", i)
            print("Loss: ", loss_post)
        
        # stop the algorithm if the loss is changing very little
        if np.abs(loss_pre - loss_post) < 0.0000001:
            break
    return w, b

### Apply the gradient descent algorithm

In [35]:
# calculating optimal weights and bias using training set
w_init, b_init = np.zeros(X_train.shape[1]), 0
w_optimal, b_optimal = gradient_descent(X_train, y_train, w_init, b_init, step_size=0.01, lambda_reg=0.1, num_iterations=1000)

Iteration:  0
Loss:  0.9055390643174237
Iteration:  100
Loss:  0.7209351416725835
Iteration:  200
Loss:  0.6880736915001773
Iteration:  300
Loss:  0.6834789525186824
Iteration:  400
Loss:  0.6805489315893753


### Classify the test data and compute the accuracy

In [36]:
# TRAINING SET
train_data_size = X_train.shape[0]
y_pred_train = np.ones(train_data_size) * -1

# computing prediction for training set
for i in range(train_data_size):
    if np.dot(w_optimal, X_train[i]) + b_optimal > 0:
        y_pred_train[i] = 1

# computing accuracy for training set
train_correct = 0
for i in range(train_data_size):
    if y_pred_train[i] == y_train[i]:
        train_correct += 1

# TESTING SET
test_data_size = X_test.shape[0]
y_pred_test = np.ones(train_data_size) * -1

# computing prediction for test set
for i in range(test_data_size):
    if np.dot(w_optimal, X_test[i]) + b_optimal > 0:
        y_pred_test[i] = 1

# computing accuracy for test set
correct = 0
for i in range(test_data_size):
    if y_pred_test[i] == y_test[i]:
        correct += 1

# print accuracy in 2 decimal places
print("Training Accuracy: ", round(train_correct / train_data_size, 4))
print("Test Accuracy: ", round(correct / test_data_size, 4))


Training Accuracy:  0.6611
Test Accuracy:  0.6753


# Reducing Data using PCA

### Apply PCA to the data

In [37]:
# Compute the mean of the dataset
mean = np.mean(X, axis=0)

# Compute the covariance matrix of the dataset
cov_mat = np.cov(X.T)

# Compute the eigenvalues and eigenvectors of the covariance matrix
eig_vals, eig_vecs = np.linalg.eig(cov_mat)

# Sort the eigenvalues and eigen vectors in descending order
eig_vals_sorted = np.sort(eig_vals)[::-1]
eig_vecs_sorted = eig_vecs[:, eig_vals.argsort()[::-1]]

# determine the number of principal components
cut_off = 0.95 # 95% of the variance
k = -1

sum_of_eig_vals = sum(eig_vals)
eig_val_percentage_var = []
for i in eig_vals:
    eig_val_percentage_var.append(i/sum_of_eig_vals)

cumulative_sum = 0
for i in range(len(eig_val_percentage_var)):
    cumulative_sum += eig_val_percentage_var[i]
    if cumulative_sum >= cut_off:
        k = i + 1
        break

assert k != -1, "k is not set"
print("Number of principal components: ", k)

# Compute the matrix with k eigenvectors associated with the k largest eigenvalues
W = eig_vecs_sorted[:, :k]

# Compute the new dataset
new_X = np.dot(X - mean, W)

Number of principal components:  13


### Resplit the data into training and test sets

In [38]:
new_X_train, new_X_test, y_train, y_test = train_test_split(new_X, binary_label, test_size=0.3, random_state=41)
print(new_X.shape)
print(binary_label.shape)
print(new_X_train.shape)
print(new_X_test.shape)
print(y_train.shape)
print(y_test.shape)

(768, 13)
(768, 1)
(537, 13)
(231, 13)
(537, 1)
(231, 1)


### Apply the gradient descent algorithm

In [39]:
# calculating optimal weights and bias using reduced training set
w_init, b_init = np.zeros(new_X_train.shape[1]), 0
w_optimal, b_optimal = gradient_descent(new_X_train, y_train, w_init, b_init, step_size=0.01, lambda_reg=0.1, num_iterations=1000)

Iteration:  0
Loss:  0.9052105553409516
Iteration:  100
Loss:  0.6842276064975295
Iteration:  200
Loss:  0.680602334061041
Iteration:  300
Loss:  0.6792663108266637
Iteration:  400
Loss:  0.678128373294901
Iteration:  500
Loss:  0.6774215498611997
Iteration:  600
Loss:  0.6768703250950927
Iteration:  700
Loss:  0.6762519756616299
Iteration:  800
Loss:  0.6760084502791516
Iteration:  900
Loss:  0.6755019919893898


### Applying the SVM to the reduced data

In [40]:
# TRAINING SET
train_data_size = new_X_train.shape[0]
y_pred_train = np.ones(train_data_size) * -1

# computing prediction for training set
for i in range(train_data_size):
    if np.dot(w_optimal, new_X_train[i]) + b_optimal > 0:
        y_pred_train[i] = 1

# computing accuracy for training set
train_correct = 0
for i in range(train_data_size):
    if y_pred_train[i] == y_train[i]:
        train_correct += 1

# TESTING SET
test_data_size = new_X_test.shape[0]
y_pred_test = np.ones(train_data_size) * -1

# computing prediction for test set
for i in range(test_data_size):
    if np.dot(w_optimal, new_X_test[i]) + b_optimal > 0:
        y_pred_test[i] = 1

# computing accuracy for test set
correct = 0
for i in range(test_data_size):
    if y_pred_test[i] == y_test[i]:
        correct += 1

# print accuracy in 2 decimal places
print("Training Accuracy: ", round(train_correct / train_data_size, 4))
print("Test Accuracy: ", round(correct / test_data_size, 4))

Training Accuracy:  0.6574
Test Accuracy:  0.671
