# Recognition Of Emotional Well Being Using Speech Analysis -
## SRET Hackathon
### Team Number- HT009
### Our Team members:
Kumaresh N M - E0320004<br>
Adhithyan B - E0320005<br>
Dhrish S Kumar - E0320008<br>
Rohit A Ch - E0320022


***Importing Libraries***

In [1]:
import librosa
import soundfile
import os
import glob
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import pyaudio
import wave
from scipy.io.wavfile import write

***Defining Emotions in the RAVDESS Dataset***

In [2]:
emotions = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}
# Emotions to observe
observed_emotions = ['calm', 'happy', 'fearful', 'disgust']


***MLP Classifier***

In [3]:
# Initialize the Multi Layer Perceptron Classifier
model = MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08,
                      hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)



***Function Definition for recording audio***

In [4]:
def recordAudio():
    chunk = 1024  # Record in chunks of 1024 samples
    sample_format = pyaudio.paInt16  # 16 bits per sample
    channels = 1
    fs = 48100  # Record at 44100 samples per second //as per ravdess dataset the frequecy is 48kHz
    seconds = 5
    filename = "C:\\Users\\Adhithyan Balajee\\Downloads\\Real-Time-Speech-Emotion-Recognition-master\\Real-Time-Speech-Emotion-Recognition-master\\Dataset\\speech-emotion-recognition-ravdess-data\\Actor_01\\03-01-08-02-02-02-01.wav"

    p = pyaudio.PyAudio()  # Create an interface to PortAudio

    print('Recording')

    stream = p.open(format=sample_format,
                    channels=channels,
                    rate=fs,
                    frames_per_buffer=chunk,
                    input=True)

    frames = []  # Initialize array to store frames

    # Store data in chunks for 10 seconds
    for i in range(0, int(fs / chunk * seconds)):
        data = stream.read(chunk)
        frames.append(data)

    # Stop and close the stream
    stream.stop_stream()
    stream.close()
    # Terminate the PortAudio interface
    p.terminate()

    print('Finished recording')

    # Save the recorded data as a WAV file
    wf = wave.open(filename, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(sample_format))
    wf.setframerate(fs)
    wf.writeframes(b''.join(frames))
    wf.close()

***Function definition for extract feature***

In [5]:
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        if chroma:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(
                y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(
                S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(
                X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel))
    return result


***Function Definition for loading the dataset***

In [6]:
def load_data(test_size=0.2):
    x, y = [], []
    for file in glob.glob("C:\\Users\\Adhithyan Balajee\\Downloads\\Real-Time-Speech-Emotion-Recognition-master\\Real-Time-Speech-Emotion-Recognition-master\\Dataset\\speech-emotion-recognition-ravdess-data\\Actor_*\\*.wav"):
   
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        
        #print("File name = {} , emotion = {}".format(file_name, emotion))
        
        if emotion not in observed_emotions:
            continue
        feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)


***Function definition for training model***

In [7]:
def trainModel():

    
    x_train, x_test, y_train, y_test = load_data(test_size=0.25)

    # Get the shape of the training and testing datasets
    print((x_train.shape[0], x_test.shape[0]))

    # Get the number of features extracted
    print(f'Features extracted: {x_train.shape[1]}')

    # Train the model
    model.fit(x_train, y_train)

    # Predict for the test set
    y_pred = model.predict(x_test)

    # Calculate the accuracy of our model
    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

    # Print the accuracy
    print("Accuracy: {:.2f}%".format(accuracy*100))


***Predicting the recorded audio***

In [8]:
def record_predictAudio():
    x_predictAudio = []
    recordAudio() #Record audio to predict
    file = "C:\\Users\\Adhithyan Balajee\\Downloads\\Real-Time-Speech-Emotion-Recognition-master\\Real-Time-Speech-Emotion-Recognition-master\\Dataset\\speech-emotion-recognition-ravdess-data\\Actor_01\\03-01-08-02-02-02-01.wav"
    
    #Recorded audio filepath
    featurePredictAudio = extract_feature(file, mfcc=True, chroma=True, mel=True) #extract features of recorded audio
    x_predictAudio.append(featurePredictAudio)
    y_predictAudio = model.predict(np.array(x_predictAudio))
    print("Emotion Predicted: {}".format(y_predictAudio))
    os.remove(file)


***Predicting the pre-recorded audio***

In [9]:

def predictAudio():
    file = input("Please enter path to your file.\n")
    x_predictAudio = []
    featurePredictAudio = extract_feature(file, mfcc=True, chroma=True, mel=True) #extract features of recorded audio
    x_predictAudio.append(featurePredictAudio)
    y_predictAudio = model.predict(np.array(x_predictAudio))
    print("Emotion Predicted: {}".format(y_predictAudio))



In [None]:

while True:
    choice = int(input("Enter 1 to create and train model. \nEnter 2 to predict on pre-recorded audio. \nEnter 3 to record and predict audio . \nEnter 4 to quit. \n"))
    if choice == 1:
        trainModel()
    elif choice == 2:
        predictAudio()
    elif choice == 3:
        record_predictAudio()
    else:
        quit()

Enter 1 to create and train model. 
Enter 2 to predict on pre-recorded audio. 
Enter 3 to record and predict audio . 
Enter 4 to quit. 
1


  mel = np.mean(librosa.feature.melspectrogram(
 0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(
 -3.0517578e-05 -3.0517578e-05] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(
 0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(
 0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(
 0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(
 0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(li

 -9.1552734e-05 -6.1035156e-05] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(
 3.0517578e-05] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(
  0.0000000e+00  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(
 -0.00024414] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(
  3.0517578e-04  3.6621094e-04] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(
 -0.00042725] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspe

(96, 33)
Features extracted: 180
Accuracy: 75.76%
Enter 1 to create and train model. 
Enter 2 to predict on pre-recorded audio. 
Enter 3 to record and predict audio . 
Enter 4 to quit. 
2
Please enter path to your file.
C:\Users\Adhithyan Balajee\Downloads\Real-Time-Speech-Emotion-Recognition-master\Real-Time-Speech-Emotion-Recognition-master\Dataset\speech-emotion-recognition-ravdess-data\Actor_01\03-01-02-01-01-01-01.wav


  mel = np.mean(librosa.feature.melspectrogram(


Emotion Predicted: ['calm']
Enter 1 to create and train model. 
Enter 2 to predict on pre-recorded audio. 
Enter 3 to record and predict audio . 
Enter 4 to quit. 
3
Recording


 -6.1035156e-05 -1.5258789e-04] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(


Finished recording
Emotion Predicted: ['fearful']
