In [1]:
%load_ext dotenv
%dotenv

## Imports

In [1]:
import collections
import json
import os
import pickle
import pprint
import re
import subprocess

import keras
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
import tensorflow as tf
# import text2emotion as te
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import AudioSource, RecognizeCallback
from pydub import AudioSegment
from sklearn.preprocessing import StandardScaler

In [2]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
   tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Setup STT Service

In [5]:
api_key = os.getenv("api_key")
url = os.getenv("url")

# Setup Service
authenticator = IAMAuthenticator(api_key)
stt = SpeechToTextV1(authenticator=authenticator)
stt.set_service_url(url)

## Paths

In [3]:
paths = {'audiobooks': "../../assets/audiobooks",
         'clips': "../../assets/temp",
         "pickels": "../../assets/audio_sentiment_data_v2/pickles",
         "models": "../../assets/audio_sentiment_data_v2/models",}

## Helper Functions

In [6]:
def fetch_and_convert_audiobook_to_wav():
    # Fetch the .mp3 audiobook file
    for file in os.listdir(paths['audiobooks']):
        if os.path.splitext(file)[1] == '.mp3':
            file_name = os.path.abspath(os.path.join(paths['audiobooks'], file))
    
    file_name_wav = file_name[:-3]+'wav'
    
    # Convert .mp3 to .wav
    sound = AudioSegment.from_mp3(file_name)
    sound.export(file_name_wav, format="wav")
    
    return file_name_wav

In [7]:
def text_sentiment_analysis(file):
    # Perform transcription
    with open(file, 'rb') as f:
        res = stt.recognize(audio=f, content_type='audio/wav', model='en-US_NarrowbandModel', continuous=True).get_result()

    text = ""
    conf = 0.0

    for i in range(len(res['results'])):
        text += res['results'][i]['alternatives'][0]['transcript'][0:-1] + ". "
        conf += res['results'][i]['alternatives'][0]['confidence']
    trans_conf = conf/len(res['results'])
    
    # Get emotions from transcript
    text_emotions = te.get_emotion(text)
    print(text_emotions)
    
    # Convert to weighted text emotions (weight = 65%)
    w_text_emotions = np.zeros(5)
    for index, emotion in enumerate(sorted(text_emotions)):
        w_text_emotions[index] =  float(text_emotions[emotion])
    w_text_emotions = w_text_emotions*0.65
    
    return w_text_emotions

In [4]:
def load_audio_senti_model():
    model_name = "hyperband_tuned_model_final_[0.260879248380661, 0.9069767594337463]"
    model = tf.keras.models.load_model(f"{paths['models']}/{model_name}")
    
    return model

In [5]:
def load_pickles():
    # Load in the labels and scaler from the pickles 
    pickles = {}
    pickle_in = open(f"{paths['pickels']}/labels.pickle","rb")
    pickles['labels'] = pickle.load(pickle_in)
    pickle_in = open(f"{paths['pickels']}/scaler.pickle","rb")
    pickles['scaler'] = pickle.load(pickle_in)
    
    return pickles

In [9]:
def feature_extraction(y, sr):
    rmse= np.mean(librosa.feature.rms(y=y))
    spec_cent = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    spec_bw = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    zcr = np.mean(librosa.feature.zero_crossing_rate(y))
    chroma_stft = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr), axis=1)

    data_features = [rmse,
                    spec_cent,
                    spec_bw, 
                    rolloff, 
                    zcr, 
                    chroma_stft[0],
                    chroma_stft[1],
                    chroma_stft[2],
                    chroma_stft[3],
                    chroma_stft[4],
                    chroma_stft[5],
                    chroma_stft[6],
                    chroma_stft[7],
                    chroma_stft[8],
                    chroma_stft[9],
                    chroma_stft[10],
                    chroma_stft[11],
                    mfcc[0],
                    mfcc[1],
                    mfcc[2],
                    mfcc[3],
                    mfcc[4],
                    mfcc[5],
                    mfcc[6],
                    mfcc[7],
                    mfcc[8],
                    mfcc[9],
                    mfcc[10],
                    mfcc[11],
                    mfcc[12],
                    mfcc[13],
                    mfcc[14],
                    mfcc[15],
                    mfcc[16],
                    mfcc[17],
                    mfcc[18],
                    mfcc[19]
                    ]
    return data_features

In [10]:
def scale_features(X, pickles):
    scaler = pickles['scaler']
    
    return scaler.transform(X)

In [11]:
def audio_sentiment_analysis(file, model, pickles):
    audio, sr = librosa.load(file, res_type='kaiser_fast', sr=22050*2)
    
    buffer = 3 * sr

    samples_total = len(audio)
    samples_wrote = 0

    predictions = []

    while samples_wrote < samples_total:

        #check if the buffer is not exceeding total samples 
        if buffer > (samples_total - samples_wrote):
            buffer = samples_total - samples_wrote

        block = audio[samples_wrote : (samples_wrote + buffer)]

        data_features = np.array(feature_extraction(block, sr))

        scaled_features = scale_features(data_features.reshape(1, -1), pickles)

        predictions.append(model.predict(scaled_features))

        samples_wrote += buffer
        
    audio_emotions = np.squeeze(predictions, axis=None)
    audio_emotions_length = len(audio_emotions)
    audio_emotions = audio_emotions.sum(axis=0)
    audio_emotions = audio_emotions / audio_emotions_length
    
    audio_emotions = audio_emotions.argmax()
    audio_emotions = audio_emotions.astype(int).flatten()
    final_emotion = labels.inverse_transform((audio_emotions))
    
    return final_emotion

## Main Function

In [13]:
def main():
    audio_senti_model = load_audio_senti_model()
    pickles = load_pickles()
    file = fetch_and_convert_audiobook_to_wav()
    audio, sr = librosa.load(file, res_type='kaiser_fast', sr=22050*2)

    emotions_list = []

    # Splitting the audiobook into 30 sec clips
    buffer = 30 * sr
    samples_total = len(audio)
    samples_wrote = 0
    counter = 1

    while samples_wrote < samples_total:

        #check if the buffer is not exceeding total samples
        if buffer > (samples_total - samples_wrote):
            buffer = samples_total - samples_wrote

        block = audio[samples_wrote : (samples_wrote + buffer)]
        out_file_name = "clip_" + str(counter) + ".wav"
        complete_name = os.path.join(paths['clips'], out_file_name)

        # Save the 30 sec clip
        sf.write(complete_name, block, sr)

        # Load the 30 sec clip
        file = os.path.abspath(complete_name)

        # Perform text sentiment analysis
        w_text_emotions = text_sentiment_analysis(file)
        print("Text Sentiment Analysis Completed.")

        # Perform audio sentiment analysis
        w_audio_emotions = audio_sentiment_analysis(complete_name, audio_senti_model, pickles)
        print("Audio Sentiment Analysis Completed.")

        # Add weighted emotions
        weighted_emotions = w_text_emotions + w_audio_emotions

        # Add emotion to emotions list
        labels = pickles['labels']
        weighted_emotions = weighted_emotions.argmax()
        weighted_emotions = weighted_emotions.astype(int).flatten()
        final_emotion = labels.inverse_transform((weighted_emotions))
        emotions_list.append(final_emotion[0])

        # Delete the clip once sentiment analysis is done
        os.remove(complete_name)

        counter += 1
        samples_wrote += buffer
        
    return emotions_list

## Run the Main function

In [14]:
if __name__ == "__main__":
    emotions_list = main()
    print(emotions_list)

{'Happy': 0.0, 'Angry': 0.0, 'Surprise': 0.4, 'Sad': 0.4, 'Fear': 0.2}
Text Sentiment Analysis Completed.
[0.24394405 0.15688714 0.09505951 0.17667837 0.3274309 ]
Audio Sentiment Analysis Completed.
{'Happy': 0.29, 'Angry': 0.0, 'Surprise': 0.24, 'Sad': 0.29, 'Fear': 0.18}
Text Sentiment Analysis Completed.
[0.10294437 0.18543912 0.09894231 0.47148782 0.14118639]
Audio Sentiment Analysis Completed.
{'Happy': 0.08, 'Angry': 0.0, 'Surprise': 0.5, 'Sad': 0.25, 'Fear': 0.17}
Text Sentiment Analysis Completed.
[0.0746337  0.2524653  0.16625951 0.23488536 0.27175602]
Audio Sentiment Analysis Completed.
{'Happy': 0.0, 'Angry': 0.0, 'Surprise': 0.11, 'Sad': 0.33, 'Fear': 0.56}
Text Sentiment Analysis Completed.
[0.27562848 0.36130312 0.069392   0.15604857 0.13762784]
Audio Sentiment Analysis Completed.
['surprise', 'sad', 'surprise', 'fear']


#### Validation of Audio sentiment analysis

In [6]:
model = load_audio_senti_model()
pickles = load_pickles()



In [7]:
pickle_in = open(f"{paths['pickels']}/X_val.pickle","rb")
X_val = pickle.load(pickle_in)

pickle_in = open(f"{paths['pickels']}/y_val.pickle","rb")
y_val = pickle.load(pickle_in)
y_val = y_val.astype(int)

pickle_in = open(f"{paths['pickels']}/labels.pickle","rb")
labels = pickle.load(pickle_in)

In [8]:
def get_emotion(y):
    y = y.argmax()
    y = y.astype(int).flatten()
    final_emotion = labels.inverse_transform((y))
    return final_emotion

In [12]:
import random
testlen=5
for i in range(testlen):
    randsample = random.randrange(0,385)
    print(f"Testing validation clip {randsample}")
    print(f'Actual Emotion :{get_emotion(y_val[randsample])[0]}')
    prediction = model.predict(X_val[randsample].reshape(1, -1))
    print(f"Predicted Emotion = {get_emotion(prediction)[0]}")
    print("========================")

Testing validation clip 23
Actual Emotion :happy
Predicted Emotion = happy
Testing validation clip 271
Actual Emotion :happy
Predicted Emotion = happy
Testing validation clip 383
Actual Emotion :neutral
Predicted Emotion = neutral
Testing validation clip 334
Actual Emotion :neutral
Predicted Emotion = neutral
Testing validation clip 104
Actual Emotion :sad
Predicted Emotion = neutral
