In [72]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, RNN, Layer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import numpy as np
import librosa
import tensorflow as tf
import tensorflow.keras as keras
import time

# tf.keras.backend.clear_session()

In [74]:
def clipped_relu(x):

    return tf.keras.activations.relu(x, max_value=20.0)

def clipped_sigmoid(x):

    return tf.keras.activations.sigmoid(x) * 10.0

In [75]:

@keras.utils.register_keras_serializable(package="Custom", name="LTCCell")
class LTCCell(Layer):

    def __init__(self, units, ode_unfolds=6, l2_reg=0.001, **kwargs):

        super(LTCCell, self).__init__(**kwargs)

        self.units = units

        self.ode_unfolds = ode_unfolds

        self.state_size = units

        self.l2_reg = l2_reg  # L2 regularization parameter



    def build(self, input_shape):

        self.input_dim = input_shape[-1]

        # Trainable parameters with L2 regularization

        self.W = self.add_weight(

            shape=(self.input_dim + self.units, self.units),

            initializer='glorot_uniform',

            regularizer=l2(self.l2_reg),

            name='W'

        )

        self.bias = self.add_weight(

            shape=(self.units,),

            initializer='zeros',

            name='bias'

        )

        self.tau = self.add_weight(

            shape=(self.units,),

            initializer='ones',

            name='tau'

        )


        self.C = self.add_weight(

            shape=(self.units,),

            initializer='ones',

            name='C'

        )

        self.G = self.add_weight(

            shape=(self.units,),

            initializer='ones',

            name='G'

        )

        super(LTCCell, self).build(input_shape)



    def call(self, inputs, states):

        prev_state = states[0]

        concatenated = tf.concat([inputs, prev_state], axis=1)

        dt = 0.01  # Time step



        for _ in range(self.ode_unfolds):

            dh = (-prev_state + tf.nn.tanh(tf.matmul(concatenated, self.W) + self.bias)) / tf.nn.softplus(self.tau)

            prev_state += dt * dh



        # Apply custom activation functions

        prev_state = clipped_relu(prev_state)

        prev_state = prev_state * clipped_sigmoid(self.C)

        prev_state = prev_state / clipped_sigmoid(self.G)



        return prev_state, [prev_state]
    
    def get_config(self):
        
        config = super(LTCCell, self).get_config()
        
        config.update({"units": self.units})
        
        return config

In [86]:

# Load your model
model = keras.models.load_model('emotion_model_63.h5', custom_objects={"LTCCell": LTCCell})

# Preprocessing function
def extract_features_from_audio(audio, sample_rate):
    mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13)
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    features = np.concatenate((mfcc, mfcc_delta, mfcc_delta2), axis=0)
    features = features.T  # Shape: (time_steps, features)

    # Expected shape for the model
    expected_time_steps = 911
    expected_features = 39

    # Check if the number of time steps matches the expected shape
    if features.shape[0] < expected_time_steps:
        # Pad with zeros
        pad_width = ((0, expected_time_steps - features.shape[0]), (0, 0))  # Correct format
        features = np.pad(features, pad_width, mode='constant')
    else:
        # Truncate
        features = features[:expected_time_steps, :]

    return features

# Function to process an audio file
def process_audio_file(file_path):
    # Load the audio file
    audio, sample_rate = librosa.load(file_path, sr=16000)  # Resample to 16kHz if necessary

    # Split the audio into 2-second segments
    segment_length = 2 * sample_rate
    segments = [audio[i:i + segment_length] for i in range(0, len(audio) - segment_length + 1, segment_length)]

    # Process each segment
    for i, segment in enumerate(segments):
        print(f"Processing segment {i + 1}...")

        # Start timer for latency measurement
        start_time = time.time()

        # Extract features from the segment
        features = extract_features_from_audio(segment, sample_rate)

        # Reshape features to match the model's input shape
        features = np.expand_dims(features, axis=0)  # Add batch dimension

        # Predict emotion
        prediction = model.predict(features, verbose=0)
        emotion_index = np.argmax(prediction, axis=1)[0]

        # Map index to emotion label
        emotion_labels = ["angry", "happy", "sad", "neutral"] 
        detected_emotion = emotion_labels[emotion_index]

        # End timer for latency measurement
        end_time = time.time()
        latency = end_time - start_time

        # Display result and latency
        print(f"Detected Emotion: {detected_emotion}")
        print(f"Latency: {latency:.4f} seconds")

# Main function
if __name__ == "__main__":
    # Path to the audio file
    audio_file_path = "angry-2.mp3"  # Replace with your audio file path

    # Process the audio file
    process_audio_file(audio_file_path)



Processing segment 1...
Detected Emotion: neutral
Latency: 0.4017 seconds
Processing segment 2...
Detected Emotion: neutral
Latency: 0.1544 seconds
Processing segment 3...
Detected Emotion: angry
Latency: 0.1471 seconds
Processing segment 4...
Detected Emotion: neutral
Latency: 0.1434 seconds
Processing segment 5...
Detected Emotion: angry
Latency: 0.1526 seconds
Processing segment 6...
Detected Emotion: angry
Latency: 0.1450 seconds
Processing segment 7...
Detected Emotion: angry
Latency: 0.1621 seconds
Processing segment 8...
Detected Emotion: sad
Latency: 0.1679 seconds
Processing segment 9...
Detected Emotion: angry
Latency: 0.1416 seconds
Processing segment 10...
Detected Emotion: angry
Latency: 0.1451 seconds
Processing segment 11...
Detected Emotion: sad
Latency: 0.1469 seconds
Processing segment 12...
Detected Emotion: sad
Latency: 0.1505 seconds
Processing segment 13...
Detected Emotion: angry
Latency: 0.1536 seconds
Processing segment 14...
Detected Emotion: angry
Latency: 0.