In [21]:
import os
import librosa
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [22]:
# Augmentation function for audio
def augment_audio(audio, sr, pitch_factor=2, noise_factor=0.005):
    # Apply pitch shifting
    audio_shifted = librosa.effects.pitch_shift(audio, sr=sr, n_steps=pitch_factor)
    
    # Add random noise
    noise = np.random.randn(len(audio))
    audio_noisy = audio_shifted + noise_factor * noise
    
    # Clip the audio to ensure it's within a valid range
    audio_noisy = np.clip(audio_noisy, -1.0, 1.0)
    
    return audio_noisy


In [23]:
# Function to get the emotion from the filename
def extract_emotion_from_filename(filename):
    # Emotion mapping based on the filename (third component)
    emotion_mapping = {
        '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad', '05': 'angry', 
        '06': 'fearful', '07': 'disgust', '08': 'surprised'
    }
    
    emotion_code = filename.split('-')[2]  # Extract emotion from the filename
    return emotion_mapping.get(emotion_code, 'unknown')


In [24]:
selected_features = ['Energy', 'MFCC_1', 'MFCC_3', 'MFCC_4', 'MFCC_6', 'MFCC_2', 'MFCC_9', 
                     'MFCC_5', 'MFCC_10', 'MFCC_13', 'ZCR', 'MFCC_7', 'MFCC_8', 'MFCC_12', 
                     'MFCC_11', 'Pitch', 'Spectral_Rolloff']


# Function to extract the 17 selected features from audio
def extract_selected_features(audio, sr):
    # Feature 1: Energy
    energy = np.sum(librosa.feature.rms(y=audio)**2, axis=1)
    
    # Feature 2: MFCCs (13 MFCCs)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    
    # Feature 3: Zero Crossing Rate (ZCR)
    zcr = librosa.feature.zero_crossing_rate(y=audio)
    
    # Feature 4: Spectral Rolloff
    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)
    
    # Feature 5: Pitch (Mean pitch)
    pitch, _ = librosa.core.piptrack(y=audio, sr=sr)
    pitch = np.mean(pitch[pitch > 0])  # Get mean pitch
    
    # Extract the 17 features based on the selected ones
    features = [
        energy.mean(),  # Energy
        mfccs[0, 0],  # MFCC_1
        mfccs[2, 0],  # MFCC_3
        mfccs[3, 0],  # MFCC_4
        mfccs[5, 0],  # MFCC_6
        mfccs[1, 0],  # MFCC_2
        mfccs[8, 0],  # MFCC_9
        mfccs[4, 0],  # MFCC_5
        mfccs[9, 0],  # MFCC_10
        mfccs[12, 0],  # MFCC_13
        zcr.mean(),  # ZCR
        mfccs[6, 0],  # MFCC_7
        mfccs[7, 0],  # MFCC_8
        mfccs[11, 0],  # MFCC_12
        mfccs[10, 0],  # MFCC_11
        pitch,  # Pitch
        spectral_rolloff.mean()  # Spectral Rolloff
    ]
    
    return features


In [25]:
# Define a function to augment audio files, extract features, and save them to a CSV
def augment_and_save_to_csv(audio_folder, output_csv='augmented_audio_features.csv', augment_count=100):
    data = []  # List to store the data
    count = 0
    
    # Iterate through all actor folders
    for actor_folder in os.listdir(audio_folder):
        actor_path = os.path.join(audio_folder, actor_folder)
        if os.path.isdir(actor_path):
            for file in os.listdir(actor_path):
                if file.endswith(".wav") and count < augment_count:
                    file_path = os.path.join(actor_path, file)
                    audio, sr = librosa.load(file_path, sr=None)
                    
                    # Apply augmentations
                    augmented_audio = augment_audio(audio, sr)
                    
                    # Extract 17 features from the augmented audio
                    features = extract_selected_features(augmented_audio, sr)
                    
                    # Extract emotion from filename
                    emotion = extract_emotion_from_filename(file)
                    
                    # Append the features and emotion label to the data list
                    data.append(features + [emotion])
                    
                    count += 1
                    if count >= augment_count:
                        break
    
    # Convert the data list to a pandas DataFrame
    columns = selected_features + ['Emotion']  # 17 features and the emotion label
    df = pd.DataFrame(data, columns=columns)
    
    # Save the DataFrame to CSV
    df.to_csv(output_csv, index=False)
    print(f"Data saved to {output_csv}")


In [27]:
# Specify the audio folder location
audio_folder = 'Audio-data'  # Update with your folder path

# Run the augmentation and save to CSV
augment_and_save_to_csv(audio_folder, output_csv='augmented_17_audio_features.csv', augment_count=100)


Data saved to augmented_17_audio_features.csv


# Model Design 2

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [3]:
# Load the augmented data CSV
df = pd.read_csv('feature-importance-analysis/17_selected_features.csv')

# Separate features (X) and labels (y)
X = df.drop(columns=['Emotion'])  # Features: All columns except 'Emotion'
y = df['Emotion']  # Labels: Emotion column


In [4]:
# Convert emotion labels into numerical format using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Convert to torch tensors
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y_encoded, dtype=torch.long)

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)


In [5]:
class EmotionNN(nn.Module):
    def __init__(self):
        super(EmotionNN, self).__init__()
        self.fc1 = nn.Linear(17, 128)
        self.bn1 = nn.BatchNorm1d(128)  # Batch normalization
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)   # Batch normalization
        self.fc3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)   # Batch normalization
        self.fc4 = nn.Linear(32, 8)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))  # Apply batch norm before activation
        x = self.dropout(x)
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.fc4(x)
        return x


In [9]:
# Initialize the model, loss function, and optimizer
model = EmotionNN()

# Loss function: Cross-Entropy Loss (for classification)
criterion = nn.CrossEntropyLoss()

# Optimizer: Adam optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Learning rate scheduler: StepLR reduces learning rate by a factor of 0.5 every 10 epochs
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)  # Reduce lr by 50% every 10 epochs


In [12]:
# Training loop
num_epochs = 100  # Keep the same number of epochs
batch_size = 32   # You can adjust this

train_data = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for inputs, labels in train_loader:
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Track loss and accuracy
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

    # Update learning rate using scheduler
    scheduler.step()  # Adjust learning rate after each epoch

    accuracy = 100 * correct_predictions / total_predictions
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {accuracy:.2f}%')


Epoch [1/100], Loss: 2.1089, Accuracy: 14.58%
Epoch [2/100], Loss: 2.1101, Accuracy: 15.54%
Epoch [3/100], Loss: 2.1131, Accuracy: 14.58%
Epoch [4/100], Loss: 2.1093, Accuracy: 17.19%
Epoch [5/100], Loss: 2.1221, Accuracy: 15.36%
Epoch [6/100], Loss: 2.1268, Accuracy: 14.84%
Epoch [7/100], Loss: 2.1320, Accuracy: 15.36%
Epoch [8/100], Loss: 2.1003, Accuracy: 13.98%
Epoch [9/100], Loss: 2.0962, Accuracy: 16.84%
Epoch [10/100], Loss: 2.1254, Accuracy: 15.89%
Epoch [11/100], Loss: 2.1211, Accuracy: 13.98%
Epoch [12/100], Loss: 2.1350, Accuracy: 14.93%
Epoch [13/100], Loss: 2.1219, Accuracy: 14.58%
Epoch [14/100], Loss: 2.1342, Accuracy: 14.15%
Epoch [15/100], Loss: 2.1032, Accuracy: 15.45%
Epoch [16/100], Loss: 2.1230, Accuracy: 13.72%
Epoch [17/100], Loss: 2.1379, Accuracy: 13.28%
Epoch [18/100], Loss: 2.1162, Accuracy: 14.32%
Epoch [19/100], Loss: 2.1119, Accuracy: 15.45%
Epoch [20/100], Loss: 2.1078, Accuracy: 16.32%
Epoch [21/100], Loss: 2.1186, Accuracy: 13.45%
Epoch [22/100], Loss: 

In [13]:
# Evaluate the model on the test set
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model(X_test)
    _, predicted = torch.max(outputs, 1)
    
    correct_predictions = (predicted == y_test).sum().item()
    accuracy = 100 * correct_predictions / y_test.size(0)

print(f'Test Accuracy: {accuracy:.2f}%')


Test Accuracy: 17.36%
