
#**Xây dựng mô hình CNN**

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm

In [None]:
#set up parameter
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BASE = "/content/drive/MyDrive/Speech_Emotion_Recognition/working/features"
BATCH_SIZE = 32
EPOCHS = 30
LR = 1e-3

In [None]:
class SpeechEmotionDataset(Dataset):
    def __init__(self, split):
        base_dir = "/content/drive/MyDrive/Speech_Emotion_Recognition/working/features"
        split_dir = os.path.join(base_dir, split)

        def load_feature(path):
            arr = np.load(path, allow_pickle=True)
            fixed = []
            for x in arr:
                if isinstance(x, np.ndarray):
                    fixed.append(x.astype(np.float32))
                elif isinstance(x, list):
                    fixed.append(np.array(x, dtype=np.float32))
                else:                    
                    continue
            return fixed

        self.mel = load_feature(os.path.join(split_dir, "mel.npy"))
        self.mfcc = load_feature(os.path.join(split_dir, "mfcc.npy"))
        self.chroma = load_feature(os.path.join(split_dir, "chroma.npy"))
        self.labels = np.load(os.path.join(split_dir, "labels.npy"), allow_pickle=True)

        #shape chuẩn
        self.mel_shape = (128, 128)
        self.mfcc_shape = (13, 128)
        self.chroma_shape = (12, 128)

    def pad_or_trim(self, x, target_shape):
        if not isinstance(x, np.ndarray):
            x = np.array(x, dtype=np.float32)

        h, w = x.shape
        H, W = target_shape

        if w < W:
            pad_w = ((0, 0), (0, W - w))
            x = np.pad(x, pad_w, mode='constant')
        else:
            x = x[:, :W]

        if h < H:
            pad_h = ((0, H - h), (0, 0))
            x = np.pad(x, pad_h, mode='constant')
        else:
            x = x[:H, :]

        return x.astype(np.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        mel = self.pad_or_trim(self.mel[idx], self.mel_shape)
        mfcc = self.pad_or_trim(self.mfcc[idx], self.mfcc_shape)
        chroma = self.pad_or_trim(self.chroma[idx], self.chroma_shape)

        mel = torch.tensor(mel).unsqueeze(0)       # (1, 128, 128)
        mfcc = torch.tensor(mfcc).unsqueeze(0)     # (1, 13, 128)
        chroma = torch.tensor(chroma).unsqueeze(0) # (1, 12, 128)
        label = torch.tensor(self.labels[idx]).long()

        return mel, mfcc, chroma, label


In [None]:
ds = SpeechEmotionDataset("train")
print(len(ds))
mel, mfcc, chroma, label = ds[0]
print(mel.shape, mfcc.shape, chroma.shape, label)

921
torch.Size([1, 128, 128]) torch.Size([1, 13, 128]) torch.Size([1, 12, 128]) tensor(4)


In [None]:
#model CNN
import torch
import torch.nn as nn

class MultiFeatureCNN(nn.Module):
    def __init__(self, num_classes):
        super(MultiFeatureCNN, self).__init__()

        def conv_block(in_c, out_c):
            return nn.Sequential(
                nn.Conv2d(in_c, out_c, kernel_size=3, padding=1),
                nn.BatchNorm2d(out_c),
                nn.ReLU(),
                nn.MaxPool2d(2),
                nn.Dropout(0.3)
            )

        # 3 branch CNN cho MFCC / Mel / Chroma
        self.branch_mel = nn.Sequential(conv_block(1, 16), conv_block(16, 32))
        self.branch_mfcc = nn.Sequential(conv_block(1, 16), conv_block(16, 32))
        self.branch_chroma = nn.Sequential(conv_block(1, 16), conv_block(16, 32))

        with torch.no_grad():
            dummy_mel = torch.zeros(1, 1, 128, 128)
            dummy_mfcc = torch.zeros(1, 1, 13, 128)
            dummy_chroma = torch.zeros(1, 1, 12, 128)

            mel_out = self.branch_mel(dummy_mel)
            mfcc_out = self.branch_mfcc(dummy_mfcc)
            chroma_out = self.branch_chroma(dummy_chroma)

            mel_dim = mel_out.numel() // mel_out.shape[0]
            mfcc_dim = mfcc_out.numel() // mfcc_out.shape[0]
            chroma_dim = chroma_out.numel() // chroma_out.shape[0]
            total_dim = mel_dim + mfcc_dim + chroma_dim

        #fully connected
        self.fc = nn.Sequential(
            nn.Linear(total_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, num_classes)
        )

    def forward(self, mel, mfcc, chroma):
        f1 = self.branch_mel(mel)
        f2 = self.branch_mfcc(mfcc)
        f3 = self.branch_chroma(chroma)

        f1 = f1.flatten(1)
        f2 = f2.flatten(1)
        f3 = f3.flatten(1)

        x = torch.cat((f1, f2, f3), dim=1)
        x = self.fc(x)
        return x


#example
BATCH_SIZE = 32

train_dataset = SpeechEmotionDataset("train")
val_dataset = SpeechEmotionDataset("val")

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

num_classes = len(np.unique(train_dataset.labels))
print("Classes:", num_classes)

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
num_classes = len(np.unique(train_dataset.labels))
model = MultiFeatureCNN(num_classes).to(DEVICE)
print(model)


Classes: 8
MultiFeatureCNN(
  (branch_mel): Sequential(
    (0): Sequential(
      (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (4): Dropout(p=0.3, inplace=False)
    )
    (1): Sequential(
      (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (4): Dropout(p=0.3, inplace=False)
    )
  )
  (branch_mfcc): Sequential(
    (0): Sequential(
      (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool2d(kerne

In [None]:
from tqdm import tqdm
import torch
import torch.nn.functional as F
import torch.nn as nn

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

scaler = torch.cuda.amp.GradScaler() if DEVICE == "cuda" else None

best_val_acc = 0.0

for epoch in range(EPOCHS):
    # ===== TRAINING =====
    model.train()
    total_loss, correct = 0.0, 0

    for mel, mfcc, chroma, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False):
        mel, mfcc, chroma, labels = mel.to(DEVICE), mfcc.to(DEVICE), chroma.to(DEVICE), labels.to(DEVICE)

        optimizer.zero_grad()

        # forward + backward + optimize
        with torch.amp.autocast("cuda", enabled=(DEVICE == "cuda")):
            outputs = model(mel, mfcc, chroma)
            loss = criterion(outputs, labels)

        if scaler: 
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()

        total_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()

    avg_train_loss = total_loss / len(train_loader)
    train_acc = correct / len(train_dataset)

    # validate
    model.eval()
    val_correct, val_loss = 0, 0.0

    with torch.no_grad():
        for mel, mfcc, chroma, labels in val_loader:
            mel, mfcc, chroma, labels = mel.to(DEVICE), mfcc.to(DEVICE), chroma.to(DEVICE), labels.to(DEVICE)
            outputs = model(mel, mfcc, chroma)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            val_correct += (outputs.argmax(1) == labels).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_acc = val_correct / len(val_dataset)
    scheduler.step(avg_val_loss)

    print(f"[Epoch {epoch+1:02d}] "
          f"Train loss: {avg_train_loss:.4f} | Train acc: {train_acc:.4f} || "
          f"Val loss: {avg_val_loss:.4f} | Val acc: {val_acc:.4f}")

    #Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_model.pth")
        print(f"New best model saved (val_acc={val_acc:.4f})")

print("Training complete. Best validation accuracy:", best_val_acc)




[Epoch 01] Train loss: 4.3063 | Train acc: 0.1433 || Val loss: 2.0835 | Val acc: 0.1342
New best model saved (val_acc=0.1342)




[Epoch 02] Train loss: 2.0824 | Train acc: 0.1336 || Val loss: 2.0813 | Val acc: 0.1212




[Epoch 03] Train loss: 2.0852 | Train acc: 0.1314 || Val loss: 2.0797 | Val acc: 0.1342




[Epoch 04] Train loss: 2.0795 | Train acc: 0.1336 || Val loss: 2.0781 | Val acc: 0.1342




[Epoch 05] Train loss: 2.0779 | Train acc: 0.1336 || Val loss: 2.0765 | Val acc: 0.1342




[Epoch 06] Train loss: 2.0767 | Train acc: 0.1336 || Val loss: 2.0752 | Val acc: 0.1342




[Epoch 07] Train loss: 2.0757 | Train acc: 0.1336 || Val loss: 2.0737 | Val acc: 0.1342




[Epoch 08] Train loss: 2.0818 | Train acc: 0.1346 || Val loss: 2.0725 | Val acc: 0.1342




[Epoch 09] Train loss: 2.0734 | Train acc: 0.1336 || Val loss: 2.0714 | Val acc: 0.1342




[Epoch 10] Train loss: 2.0724 | Train acc: 0.1336 || Val loss: 2.0703 | Val acc: 0.1342




[Epoch 11] Train loss: 2.0715 | Train acc: 0.1336 || Val loss: 2.0693 | Val acc: 0.1342




[Epoch 12] Train loss: 2.0707 | Train acc: 0.1336 || Val loss: 2.0685 | Val acc: 0.1342




[Epoch 13] Train loss: 2.0702 | Train acc: 0.1336 || Val loss: 2.0675 | Val acc: 0.1342




[Epoch 14] Train loss: 2.0693 | Train acc: 0.1336 || Val loss: 2.0668 | Val acc: 0.1342




[Epoch 15] Train loss: 2.0687 | Train acc: 0.1336 || Val loss: 2.0661 | Val acc: 0.1342




[Epoch 16] Train loss: 2.0681 | Train acc: 0.1336 || Val loss: 2.0653 | Val acc: 0.1342




[Epoch 17] Train loss: 2.0675 | Train acc: 0.1336 || Val loss: 2.0647 | Val acc: 0.1342




[Epoch 18] Train loss: 2.0671 | Train acc: 0.1336 || Val loss: 2.0641 | Val acc: 0.1342




[Epoch 19] Train loss: 2.0666 | Train acc: 0.1336 || Val loss: 2.0634 | Val acc: 0.1342




[Epoch 20] Train loss: 2.0663 | Train acc: 0.1336 || Val loss: 2.0630 | Val acc: 0.1342




[Epoch 21] Train loss: 2.0659 | Train acc: 0.1336 || Val loss: 2.0625 | Val acc: 0.1342




[Epoch 22] Train loss: 2.0656 | Train acc: 0.1336 || Val loss: 2.0620 | Val acc: 0.1342




[Epoch 23] Train loss: 2.0653 | Train acc: 0.1336 || Val loss: 2.0616 | Val acc: 0.1342




[Epoch 24] Train loss: 2.0650 | Train acc: 0.1336 || Val loss: 2.0612 | Val acc: 0.1342




[Epoch 25] Train loss: 2.0645 | Train acc: 0.1336 || Val loss: 2.0609 | Val acc: 0.1342




[Epoch 26] Train loss: 2.0645 | Train acc: 0.1336 || Val loss: 2.0606 | Val acc: 0.1342




[Epoch 27] Train loss: 2.0642 | Train acc: 0.1336 || Val loss: 2.0601 | Val acc: 0.1342




[Epoch 28] Train loss: 2.0642 | Train acc: 0.1336 || Val loss: 2.0598 | Val acc: 0.1342




[Epoch 29] Train loss: 2.0638 | Train acc: 0.1336 || Val loss: 2.0596 | Val acc: 0.1342




[Epoch 30] Train loss: 2.0634 | Train acc: 0.1336 || Val loss: 2.0595 | Val acc: 0.1342
Training complete. Best validation accuracy: 0.1341991341991342


In [None]:
# torch.save(model.state_dict(), "/content/drive/MyDrive/Speech_Emotion_Recognition/model_cnn_multi.pth")
# print("Model saved.")

In [None]:
#test model ---> run this cell
import os
import re
import torch
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn.functional as F

# ===== Cấu hình =====
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_PATH = "/content/drive/MyDrive/Speech_Emotion_Recognition/model_cnn_multi.pth" #<---- thay doi duong dan file mode.pth
TEST_DIR = "/content/drive/MyDrive/Speech_Emotion_Recognition/working/processed/test" #<---- thay doi duong dan
OUTPUT_CSV = "/content/drive/MyDrive/Speech_Emotion_Recognition/predictions.csv" #<---- thay doi duong dan
NUM_CLASSES = 8  #8 emotion

#Mapping emotion_map
EMOTION_MAP = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

def pad_or_trim(feature, target_frames=128):
    if feature.shape[1] < target_frames:
        pad_width = target_frames - feature.shape[1]
        return np.pad(feature, ((0, 0), (0, pad_width)), mode='constant')
    else:
        return feature[:, :target_frames]

def load_model():
    model = MultiFeatureCNN(NUM_CLASSES).to(DEVICE)
    model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
    model.eval()
    return model

def extract_emotion_code(filename):
    match = re.match(r'^\d{2}-\d{2}-(\d{2})-', filename)
    if match:
        code = match.group(1)
        return EMOTION_MAP.get(code, "unknown")
    return "unknown"

def predict_emotion(audio_path, model):
    y, sr = librosa.load(audio_path, sr=16000, mono=True)

    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    mel = librosa.power_to_db(mel, ref=np.max)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)

    mel = pad_or_trim(mel, 128)
    mfcc = pad_or_trim(mfcc, 128)
    chroma = pad_or_trim(chroma, 128)

    mel = torch.tensor(mel).unsqueeze(0).unsqueeze(0).float().to(DEVICE)
    mfcc = torch.tensor(mfcc).unsqueeze(0).unsqueeze(0).float().to(DEVICE)
    chroma = torch.tensor(chroma).unsqueeze(0).unsqueeze(0).float().to(DEVICE)

    with torch.no_grad():
        output = model(mel, mfcc, chroma)
        probs = F.softmax(output, dim=1)
        pred = torch.argmax(probs, dim=1).item()
        confidence = probs[0, pred].item()

    return pred, confidence

def predict_all_test_files():
    model = load_model()
    results = []
    wav_files = [f for f in os.listdir(TEST_DIR) if f.endswith(".wav")]

    for fname in tqdm(wav_files, desc="Predicting"):
        file_path = os.path.join(TEST_DIR, fname)
        try:
            pred_idx, confidence = predict_emotion(file_path, model)
            predicted_label = list(EMOTION_MAP.values())[pred_idx] if pred_idx < len(EMOTION_MAP) else f"unknown({pred_idx})"
            true_label = extract_emotion_code(fname)
            results.append({
                "file_name": fname,
                "file_path": file_path,
                "true_label": true_label,
                "predicted_label": predicted_label,
                "confidence": round(confidence, 4)
            })
        except Exception as e:
            print(f"Error {file_path}: {e}")

    df = pd.DataFrame(results)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"\nSaved to {OUTPUT_CSV}")
    return df

pred_df = predict_all_test_files()
pred_df.head()


#Sử dụng model pre-train
acc = 0.4545

In [None]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2FeatureExtractor
import pandas as pd
import librosa
from tqdm import tqdm

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DATA_DIR = "/content/drive/MyDrive/Speech_Emotion_Recognition/working/processed" #<---- thay doi duong dan
MODEL_SAVE_PATH = "/content/drive/MyDrive/Speech_Emotion_Recognition/wav2vec_emotion.pth" #<---- thay doi duong dan
SAMPLE_RATE = 16000
BATCH_SIZE = 8
EPOCHS = 20
LR = 1e-4

print("Device: ", DEVICE)

Thiết bị: cuda


In [None]:
extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
base_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
train_csv = os.path.join(DATA_DIR, "train_final.csv")
val_csv = os.path.join(DATA_DIR, "val_final.csv")

train_df = pd.read_csv(train_csv)
val_df = pd.read_csv(val_csv)

# Lấy danh sách nhãn
unique_labels = sorted(train_df["label"].unique())
label2id = {lbl: i for i, lbl in enumerate(unique_labels)}
id2label = {i: lbl for lbl, i in label2id.items()}
NUM_CLASSES = len(label2id)

print("Label mapping:", label2id)

Label mapping: {'angry': 0, 'calm': 1, 'disgust': 2, 'fearful': 3, 'happy': 4, 'neutral': 5, 'sad': 6, 'surprised': 7}


In [None]:
!pip install audiomentations



In [None]:
from audiomentations import Compose, AddGaussianNoise, PitchShift, TimeStretch

augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.3),
    PitchShift(min_semitones=-2, max_semitones=2, p=0.4),
    TimeStretch(min_rate=0.8, max_rate=1.2, p=0.3)
])


In [None]:
class SpeechEmotionDataset(Dataset):
    def __init__(self, df, extractor):
        self.data = df
        self.extractor = extractor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        path = row["file_path"]
        label = int(row["label_id"])

        # Load audio
        y, sr = librosa.load(path, sr=SAMPLE_RATE, mono=True)

        # Extract Wav2Vec2 features
        inputs = self.extractor(y, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
        input_values = inputs["input_values"][0]
        return input_values, torch.tensor(label)


In [None]:
def collate_fn(batch):
    input_values = [b[0] for b in batch]
    labels = torch.stack([b[1] for b in batch])

    # Dùng extractor.pad để tạo batch có cùng độ dài
    padded = extractor.pad(
        {"input_values": input_values},
        padding=True,
        return_tensors="pt"
    )

    return padded["input_values"], labels

In [None]:
class Wav2Vec2EmotionClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.wav2vec = base_model
        for param in self.wav2vec.parameters():
            param.requires_grad = False  # freeze để train nhanh hơn


        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_values):
        # Giữ nguyên Wav2Vec2, chỉ train classifier
        with torch.no_grad():
            features = self.wav2vec(input_values).last_hidden_state
        x = features.mean(dim=1)
        return self.classifier(x)


In [None]:
train_dataset = SpeechEmotionDataset(train_df, extractor)
val_dataset = SpeechEmotionDataset(val_df, extractor)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)


In [None]:
model = Wav2Vec2EmotionClassifier(NUM_CLASSES).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [None]:
best_val_acc = 0

for epoch in range(EPOCHS):
    model.train()
    train_loss, correct = 0, 0

    for input_values, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        input_values, labels = input_values.to(DEVICE), labels.to(DEVICE)

        optimizer.zero_grad()
        outputs = model(input_values)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()

    train_acc = correct / len(train_dataset)
    print(f"[Epoch {epoch+1}] Train loss: {train_loss/len(train_loader):.4f} | acc: {train_acc:.4f}")

    # ===== Validation =====
    model.eval()
    val_correct, val_loss = 0, 0
    with torch.no_grad():
        for input_values, labels in val_loader:
            input_values, labels = input_values.to(DEVICE), labels.to(DEVICE)
            outputs = model(input_values)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            val_correct += (outputs.argmax(1) == labels).sum().item()

    val_acc = val_correct / len(val_dataset)
    print(f"Val loss: {val_loss/len(val_loader):.4f}, acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"Saved new best model ({best_val_acc:.4f})")

print(f"Training complete. Best val acc = {best_val_acc:.4f}")

Epoch 1/20: 100%|██████████| 116/116 [09:45<00:00,  5.05s/it]


[Epoch 1] Train loss: 2.0318 | acc: 0.2269
Val loss: 1.9771, acc: 0.3117
Saved new best model (0.3117)


Epoch 2/20: 100%|██████████| 116/116 [00:21<00:00,  5.38it/s]


[Epoch 2] Train loss: 1.9503 | acc: 0.2986
Val loss: 1.8977, acc: 0.3290
Saved new best model (0.3290)


Epoch 3/20: 100%|██████████| 116/116 [00:25<00:00,  4.58it/s]


[Epoch 3] Train loss: 1.8818 | acc: 0.3116
Val loss: 1.8364, acc: 0.3074


Epoch 4/20: 100%|██████████| 116/116 [00:21<00:00,  5.37it/s]


[Epoch 4] Train loss: 1.8221 | acc: 0.3301
Val loss: 1.7937, acc: 0.3074


Epoch 5/20: 100%|██████████| 116/116 [00:21<00:00,  5.42it/s]


[Epoch 5] Train loss: 1.7826 | acc: 0.3268
Val loss: 1.7647, acc: 0.3160


Epoch 6/20: 100%|██████████| 116/116 [00:21<00:00,  5.29it/s]


[Epoch 6] Train loss: 1.7525 | acc: 0.3290
Val loss: 1.7375, acc: 0.3593
Saved new best model (0.3593)


Epoch 7/20: 100%|██████████| 116/116 [00:25<00:00,  4.57it/s]


[Epoch 7] Train loss: 1.7279 | acc: 0.3344
Val loss: 1.7192, acc: 0.3723
Saved new best model (0.3723)


Epoch 8/20: 100%|██████████| 116/116 [00:24<00:00,  4.71it/s]


[Epoch 8] Train loss: 1.7124 | acc: 0.3594
Val loss: 1.7077, acc: 0.3723


Epoch 9/20: 100%|██████████| 116/116 [00:21<00:00,  5.31it/s]


[Epoch 9] Train loss: 1.6823 | acc: 0.3605
Val loss: 1.6950, acc: 0.3766
Saved new best model (0.3766)


Epoch 10/20: 100%|██████████| 116/116 [00:24<00:00,  4.67it/s]


[Epoch 10] Train loss: 1.6756 | acc: 0.3702
Val loss: 1.6747, acc: 0.3983
Saved new best model (0.3983)


Epoch 11/20: 100%|██████████| 116/116 [00:24<00:00,  4.72it/s]


[Epoch 11] Train loss: 1.6502 | acc: 0.3648
Val loss: 1.6623, acc: 0.3896


Epoch 12/20: 100%|██████████| 116/116 [00:21<00:00,  5.34it/s]


[Epoch 12] Train loss: 1.6316 | acc: 0.3724
Val loss: 1.6449, acc: 0.4286
Saved new best model (0.4286)


Epoch 13/20: 100%|██████████| 116/116 [00:24<00:00,  4.80it/s]


[Epoch 13] Train loss: 1.6237 | acc: 0.4007
Val loss: 1.6360, acc: 0.3939


Epoch 14/20: 100%|██████████| 116/116 [00:21<00:00,  5.35it/s]


[Epoch 14] Train loss: 1.6083 | acc: 0.3898
Val loss: 1.6172, acc: 0.3939


Epoch 15/20: 100%|██████████| 116/116 [00:21<00:00,  5.46it/s]


[Epoch 15] Train loss: 1.6024 | acc: 0.3952
Val loss: 1.6072, acc: 0.4069


Epoch 16/20: 100%|██████████| 116/116 [00:23<00:00,  4.87it/s]


[Epoch 16] Train loss: 1.5726 | acc: 0.4300
Val loss: 1.6018, acc: 0.3896


Epoch 17/20: 100%|██████████| 116/116 [00:21<00:00,  5.35it/s]


[Epoch 17] Train loss: 1.5639 | acc: 0.4017
Val loss: 1.5833, acc: 0.4286


Epoch 18/20: 100%|██████████| 116/116 [00:21<00:00,  5.36it/s]


[Epoch 18] Train loss: 1.5441 | acc: 0.4093
Val loss: 1.5826, acc: 0.4156


Epoch 19/20: 100%|██████████| 116/116 [00:21<00:00,  5.38it/s]


[Epoch 19] Train loss: 1.5465 | acc: 0.4169
Val loss: 1.5774, acc: 0.4199


Epoch 20/20: 100%|██████████| 116/116 [00:21<00:00,  5.34it/s]


[Epoch 20] Train loss: 1.5394 | acc: 0.4267
Val loss: 1.5687, acc: 0.4199
Training complete. Best val acc = 0.4286


In [None]:
# import os
# import torch
# import pandas as pd
# import librosa
# import numpy as np
# from torch import nn
# from torch.utils.data import Dataset, DataLoader
# from torch.nn.utils.rnn import pad_sequence
# from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
# from tqdm import tqdm
# from sklearn.metrics import accuracy_score, classification_report

# # =============================
# # 1Cấu hình
# # =============================
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# SAMPLE_RATE = 16000
# BATCH_SIZE = 8
# NUM_CLASSES = 8
# MODEL_PATH = "/content/drive/MyDrive/Speech_Emotion_Recognition/wav2vec2_aug_best.pth"
# TEST_CSV = "/content/drive/MyDrive/Speech_Emotion_Recognition/working/processed/test_final.csv"
# OUTPUT_CSV = "/content/drive/MyDrive/Speech_Emotion_Recognition/test_predictions.csv"
# BASE_DIR = "/content/drive/MyDrive"  # Đường dẫn gốc để nối vào CSV

# # =============================
# # 2. mapping cảm xúc
# # =============================
# ID2LABEL = {
#     0: "neutral",
#     1: "calm",
#     2: "happy",
#     3: "sad",
#     4: "angry",
#     5: "fearful",
#     6: "disgust",
#     7: "surprised"
# }

# # =============================
# # 3. Feature extractor
# # =============================
# extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")

# # =============================
# # 4. Dataset test
# # =============================
# class TestDataset(Dataset):
#     def __init__(self, csv_path, extractor, base_dir):
#         self.data = pd.read_csv(csv_path)
#         self.extractor = extractor
#         self.base_dir = base_dir

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         row = self.data.iloc[idx]
#         # Ghép đường dẫn tuyệt đối
#         rel_path = row["file_path"].lstrip("/")
#         full_path = os.path.join(self.base_dir, rel_path)
#         label = int(row["label_id"])

#         # Đọc file audio
#         y, sr = librosa.load(full_path, sr=SAMPLE_RATE, mono=True)
#         inputs = self.extractor(y, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
#         input_values = inputs.input_values.squeeze(0)
#         return {"input_values": input_values, "label": label, "file_path": full_path}

# def collate_fn(batch):
#     input_values = [item["input_values"] for item in batch]
#     labels = torch.tensor([item["label"] for item in batch], dtype=torch.long)
#     file_paths = [item["file_path"] for item in batch]
#     input_values = pad_sequence(input_values, batch_first=True)
#     return {"input_values": input_values, "labels": labels, "file_paths": file_paths}

# test_dataset = TestDataset(TEST_CSV, extractor, BASE_DIR)
# test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
# print(f"Test samples: {len(test_dataset)}")

# # =============================
# # 5️. Mô hình
# # =============================
# class Wav2VecEmotion(nn.Module):
#     def __init__(self, num_classes=8):
#         super().__init__()
#         self.wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
#         self.classifier = nn.Sequential(
#             nn.Linear(self.wav2vec.config.hidden_size, 256),
#             nn.ReLU(),
#             nn.Dropout(0.3),
#             nn.Linear(256, num_classes)
#         )

#     def forward(self, input_values):
#         outputs = self.wav2vec(input_values)
#         hidden_states = outputs.last_hidden_state
#         x = hidden_states.mean(dim=1)
#         logits = self.classifier(x)
#         return logits

# # =============================
# # 6️. Load model
# # =============================
# model = Wav2VecEmotion(NUM_CLASSES).to(DEVICE)
# model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
# model.eval()
# print("Model loaded successfully.")

# # =============================
# # 7️. Dự đoán
# # =============================
# preds, truths, files = [], [], []

# with torch.no_grad():
#     for batch in tqdm(test_loader, desc="Predicting"):
#         input_values = batch["input_values"].to(DEVICE)
#         labels = batch["labels"].cpu().numpy()

#         outputs = model(input_values)
#         pred_labels = outputs.argmax(dim=1).cpu().numpy()

#         preds.extend(pred_labels)
#         truths.extend(labels)
#         files.extend(batch["file_paths"])

# # =============================
# # 8️. Tính accuracy & xuất kết quả
# # =============================
# acc = accuracy_score(truths, preds)
# print(f"Test Accuracy: {acc:.4f}")
# print("Classification Report:")
# print(classification_report(truths, preds, target_names=list(ID2LABEL.values())))

# results = pd.DataFrame({
#     "file_path": files,
#     "true_label_id": truths,
#     "true_label": [ID2LABEL[i] for i in truths],
#     "pred_label_id": preds,
#     "pred_label": [ID2LABEL[i] for i in preds]
# })

# results.to_csv(OUTPUT_CSV, index=False)
# print(f"Saved predictions to: {OUTPUT_CSV}")
# print(results.head())


Test samples: 288




Model loaded successfully.


Predicting: 100%|██████████| 36/36 [08:23<00:00, 13.98s/it]

Test Accuracy: 0.1319
Classification Report:
              precision    recall  f1-score   support

     neutral       0.00      0.00      0.00        19
        calm       0.00      0.00      0.00        38
       happy       0.13      1.00      0.23        38
         sad       0.00      0.00      0.00        38
       angry       0.00      0.00      0.00        39
     fearful       0.00      0.00      0.00        39
     disgust       0.00      0.00      0.00        38
   surprised       0.00      0.00      0.00        39

    accuracy                           0.13       288
   macro avg       0.02      0.12      0.03       288
weighted avg       0.02      0.13      0.03       288

Saved predictions to: /content/drive/MyDrive/Speech_Emotion_Recognition/test_predictions.csv
                                           file_path  true_label_id  \
0  /content/drive/MyDrive/Speech_Emotion_Recognit...              2   
1  /content/drive/MyDrive/Speech_Emotion_Recognit...              7  


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
preds, files = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        input_values = batch["input_values"].to(DEVICE)
        outputs = model(input_values)
        pred_labels = outputs.argmax(dim=1).cpu().numpy()

        preds.extend(pred_labels)
        files.extend(batch["file_paths"])


In [None]:
results = pd.DataFrame({
    "file_path": files,
    "pred_label_id": preds,
    "pred_label_name": [ID2LABEL[i] for i in preds]
})

results.to_csv(OUTPUT_CSV, index=False)

#Sử dụng model pretrain với data augment

In [None]:
!pip install audiomentations




In [None]:
import os
import torch
import librosa
import pandas as pd
import numpy as np
from tqdm import tqdm
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
from audiomentations import Compose, AddGaussianNoise, PitchShift, TimeStretch
from torch.nn.utils.rnn import pad_sequence

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SAMPLE_RATE = 16000
BATCH_SIZE = 8
EPOCHS = 20
LR = 1e-4
NUM_CLASSES = 8
MODEL_SAVE_PATH = "/content/drive/MyDrive/Speech_Emotion_Recognition/wav2vec2_aug_best.pth"

print("Device:", DEVICE)


Device: cuda


In [None]:
EMOTION_MAP = {
    "neutral": 0,
    "calm": 1,
    "happy": 2,
    "sad": 3,
    "angry": 4,
    "fearful": 5,
    "disgust": 6,
    "surprised": 7
}

In [None]:
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.3),
    PitchShift(min_semitones=-2, max_semitones=2, p=0.3),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.3)
])


In [None]:
class SpeechEmotionDataset(Dataset):
    def __init__(self, csv_path, extractor, augment=None):
        self.data = pd.read_csv(csv_path)
        self.extractor = extractor
        self.augment = augment

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        path = row["file_path"]

        # Load audio
        y, sr = librosa.load(path, sr=SAMPLE_RATE, mono=True)

        # Apply augment
        if self.augment:
            y = self.augment(samples=y, sample_rate=SAMPLE_RATE)

        # Extract features
        inputs = self.extractor(y, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
        input_values = inputs.input_values.squeeze(0)  # (seq_len,)

        label = torch.tensor(int(row["label_id"]), dtype=torch.long)

        return {"input_values": input_values, "labels": label}


In [None]:
def collate_fn(batch):
    input_values = [item["input_values"] for item in batch]
    labels = torch.stack([item["labels"] for item in batch])

    # Pad audio tensor về cùng độ dài trong batch
    input_values = pad_sequence(input_values, batch_first=True)

    return {"input_values": input_values, "labels": labels}

In [None]:
train_csv = "/content/drive/MyDrive/Speech_Emotion_Recognition/working/processed/train_final.csv"
val_csv   = "/content/drive/MyDrive/Speech_Emotion_Recognition/working/processed/val_final.csv"

extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")

train_dataset = SpeechEmotionDataset(train_csv, extractor, augment=augment)
val_dataset   = SpeechEmotionDataset(val_csv, extractor, augment=None)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(f"Train samples: {len(train_dataset)}, Val samples: {len(val_dataset)}")

Train samples: 921, Val samples: 231


In [None]:
class Wav2VecEmotion(nn.Module):
    def __init__(self, num_classes=8):
        super().__init__()
        self.wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
        self.classifier = nn.Sequential(
            nn.Linear(self.wav2vec.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_values):
        with torch.set_grad_enabled(True):
            outputs = self.wav2vec(input_values)
            hidden_states = outputs.last_hidden_state
            x = hidden_states.mean(dim=1)  # Mean pooling
            logits = self.classifier(x)
        return logits


In [None]:
model = Wav2VecEmotion(NUM_CLASSES).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
best_val_acc = 0

print("-------------------------------------------------")
for epoch in range(EPOCHS):
    model.train()
    train_loss, correct = 0, 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        input_values = batch["input_values"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        optimizer.zero_grad()
        outputs = model(input_values)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()

    train_acc = correct / len(train_dataset)

    # ===== Validation =====
    model.eval()
    val_loss, val_correct = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            input_values = batch["input_values"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)
            outputs = model(input_values)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            val_correct += (outputs.argmax(1) == labels).sum().item()

    val_acc = val_correct / len(val_dataset)

    print(f"[Epoch {epoch+1}] Train loss: {train_loss/len(train_loader):.4f} | acc: {train_acc:.4f} || "
          f"Val loss: {val_loss/len(val_loader):.4f} | acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"Saved new best model ({best_val_acc:.4f})")

print(f"Training complete. Best val acc = {best_val_acc:.4f}")



-------------------------------------------------


Epoch 1/20: 100%|██████████| 116/116 [08:56<00:00,  4.63s/it]


[Epoch 1] Train loss: 2.0777 | acc: 0.1336 || Val loss: 2.0606 | acc: 0.1342
Saved new best model (0.1342)


Epoch 2/20: 100%|██████████| 116/116 [01:31<00:00,  1.26it/s]


[Epoch 2] Train loss: 2.0732 | acc: 0.1314 || Val loss: 2.0610 | acc: 0.1299


Epoch 3/20: 100%|██████████| 116/116 [01:31<00:00,  1.26it/s]


[Epoch 3] Train loss: 2.0703 | acc: 0.1194 || Val loss: 2.0610 | acc: 0.1342


Epoch 4/20: 100%|██████████| 116/116 [01:32<00:00,  1.26it/s]


[Epoch 4] Train loss: 2.0696 | acc: 0.1281 || Val loss: 2.0612 | acc: 0.1342


Epoch 5/20: 100%|██████████| 116/116 [01:32<00:00,  1.26it/s]


[Epoch 5] Train loss: 2.0641 | acc: 0.1390 || Val loss: 2.0601 | acc: 0.1342


Epoch 6/20: 100%|██████████| 116/116 [01:32<00:00,  1.26it/s]


[Epoch 6] Train loss: 2.0651 | acc: 0.1281 || Val loss: 2.0604 | acc: 0.1342


Epoch 7/20: 100%|██████████| 116/116 [01:31<00:00,  1.26it/s]


[Epoch 7] Train loss: 2.0624 | acc: 0.1238 || Val loss: 2.0604 | acc: 0.1299


Epoch 8/20: 100%|██████████| 116/116 [01:31<00:00,  1.27it/s]


[Epoch 8] Train loss: 2.0655 | acc: 0.1249 || Val loss: 2.0602 | acc: 0.1342


Epoch 9/20: 100%|██████████| 116/116 [01:31<00:00,  1.27it/s]


[Epoch 9] Train loss: 2.0656 | acc: 0.1238 || Val loss: 2.0603 | acc: 0.1342


Epoch 10/20: 100%|██████████| 116/116 [01:32<00:00,  1.25it/s]


[Epoch 10] Train loss: 2.0652 | acc: 0.1292 || Val loss: 2.0600 | acc: 0.1342


Epoch 11/20: 100%|██████████| 116/116 [01:31<00:00,  1.26it/s]


[Epoch 11] Train loss: 2.0638 | acc: 0.1249 || Val loss: 2.0600 | acc: 0.1342


Epoch 12/20: 100%|██████████| 116/116 [01:32<00:00,  1.25it/s]


[Epoch 12] Train loss: 2.0752 | acc: 0.1238 || Val loss: 2.0606 | acc: 0.1299


Epoch 13/20: 100%|██████████| 116/116 [01:32<00:00,  1.25it/s]


[Epoch 13] Train loss: 2.0667 | acc: 0.1368 || Val loss: 2.0609 | acc: 0.1342


Epoch 14/20: 100%|██████████| 116/116 [01:31<00:00,  1.26it/s]


[Epoch 14] Train loss: 2.0640 | acc: 0.1260 || Val loss: 2.0605 | acc: 0.1342


Epoch 15/20: 100%|██████████| 116/116 [01:32<00:00,  1.25it/s]


[Epoch 15] Train loss: 2.0656 | acc: 0.1194 || Val loss: 2.0604 | acc: 0.1342


Epoch 16/20: 100%|██████████| 116/116 [01:31<00:00,  1.27it/s]


[Epoch 16] Train loss: 2.0664 | acc: 0.1183 || Val loss: 2.0599 | acc: 0.1299


Epoch 17/20: 100%|██████████| 116/116 [01:32<00:00,  1.25it/s]


[Epoch 17] Train loss: 2.0624 | acc: 0.1401 || Val loss: 2.0598 | acc: 0.1342


Epoch 18/20: 100%|██████████| 116/116 [01:31<00:00,  1.27it/s]


[Epoch 18] Train loss: 2.0641 | acc: 0.1292 || Val loss: 2.0604 | acc: 0.1342


Epoch 19/20: 100%|██████████| 116/116 [01:31<00:00,  1.27it/s]


[Epoch 19] Train loss: 2.0680 | acc: 0.1227 || Val loss: 2.0609 | acc: 0.1342


Epoch 20/20: 100%|██████████| 116/116 [01:31<00:00,  1.27it/s]


[Epoch 20] Train loss: 2.0677 | acc: 0.1129 || Val loss: 2.0606 | acc: 0.1342
Training complete. Best val acc = 0.1342
