In [None]:
# # =========================
# # 3. Duyệt file + lấy nhãn
# # =========================
# all_files = []
# all_labels = []

# # In ra các thư mục gốc mà os.walk sẽ đi qua
# print(f"Bắt đầu duyệt từ: {DATA_DIR}")

# for root, _, files in os.walk(DATA_DIR):
#     print(f"Đang duyệt trong thư mục: {root}") # Thêm dòng này để kiểm tra
#     for f in files:
#         if f.endswith(".wav"):
#             path = os.path.join(root, f)
#             try:
#                 code = f.split("-")[2]
#                 label = EMOTION_MAP[code]
#             except IndexError: # Thay đổi except chung thành IndexError cụ thể hơn
#                 print(f"File không chuẩn hoặc tên không đúng định dạng: {f}")
#                 continue
#             except KeyError: # Nếu code không có trong EMOTION_MAP
#                 print(f"Mã cảm xúc '{code}' không có trong EMOTION_MAP cho file: {f}")
#                 continue

#             # Chỉ để kiểm tra 1 vài file đầu tiên để không quá dài
#             if len(all_files) < 10: # Chỉ in 10 file đầu tiên
#                 print(f"  Thêm file: {path}")
#             all_files.append(path)
#             all_labels.append(label)

# print(f"Tổng số file sau khi duyệt: {len(all_files)}")

In [1]:
# =========================
# 1_preprocess_emotionid.py
# =========================
import os
import librosa
import soundfile as sf
import pandas as pd
from sklearn.model_selection import train_test_split

# =========================
# 1. Cấu hình
# =========================
DATA_DIR = "/kaggle/input/ravdess-emotional-speech-audio"  # dataset gốc
OUTPUT_DIR = "/kaggle/working/processed"  # thư mục lưu WAV + CSV
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Mapping code -> nhãn text
EMOTION_MAP = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

# Mapping EmotionID -> label_id theo gốc (0-index)
EMOTION_ID_TO_NUM = {k: int(k)-1 for k in EMOTION_MAP.keys()}
# '01' -> 0, '02' -> 1, ..., '08' -> 7

# =========================
# 2. Hàm preprocess audio
# =========================
def preprocess_audio(file_path, target_sr=16000):
    try:
        y, sr = librosa.load(file_path, sr=None)
        if y.ndim > 1:
            y = librosa.to_mono(y)
        y_resampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
        return y_resampled, target_sr
    except Exception as e:
        print(f"Lỗi khi xử lý {file_path}: {e}")
        return None, None

# =========================
# 3. Duyệt file + lấy nhãn
# =========================
all_files = []
all_labels = []
all_label_ids = []

for root, _, files in os.walk(DATA_DIR):
    for f in files:
        if f.endswith(".wav"):
            path = os.path.join(root, f)
            try:
                code = f.split("-")[2]           # EmotionID từ filename
                label_text = EMOTION_MAP[code]   # nhãn text
                label_id = EMOTION_ID_TO_NUM[code]  # label_id theo gốc
            except:
                print(f"File không chuẩn: {f}")
                continue
            all_files.append(path)
            all_labels.append(label_text)
            all_label_ids.append(label_id)

print(f"Tổng số file: {len(all_files)}")

# =========================
# 4. Chia train/test
# =========================
train_files, test_files, train_labels, test_labels, train_label_ids, test_label_ids = train_test_split(
    all_files, all_labels, all_label_ids,
    test_size=0.2, stratify=all_label_ids, random_state=42
)

# =========================
# 5. Hàm lưu WAV + trả paths
# =========================
def save_wav(files, prefix):
    paths = []
    out_dir = os.path.join(OUTPUT_DIR, prefix)
    os.makedirs(out_dir, exist_ok=True)
    for i, path in enumerate(files):
        y, sr = preprocess_audio(path)
        if y is None:
            continue
        out_name = f"{prefix}_{i}.wav"
        out_path = os.path.join(out_dir, out_name)
        sf.write(out_path, y, sr)
        paths.append(out_path)
    return paths

train_paths = save_wav(train_files, "train")
test_paths = save_wav(test_files, "test")

# =========================
# 6. Lưu CSV final
# =========================
train_df = pd.DataFrame({
    "file_path": train_paths,
    "label": train_labels,
    "label_id": train_label_ids
})

test_df = pd.DataFrame({
    "file_path": test_paths,
    "label": test_labels,
    "label_id": test_label_ids
})

# Chia train -> validation
train_df, val_df = train_test_split(
    train_df, test_size=0.2, stratify=train_df["label_id"], random_state=42
)

# Lưu CSV
train_df.to_csv(os.path.join(OUTPUT_DIR, "train_final.csv"), index=False)
val_df.to_csv(os.path.join(OUTPUT_DIR, "val_final.csv"), index=False)
test_df.to_csv(os.path.join(OUTPUT_DIR, "test_final.csv"), index=False)

print(f"Số file train: {len(train_df)}, val: {len(val_df)}, test: {len(test_df)}")
print("✅ Hoàn tất preprocess, CSV sẵn sàng train mô hình")
print("Bảng label -> label_id:", dict(zip(EMOTION_MAP.values(), EMOTION_ID_TO_NUM.values())))


Tổng số file: 2880
Số file train: 1843, val: 461, test: 576
✅ Hoàn tất preprocess, CSV sẵn sàng train mô hình
Bảng label -> label_id: {'neutral': 0, 'calm': 1, 'happy': 2, 'sad': 3, 'angry': 4, 'fearful': 5, 'disgust': 6, 'surprised': 7}


# Extracting features

In [2]:
from pathlib import Path
import numpy as np
import json

In [3]:
# ----------- 1. Config -----------
SR = 16000             #16kHz
N_FFT = 1024           # ~64 ms
HOP = 256              # ~16 ms
WIN = 1024
N_MELS = 64
N_MFCC = 13

MAX_FRAMES = 500 

USE_LOGMEL = True
USE_MFCC39 = True
USE_CHROMA = True
DATA_DIR = Path("/kaggle/working/processed")
FEAT_DIR = DATA_DIR / "features"
FEAT_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
# ----------- 2. Utils -----------
def load_wav(path, sr = SR):
    # đọc wav, resample về SR,chuẩn hóa biên độ [-1, 1]
    y, _ = librosa.load(Path(path), sr=sr, mono=True)
    if np.max(np.abs(y)) > 0:
        y = y / np.max(np.abs(y))
    return y

def extract_logmel(y, sr = SR):
    S = np.abs(librosa.stft(y, n_fft=N_FFT, hop_length=HOP, win_length=WIN))**2
    M = librosa.feature.melspectrogram(S=S, sr=sr, n_mels=N_MELS)
    logmel = librosa.power_to_db(M, ref=np.max)
    return logmel

def extract_mfcc_block(y, sr = SR):
    M = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT,
                                       hop_length=HOP, win_length=WIN,
                                       n_mels=N_MELS)
    db = librosa.power_to_db(M, ref=np.max)
    mfcc = librosa.feature.mfcc(S=db, n_mfcc=N_MFCC)
    d1 = librosa.feature.delta(mfcc)
    d2 = librosa.feature.delta(mfcc, order=2)
    return np.concatenate([mfcc, d1, d2], axis=0)

def extract_chroma(y, sr=SR):
    chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP, win_length=WIN)
    return chroma

def pad_or_trim_feat(X, T_target, pad_value=0.0):
    # X: (C, T) -> (C, T_target) bằng cách cắt hoặc đệm 0 ở cuối
    C, T = X.shape
    if T == T_target:
        return X
    if T > T_target:
        return X[:, :T_target]
    pad = np.full((C, T_target - T), pad_value, dtype=X.dtype)
    return np.concatenate([X, pad], axis=1)

def stack_features(y):
    """
    Ghép các khối đặc trưng theo trục kênh (C):
      - log-Mel: (N_MELS, T)
      - MFCC39: (39, T)
      - Chroma: (12, T)
    Trả về: (C, T)
    """
    feats = []
    if USE_LOGMEL:
        feats.append(extract_logmel(y))
    if USE_MFCC39:
        feats.append(extract_mfcc_block(y))
    if USE_CHROMA:
        feats.append(extract_chroma(y))
    if len(feats) == 0:
        raise ValueError("Bạn phải bật ít nhất một đặc trưng (LOGMEL/MFCC/CHROMA).")
    # Khớp T (trường hợp sai lệch 1-2 frame do làm tròn) bằng cách cắt theo T nhỏ nhất
    T_min = min(f.shape[1] for f in feats)
    feats = [f[:, :T_min] for f in feats]
    return np.concatenate(feats, axis=0)

def compute_norm_stats(X):
    """X: (N, C, T) -> tính mean/std theo từng kênh C trên toàn bộ frames."""
    N, C, T = X.shape
    flat = X.transpose(1, 0, 2).reshape(C, N*T)
    mean = flat.mean(axis=1)
    std = flat.std(axis=1) + 1e-8
    return mean, std

def apply_norm(X, mean, std):
    """Pre-emphasis theo kênh: (N, C, T) -> normalized."""
    return (X - mean[None, :, None]) / std[None, :, None]

def read_split(split_name):
    df = pd.read_csv(DATA_DIR / f"{split_name}_final.csv")
    # Đảm bảo đúng kiểu
    df["file_path"] = df["file_path"].astype(str)
    df["label_id"] = df["label_id"].astype(int)
    return df

In [5]:
# 3) Trích xuất cho 1 split
# =========================
def extract_split(split_name, T_target=None):
    """
    split_name: 'train' | 'val' | 'test'
    T_target: nếu None (thường cho train), sẽ chọn = min(T_max, MAX_FRAMES)
    Trả về: X_pad (N,C,T), y (N,), paths (list), T_target (int)
    """
    df = read_split(split_name)
    X_list, y_list, paths = [], [], []

    for path, label_id in zip(df["file_path"].tolist(), df["label_id"].tolist()):
        y = load_wav(path)
        F = stack_features(y)           # (C, T)
        X_list.append(F)
        y_list.append(int(label_id))
        paths.append(str(path))

    # Chọn số frame mục tiêu
    if T_target is None:
        T_max = max(x.shape[1] for x in X_list)
        T_target = min(T_max, MAX_FRAMES)

    # Pad/trim và xếp stack
    X_pad = np.stack([pad_or_trim_feat(x, T_target) for x in X_list], axis=0)  # (N, C, T_target)
    y = np.array(y_list, dtype=np.int64)

    return X_pad, y, paths, T_target

In [6]:
def main():
    # ---- Train ----
    X_train, y_train, train_paths, T_target = extract_split("train", T_target=None)
    mean, std = compute_norm_stats(X_train)
    X_train = apply_norm(X_train, mean, std)

    # ---- Val/Test: dùng cùng T_target & cùng mean/std của train ----
    X_val, y_val, val_paths, _ = extract_split("val", T_target=T_target)
    X_test, y_test, test_paths, _ = extract_split("test", T_target=T_target)

    X_val  = apply_norm(X_val,  mean, std)
    X_test = apply_norm(X_test, mean, std)

    # ---- Lưu npy ----
    np.save(FEAT_DIR / "X_train.npy", X_train)
    np.save(FEAT_DIR / "y_train.npy", y_train)
    np.save(FEAT_DIR / "X_val.npy",   X_val)
    np.save(FEAT_DIR / "y_val.npy",   y_val)
    np.save(FEAT_DIR / "X_test.npy",  X_test)
    np.save(FEAT_DIR / "y_test.npy",  y_test)

    # ---- Lưu meta ----
    channels = X_train.shape[1]
    meta = {
        "sr": SR,
        "n_fft": N_FFT,
        "hop": HOP,
        "win": WIN,
        "n_mels": N_MELS,
        "n_mfcc": N_MFCC,
        "use_logmel": USE_LOGMEL,
        "use_mfcc39": USE_MFCC39,
        "use_chroma": USE_CHROMA,
        "channels": int(channels),
        "frames_target": int(X_train.shape[2]),
        "max_frames_cap": MAX_FRAMES,
        "mean": [float(m) for m in mean],
        "std":  [float(s) for s in std],
        "train_size": int(X_train.shape[0]),
        "val_size": int(X_val.shape[0]),
        "test_size": int(X_test.shape[0]),
        "train_paths_head": train_paths[:3],  # để debug nhanh
        "val_paths_head":   val_paths[:3],
        "test_paths_head":  test_paths[:3],
    }
    with open(FEAT_DIR / "meta.json", "w") as f:
        json.dump(meta, f, indent=2)

    print("Done feature extraction.")
    print(f"Shapes -> X_train {X_train.shape}, X_val {X_val.shape}, X_test {X_test.shape}")
    print(f"Channels: {channels} (LOGMEL={USE_LOGMEL}, MFCC39={USE_MFCC39}, CHROMA={USE_CHROMA})")
    print(f"Norm stats saved at: {FEAT_DIR/'meta.json'}")

if __name__ == "__main__":
    main()

Done feature extraction.
Shapes -> X_train (1843, 115, 330), X_val (461, 115, 330), X_test (576, 115, 330)
Channels: 115 (LOGMEL=True, MFCC39=True, CHROMA=True)
Norm stats saved at: /kaggle/working/processed/features/meta.json


In [7]:
# === 4. LOAD DATA ===
import numpy as np, os, json, torch
from torch.utils.data import TensorDataset, DataLoader

FEATURES_DIR = "/kaggle/working/processed/features"  # hoặc đường dẫn bạn lưu
OUT_DIR = "/kaggle/working"
os.makedirs(OUT_DIR, exist_ok=True)

# Load đặc trưng và nhãn
X_train = np.load(os.path.join(FEATURES_DIR, "X_train.npy"))
y_train = np.load(os.path.join(FEATURES_DIR, "y_train.npy"))
X_val   = np.load(os.path.join(FEATURES_DIR, "X_val.npy"))
y_val   = np.load(os.path.join(FEATURES_DIR, "y_val.npy"))
X_test  = np.load(os.path.join(FEATURES_DIR, "X_test.npy"))
y_test  = np.load(os.path.join(FEATURES_DIR, "y_test.npy"))

with open(os.path.join(FEATURES_DIR, "meta.json")) as f:
    meta = json.load(f)

print("Shapes:", X_train.shape, X_val.shape, X_test.shape)
print("Classes:", len(set(y_train)))

# Reshape cho CNN2D: (N, 1, C, T)
X_train = X_train[:, None, :, :].astype("float32")
X_val   = X_val[:, None, :, :].astype("float32")
X_test  = X_test[:, None, :, :].astype("float32")

# Dataloader
BATCH_SIZE = 32
device = "cuda" if torch.cuda.is_available() else "cpu"

train_dl = DataLoader(TensorDataset(torch.tensor(X_train), torch.tensor(y_train)),
                      batch_size=BATCH_SIZE, shuffle=True)
val_dl = DataLoader(TensorDataset(torch.tensor(X_val), torch.tensor(y_val)),
                    batch_size=BATCH_SIZE)
test_dl = DataLoader(TensorDataset(torch.tensor(X_test), torch.tensor(y_test)),
                     batch_size=BATCH_SIZE)


Shapes: (1843, 115, 330) (461, 115, 330) (576, 115, 330)
Classes: 8


In [8]:
# === 5. MODEL DEFINITION ===
import torch.nn as nn
import torch.nn.functional as F

class SER_CNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, (5,7), padding=(2,3)), nn.BatchNorm2d(32), nn.ReLU(),
            nn.MaxPool2d((2,2)),
            nn.Conv2d(32, 64, (3,5), padding=(1,2)), nn.BatchNorm2d(64), nn.ReLU(),
            nn.MaxPool2d((2,2)),
            nn.Conv2d(64, 128, (3,5), padding=(1,2)), nn.BatchNorm2d(128), nn.ReLU(),
            nn.Dropout(0.25),
        )
        self.pool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128, 128), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.conv(x)
        x = self.pool(x)
        return self.fc(x)

# Khởi tạo
num_classes = len(set(y_train))
model = SER_CNN(num_classes).to(device)
print(model)


SER_CNN(
  (conv): Sequential(
    (0): Conv2d(1, 32, kernel_size=(5, 7), stride=(1, 1), padding=(2, 3))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 5), stride=(1, 1), padding=(1, 2))
    (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(64, 128, kernel_size=(3, 5), stride=(1, 1), padding=(1, 2))
    (9): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.25, inplace=False)
  )
  (pool): AdaptiveAvgPool2d(output_size=(1, 1))
  (fc): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=128, out_features=128, bias=True)
    (2): ReLU()
    (3

In [9]:
# === 6. TRAINING SETUP ===
from sklearn.metrics import f1_score

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=3)

def evaluate(model, dataloader):
    model.eval()
    all_true, all_pred = [], []
    with torch.no_grad():
        for xb, yb in dataloader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb).argmax(1)
            all_true.append(yb.cpu().numpy())
            all_pred.append(pred.cpu().numpy())
    y_true = np.concatenate(all_true); y_pred = np.concatenate(all_pred)
    f1 = f1_score(y_true, y_pred, average="macro")
    acc = (y_true == y_pred).mean()
    return acc, f1


In [10]:
# === 7. TRAIN LOOP ===
EPOCHS = 30
best_f1, patience, left = -1, 5, 5

for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model(xb), yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    train_loss = total_loss / len(train_dl.dataset)
    
    val_acc, val_f1 = evaluate(model, val_dl)
    scheduler.step(val_f1)

    print(f"Epoch {epoch:02d}: loss={train_loss:.4f}  val_acc={val_acc:.3f}  val_f1={val_f1:.3f}")

    if val_f1 > best_f1:
        best_f1, left = val_f1, patience
        torch.save(model.state_dict(), os.path.join(OUT_DIR, "ser_best.pt"))
    else:
        left -= 1
        if left == 0:
            print("⏹ Early stopping.")
            break

print("Best val F1:", best_f1)


Epoch 01: loss=2.0270  val_acc=0.208  val_f1=0.106
Epoch 02: loss=1.9353  val_acc=0.269  val_f1=0.185
Epoch 03: loss=1.8284  val_acc=0.345  val_f1=0.244
Epoch 04: loss=1.7345  val_acc=0.371  val_f1=0.281
Epoch 05: loss=1.6596  val_acc=0.397  val_f1=0.310
Epoch 06: loss=1.6005  val_acc=0.230  val_f1=0.142
Epoch 07: loss=1.5712  val_acc=0.432  val_f1=0.342
Epoch 08: loss=1.5135  val_acc=0.299  val_f1=0.237
Epoch 09: loss=1.4921  val_acc=0.141  val_f1=0.043
Epoch 10: loss=1.4379  val_acc=0.330  val_f1=0.268
Epoch 11: loss=1.3965  val_acc=0.408  val_f1=0.347
Epoch 12: loss=1.3670  val_acc=0.221  val_f1=0.156
Epoch 13: loss=1.3204  val_acc=0.213  val_f1=0.128
Epoch 14: loss=1.3174  val_acc=0.219  val_f1=0.120
Epoch 15: loss=1.2788  val_acc=0.425  val_f1=0.351
Epoch 16: loss=1.2275  val_acc=0.210  val_f1=0.133
Epoch 17: loss=1.1801  val_acc=0.176  val_f1=0.094
Epoch 18: loss=1.1399  val_acc=0.443  val_f1=0.403
Epoch 19: loss=1.1282  val_acc=0.267  val_f1=0.129
Epoch 20: loss=1.1045  val_acc=

In [11]:
# === 8. TEST EVALUATION ===
from sklearn.metrics import classification_report, confusion_matrix

model.load_state_dict(torch.load(os.path.join(OUT_DIR, "ser_best.pt"), map_location=device))
acc, f1 = evaluate(model, test_dl)
print(f"\n Test Accuracy: {acc:.4f} | Macro-F1: {f1:.4f}")

# In báo cáo chi tiết
y_true, y_pred = [], []
model.eval()
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        preds = model(xb).argmax(1).cpu().numpy()
        y_true.append(yb.numpy()); y_pred.append(preds)
y_true = np.concatenate(y_true); y_pred = np.concatenate(y_pred)

print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4))
print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))



 Test Accuracy: 0.5729 | Macro-F1: 0.5341

Classification Report:
               precision    recall  f1-score   support

           0     0.4000    0.1053    0.1667        38
           1     0.5000    1.0000    0.6667        76
           2     0.7667    0.2987    0.4299        77
           3     0.2895    0.4286    0.3455        77
           4     0.8824    0.7792    0.8276        77
           5     0.4839    0.3896    0.4317        77
           6     0.7531    0.7922    0.7722        77
           7     0.7288    0.5584    0.6324        77

    accuracy                         0.5729       576
   macro avg     0.6005    0.5440    0.5341       576
weighted avg     0.6143    0.5729    0.5587       576


Confusion Matrix:
 [[ 4 26  0  8  0  0  0  0]
 [ 0 76  0  0  0  0  0  0]
 [ 3  5 23 22  1 12  5  6]
 [ 2 32  0 33  0  4  6  0]
 [ 0  0  0  1 60  3  7  6]
 [ 0  5  1 37  0 30  0  4]
 [ 0  6  0  7  3  0 61  0]
 [ 1  2  6  6  4 13  2 43]]
