# 下载数据

## https://drive.google.com/file/d/1zHjG3F8msz9LBPhp_N5kp_O6G9F2Y5w9/view?usp=drive_link

In [None]:
# !gdown --id '1zHjG3F8msz9LBPhp_N5kp_O6G9F2Y5w9' --output Dataset.zip
# !unzip Dataset.zip

# 导入包

In [1]:
import os
import json
import torch
import random
from pathlib import Path
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, random_split
from datetime import datetime

# dataset

In [2]:
class myDataset(Dataset):
  def __init__(self, data_dir, segment_len=128):
    """初始化数据集
    
    Args:
        data_dir (str): 数据目录路径
        segment_len (int, optional): 每个片段的帧数，默认为128
    """
    self.data_dir = data_dir
    self.segment_len = segment_len

    # 加载说话人姓名到ID的映射关系
    mapping_path = Path(data_dir) / "mapping.json"
    mapping = json.load(mapping_path.open())
    self.speaker2id = mapping["speaker2id"]

    # 加载训练数据的元数据
    metadata_path = Path(data_dir) / "metadata.json"
    metadata = json.load(open(metadata_path))["speakers"]

    # 获取说话人总数
    self.speaker_num = len(metadata.keys())
    self.data = []
    
    # 遍历所有说话人和他们的语音片段，构建数据列表
    # 每个元素包含特征路径和对应的说话人ID
    for speaker in metadata.keys():
      for utterances in metadata[speaker]:
        self.data.append([utterances["feature_path"], self.speaker2id[speaker]])

  def __len__(self):
    """返回数据集中的样本数量"""
    return len(self.data)

  def __getitem__(self, index):
    """获取指定索引的数据样本
    
    Args:
        index (int): 数据索引
    
    Returns:
        tuple: (mel频谱图, 说话人ID)
    """
    # 获取特征路径和说话人ID
    feat_path, speaker = self.data[index]
    
    # 加载预处理的mel频谱图
    mel = torch.load(os.path.join(self.data_dir, feat_path))

    # 将mel频谱图分割成指定长度的片段
    if len(mel) > self.segment_len:
      # 随机获取片段的起始点
      start = random.randint(0, len(mel) - self.segment_len)
      # 截取指定长度的片段
      mel = torch.FloatTensor(mel[start:start+self.segment_len])
    else:
      mel = torch.FloatTensor(mel)
      
    # 将说话人ID转换为long类型，用于后续损失计算
    speaker = torch.FloatTensor([speaker]).long()
    
    return mel, speaker

  def get_speaker_number(self):
    """返回数据集中说话人的总数"""
    return self.speaker_num

# dataloader

In [3]:
import torch
from torch.utils.data import DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence


def collate_batch(batch):
  """处理一个批次的数据
  
  将同一个批次中的特征进行填充，使它们的长度相同
  
  Args:
      batch: 一个批次的数据，包含mel频谱图和说话人ID
  
  Returns:
      tuple: 填充后的mel频谱图和说话人ID张量
  """
  # 将批次数据解包为mel频谱图和说话人ID
  mel, speaker = zip(*batch)
  
  # 对同一个批次中的mel频谱图进行填充，使它们长度相同
  # 使用-20进行填充，对应log10^(-20)，这是一个非常小的值（接近0）
  mel = pad_sequence(mel, batch_first=True, padding_value=-20)
  
  # mel的形状: (批次大小, 序列长度, 40个mel频带)
  return mel, torch.FloatTensor(speaker).long()


def get_dataloader(data_dir, batch_size, n_workers):
  """生成数据加载器
  
  Args:
      data_dir (str): 数据目录路径
      batch_size (int): 批次大小
      n_workers (int): 数据加载的工作进程数
  
  Returns:
      tuple: 训练数据加载器、验证数据加载器、说话人数量
  """
  # 创建数据集实例
  dataset = myDataset(data_dir)
  
  # 获取数据集中说话人的总数
  speaker_num = dataset.get_speaker_number()
  
  # 将数据集按9:1的比例分割为训练集和验证集
  trainlen = int(0.9 * len(dataset))  # 90% 训练集
  lengths = [trainlen, len(dataset) - trainlen]  # 训练集和验证集的大小
  trainset, validset = random_split(dataset, lengths)  # 随机分割

  # 创建训练数据加载器
  train_loader = DataLoader(
    trainset,                    # 训练数据集
    batch_size=batch_size,       # 批次大小
    shuffle=True,                # 每个epoch打乱数据
    drop_last=True,              # 丢弃最后一个不完整的批次
    num_workers=n_workers,       # 数据加载的工作进程数
    pin_memory=True,             # 将数据固定在内存中，加速GPU传输
    collate_fn=collate_batch,    # 自定义批次处理函数
  )
  
  # 创建验证数据加载器
  valid_loader = DataLoader(
    validset,                    # 验证数据集
    batch_size=batch_size,       # 批次大小
    num_workers=n_workers,       # 数据加载的工作进程数
    drop_last=True,              # 丢弃最后一个不完整的批次
    pin_memory=True,             # 将数据固定在内存中，加速GPU传输
    collate_fn=collate_batch,    # 自定义批次处理函数
  )

  return train_loader, valid_loader, speaker_num

# 模型

In [4]:
class Classifier(nn.Module):
  def __init__(self, d_model=80, n_spks=600, dropout=0.1):
    """初始化分类器模型
    
    Args:
        d_model (int): 模型的特征维度，默认为80
        n_spks (int): 说话人数量，默认为600
        dropout (float): dropout率，默认为0.1
    """
    super().__init__()
    
    # 将输入特征的维度从40投影到d_model
    # 输入: (batch_size, length, 40) -> 输出: (batch_size, length, d_model)
    self.prenet = nn.Linear(40, d_model)
    
    # TODO: 将Transformer改为Conformer
    # 参考论文: https://arxiv.org/abs/2005.08100
    # Conformer结合了CNN和Transformer的优点，在语音任务上表现更好
    
    # 当前使用Transformer编码层
    self.encoder_layer = nn.TransformerEncoderLayer(
      d_model=d_model,        # 特征维度
      dim_feedforward=256,    # 前馈网络的隐藏层维度
      nhead=2                 # 注意力头数
    )
    # 如果需要多层，可以使用TransformerEncoder
    # self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)

    # 预测层：将d_model维特征映射到说话人数量
    self.pred_layer = nn.Sequential(
      nn.Linear(d_model, d_model),  # 线性变换
      nn.ReLU(),                    # 激活函数
      nn.Linear(d_model, n_spks),   # 输出层，输出每个说话人的分数
    )

  def forward(self, mels):
    """
    前向传播
    
    Args:
      mels: 输入mel频谱图，形状为 (batch_size, length, 40)
      
    Return:
      out: 输出说话人分类结果，形状为 (batch_size, n_spks)
    """
    # 输入投影: (batch_size, length, 40) -> (batch_size, length, d_model)
    out = self.prenet(mels)
    
    # 调整维度以适应Transformer输入要求
    # (batch_size, length, d_model) -> (length, batch_size, d_model)
    out = out.permute(1, 0, 2)
    
    # Transformer编码层期望输入形状为 (length, batch_size, d_model)
    out = self.encoder_layer(out)
    
    # 恢复维度: (length, batch_size, d_model) -> (batch_size, length, d_model)
    out = out.transpose(0, 1)
    
    # 均值池化：沿时间维度求平均
    # (batch_size, length, d_model) -> (batch_size, d_model)
    stats = out.mean(dim=1)

    # 通过预测层得到最终分类结果
    # (batch_size, d_model) -> (batch_size, n_spks)
    out = self.pred_layer(stats)
    
    return out

# 学习率

In [5]:
import math
import torch
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR


def get_cosine_schedule_with_warmup(
  optimizer: Optimizer,
  num_warmup_steps: int,
  num_training_steps: int,
  num_cycles: float = 0.5,
  last_epoch: int = -1,
):
  """
  创建带有热身阶段的余弦退火学习率调度器
  
  学习率首先在热身阶段从0线性增加到优化器中设置的初始学习率，
  然后按照余弦函数的值从初始学习率下降到0。

  Args:
    optimizer (:class:`~torch.optim.Optimizer`):
      需要调度学习率的优化器
    num_warmup_steps (:obj:`int`):
      热身阶段的步数
    num_training_steps (:obj:`int`):
      总的训练步数
    num_cycles (:obj:`float`, `optional`, 默认为 0.5):
      余弦调度中的波数（默认只是从最大值下降到0，遵循半余弦）
    last_epoch (:obj:`int`, `optional`, 默认为 -1):
      恢复训练时最后一个epoch的索引

  Return:
    :obj:`torch.optim.lr_scheduler.LambdaLR`: 带有相应调度策略的学习率调度器
  """

  def lr_lambda(current_step):
    """计算学习率乘数的lambda函数
    
    Args:
        current_step (int): 当前训练步数
    
    Returns:
        float: 学习率乘数
    """
    # 热身阶段：线性增加学习率
    if current_step < num_warmup_steps:
      return float(current_step) / float(max(1, num_warmup_steps))
    
    # 衰减阶段：余弦退火
    progress = float(current_step - num_warmup_steps) / float(
      max(1, num_training_steps - num_warmup_steps)
    )
    
    # 计算余弦值，确保不会小于0
    return max(
      0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
    )

  # 创建并返回LambdaLR调度器
  return LambdaLR(optimizer, lr_lambda, last_epoch)

In [6]:
def model_fn(batch, model, criterion, device):
  """处理一个批次的模型前向传播
  
  对输入批次数据进行前向传播，计算损失和准确率
  
  Args:
      batch: 一个批次的数据，包含mel频谱图和标签
      model: 神经网络模型
      criterion: 损失函数
      device: 计算设备（CPU或GPU）
  
  Returns:
      tuple: (损失值, 准确率)
  """

  # 解包批次数据，获取mel频谱图和对应的说话人标签
  mels, labels = batch
  
  # 将数据移动到指定设备（GPU或CPU）
  mels = mels.to(device)
  labels = labels.to(device)

  # 前向传播：通过模型获取预测输出
  outs = model(mels)

  # 计算损失：比较预测输出和真实标签
  loss = criterion(outs, labels)

  # 获取预测结果：选择概率最高的说话人ID
  preds = outs.argmax(1)
  
  # 计算准确率：比较预测结果和真实标签，求平均值
  accuracy = torch.mean((preds == labels).float())

  return loss, accuracy

In [7]:
def valid(dataloader, model, criterion, device):
  """在验证集上进行模型验证

  评估模型在验证集上的性能，计算平均损失和准确率

  Args:
      dataloader: 验证集的数据加载器
      model: 神经网络模型
      criterion: 损失函数
      device: 计算设备（CPU或GPU）

  Returns:
      float: 验证集的平均准确率
  """

  # 将模型设置为评估模式
  # 这会禁用dropout、batch normalization的更新等训练特定操作
  model.eval()
  
  # 初始化累计损失和准确率
  running_loss = 0.0
  running_accuracy = 0.0
  
  # 遍历验证集中的所有批次
  for i, batch in enumerate(dataloader):
    # 禁用梯度计算，节省内存和计算资源
    with torch.no_grad():
      # 使用model_fn函数计算当前批次的损失和准确率
      loss, accuracy = model_fn(batch, model, criterion, device)
      
      # 累加损失和准确率
      running_loss += loss.item()
      running_accuracy += accuracy.item()

  
  # 将模型恢复为训练模式
  model.train()

  # 返回整个验证集的平均准确率
  avg_loss = running_loss / len(dataloader)
  avg_accuracy = running_accuracy / len(dataloader)
  return avg_loss, avg_accuracy

In [8]:
BASE_DIR = os.getcwd() 
data_dir = os.path.join(BASE_DIR, 'Dataset') 
batch_size = 32
n_workers = 8
valid_steps = 2000
warmup_steps = 1000
save_steps = 10000
total_steps = 70000
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []
learning_rates = []
best_val_accuracy = 0.0
early_stop_counter = 0
best_epoch = 0

In [None]:
# 创建结果目录
now_time = datetime.now()
time_str = datetime.strftime(now_time, '%m-%d_%H-%M')
log_dir = os.path.join(BASE_DIR, "results", time_str)
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
print(f"结果保存目录: {log_dir}")

In [None]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, random_split

 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

train_loader, valid_loader, speaker_num = get_dataloader(data_dir, batch_size, n_workers)
train_iterator = iter(train_loader)
print(f"[Info]: Finish loading data!",flush = True)

model = Classifier(n_spks=speaker_num).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-3)
scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
print(f"[Info]: Finish creating model!",flush = True)

best_accuracy = -1.0
best_state_dict = None


for step in range(total_steps):
  # Get data
  try:
    batch = next(train_iterator)
  except StopIteration:
    train_iterator = iter(train_loader)
    batch = next(train_iterator)

  train_loss, train_accuracy = model_fn(batch, model, criterion, device)
  train_losses.append(train_loss)
  train_losses.append(train_loss.item())
  batch_loss = train_loss.item()
  batch_accuracy = train_accuracy.item()

  # Updata model
  train_loss.backward()
  optimizer.step()
  
  # 更新学习率并记录
  current_lr = scheduler.get_last_lr()[0]
  learning_rates.append(current_lr)
  scheduler.step()
  optimizer.zero_grad()

  
  # Do validation
  if (step + 1) % valid_steps == 0:

    val_loss, val_accuracy  = valid(valid_loader, model, criterion, device)
    
    # 验证阶段
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)
    # 早停判断和模型保存
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_epoch = epoch
        early_stop_counter = 0

        # 保存最佳模型
        checkpoint = {
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
            "epoch": epoch,
            "best_val_accuracy": best_val_accuracy,
            "val_loss": val_loss,
            "train_accuracy": train_accuracy,
            "train_loss": train_loss
        }
        path_checkpoint = os.path.join(log_dir, "checkpoint_best.pkl")
        torch.save(checkpoint, path_checkpoint)
        print(f"✅ 保存最佳模型，验证准确率: {best_val_accuracy:.2f}%")
    
    # 保存接近最佳的模型（用于集成）
    elif val_accuracy > best_val_accuracy - 2.0:
        checkpoint = {
            "model_state_dict": model.state_dict(),
            "val_accuracy": val_accuracy,
            "epoch": epoch
        }
        torch.save(checkpoint, os.path.join(log_dir, f"checkpoint_epoch_{epoch}_acc_{val_accuracy:.2f}.pth"))

    else:
        early_stop_counter += 1

    # 打印训练信息
    print(f'Epoch: {epoch:03d}/{MAX_EPOCH}, '
          f'训练损失: {train_loss:.4f}, 训练准确率: {train_accuracy:.2f}% , '
          f'验证损失: {val_loss:.4f}, 验证准确率: {val_accuracy:.2f}% , '
          f'学习率: {current_lr:.6f}, '
          f'最佳: {best_val_accuracy:.2f}% @ Epoch {best_epoch}')
    print('-' * 80)


使用设备: cpu


In [None]:
import os
import json
import torch
from pathlib import Path
from torch.utils.data import Dataset


class InferenceDataset(Dataset):
  def __init__(self, data_dir):
    testdata_path = Path(data_dir) / "testdata.json"
    metadata = json.load(testdata_path.open())
    self.data_dir = data_dir
    self.data = metadata["utterances"]

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    utterance = self.data[index]
    feat_path = utterance["feature_path"]
    mel = torch.load(os.path.join(self.data_dir, feat_path))

    return feat_path, mel


def inference_collate_batch(batch):
  """Collate a batch of data."""
  feat_paths, mels = zip(*batch)

  return feat_paths, torch.stack(mels)

In [None]:
import json
import csv
from pathlib import Path
from tqdm.notebook import tqdm

import torch
from torch.utils.data import DataLoader

def parse_args():
  """arguments"""
  config = {
    "data_dir": "./Dataset",
    "model_path": "./model.ckpt",
    "output_path": "./output.csv",
  }

  return config


def main(
  data_dir,
  model_path,
  output_path,
):
  """Main function."""
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  print(f"[Info]: Use {device} now!")

  mapping_path = Path(data_dir) / "mapping.json"
  mapping = json.load(mapping_path.open())

  dataset = InferenceDataset(data_dir)
  dataloader = DataLoader(
    dataset,
    batch_size=1,
    shuffle=False,
    drop_last=False,
    num_workers=8,
    collate_fn=inference_collate_batch,
  )
  print(f"[Info]: Finish loading data!",flush = True)

  speaker_num = len(mapping["id2speaker"])
  model = Classifier(n_spks=speaker_num).to(device)
  model.load_state_dict(torch.load(model_path))
  model.eval()
  print(f"[Info]: Finish creating model!",flush = True)

  results = [["Id", "Category"]]
  for feat_paths, mels in tqdm(dataloader):
    with torch.no_grad():
      mels = mels.to(device)
      outs = model(mels)
      preds = outs.argmax(1).cpu().numpy()
      for feat_path, pred in zip(feat_paths, preds):
        results.append([feat_path, mapping["id2speaker"][str(pred)]])

  with open(output_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(results)


if __name__ == "__main__":
  main(**parse_args())