# 下载数据

## https://drive.google.com/file/d/1zHjG3F8msz9LBPhp_N5kp_O6G9F2Y5w9/view?usp=drive_link

In [None]:
# !gdown --id '1zHjG3F8msz9LBPhp_N5kp_O6G9F2Y5w9' --output Dataset.zip
# !unzip Dataset.zip

# 导入包

In [3]:
import os
import json
import torch
import random
import math
from pathlib import Path
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from datetime import datetime
import matplotlib.pyplot as plt


# 模型

In [4]:
class Classifier(nn.Module):
  def __init__(self, d_model=80, n_spks=600, dropout=0.1):
    """初始化分类器模型
    
    Args:
        d_model (int): 模型的特征维度，默认为80
        n_spks (int): 说话人数量，默认为600
        dropout (float): dropout率，默认为0.1
    """
    super().__init__()
    
    # 将输入特征的维度从40投影到d_model
    # 输入: (batch_size, length, 40) -> 输出: (batch_size, length, d_model)
    self.prenet = nn.Linear(40, d_model)
    
    # TODO: 将Transformer改为Conformer
    # 参考论文: https://arxiv.org/abs/2005.08100
    # Conformer结合了CNN和Transformer的优点，在语音任务上表现更好
    
    # 当前使用Transformer编码层
    self.encoder_layer = nn.TransformerEncoderLayer(
      d_model=d_model,        # 特征维度
      dim_feedforward=256,    # 前馈网络的隐藏层维度
      nhead=2                 # 注意力头数
    )
    # 如果需要多层，可以使用TransformerEncoder
    # self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)

    # 预测层：将d_model维特征映射到说话人数量
    self.pred_layer = nn.Sequential(
      nn.Linear(d_model, d_model),  # 线性变换
      nn.ReLU(),                    # 激活函数
      nn.Linear(d_model, n_spks),   # 输出层，输出每个说话人的分数
    )

  def forward(self, mels):
    """
    前向传播
    
    Args:
      mels: 输入mel频谱图，形状为 (batch_size, length, 40)
      
    Return:
      out: 输出说话人分类结果，形状为 (batch_size, n_spks)
    """
    # 输入投影: (batch_size, length, 40) -> (batch_size, length, d_model)
    out = self.prenet(mels)
    
    # 调整维度以适应Transformer输入要求
    # (batch_size, length, d_model) -> (length, batch_size, d_model)
    out = out.permute(1, 0, 2)
    
    # Transformer编码层期望输入形状为 (length, batch_size, d_model)
    out = self.encoder_layer(out)
    
    # 恢复维度: (length, batch_size, d_model) -> (batch_size, length, d_model)
    out = out.transpose(0, 1)
    
    # 均值池化：沿时间维度求平均
    # (batch_size, length, d_model) -> (batch_size, d_model)
    stats = out.mean(dim=1)

    # 通过预测层得到最终分类结果
    # (batch_size, d_model) -> (batch_size, n_spks)
    out = self.pred_layer(stats)
    
    return out

In [None]:
class SpeakerClassifier(nn.Module):
    def __init__(self, d_model=80, n_spks=600, dropout=0.1):
        """说话人分类模型
        
        Args:
            d_model (int): 模型特征维度
            n_spks (int): 说话人数量
            dropout (float): dropout比率
        """
        super().__init__()
        
        # 输入投影层: 40维mel特征 -> d_model维
        self.prenet = nn.Linear(40, d_model)
        
        # Transformer编码层
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            dim_feedforward=256,
            nhead=2,
            dropout=dropout,
            batch_first=False  # Transformer需要序列维度在前
        )
        
        # 分类器
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, n_spks),
        )

    def forward(self, mels):
        """前向传播
        
        Args:
            mels: mel频谱图, shape: (batch, seq_len, 40)
            
        Returns:
            说话人分类logits, shape: (batch, n_spks)
        """
        # 输入投影
        x = self.prenet(mels)  # (batch, seq_len, d_model)
        
        # 调整维度适应Transformer: (seq_len, batch, d_model)
        x = x.transpose(0, 1)
        
        # Transformer编码
        x = self.encoder_layer(x)  # (seq_len, batch, d_model)
        
        # 恢复维度: (batch, seq_len, d_model)
        x = x.transpose(0, 1)
        
        # 全局平均池化
        x = x.mean(dim=1)  # (batch, d_model)
        
        # 分类
        out = self.classifier(x)  # (batch, n_spks)
        
        return out

## 创建模型的函数

In [5]:
def get_Classifier(d_model=80, n_spks=600, dropout=0.1):
    """创建适配512×512的ResNet-18模型"""
    return Classifier(d_model=d_model, n_spks=n_spks, dropout=dropout)

# 添加工具函数

## 绘制训练和损失曲线

In [7]:
def plot_loss_curves(train_losses, val_losses, save_path=None):
    """绘制训练和验证损失曲线"""
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='训练损失', linewidth=2)
    plt.plot(val_losses, label='验证损失', linewidth=2)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('训练和验证损失曲线')
    plt.legend()
    plt.grid(True, alpha=0.3)

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"图片已保存到: {save_path}")

## 绘制训练准确率曲线

In [8]:
def plot_accuracy_curves(train_accuracies, val_accuracies, save_path=None):
    """绘制准确率曲线"""
    plt.figure(figsize=(10, 6))
    epochs = range(1, len(train_accuracies) + 1)

    plt.plot(epochs, train_accuracies, 'b-', label='训练准确率', linewidth=2)
    plt.plot(epochs, val_accuracies, 'r-', label='验证准确率', linewidth=2)

    plt.title('训练和验证准确率', fontsize=14, fontweight='bold')
    plt.xlabel('Epochs', fontsize=12)
    plt.ylabel('Accuracy (%)', fontsize=12)
    plt.legend(fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.ylim(bottom=0)

    # 添加最佳准确率标注
    best_val_acc = max(val_accuracies)
    best_epoch = val_accuracies.index(best_val_acc) + 1
    plt.axvline(x=best_epoch, color='gray', linestyle='--', alpha=0.7)
    plt.text(best_epoch, best_val_acc / 2, f'最佳: {best_val_acc:.2f}%\nEpoch: {best_epoch}',
             ha='center', va='center', fontsize=10, 
             bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.close()
    else:
        plt.show()

## 绘制综合曲线

In [9]:
def plot_training_curves(train_losses, val_losses, train_accuracies, val_accuracies, save_path=None):
    """绘制综合训练曲线"""
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    epochs = range(1, len(train_losses) + 1)

    # 绘制损失曲线
    ax1.plot(epochs, train_losses, 'b-', label='训练损失', linewidth=2)
    ax1.plot(epochs, val_losses, 'r-', label='验证损失', linewidth=2)
    ax1.set_title('训练和验证损失', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Loss', fontsize=12)
    ax1.legend(fontsize=12)
    ax1.grid(True, alpha=0.3)

    # 绘制准确率曲线
    ax2.plot(epochs, train_accuracies, 'b-', label='训练准确率', linewidth=2)
    ax2.plot(epochs, val_accuracies, 'r-', label='验证准确率', linewidth=2)
    ax2.set_title('训练和验证准确率', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Epochs', fontsize=12)
    ax2.set_ylabel('Accuracy (%)', fontsize=12)
    ax2.legend(fontsize=12)
    ax2.grid(True, alpha=0.3)
    ax2.set_ylim(bottom=0)

    # 添加最佳准确率标注
    best_val_acc = max(val_accuracies)
    best_epoch = val_accuracies.index(best_val_acc) + 1
    ax2.axvline(x=best_epoch, color='gray', linestyle='--', alpha=0.7)
    ax2.text(best_epoch, best_val_acc / 2, f'最佳: {best_val_acc:.2f}%',
             ha='center', va='center', fontsize=10,
             bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.close()
    else:
        plt.show()

# dataset

In [2]:
class SpeakerDataset(Dataset):
    def __init__(self, data_dir, segment_len=128):
        """初始化说话人数据集
        
        Args:
            data_dir (str): 数据目录路径
            segment_len (int): 每个语音片段的帧数，默认为128
        """
        self.data_dir = data_dir
        self.segment_len = segment_len

        # 加载说话人映射关系
        mapping_path = Path(data_dir) / "mapping.json"
        mapping = json.load(mapping_path.open())
        self.speaker2id = mapping["speaker2id"]

        # 加载元数据
        metadata_path = Path(data_dir) / "metadata.json"
        metadata = json.load(open(metadata_path))["speakers"]

        # 获取说话人总数
        self.speaker_num = len(metadata.keys())
        self.data = []
        
        # 构建数据列表: [特征路径, 说话人ID]
        for speaker in metadata.keys():
            for utterances in metadata[speaker]:
                self.data.append([utterances["feature_path"], self.speaker2id[speaker]])

    def __len__(self):
        """返回数据集大小"""
        return len(self.data)

    def __getitem__(self, index):
        """获取单个样本"""
        feat_path, speaker = self.data[index]
        
        # 加载mel频谱特征
        mel = torch.load(os.path.join(self.data_dir, feat_path))

        # 随机截取固定长度的片段
        if len(mel) > self.segment_len:
            start = random.randint(0, len(mel) - self.segment_len)
            mel = torch.FloatTensor(mel[start:start+self.segment_len])
        else:
            mel = torch.FloatTensor(mel)
            
        speaker = torch.LongTensor([speaker]).squeeze()
        
        return mel, speaker

    def get_speaker_number(self):
        """返回说话人数量"""
        return self.speaker_num


# dataloader

In [None]:
import torch
from torch.utils.data import DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence


def collate_batch(batch):
  """处理一个批次的数据
  
  将同一个批次中的特征进行填充，使它们的长度相同
  
  Args:
      batch: 一个批次的数据，包含mel频谱图和说话人ID
  
  Returns:
      tuple: 填充后的mel频谱图和说话人ID张量
  """
  # 将批次数据解包为mel频谱图和说话人ID
  mel, speaker = zip(*batch)
  
  # 对同一个批次中的mel频谱图进行填充，使它们长度相同
  # 使用-20进行填充，对应log10^(-20)，这是一个非常小的值（接近0）
  mel = pad_sequence(mel, batch_first=True, padding_value=-20)
  
  # mel的形状: (批次大小, 序列长度, 40个mel频带)
  return mel, torch.FloatTensor(speaker).long()


def get_dataloader(data_dir, batch_size, n_workers):
  """生成数据加载器
  
  Args:
      data_dir (str): 数据目录路径
      batch_size (int): 批次大小
      n_workers (int): 数据加载的工作进程数
  
  Returns:
      tuple: 训练数据加载器、验证数据加载器、说话人数量
  """
  # 创建数据集实例
  dataset = SpeakerDataset(data_dir)
  
  # 获取数据集中说话人的总数
  speaker_num = dataset.get_speaker_number()
  
  # 将数据集按9:1的比例分割为训练集和验证集
  trainlen = int(0.9 * len(dataset))  # 90% 训练集
  lengths = [trainlen, len(dataset) - trainlen]  # 训练集和验证集的大小
  trainset, validset = random_split(dataset, lengths)  # 随机分割

  # 创建训练数据加载器
  train_loader = DataLoader(
    trainset,                    # 训练数据集
    batch_size=batch_size,       # 批次大小
    shuffle=True,                # 每个epoch打乱数据
    drop_last=True,              # 丢弃最后一个不完整的批次
    num_workers=n_workers,       # 数据加载的工作进程数
    pin_memory=True,             # 将数据固定在内存中，加速GPU传输
    collate_fn=collate_batch,    # 自定义批次处理函数
  )
  
  # 创建验证数据加载器
  valid_loader = DataLoader(
    validset,                    # 验证数据集
    batch_size=batch_size,       # 批次大小
    num_workers=n_workers,       # 数据加载的工作进程数
    drop_last=True,              # 丢弃最后一个不完整的批次
    pin_memory=True,             # 将数据固定在内存中，加速GPU传输
    collate_fn=collate_batch,    # 自定义批次处理函数
  )

  return train_loader, valid_loader, speaker_num

# 学习率

In [3]:
import math
import torch
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR

def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5):
    """带热身阶段的余弦退火学习率调度器
    
    Args:
        optimizer: 优化器
        num_warmup_steps: 热身步数
        num_training_steps: 总训练步数
        num_cycles: 余弦周期数
        
    Returns:
        学习率调度器
    """
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            # 线性热身
            return float(current_step) / float(max(1, num_warmup_steps))
        else:
            # 余弦退火
            progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
            return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
    
    return LambdaLR(optimizer, lr_lambda)

In [4]:
def model_fn(batch, model, criterion, device):
    """单个批次的前向传播
    
    Args:
        batch: 数据批次
        model: 模型
        criterion: 损失函数
        device: 计算设备
        
    Returns:
        tuple: (损失, 准确率)
    """
    mels, labels = batch
    mels = mels.to(device)
    labels = labels.to(device)

    # 前向传播
    outputs = model(mels)
    
    # 计算损失和准确率
    loss = criterion(outputs, labels)
    preds = outputs.argmax(dim=1)
    accuracy = (preds == labels).float().mean()
    
    return loss, accuracy

In [5]:
def validate(dataloader, model, criterion, device):
    """在验证集上评估模型
    
    Args:
        dataloader: 验证数据加载器
        model: 模型
        criterion: 损失函数
        device: 计算设备
        
    Returns:
        tuple: (平均损失, 平均准确率)
    """
    model.eval()
    total_loss = 0.0
    total_accuracy = 0.0
    num_batches = len(dataloader)
    
    with torch.no_grad():
        for batch in dataloader:
            loss, accuracy = model_fn(batch, model, criterion, device)
            total_loss += loss.item()
            total_accuracy += accuracy.item()
    
    model.train()
    
    return total_loss / num_batches, total_accuracy / num_batches


In [None]:
# 配置参数
data_dir = "./Dataset"
batch_size = 32
n_workers = 4
valid_steps = 2000
warmup_steps = 1000
save_steps = 10000
total_steps = 70000

In [None]:
    # 创建保存目录
now_time = datetime.now()
time_str = datetime.strftime(now_time, '%m-%d_%H-%M')
log_dir = os.path.join("./results", time_str)
os.makedirs(log_dir, exist_ok=True)
print(f"结果保存目录: {log_dir}")

In [None]:
# 设备设置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

In [None]:
# 数据加载
train_loader, val_loader, speaker_num = get_dataloader(data_dir, batch_size, n_workers)
print(f"[信息]: 完成数据加载! 说话人数量: {speaker_num}")

In [None]:
# 模型初始化
model = SpeakerClassifier(n_spks=speaker_num).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-3)
scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
print("[信息]: 完成模型创建!")

In [None]:
# 训练状态记录
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []
learning_rates = []

best_val_accuracy = 0.0
best_state_dict = None

In [None]:
# 训练循环
model.train()
train_iterator = iter(train_loader)

print("开始训练...")
start_time = datetime.now()

for step in range(total_steps):
    # 获取数据
    try:
        batch = next(train_iterator)
    except StopIteration:
        train_iterator = iter(train_loader)
        batch = next(train_iterator)
    
    # 前向传播和反向传播
    train_loss, train_accuracy = model_fn(batch, model, criterion, device)
    
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()
    scheduler.step()
    
    # 记录训练指标
    train_losses.append(train_loss.item())
    train_accuracies.append(train_accuracy.item())
    current_lr = scheduler.get_last_lr()[0]
    learning_rates.append(current_lr)
    
    # 打印训练进度
    if (step + 1) % 1000 == 0:
        elapsed_time = (datetime.now() - start_time).total_seconds()
        steps_per_sec = (step + 1) / elapsed_time
        eta = (total_steps - step - 1) / steps_per_sec if steps_per_sec > 0 else 0
        
        print(f"步骤 {step+1}/{total_steps} | "
              f"训练损失: {train_loss.item():.4f} | "
              f"训练准确率: {train_accuracy.item():.4f} | "
              f"学习率: {current_lr:.2e} | "
              f"已用时间: {elapsed_time/60:.1f}分钟 | "
              f"剩余时间: {eta/60:.1f}分钟")
    
    # 验证
    if (step + 1) % valid_steps == 0:
        val_loss, val_accuracy = validate(val_loader, model, criterion, device)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)
        
        print(f"验证结果 | 步骤 {step+1} | "
              f"验证损失: {val_loss:.4f} | "
              f"验证准确率: {val_accuracy:.4f}")
        
        # 更新最佳模型
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_state_dict = model.state_dict().copy()
            
            # 保存最佳模型
            model_path = os.path.join(log_dir, f"best_model_acc_{best_val_accuracy:.4f}.pth")
            torch.save({
                'model_state_dict': best_state_dict,
                'val_accuracy': best_val_accuracy,
                'step': step
            }, model_path)
            print(f"✅ 保存最佳模型! 验证准确率: {best_val_accuracy:.4f}")
    
    # 定期保存检查点
    if (step + 1) % save_steps == 0 and best_state_dict is not None:
        checkpoint_path = os.path.join(log_dir, f"checkpoint_step_{step+1}.pth")
        torch.save({
            'model_state_dict': best_state_dict,
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'step': step,
            'best_val_accuracy': best_val_accuracy
        }, checkpoint_path)
        print(f"💾 保存检查点: {checkpoint_path}")

# 训练完成
total_time = (datetime.now() - start_time).total_seconds()
print(f"🎉 训练完成! 总用时: {total_time/60:.1f}分钟")
print(f"最佳验证准确率: {best_val_accuracy:.4f}")

In [None]:
# 保存训练历史
training_history = {
    'train_losses': train_losses,
    'val_losses': val_losses,
    'train_accuracies': train_accuracies,
    'val_accuracies': val_accuracies,
    'learning_rates': learning_rates,
    'best_val_accuracy': best_val_accuracy,
    'best_epoch': len(val_accuracies) - 1
}

# 保存训练记录
log_path = os.path.join(log_dir, "training_history.pth")
torch.save(training_history, log_path)
print(f"训练记录已保存: {log_path}")

In [None]:
# 绘制训练曲线
print("开始绘制训练曲线...")

picture_path_loss = os.path.join(log_dir, 'loss_curves.png')
picture_path_acc = os.path.join(log_dir, 'accuracy_curves.png')
picture_path_combined = os.path.join(log_dir, 'training_curves.png')
picture_path_lr = os.path.join(log_dir, 'learning_rate.png')

plot_loss_curves(train_losses, val_losses, picture_path_loss)
plot_accuracy_curves(train_accuracies, val_accuracies, picture_path_acc)
plot_training_curves(train_losses, val_losses, train_accuracies, val_accuracies, picture_path_combined)
print(f"所有训练曲线已保存至: {log_dir}")