In [50]:
import torch
from torch import nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader
from tqdm import tqdm

In [51]:
# Load mnist dataset
train_dataset = torchvision.datasets.MNIST(
    root="../../../Other/datasets/PyTorch",
    train=True,
    transform=transforms.ToTensor(),
    download=True
)

test_dataset = torchvision.datasets.MNIST(
    root="../../../Other/datasets/PyTorch",
    train=False,
    transform=transforms.ToTensor(),
    download=True
)
# Create train and test dataloaders
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

In [52]:
class TeacherModel(nn.Module):
    """教师模型"""

    def __init__(self, in_channels=1, num_classes=10):
        super(TeacherModel, self).__init__()
        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=64,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
        )
        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        self.conv2 = nn.Conv2d(
            in_channels=64,
            out_channels=256,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
        )
        self.fc1 = nn.Linear(256 * 7 * 7, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fc1(x)
        return x


class StudentModel(nn.Module):
    """学生模型"""

    def __init__(self, in_channels=1, num_classes=10):
        super(StudentModel, self).__init__()
        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=8,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
        )
        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        self.conv2 = nn.Conv2d(
            in_channels=8,
            out_channels=16,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
        )
        self.fc1 = nn.Linear(16 * 7 * 7, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fc1(x)
        return x

In [53]:
def check_accuracy(loader, model, device):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)

            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

    model.train()
    return (num_correct / num_samples).item()  # 每一轮的平均准确率


def train_teacher(epochs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    teacher_model = TeacherModel().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(teacher_model.parameters(), lr=1e-4)

    for epoch in range(epochs):
        teacher_model.train()
        losses = []

        pbar = tqdm(train_loader, total=len(train_loader), desc=f"Epoch {epoch}")
        for data, targets in pbar:
            data = data.to(device)
            targets = targets.to(device)

            # forward
            scores = teacher_model(data)
            loss = criterion(scores, targets)
            losses.append(loss.item())
            # backward
            optimizer.zero_grad()
            loss.backward()

            optimizer.step()

        avg_loss = sum(losses) / len(losses)
        acc = check_accuracy(test_loader, teacher_model, device)
        print(f"Loss:{avg_loss:.2f}\tAccuracy:{acc:.2f}")

    return teacher_model

In [54]:
teacher_model_ = train_teacher(3)
teacher_model_  # 训练好的教师模型

Epoch 0: 100%|██████████| 1875/1875 [00:12<00:00, 148.62it/s]


Loss:0.28	Accuracy:0.98


Epoch 1: 100%|██████████| 1875/1875 [00:12<00:00, 147.25it/s]


Loss:0.08	Accuracy:0.98


Epoch 2: 100%|██████████| 1875/1875 [00:13<00:00, 141.26it/s]


Loss:0.05	Accuracy:0.98


TeacherModel(
  (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(64, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=12544, out_features=10, bias=True)
)

In [55]:
def train_student(epochs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    student_model = StudentModel().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(student_model.parameters(), lr=1e-4)

    for epoch in range(epochs):
        student_model.train()
        losses = []

        pbar = tqdm(train_loader, total=len(train_loader), desc=f"Epoch {epoch}")
        for data, targets in pbar:
            data = data.to(device)
            targets = targets.to(device)

            # forward
            scores = student_model(data)
            loss = criterion(scores, targets)
            losses.append(loss.item())
            # backward
            optimizer.zero_grad()
            loss.backward()

            optimizer.step()

        avg_loss = sum(losses) / len(losses)
        acc = check_accuracy(test_loader, student_model, device)
        print(f"Loss:{avg_loss:.2f}\tAccuracy:{acc:.2f}")

    return student_model

In [56]:
student_model_ = train_student(3)
student_model_  # 没有蒸馏的学生模型

Epoch 0: 100%|██████████| 1875/1875 [00:13<00:00, 137.47it/s]


Loss:0.85	Accuracy:0.91


Epoch 1: 100%|██████████| 1875/1875 [00:13<00:00, 141.05it/s]


Loss:0.30	Accuracy:0.93


Epoch 2: 100%|██████████| 1875/1875 [00:13<00:00, 143.09it/s]


Loss:0.23	Accuracy:0.94


StudentModel(
  (conv1): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(8, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=784, out_features=10, bias=True)
)

In [1]:
def train_step(
        teacher,  # 教师模型
        student,  # 学生模型
        optimizer,  # 学生模型优化器
        student_loss_fn,  # sort loss 损失函数
        divergence_loss_fn,  # hard loss 损失函数
        temp,  # 温度
        alpha,  # soft loss权重
        epoch,  # 训练轮数
        device):
    losses = []
    pbar = tqdm(train_loader, total=len(train_loader), desc=f"Epoch {epoch}")
    for data, targets in pbar:
        # Get data to cuda if possible
        data = data.to(device)
        targets = targets.to(device)

        # forward
        with torch.no_grad():
            teacher_preds = teacher(data)

        student_preds = student(data)
        student_loss = student_loss_fn(student_preds, targets)

        ditillation_loss = divergence_loss_fn(
            F.softmax(student_preds / temp, dim=1),
            F.softmax(teacher_preds / temp, dim=1)
        )
        # loss = alpha * 学生损失 + (1 - alpha) * 蒸馏损失
        loss = alpha * student_loss + (1 - alpha) * ditillation_loss  # 总损失
        losses.append(loss.item())

        # backward
        optimizer.zero_grad()
        loss.backward()

        optimizer.step()

    avg_loss = sum(losses) / len(losses)
    return avg_loss


def distill(epochs,  # 训练轮数
            teacher,  # 教师模型
            student,  # 学生模型
            temp=7,  # 温度
            alpha=0.3):  # soft loss权重
    """通过教师模型的输出蒸馏学生模型"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    teacher = teacher.to(device)
    student = student.to(device)
    student_loss_fn = nn.CrossEntropyLoss()  # soft loss 损失函数
    divergence_loss_fn = nn.KLDivLoss(reduction="batchmean")  # hard loss损失函数
    optimizer = torch.optim.Adam(student.parameters(), lr=1e-4)  # 学生模型优化器

    teacher.eval()
    student.train()
    for epoch in range(epochs):
        loss = train_step(teacher, student, optimizer, student_loss_fn,
            divergence_loss_fn, temp, alpha, epoch, device
        )
        acc = check_accuracy(test_loader, student, device)
        print(f"Loss:{loss:.2f}\tAccuracy:{acc:.2f}")
    return student

In [58]:
student_model_new = StudentModel()
distill(epochs=3, teacher=teacher_model_,
        student=student_model_new, temp=7, alpha=0.5)  # 学生网络的蒸馏模型

Epoch 0: 100%|██████████| 1875/1875 [00:14<00:00, 130.29it/s]


Loss:-0.61	Accuracy:0.92


Epoch 1: 100%|██████████| 1875/1875 [00:14<00:00, 128.20it/s]


Loss:-0.89	Accuracy:0.94


Epoch 2: 100%|██████████| 1875/1875 [00:14<00:00, 129.02it/s]


Loss:-0.93	Accuracy:0.95


StudentModel(
  (conv1): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(8, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=784, out_features=10, bias=True)
)