### 基础代码
手撕 KL散度

给出公式 $KL(P, Q)=\sum P(i) \log \frac{P(i)}{Q(i)}$

In [5]:
import torch
def KL_divergence(x, y, eps=1e-12):
    x = x / x.sum(dim=-1)
    y = y / y.sum(dim=-1)
    
    x = x + eps
    y = y + eps
    sum = x * torch.log(x / y)
    return torch.sum(sum)

In [6]:
x = torch.tensor([0.4, 0.6])
y = torch.tensor([0.5, 0.5])
KL_divergence(x, y)

tensor(0.0201)

In [7]:
x = torch.tensor([0.5, 0.5])
y = torch.tensor([0.5, 0.5])
KL_divergence(x, y)

tensor(0.)

手撕交叉熵损失

给出公式 $H(P, Q) = \sum Q(i)\log P(i) = H(P) + KL(P, Q)$

In [11]:
def cross_entropy(x, y):
    sum = torch.log(x) * y
    return -torch.sum(sum)

x = torch.tensor([0.1, 0.2, 0.6, 0.1])
y = torch.tensor([0, 1, 0, 0])
loss = cross_entropy(x, y)
loss

tensor(1.6094)

手撕优化器

手撕SGD优化器

$\theta_i = \theta_i - g_i$

带动量的优化器

$\hat{g}_i = \lambda g_{i-1} + g_i  $

$\theta_i = \theta_i - lr * \hat{g}_i$

In [None]:
import torch

class SGDOptimizer:
    def __init__(self, params, lr=1e-3):
        """
        params: 可迭代的模型参数
        lr: 学习率 α
        """
        self.params = list(params)
        self.lr = lr

    def step(self):
        for p in self.params:
            if p.grad is None:
                continue
            # 参数更新 θ ← θ - α * grad
            p.data -= self.lr * p.grad.data

    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad.detach_()
                p.grad.zero_()

class SGDMomentumOptimizer:
    def __init__(self, params, lr=1e-3, momentum=0.9):
        self.params = list(params)
        self.lr = lr
        self.momentum = momentum
        self.v = [torch.zeros_like(p) for p in self.params]  # 初始化动量

    def step(self):
        for i, p in enumerate(self.params):
            if p.grad is None:
                continue
            # v_t = μ v_{t-1} + grad
            self.v[i] = self.momentum * self.v[i] + p.grad.data
            # θ ← θ - α * v_t
            p.data -= self.lr * self.v[i]

    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad.detach_()
                p.grad.zero_()

手撕Adam优化器

公式：
参数 $\beta_1$, $\beta_2$, lr, eps

额外的，有一步权重衰减
$g_i = g_i + \lambda \theta_i$


$m_i = (1 - \beta_1) * g_i + m_{i+1}$

$v_i = (1 - \beta_2) * g_i^2 + v_{i+1}$


$\theta_i = \theta_{i-1} - lr * \frac{m_i / (1 - \beta_1^i)}{\sqrt(v_i / (1 - \beta_2^i)) + eps}$


Adamw的区别就在于权重衰减是在外面
$\theta_i = \theta_i - lr * \theta_i * w$

In [2]:
import torch

class Adam:
    def __init__(self, params, lr=5e-5, betas=(0.9, 0.99), eps=1e-10):
        self.beta_1, self.beta_2 = betas
        self.alpha = 1
        self.lr = lr

        self.m = [torch.zeros(para.shape) for para in params]
        self.v = [torch.zeros(para.shape) for para in params]

        self.params = params
        self.t = 1
    
    def step(self):
        for i, para in enumerate(self.params):
            m = (1 - self.beta1) * para.grad + self.m[i][-1]
            v = (1 - self.beta2) * (para.grad ** 2) + self.v[i][-1]
            self.m.append(m)
            self.v.append(v)
            m_hat = m / (1 - self.beta_1 ** self.t)
            v_hat = v / (1 - self.beta_2 ** self.t)
            para.data -= - m_hat / ( v_hat ** 0.5 + self.eps ) * self.lr

        self.t += 1

    def zero_step(self):



SyntaxError: incomplete input (3576334902.py, line 28)

手撕AdamW优化器

手撕AUC

$AUC = \frac{R_{pos} - \frac{n_{pos}(n_{pos} + 1)}{2}}{n_{pos} \cdot n_{neg}}$