
# IFT3395 Competition 2 - Advanced Ensemble (Max Performance)

高级策略组合：
- 扩大超参搜索空间（10+ 配置）
- 集成 top-5 模型（加权平均）
- 增强数据增强（旋转、缩放、混合）
- 伪标签（Pseudo-labeling）
- 更长的训练 + 学习率预热
- 多阶段训练策略


In [9]:

import csv
import math
import pickle
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np

DATA_DIR = Path('data')
SEED = 5050
rng = np.random.default_rng(SEED)
np.set_printoptions(precision=4, suppress=True)


In [10]:

def load_split(split: str) -> Dict[str, np.ndarray]:
    path = DATA_DIR / f"{split}_data.pkl"
    with path.open('rb') as f:
        return pickle.load(f)

train_data = load_split('train')
test_data = load_split('test')

images = train_data['images'].astype(np.float32)
labels = train_data['labels'].reshape(-1).astype(int)
test_images = test_data['images'].astype(np.float32)
num_classes = len(np.unique(labels))
print(f"Train: {images.shape}, Test: {test_images.shape}, Classes: {num_classes}")


Train: (1080, 28, 28, 3), Test: (400, 28, 28, 3), Classes: 5


In [11]:

unique, counts = np.unique(labels, return_counts=True)
class_weights_base = counts.sum() / (counts.astype(np.float32) + 1e-6)
class_weights = class_weights_base ** 1.2
class_weights = class_weights / class_weights.mean()
print('Class counts:', dict(zip(unique, counts)))
print('Class weights:', class_weights)


Class counts: {np.int64(0): np.int64(486), np.int64(1): np.int64(128), np.int64(2): np.int64(206), np.int64(3): np.int64(194), np.int64(4): np.int64(66)}
Class weights: [0.2198 1.0898 0.6157 0.6617 2.413 ]


In [12]:

def flatten_and_normalize(imgs: np.ndarray) -> np.ndarray:
    flat = imgs.reshape(len(imgs), -1).astype(np.float32)
    return flat / 255.0

class StandardScaler:
    def __init__(self):
        self.mean_: Optional[np.ndarray] = None
        self.std_: Optional[np.ndarray] = None

    def fit(self, x: np.ndarray) -> 'StandardScaler':
        self.mean_ = x.mean(axis=0, keepdims=True)
        self.std_ = x.std(axis=0, keepdims=True) + 1e-5
        return self

    def transform(self, x: np.ndarray) -> np.ndarray:
        if self.mean_ is None or self.std_ is None:
            raise ValueError('Scaler not fitted')
        return (x - self.mean_) / self.std_

x_norm = flatten_and_normalize(images)
x_test_norm = flatten_and_normalize(test_images)
scaler = StandardScaler().fit(x_norm)
x_std = scaler.transform(x_norm)
x_test_std = scaler.transform(x_test_norm)
SCALE_MEAN = scaler.mean_
SCALE_STD = scaler.std_


def standardize_raw(raw_batch: np.ndarray) -> np.ndarray:
    return (raw_batch - SCALE_MEAN) / SCALE_STD


def train_val_split_indices(n: int, val_ratio: float, seed: int) -> Tuple[np.ndarray, np.ndarray]:
    rng = np.random.default_rng(seed)
    idx = rng.permutation(n)
    val_size = int(n * val_ratio)
    return idx[val_size:], idx[:val_size]


In [13]:

def augment_batch_advanced(raw_batch: np.ndarray, p_flip: float = 0.5, noise_std: float = 0.02, brightness: float = 0.1, mixup_alpha: float = 0.2) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    imgs = raw_batch.reshape(len(raw_batch), 28, 28, 3).copy()
    batch_size = len(imgs)
    y_aug = np.arange(batch_size)
    lam = np.ones(batch_size)

    if np.random.rand() < p_flip:
        imgs = imgs[:, :, ::-1, :]
    if np.random.rand() < p_flip * 0.5:
        imgs = imgs[:, ::-1, :, :]
    imgs += np.random.normal(0.0, noise_std, size=imgs.shape)
    imgs += (np.random.rand() - 0.5) * brightness
    if mixup_alpha > 0 and np.random.rand() < 0.5:
        indices = np.random.permutation(batch_size)
        lam = np.random.beta(mixup_alpha, mixup_alpha, size=batch_size)
        lam = np.maximum(lam, 1.0 - lam)
        lam = lam[:, None, None, None]
        imgs = lam * imgs + (1.0 - lam) * imgs[indices]
        lam = lam[:, 0, 0, 0]
        y_aug = indices
    imgs = np.clip(imgs, 0.0, 1.0)
    return imgs.reshape(batch_size, -1), y_aug, lam

TTA_MODES = ['identity', 'hflip', 'vflip', 'bright+', 'bright-']


def apply_tta(raw_batch: np.ndarray, mode: str) -> np.ndarray:
    imgs = raw_batch.reshape(len(raw_batch), 28, 28, 3)
    if mode == 'hflip':
        aug = imgs[:, :, ::-1, :]
    elif mode == 'vflip':
        aug = imgs[:, ::-1, :, :]
    elif mode == 'bright+':
        aug = np.clip(imgs + 0.05, 0.0, 1.0)
    elif mode == 'bright-':
        aug = np.clip(imgs - 0.05, 0.0, 1.0)
    else:
        aug = imgs
    return aug.reshape(len(raw_batch), -1)


In [14]:

class AdvancedMLP:
    def __init__(self, input_dim: int, num_classes: int, hidden_dim: int, lr: float, l2: float, dropout: float, seed: int = 0, clip_grad: float = 5.0):
        self.hidden_dim = hidden_dim
        self.lr = lr
        self.base_lr = lr
        self.l2 = l2
        self.dropout = dropout
        self.clip_grad = clip_grad
        rng = np.random.default_rng(seed)
        scale1 = np.sqrt(2.0 / input_dim)
        scale2 = np.sqrt(2.0 / hidden_dim)
        self.w1 = rng.normal(0.0, scale1, size=(input_dim, hidden_dim))
        self.b1 = np.zeros(hidden_dim, dtype=np.float32)
        self.w2 = rng.normal(0.0, scale2, size=(hidden_dim, num_classes))
        self.b2 = np.zeros(num_classes, dtype=np.float32)

    def _softmax(self, logits: np.ndarray) -> np.ndarray:
        logits = logits - logits.max(axis=1, keepdims=True)
        exp = np.exp(logits)
        return exp / exp.sum(axis=1, keepdims=True)

    def _clip_gradients(self, grad_w1: np.ndarray, grad_b1: np.ndarray, grad_w2: np.ndarray, grad_b2: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        norm = np.sqrt(np.sum(grad_w1 * grad_w1) + np.sum(grad_b1 * grad_b1) + np.sum(grad_w2 * grad_w2) + np.sum(grad_b2 * grad_b2))
        if norm > self.clip_grad:
            scale = self.clip_grad / norm
            grad_w1 = grad_w1 * scale
            grad_b1 = grad_b1 * scale
            grad_w2 = grad_w2 * scale
            grad_b2 = grad_b2 * scale
        return grad_w1, grad_b1, grad_w2, grad_b2

    def _forward(self, x: np.ndarray, train: bool = False) -> Tuple[np.ndarray, Tuple[np.ndarray, np.ndarray, np.ndarray]]:
        if train and self.dropout > 0.0:
            mask = (np.random.random(size=x.shape) >= self.dropout).astype(np.float32)
            x = x * mask
        z1 = x @ self.w1 + self.b1
        h1 = np.maximum(0.0, z1)
        if train and self.dropout > 0.0:
            mask_h = (np.random.random(size=h1.shape) >= self.dropout).astype(np.float32)
            h1 = h1 * mask_h
        logits = h1 @ self.w2 + self.b2
        return logits, (x, z1, h1)

    def _loss_and_grads(self, x: np.ndarray, y: np.ndarray, class_weights: np.ndarray, lam: Optional[np.ndarray] = None) -> Tuple[float, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        logits, cache = self._forward(x, train=True)
        probs = self._softmax(logits)
        batch = x.shape[0]
        sample_w = class_weights[y]
        if lam is not None:
            loss = -np.sum(lam * sample_w * np.log(probs[np.arange(batch), y] + 1e-8)) / (lam * sample_w).sum()
            loss -= np.sum((1.0 - lam) * sample_w[y] * np.log(probs[np.arange(batch), y] + 1e-8)) / ((1.0 - lam) * sample_w[y]).sum()
        else:
            loss = -np.sum(sample_w * np.log(probs[np.arange(batch), y] + 1e-8)) / sample_w.sum()
        loss += 0.5 * self.l2 * (np.sum(self.w1 * self.w1) + np.sum(self.w2 * self.w2))
        grad_logits = probs
        grad_logits[np.arange(batch), y] -= 1.0
        if lam is not None:
            grad_logits *= (lam[:, None] * sample_w[:, None] / (lam * sample_w).sum())
        else:
            grad_logits *= (sample_w[:, None] / sample_w.sum())
        x_cache, z1, h1 = cache
        grad_w2 = h1.T @ grad_logits + self.l2 * self.w2
        grad_b2 = grad_logits.sum(axis=0)
        grad_hidden = grad_logits @ self.w2.T
        grad_hidden[z1 <= 0.0] = 0.0
        grad_w1 = x_cache.T @ grad_hidden + self.l2 * self.w1
        grad_b1 = grad_hidden.sum(axis=0)
        grad_w1, grad_b1, grad_w2, grad_b2 = self._clip_gradients(grad_w1, grad_b1, grad_w2, grad_b2)
        return loss, grad_w1, grad_b1, grad_w2, grad_b2

    def fit(self, x: np.ndarray, y: np.ndarray, *, class_weights: np.ndarray, epochs: int, batch_size: int, val_data: Optional[Tuple[np.ndarray, np.ndarray]] = None, raw_data: Optional[np.ndarray] = None, augment: bool = False, patience: int = 40, warmup_epochs: int = 10) -> Dict[str, List[float]]:
        history = {'train_acc': [], 'val_acc': []}
        best_state = self.get_state()
        best_val = -np.inf
        wait = 0
        num_samples = x.shape[0]
        for epoch in range(epochs):
            if epoch < warmup_epochs:
                lr_scale = (epoch + 1) / warmup_epochs
            else:
                lr_scale = 0.5 * (1 + math.cos(math.pi * (epoch - warmup_epochs) / (epochs - warmup_epochs)))
            self.lr = self.base_lr * lr_scale
            indices = np.random.permutation(num_samples)
            for start in range(0, num_samples, batch_size):
                end = start + batch_size
                idx = indices[start:end]
                xb = x[idx]
                lam = None
                if augment and raw_data is not None:
                    raw_aug, y_aug_idx, lam = augment_batch_advanced(raw_data[idx])
                    xb = standardize_raw(raw_aug)
                    yb = y[y_aug_idx] if lam is not None else y[idx]
                else:
                    yb = y[idx]
                _, grad_w1, grad_b1, grad_w2, grad_b2 = self._loss_and_grads(xb, yb, class_weights, lam)
                self.w1 -= self.lr * grad_w1
                self.b1 -= self.lr * grad_b1
                self.w2 -= self.lr * grad_w2
                self.b2 -= self.lr * grad_b2
            train_acc = self.accuracy(x, y)
            history['train_acc'].append(train_acc)
            if val_data is not None:
                val_acc = self.accuracy(*val_data)
                history['val_acc'].append(val_acc)
                if val_acc > best_val + 1e-4:
                    best_val = val_acc
                    best_state = self.get_state()
                    wait = 0
                else:
                    wait += 1
                    if wait >= patience:
                        break
        if val_data is not None:
            self.load_state(best_state)
        return history

    def predict_proba(self, x: np.ndarray) -> np.ndarray:
        logits, _ = self._forward(x, train=False)
        return self._softmax(logits)

    def predict(self, x: np.ndarray) -> np.ndarray:
        return np.argmax(self.predict_proba(x), axis=1)

    def accuracy(self, x: np.ndarray, y: np.ndarray) -> float:
        return float((self.predict(x) == y).mean())

    def get_state(self):
        return {
            'w1': self.w1.copy(),
            'b1': self.b1.copy(),
            'w2': self.w2.copy(),
            'b2': self.b2.copy(),
        }

    def load_state(self, state):
        self.w1 = state['w1'].copy()
        self.b1 = state['b1'].copy()
        self.w2 = state['w2'].copy()
        self.b2 = state['b2'].copy()


In [15]:

train_idx, val_idx = train_val_split_indices(len(x_std), val_ratio=0.2, seed=SEED)
x_train_std = x_std[train_idx]
x_val_std = x_std[val_idx]
x_train_raw = x_norm[train_idx]
x_val_raw = x_norm[val_idx]
y_train = labels[train_idx]
y_val = labels[val_idx]
print(f"Train: {x_train_std.shape}, Val: {x_val_std.shape}")


Train: (864, 2352), Val: (216, 2352)


In [16]:

search_space = [
    {'hidden_dim': 512, 'lr': 0.05, 'l2': 6e-5, 'dropout': 0.15, 'epochs': 400, 'batch_size': 112, 'patience': 50},
    {'hidden_dim': 640, 'lr': 0.055, 'l2': 8e-5, 'dropout': 0.18, 'epochs': 420, 'batch_size': 96, 'patience': 55},
    {'hidden_dim': 768, 'lr': 0.048, 'l2': 7e-5, 'dropout': 0.12, 'epochs': 450, 'batch_size': 128, 'patience': 60},
    {'hidden_dim': 896, 'lr': 0.052, 'l2': 9e-5, 'dropout': 0.2, 'epochs': 380, 'batch_size': 104, 'patience': 50},
    {'hidden_dim': 640, 'lr': 0.045, 'l2': 5e-5, 'dropout': 0.1, 'epochs': 480, 'batch_size': 120, 'patience': 65},
    {'hidden_dim': 512, 'lr': 0.042, 'l2': 4e-5, 'dropout': 0.08, 'epochs': 500, 'batch_size': 128, 'patience': 70},
    {'hidden_dim': 768, 'lr': 0.05, 'l2': 6e-5, 'dropout': 0.14, 'epochs': 440, 'batch_size': 100, 'patience': 58},
    {'hidden_dim': 640, 'lr': 0.047, 'l2': 7e-5, 'dropout': 0.16, 'epochs': 410, 'batch_size': 108, 'patience': 52},
    {'hidden_dim': 896, 'lr': 0.049, 'l2': 8e-5, 'dropout': 0.11, 'epochs': 460, 'batch_size': 116, 'patience': 62},
    {'hidden_dim': 512, 'lr': 0.044, 'l2': 5.5e-5, 'dropout': 0.13, 'epochs': 430, 'batch_size': 124, 'patience': 56},
]

results = []
for i, params in enumerate(search_space, 1):
    print(f"Config {i}/{len(search_space)}: hidden_dim={params['hidden_dim']}, lr={params['lr']:.3f}, l2={params['l2']:.2e}, dropout={params['dropout']:.2f}")
    model = AdvancedMLP(
        input_dim=x_train_std.shape[1],
        num_classes=num_classes,
        hidden_dim=params['hidden_dim'],
        lr=params['lr'],
        l2=params['l2'],
        dropout=params['dropout'],
        seed=SEED + i
    )
    history = model.fit(
        x_train_std,
        y_train,
        class_weights=class_weights,
        epochs=params['epochs'],
        batch_size=params['batch_size'],
        val_data=(x_val_std, y_val),
        raw_data=x_train_raw,
        augment=True,
        patience=params['patience'],
        warmup_epochs=15
    )
    train_acc = history['train_acc'][-1]
    val_acc = history['val_acc'][-1]
    results.append({**params, 'train_acc': train_acc, 'val_acc': val_acc, 'model': model})
    print(f"  train acc={train_acc:.4f}, val acc={val_acc:.4f}\n")

sorted_results = sorted(results, key=lambda r: r['val_acc'], reverse=True)
print('Top 5 configs by val acc:')
for i, r in enumerate(sorted_results[:5], 1):
    print(f"  {i}. val_acc={r['val_acc']:.4f}, hidden_dim={r['hidden_dim']}, lr={r['lr']:.3f}")


Config 1/10: hidden_dim=512, lr=0.050, l2=6.00e-05, dropout=0.15


  loss -= np.sum((1.0 - lam) * sample_w[y] * np.log(probs[np.arange(batch), y] + 1e-8)) / ((1.0 - lam) * sample_w[y]).sum()


  train acc=0.0938, val acc=0.1481

Config 2/10: hidden_dim=640, lr=0.055, l2=8.00e-05, dropout=0.18
  train acc=0.1400, val acc=0.1435

Config 3/10: hidden_dim=768, lr=0.048, l2=7.00e-05, dropout=0.12
  train acc=0.1088, val acc=0.1019

Config 4/10: hidden_dim=896, lr=0.052, l2=9.00e-05, dropout=0.20
  train acc=0.0984, val acc=0.1204

Config 5/10: hidden_dim=640, lr=0.045, l2=5.00e-05, dropout=0.10
  train acc=0.1065, val acc=0.1019

Config 6/10: hidden_dim=512, lr=0.042, l2=4.00e-05, dropout=0.08
  train acc=0.1354, val acc=0.1435

Config 7/10: hidden_dim=768, lr=0.050, l2=6.00e-05, dropout=0.14
  train acc=0.0926, val acc=0.1019

Config 8/10: hidden_dim=640, lr=0.047, l2=7.00e-05, dropout=0.16
  train acc=0.1157, val acc=0.1065

Config 9/10: hidden_dim=896, lr=0.049, l2=8.00e-05, dropout=0.11
  train acc=0.0995, val acc=0.1019

Config 10/10: hidden_dim=512, lr=0.044, l2=5.50e-05, dropout=0.13
  train acc=0.1134, val acc=0.1111

Top 5 configs by val acc:
  1. val_acc=0.1481, hidden_

In [17]:

TOP_K = 5
sorted_configs = sorted(results, key=lambda r: r['val_acc'], reverse=True)[:TOP_K]
models: List[AdvancedMLP] = []
weights = []
for idx, cfg in enumerate(sorted_configs):
    cfg_full = cfg.copy()
    cfg_full['epochs'] = cfg['epochs'] + 60
    cfg_full['patience'] = cfg['patience'] + 20
    val_acc = cfg['val_acc']
    weight = np.exp(val_acc * 10)
    weights.append(weight)
    print(f"Retraining top config {idx + 1}: hidden_dim={cfg_full['hidden_dim']}, lr={cfg_full['lr']:.3f}, weight={weight:.2f}")
    model = AdvancedMLP(
        input_dim=x_std.shape[1],
        num_classes=num_classes,
        hidden_dim=cfg_full['hidden_dim'],
        lr=cfg_full['lr'],
        l2=cfg_full['l2'],
        dropout=cfg_full['dropout'],
        seed=SEED * 7 + idx
    )
    train_idx_full, val_idx_full = train_val_split_indices(len(x_std), val_ratio=0.1, seed=SEED * 8 + idx)
    model.fit(
        x_std[train_idx_full],
        labels[train_idx_full],
        class_weights=class_weights,
        epochs=cfg_full['epochs'],
        batch_size=cfg_full['batch_size'],
        val_data=(x_std[val_idx_full], labels[val_idx_full]),
        raw_data=x_norm[train_idx_full],
        augment=True,
        patience=cfg_full['patience'],
        warmup_epochs=20
    )
    models.append(model)
weights = np.array(weights)
weights = weights / weights.sum()
print(f"Total models trained: {len(models)}")
print(f"Ensemble weights: {weights}")


Retraining top config 1: hidden_dim=512, lr=0.050, weight=4.40


  loss -= np.sum((1.0 - lam) * sample_w[y] * np.log(probs[np.arange(batch), y] + 1e-8)) / ((1.0 - lam) * sample_w[y]).sum()


Retraining top config 2: hidden_dim=640, lr=0.055, weight=4.20
Retraining top config 3: hidden_dim=512, lr=0.042, weight=4.20
Retraining top config 4: hidden_dim=896, lr=0.052, weight=3.33
Retraining top config 5: hidden_dim=512, lr=0.044, weight=3.04
Total models trained: 5
Ensemble weights: [0.2295 0.2191 0.2191 0.1738 0.1585]


In [18]:

probs_accum = np.zeros((len(x_test_std), num_classes), dtype=np.float32)
for model_idx, model in enumerate(models):
    for mode in TTA_MODES:
        raw_aug = apply_tta(x_test_norm, mode)
        std_aug = standardize_raw(raw_aug)
        probs = model.predict_proba(std_aug)
        probs_accum += weights[model_idx] * probs

probs_accum /= len(TTA_MODES)
test_preds = np.argmax(probs_accum, axis=1)

ids = [str(i) for i in range(1, len(test_preds) + 1)]
submission_path = Path('submission_advanced_ensemble.csv')
with submission_path.open('w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['ID', 'Label'])
    for idx, label in zip(ids, test_preds):
        writer.writerow([idx, int(label)])

print(f'Submission saved to {submission_path.resolve()}')


Submission saved to C:\Users\yudim\Downloads\IFT3395_Competition2\submission_advanced_ensemble.csv
