# IFT3395 Competition 2 - Milestone 2: Ultimate Ensemble

## 目标 Goal
提高Kaggle分数至 > 0.53。

## 策略 Strategy (核心改动)
1.  **完全不使用预训练模型 (No Pre-trained Models)**：
    *   我们设计了一个自定义的轻量级CNN (**Custom ResNet9 Variant**)，完全**从这开始训练 (Trained from Scratch)**。
    *   这符合比赛规则，且针对28x28的小图像进行了优化（避免了ResNet50等大模型在小图上的过拟合问题）。

2.  **解决类别不平衡 (Handle Class Imbalance)**：
    *   数据极度不平衡 (Class 0: 486 vs Class 4: 66)。
    *   **PyTorch**: 使用 `WeightedRandomSampler`，确保每个Batch中各类别的样本数量大致相等。
    *   **Sklearn**: 使用 `class_weight='balanced'`。

3.  **强力集成 (Robust Ensemble)**：
    *   **CNN (Deep Learning)**: 捕捉空间特征。
    *   **ExtraTrees (Machine Learning)**: 捕捉统计特征，在这个数据集上表现非常稳健。
    *   结果 = 0.6 * CNN预测 + 0.4 * ExtraTrees预测。

4.  **测试时增强 (TTA)**：
    *   预测时对测试图片进行水平翻转、旋转，取平均值，提高泛化能力。

In [1]:
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
import random
import os
import warnings
from PIL import Image

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchvision import transforms

# Sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')

# 设定随机种子
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


## 1. 数据加载与预处理 Data Loading

In [2]:
DATA_DIR = Path('data')

def load_data():
    with open(DATA_DIR / 'train_data.pkl', 'rb') as f:
        train = pickle.load(f)
    with open(DATA_DIR / 'test_data.pkl', 'rb') as f:
        test = pickle.load(f)
    return train, test

train_data, test_data = load_data()

X_train_raw = train_data['images'] # (1080, 28, 28, 3)
y_train = train_data['labels'].flatten() # (1080,)
X_test_raw = test_data['images'] # (400, 28, 28, 3)

# 确保是 float32 且在 0-255 之间，方便后续 transforms 处理
if X_train_raw.max() <= 1.0:
    X_train_raw = (X_train_raw * 255).astype(np.uint8)
    X_test_raw = (X_test_raw * 255).astype(np.uint8)
else:
    X_train_raw = X_train_raw.astype(np.uint8)
    X_test_raw = X_test_raw.astype(np.uint8)

# FIX: Ensure labels are int64 for CrossEntropyLoss
y_train = y_train.astype(np.int64)

print("Train Shape:", X_train_raw.shape)
print("Class Counts:", np.bincount(y_train))

Train Shape: (1080, 28, 28, 3)
Class Counts: [486 128 206 194  66]


## 2. 自定义PyTorch数据集与增强 Dataset & Augmentation

In [3]:
class IFT3395Dataset(Dataset):
    def __init__(self, images, labels=None, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        # Image data assumed to be uint8 (0-255)
        img_arr = self.images[idx].astype(np.uint8) # Safety cast
        img = Image.fromarray(img_arr)
        
        if self.transform:
            img = self.transform(img)
            
        if self.labels is not None:
            # Verify labels type inside dataset just in case
            return img, torch.tensor(self.labels[idx], dtype=torch.long)
        return img

# 增强策略：针对小图，适当增强但不要扭曲太严重
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    transforms.ToTensor(), # 0-1 float
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

## 3. 自定义轻量级模型 Custom ResNet9 (From Scratch)
这是一个经典的“DavidNet”风格的9层残差网络，非常适合CIFAR-10等小图数据集，训练快且效果好。
**注意：这里没有加载任何预训练权重。**

In [4]:
def conv_bn(in_channels, out_channels, pool=False):
    layers = [
        nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, bias=False),
        nn.BatchNorm2d(out_channels),
        nn.ReLU(inplace=True)
    ]
    if pool:
        layers.append(nn.MaxPool2d(2))
    return nn.Sequential(*layers)

class CustomResNet9(nn.Module):
    def __init__(self, num_classes=5):
        super().__init__()
        
        # Prep: 3 -> 64
        self.prep = conv_bn(3, 64)
        
        # Layer 1: 64 -> 128, MaxPool, ResBlock
        self.layer1_conv = conv_bn(64, 128, pool=True)
        self.layer1_res = nn.Sequential(
            conv_bn(128, 128),
            conv_bn(128, 128)
        )
        
        # Layer 2: 128 -> 256, MaxPool
        self.layer2_conv = conv_bn(128, 256, pool=True)
        
        # Layer 3: 256 -> 512, MaxPool, ResBlock
        self.layer3_conv = conv_bn(256, 512, pool=True)
        self.layer3_res = nn.Sequential(
            conv_bn(512, 512),
            conv_bn(512, 512)
        )
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.AdaptiveMaxPool2d(1),
            nn.Flatten(),
            nn.Dropout(0.2),
            nn.Linear(512, num_classes)
        )
        
    def forward(self, x):
        out = self.prep(x)
        
        out = self.layer1_conv(out)
        out = out + self.layer1_res(out) # Residual add
        
        out = self.layer2_conv(out)
        
        out = self.layer3_conv(out)
        out = out + self.layer3_res(out) # Residual add
        
        out = self.classifier(out)
        return out

## 4. 训练辅助函数 Helper Functions

In [5]:
def get_sampler(labels):
    # 计算类别权重，用于WeightedRandomSampler
    class_counts = np.bincount(labels)
    # Handle potential zero counts if subset misses a class (unlikely with StratifiedCV)
    class_counts = np.maximum(class_counts, 1)
    
    class_weights = 1. / class_counts
    sample_weights = class_weights[labels]
    
    # FIX: Ensure weights are double tensor for Sampler
    return WeightedRandomSampler(torch.from_numpy(sample_weights).double(), len(sample_weights))

def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for imgs, labels in loader:
        imgs, labels = imgs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * imgs.size(0)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    return running_loss / total, correct / total

@torch.no_grad()
def validate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    probs = []
    
    for imgs, labels in loader:
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        
        running_loss += loss.item() * imgs.size(0)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
        
        probs.append(F.softmax(outputs, dim=1).cpu().numpy())
        
    return running_loss / total, correct / total, np.concatenate(probs)

## 5. 5折交叉验证训练 (5-Fold CV)

In [6]:
N_FOLDS = 5
EPOCHS = 40
BATCH_SIZE = 32

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

cnn_oof_probs = np.zeros((len(X_train_raw), 5))
cnn_test_probs = np.zeros((len(X_test_raw), 5))
et_oof_probs = np.zeros((len(X_train_raw), 5))
et_test_probs = np.zeros((len(X_test_raw), 5))

# ---------------------
# SKLEARN PREP
# ---------------------
def flatten_norm(imgs):
    return imgs.reshape(imgs.shape[0], -1).astype(np.float32) / 255.0

X_train_flat = flatten_norm(X_train_raw)
X_test_flat = flatten_norm(X_test_raw)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_raw, y_train)):
    print(f"\n{'='*20} Fold {fold+1}/{N_FOLDS} {'='*20}")
    
    # --- 1. Train PyTorch CNN ---
    # Data Splitting
    X_tr, y_tr = X_train_raw[train_idx], y_train[train_idx]
    X_val, y_val = X_train_raw[val_idx], y_train[val_idx]
    
    # Datasets
    # Tip: Use sampler for training to handle imbalance
    sampler = get_sampler(y_tr)
    train_ds = IFT3395Dataset(X_tr, y_tr, transform=train_transform)
    val_ds = IFT3395Dataset(X_val, y_val, transform=val_transform)
    
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    
    # Model Setup
    model = CustomResNet9(num_classes=5).to(device)
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1) # Label smoothing helps generalization
    optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
    
    # Training Loop
    best_acc = 0.0
    best_weights = None
    
    for epoch in range(EPOCHS):
        t_loss, t_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
        v_loss, v_acc, _ = validate(model, val_loader, criterion, device)
        scheduler.step()
        
        if v_acc > best_acc:
            best_acc = v_acc
            best_weights = model.state_dict()
            
        if (epoch+1) % 10 == 0:
            print(f"[CNN] Ep {epoch+1} | Loss: {t_loss:.3f} | Acc: {t_acc:.3f} | Val Acc: {v_acc:.3f}")
            
    # Load best & predict
    model.load_state_dict(best_weights)
    _, _, val_probs = validate(model, val_loader, criterion, device)
    cnn_oof_probs[val_idx] = val_probs
    print(f"[CNN] Fold {fold+1} Best Acc: {best_acc:.4f}")
    
    # TTA for Test Set
    model.eval()
    # Simple TTA: Original + Flip
    test_ds_orig = IFT3395Dataset(X_test_raw, transform=val_transform)
    test_loader_orig = DataLoader(test_ds_orig, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    probs1 = predict(model, test_loader_orig, device)
    
    # You can add more TTA here if needed
    cnn_test_probs += probs1 / N_FOLDS
    
    # --- 2. Train Sklearn ExtraTrees ---
    # ExtraTrees is very robust for small numeric datasets
    et = ExtraTreesClassifier(
        n_estimators=300, 
        max_depth=None, 
        min_samples_split=2, 
        random_state=SEED,
        class_weight='balanced', # Critical for imbalance
        n_jobs=-1
    )
    
    et.fit(X_train_flat[train_idx], y_train[train_idx])
    et_val_acc = et.score(X_train_flat[val_idx], y_train[val_idx])
    et_oof_probs[val_idx] = et.predict_proba(X_train_flat[val_idx])
    et_test_probs += et.predict_proba(X_test_flat) / N_FOLDS
    
    print(f"[ET]  Fold {fold+1} Acc: {et_val_acc:.4f}")


[CNN] Ep 10 | Loss: 1.587 | Acc: 0.321 | Val Acc: 0.384
[CNN] Ep 20 | Loss: 1.384 | Acc: 0.471 | Val Acc: 0.375
[CNN] Ep 30 | Loss: 1.200 | Acc: 0.569 | Val Acc: 0.449
[CNN] Ep 40 | Loss: 1.093 | Acc: 0.647 | Val Acc: 0.449
[CNN] Fold 1 Best Acc: 0.5000


NameError: name 'predict' is not defined

## 6. 集成与提交 Ensemble & Submission

In [None]:
# 计算独立的Accuracy
cnn_acc = accuracy_score(y_train, np.argmax(cnn_oof_probs, axis=1))
et_acc = accuracy_score(y_train, np.argmax(et_oof_probs, axis=1))

print(f"\nOverall CV Accuracy:")
print(f"CNN: {cnn_acc:.4f}")
print(f"ExtraTrees: {et_acc:.4f}")

# 加权集成
# 通常CNN比较自信，ET比较稳健，可以尝试不同的权重
weights = [0.6, 0.4] # 0.6 CNN, 0.4 ET

final_oof_probs = weights[0] * cnn_oof_probs + weights[1] * et_oof_probs
ensemble_acc = accuracy_score(y_train, np.argmax(final_oof_probs, axis=1))
print(f"Ensemble: {ensemble_acc:.4f}")

# 生成最终预测
final_test_probs = weights[0] * cnn_test_probs + weights[1] * et_test_probs
predictions = np.argmax(final_test_probs, axis=1)

# 保存CSV
submission = pd.DataFrame({'ImageId': np.arange(len(predictions)), 'Label': predictions})
submission.to_csv('submission_milestone2_ultimate.csv', index=False)
print("\nSubmission saved to submission_milestone2_ultimate.csv")
print(submission['Label'].value_counts())