# IFT3395 Competition 2 - Milestone 2 v2

## 目标 Goal
提高Kaggle分数至 > 0.525。

## 策略 Strategy
1. **自定义CNN (From Scratch)** - 轻量级ResNet风格网络
2. **处理类别不平衡** - WeightedRandomSampler + class_weight
3. **集成学习** - CNN + ExtraTrees + RandomForest
4. **数据增强** - 翻转、旋转、颜色抖动
5. **测试时增强 (TTA)** - 提高泛化能力

In [1]:
# ==================== Cell 1: Imports ====================
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
import random
import os
import warnings
from PIL import Image

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchvision import transforms

# Sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')

# 设定随机种子
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [2]:
# ==================== Cell 2: Load Data ====================
DATA_DIR = Path('data')

def load_data():
    with open(DATA_DIR / 'train_data.pkl', 'rb') as f:
        train = pickle.load(f)
    with open(DATA_DIR / 'test_data.pkl', 'rb') as f:
        test = pickle.load(f)
    return train, test

train_data, test_data = load_data()

X_train_raw = train_data['images']
y_train = train_data['labels'].flatten().astype(np.int64)
X_test_raw = test_data['images']

# 确保是 uint8 (0-255)
if X_train_raw.max() <= 1.0:
    X_train_raw = (X_train_raw * 255).astype(np.uint8)
    X_test_raw = (X_test_raw * 255).astype(np.uint8)
else:
    X_train_raw = X_train_raw.astype(np.uint8)
    X_test_raw = X_test_raw.astype(np.uint8)

print("Train Shape:", X_train_raw.shape)
print("Test Shape:", X_test_raw.shape)
print("Class Counts:", np.bincount(y_train))

Train Shape: (1080, 28, 28, 3)
Test Shape: (400, 28, 28, 3)
Class Counts: [486 128 206 194  66]


In [3]:
# ==================== Cell 3: Dataset & Transforms ====================
class IFT3395Dataset(Dataset):
    """Custom Dataset for labeled and unlabeled data."""
    def __init__(self, images, labels=None, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        img_arr = self.images[idx].astype(np.uint8)
        img = Image.fromarray(img_arr)
        
        if self.transform:
            img = self.transform(img)
            
        if self.labels is not None:
            return img, torch.tensor(self.labels[idx], dtype=torch.long)
        return img  # No label for test set

# 训练时增强
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 验证/测试时不增强
val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# TTA: 水平翻转
tta_flip_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=1.0),  # Always flip
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

print("Datasets and transforms defined.")

Datasets and transforms defined.


In [4]:
# ==================== Cell 4: CNN Model ====================
def conv_bn(in_ch, out_ch, pool=False):
    layers = [
        nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1, bias=False),
        nn.BatchNorm2d(out_ch),
        nn.ReLU(inplace=True)
    ]
    if pool:
        layers.append(nn.MaxPool2d(2))
    return nn.Sequential(*layers)

class ResNet9(nn.Module):
    """Lightweight ResNet-9 for small images (28x28)."""
    def __init__(self, num_classes=5):
        super().__init__()
        self.prep = conv_bn(3, 64)
        
        self.layer1 = conv_bn(64, 128, pool=True)
        self.res1 = nn.Sequential(conv_bn(128, 128), conv_bn(128, 128))
        
        self.layer2 = conv_bn(128, 256, pool=True)
        
        self.layer3 = conv_bn(256, 512, pool=True)
        self.res2 = nn.Sequential(conv_bn(512, 512), conv_bn(512, 512))
        
        self.classifier = nn.Sequential(
            nn.AdaptiveMaxPool2d(1),
            nn.Flatten(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )
        
    def forward(self, x):
        x = self.prep(x)
        x = self.layer1(x)
        x = x + self.res1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = x + self.res2(x)
        x = self.classifier(x)
        return x

print("ResNet9 model defined.")

ResNet9 model defined.


In [5]:
# ==================== Cell 5: Helper Functions ====================
def get_sampler(labels):
    """Create WeightedRandomSampler for class imbalance."""
    class_counts = np.bincount(labels)
    class_counts = np.maximum(class_counts, 1)  # Avoid division by zero
    class_weights = 1.0 / class_counts
    sample_weights = class_weights[labels]
    return WeightedRandomSampler(
        torch.from_numpy(sample_weights).double(), 
        len(sample_weights)
    )

def train_one_epoch(model, loader, criterion, optimizer, device):
    """Train for one epoch."""
    model.train()
    total_loss, correct, total = 0.0, 0, 0
    
    for imgs, labels in loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * imgs.size(0)
        _, preds = outputs.max(1)
        total += labels.size(0)
        correct += preds.eq(labels).sum().item()
        
    return total_loss / total, correct / total

@torch.no_grad()
def validate(model, loader, criterion, device):
    """Validate with labels."""
    model.eval()
    total_loss, correct, total = 0.0, 0, 0
    all_probs = []
    
    for imgs, labels in loader:
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        
        total_loss += loss.item() * imgs.size(0)
        _, preds = outputs.max(1)
        total += labels.size(0)
        correct += preds.eq(labels).sum().item()
        all_probs.append(F.softmax(outputs, dim=1).cpu().numpy())
        
    return total_loss / total, correct / total, np.concatenate(all_probs)

@torch.no_grad()
def predict(model, loader, device):
    """Predict on unlabeled test data."""
    model.eval()
    all_probs = []
    
    for imgs in loader:
        imgs = imgs.to(device)
        outputs = model(imgs)
        all_probs.append(F.softmax(outputs, dim=1).cpu().numpy())
        
    return np.concatenate(all_probs)

print("Helper functions defined.")

Helper functions defined.


In [6]:
# ==================== Cell 6: Training with K-Fold CV ====================
N_FOLDS = 5
EPOCHS = 50
BATCH_SIZE = 32
LR = 0.001

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

# Storage for predictions
cnn_oof_probs = np.zeros((len(X_train_raw), 5))
cnn_test_probs = np.zeros((len(X_test_raw), 5))

# Sklearn models storage
et_oof_probs = np.zeros((len(X_train_raw), 5))
et_test_probs = np.zeros((len(X_test_raw), 5))
rf_oof_probs = np.zeros((len(X_train_raw), 5))
rf_test_probs = np.zeros((len(X_test_raw), 5))

# Flatten data for sklearn
X_train_flat = X_train_raw.reshape(len(X_train_raw), -1).astype(np.float32) / 255.0
X_test_flat = X_test_raw.reshape(len(X_test_raw), -1).astype(np.float32) / 255.0

print(f"Starting {N_FOLDS}-Fold Cross Validation...")
print(f"Epochs: {EPOCHS}, Batch Size: {BATCH_SIZE}, LR: {LR}")
print("="*60)

Starting 5-Fold Cross Validation...
Epochs: 50, Batch Size: 32, LR: 0.001


In [7]:
# ==================== Cell 7: Main Training Loop ====================
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_raw, y_train)):
    print(f"\n{'='*20} Fold {fold+1}/{N_FOLDS} {'='*20}")
    
    # Split data
    X_tr, y_tr = X_train_raw[train_idx], y_train[train_idx]
    X_val, y_val = X_train_raw[val_idx], y_train[val_idx]
    
    # ========== CNN Training ==========
    sampler = get_sampler(y_tr)
    train_ds = IFT3395Dataset(X_tr, y_tr, transform=train_transform)
    val_ds = IFT3395Dataset(X_val, y_val, transform=val_transform)
    
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=sampler, num_workers=0)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    
    # Model, loss, optimizer
    model = ResNet9(num_classes=5).to(device)
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
    
    best_acc = 0.0
    best_weights = None
    
    for epoch in range(EPOCHS):
        t_loss, t_acc = train_one_epoch(model, train_loader, criterion, optimizer, device)
        v_loss, v_acc, _ = validate(model, val_loader, criterion, device)
        scheduler.step()
        
        if v_acc > best_acc:
            best_acc = v_acc
            best_weights = model.state_dict().copy()
            
        if (epoch + 1) % 10 == 0:
            print(f"  Epoch {epoch+1:02d} | Train Acc: {t_acc:.3f} | Val Acc: {v_acc:.3f}")
    
    # Load best model
    model.load_state_dict(best_weights)
    _, _, val_probs = validate(model, val_loader, criterion, device)
    cnn_oof_probs[val_idx] = val_probs
    print(f"  [CNN] Best Val Acc: {best_acc:.4f}")
    
    # TTA for test set: Original + Flip
    test_ds_orig = IFT3395Dataset(X_test_raw, labels=None, transform=val_transform)
    test_ds_flip = IFT3395Dataset(X_test_raw, labels=None, transform=tta_flip_transform)
    
    test_loader_orig = DataLoader(test_ds_orig, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    test_loader_flip = DataLoader(test_ds_flip, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    
    probs_orig = predict(model, test_loader_orig, device)
    probs_flip = predict(model, test_loader_flip, device)
    
    # Average TTA predictions
    cnn_test_probs += (probs_orig + probs_flip) / 2 / N_FOLDS
    
    # ========== ExtraTrees ==========
    et = ExtraTreesClassifier(
        n_estimators=500,
        max_depth=None,
        min_samples_split=2,
        random_state=SEED + fold,
        class_weight='balanced',
        n_jobs=-1
    )
    et.fit(X_train_flat[train_idx], y_train[train_idx])
    et_val_acc = et.score(X_train_flat[val_idx], y_train[val_idx])
    et_oof_probs[val_idx] = et.predict_proba(X_train_flat[val_idx])
    et_test_probs += et.predict_proba(X_test_flat) / N_FOLDS
    print(f"  [ET]  Val Acc: {et_val_acc:.4f}")
    
    # ========== RandomForest ==========
    rf = RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        min_samples_split=2,
        random_state=SEED + fold,
        class_weight='balanced',
        n_jobs=-1
    )
    rf.fit(X_train_flat[train_idx], y_train[train_idx])
    rf_val_acc = rf.score(X_train_flat[val_idx], y_train[val_idx])
    rf_oof_probs[val_idx] = rf.predict_proba(X_train_flat[val_idx])
    rf_test_probs += rf.predict_proba(X_test_flat) / N_FOLDS
    print(f"  [RF]  Val Acc: {rf_val_acc:.4f}")
    
    fold_scores.append({'cnn': best_acc, 'et': et_val_acc, 'rf': rf_val_acc})

print("\n" + "="*60)
print("Training Complete!")


  Epoch 10 | Train Acc: 0.341 | Val Acc: 0.301
  Epoch 20 | Train Acc: 0.428 | Val Acc: 0.370
  Epoch 30 | Train Acc: 0.505 | Val Acc: 0.509
  Epoch 40 | Train Acc: 0.602 | Val Acc: 0.417
  Epoch 50 | Train Acc: 0.624 | Val Acc: 0.486
  [CNN] Best Val Acc: 0.5093
  [ET]  Val Acc: 0.4954
  [RF]  Val Acc: 0.5139

  Epoch 10 | Train Acc: 0.366 | Val Acc: 0.333
  Epoch 20 | Train Acc: 0.394 | Val Acc: 0.389
  Epoch 30 | Train Acc: 0.458 | Val Acc: 0.435
  Epoch 40 | Train Acc: 0.567 | Val Acc: 0.421
  Epoch 50 | Train Acc: 0.598 | Val Acc: 0.440
  [CNN] Best Val Acc: 0.4630
  [ET]  Val Acc: 0.5370
  [RF]  Val Acc: 0.5185

  Epoch 10 | Train Acc: 0.372 | Val Acc: 0.403
  Epoch 20 | Train Acc: 0.442 | Val Acc: 0.389
  Epoch 30 | Train Acc: 0.502 | Val Acc: 0.481
  Epoch 40 | Train Acc: 0.596 | Val Acc: 0.477
  Epoch 50 | Train Acc: 0.645 | Val Acc: 0.468
  [CNN] Best Val Acc: 0.5000
  [ET]  Val Acc: 0.5139
  [RF]  Val Acc: 0.5139

  Epoch 10 | Train Acc: 0.358 | Val Acc: 0.403


KeyboardInterrupt: 

In [None]:
# ==================== Cell 8: Ensemble & Evaluation ====================
# Calculate overall CV accuracy for each model
cnn_cv_acc = accuracy_score(y_train, np.argmax(cnn_oof_probs, axis=1))
et_cv_acc = accuracy_score(y_train, np.argmax(et_oof_probs, axis=1))
rf_cv_acc = accuracy_score(y_train, np.argmax(rf_oof_probs, axis=1))

print("=" * 50)
print("Cross-Validation Results:")
print(f"  CNN OOF Accuracy:        {cnn_cv_acc:.4f}")
print(f"  ExtraTrees OOF Accuracy: {et_cv_acc:.4f}")
print(f"  RandomForest OOF Accuracy: {rf_cv_acc:.4f}")
print("=" * 50)

# Try different ensemble weights
best_ensemble_acc = 0
best_weights = None

for w_cnn in np.arange(0.2, 0.8, 0.1):
    for w_et in np.arange(0.1, 0.6, 0.1):
        w_rf = 1.0 - w_cnn - w_et
        if w_rf < 0:
            continue
        
        oof_probs = w_cnn * cnn_oof_probs + w_et * et_oof_probs + w_rf * rf_oof_probs
        acc = accuracy_score(y_train, np.argmax(oof_probs, axis=1))
        
        if acc > best_ensemble_acc:
            best_ensemble_acc = acc
            best_weights = (w_cnn, w_et, w_rf)

print(f"\nBest Ensemble Weights: CNN={best_weights[0]:.1f}, ET={best_weights[1]:.1f}, RF={best_weights[2]:.1f}")
print(f"Best Ensemble OOF Accuracy: {best_ensemble_acc:.4f}")

In [None]:
# ==================== Cell 9: Generate Submission ====================
# Use best weights for final prediction
w_cnn, w_et, w_rf = best_weights
final_test_probs = w_cnn * cnn_test_probs + w_et * et_test_probs + w_rf * rf_test_probs
predictions = np.argmax(final_test_probs, axis=1)

# Create submission
submission = pd.DataFrame({
    'ImageId': np.arange(len(predictions)),
    'Label': predictions
})

submission.to_csv('submission_milestone2_v2.csv', index=False)
print("\nSubmission saved to 'submission_milestone2_v2.csv'")
print(f"\nPrediction Distribution:")
print(submission['Label'].value_counts().sort_index())

# Sanity check
print(f"\nTotal Predictions: {len(predictions)}")
print(f"Expected: {len(X_test_raw)}")