# IFT3395 Competition 2 - Milestone 2 v4.3

## 目标: Validation Accuracy > 0.53, 训练时间 < 15分钟

## 混合策略 (Hybrid Strategy)
结合 `milestone2_v4.2` 的 **Ensemble/Mixup** 框架与 `Milestone2.ipynb` 的 **SimpleCNN** 核心。

- **Architecture**: SimpleCNN (2-layer, 32->64 filters) - *From Milestone2.ipynb*
- **Normalization**: `(0.5, 0.5, 0.5)` - *From Milestone2.ipynb*
- **Training**: 60 Epochs (SimpleCNN is fast), OneCycleLR
- **Ensemble**: CNN + ET + RF + HGB + SVC (Diverse 5-model stack)
- **Augmentation**: Mixup + Lightweight TTA

In [1]:
# Cell 1: Imports
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
import random, os, warnings, time
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchvision import transforms

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')

SEED = 42
def seed_all(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True

seed_all(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
t0 = time.time()

Device: cpu


In [2]:
# Cell 2: Load Data
DATA_DIR = Path('data')
with open(DATA_DIR / 'train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)
with open(DATA_DIR / 'test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)

X_train = train_data['images']
y_train = train_data['labels'].flatten().astype(np.int64)
X_test = test_data['images']

if X_train.max() <= 1.0:
    X_train = (X_train * 255).astype(np.uint8)
    X_test = (X_test * 255).astype(np.uint8)
else:
    X_train = X_train.astype(np.uint8)
    X_test = X_test.astype(np.uint8)

n_train, n_test = len(X_train), len(X_test)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Classes: {np.bincount(y_train)}")

Train: (1080, 28, 28, 3), Test: (400, 28, 28, 3)
Classes: [486 128 206 194  66]


In [3]:
# Cell 3: Feature Engineering (For Sklearn Models)
def extract_features(images):
    """Extract features: flatten pixels + color statistics"""
    features = []
    for img in images:
        # Flatten RGB pixels
        flat = img.flatten().astype(np.float32) / 255.0
        
        # Color statistics per channel
        stats = []
        for c in range(3):
            ch = img[:, :, c].astype(np.float32)
            stats.extend([ch.mean(), ch.std(), ch.min(), ch.max(),
                          np.percentile(ch, 25), np.percentile(ch, 75)])
        
        # Grayscale stats
        gray = 0.299*img[:,:,0] + 0.587*img[:,:,1] + 0.114*img[:,:,2]
        stats.extend([gray.mean(), gray.std(), gray.min(), gray.max()])
        
        # Combine
        features.append(np.concatenate([flat, np.array(stats, dtype=np.float32)]))
    return np.array(features, dtype=np.float32)

print("Extracting features...")
X_train_feat = extract_features(X_train)
X_test_feat = extract_features(X_test)
print(f"Feature shape: {X_train_feat.shape}")

Extracting features...
Feature shape: (1080, 2374)


In [4]:
# Cell 4: Dataset & SimpleCNN Model
class ImgDataset(Dataset):
    def __init__(self, images, labels=None, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform
        
    def __len__(self): return len(self.images)
    
    def __getitem__(self, i):
        img = Image.fromarray(self.images[i].astype(np.uint8))
        if self.transform: img = self.transform(img)
        if self.labels is not None:
            return img, torch.tensor(self.labels[i], dtype=torch.long)
        return img

# TRANSFORMS: Optimized to match Milestone2.ipynb stats (0.5, 0.5, 0.5)
train_tf = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(0.2, 0.2, 0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])
val_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

# SimpleCNN from Milestone2.ipynb
# Architecture: 32 -> 64 -> Dense
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=5):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool = nn.MaxPool2d(2, 2) 
        
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        
        self.flatten_dim = 64 * 7 * 7 
        
        self.fc1 = nn.Linear(self.flatten_dim, 256)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, num_classes)
        
    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = x.reshape(-1, self.flatten_dim)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

print("Model defined.")

Model defined.


In [5]:
# Cell 5: Helpers with Mixup and TTA
def get_sampler(y):
    w = 1.0 / np.maximum(np.bincount(y), 1)
    return WeightedRandomSampler(torch.from_numpy(w[y]).double(), len(y))

def mixup_data(x, y, alpha=0.4, device='cuda'):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    batch_size = x.size(0)
    index = torch.randperm(batch_size).to(device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def train_ep(model, loader, crit, opt, dev, use_mixup=True):
    model.train()
    for x, y in loader:
        x, y = x.to(dev), y.to(dev)
        opt.zero_grad()
        
        if use_mixup:
            x, y_a, y_b, lam = mixup_data(x, y, 0.4, dev)
            out = model(x)
            loss = lam * crit(out, y_a) + (1 - lam) * crit(out, y_b)
        else:
            loss = crit(model(x), y)
            
        loss.backward()
        opt.step()

@torch.no_grad()
def val_ep(model, loader, dev):
    model.eval()
    c, t, probs = 0, 0, []
    for x, y in loader:
        x, y = x.to(dev), y.to(dev)
        out = model(x)
        c += (out.argmax(1) == y).sum().item()
        t += y.size(0)
        probs.append(F.softmax(out, 1).cpu().numpy())
    return c / t, np.concatenate(probs)

@torch.no_grad()
def predict_tta(model, images, dev, tta=True):
    model.eval()
    # 1. Original
    ds = ImgDataset(images, None, val_tf)
    loader = DataLoader(ds, 64, shuffle=False, num_workers=0)
    probs = []
    for x in loader:
        probs.append(F.softmax(model(x.to(dev)), 1).cpu().numpy())
    p1 = np.concatenate(probs)
    
    if not tta: return p1
    
    # 2. Horizontal Flip
    # Note: val_tf normalizes, so we need to flip BEFORE normalization or use flip tensor
    # Safest is to flip numpy, then dataset process it
    images_flip = np.array([np.fliplr(img) for img in images])
    ds_flip = ImgDataset(images_flip, None, val_tf)
    loader_flip = DataLoader(ds_flip, 64, shuffle=False, num_workers=0)
    probs_flip = []
    for x in loader_flip:
        probs_flip.append(F.softmax(model(x.to(dev)), 1).cpu().numpy())
    p2 = np.concatenate(probs_flip)
    
    return 0.5 * (p1 + p2)

print("Helpers defined.")

Helpers defined.


In [6]:
# Cell 6: Training
N_FOLDS, EPOCHS, BS = 5, 60, 64  # Increased Epochs for simpler model
skf = StratifiedKFold(N_FOLDS, shuffle=True, random_state=SEED)

# Storage
cnn_oof = np.zeros((n_train, 5))
cnn_test = np.zeros((n_test, 5))
et_oof = np.zeros((n_train, 5))
et_test = np.zeros((n_test, 5))
rf_oof = np.zeros((n_train, 5))
rf_test = np.zeros((n_test, 5))
hgb_oof = np.zeros((n_train, 5))
hgb_test = np.zeros((n_test, 5))
svc_oof = np.zeros((n_train, 5))
svc_test = np.zeros((n_test, 5))

print(f"Training: {N_FOLDS} folds, {EPOCHS} epochs, Mixup=True, Model=SimpleCNN")
print("="*50)

for fold, (tr_i, val_i) in enumerate(skf.split(X_train, y_train)):
    t1 = time.time()
    print(f"\nFold {fold+1}/{N_FOLDS}")
    
    # CNN (SimpleCNN)
    tr_ds = ImgDataset(X_train[tr_i], y_train[tr_i], train_tf)
    val_ds = ImgDataset(X_train[val_i], y_train[val_i], val_tf)
    tr_ld = DataLoader(tr_ds, BS, sampler=get_sampler(y_train[tr_i]), num_workers=0)
    val_ld = DataLoader(val_ds, BS, shuffle=False, num_workers=0)
    
    model = SimpleCNN().to(device)
    crit = nn.CrossEntropyLoss(label_smoothing=0.1)
    opt = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
    sch = optim.lr_scheduler.OneCycleLR(opt, max_lr=0.003, steps_per_epoch=len(tr_ld), epochs=EPOCHS)
    
    best_acc, best_w = 0, None
    for ep in range(EPOCHS):
        train_ep(model, tr_ld, crit, opt, device, use_mixup=True)
        acc, _ = val_ep(model, val_ld, device)
        sch.step()
        if acc > best_acc:
            best_acc = acc
            best_w = {k: v.cpu().clone() for k, v in model.state_dict().items()}
    
    model.load_state_dict(best_w)
    _, probs = val_ep(model, val_ld, device)
    cnn_oof[val_i] = probs
    cnn_test += predict_tta(model, X_test, device, tta=True) / N_FOLDS
    print(f"  CNN: {best_acc:.4f}")
    
    # Sklearn Models with cached features
    X_tr_f, y_tr_f = X_train_feat[tr_i], y_train[tr_i]
    X_val_f = X_train_feat[val_i]
    
    # ET
    et = ExtraTreesClassifier(200, max_depth=25, class_weight='balanced', random_state=SEED+fold, n_jobs=-1)
    et.fit(X_tr_f, y_tr_f)
    et_oof[val_i] = et.predict_proba(X_val_f)
    et_test += et.predict_proba(X_test_feat) / N_FOLDS
    print(f"  ET:  {et.score(X_val_f, y_train[val_i]):.4f}")
    
    # RF
    rf = RandomForestClassifier(200, max_depth=25, class_weight='balanced', random_state=SEED+fold, n_jobs=-1)
    rf.fit(X_tr_f, y_tr_f)
    rf_oof[val_i] = rf.predict_proba(X_val_f)
    rf_test += rf.predict_proba(X_test_feat) / N_FOLDS
    print(f"  RF:  {rf.score(X_val_f, y_train[val_i]):.4f}")
    
    # HGB
    hgb = HistGradientBoostingClassifier(max_iter=100, max_depth=10, random_state=SEED+fold)
    hgb.fit(X_tr_f, y_tr_f)
    hgb_oof[val_i] = hgb.predict_proba(X_val_f)
    hgb_test += hgb.predict_proba(X_test_feat) / N_FOLDS
    print(f"  HGB: {hgb.score(X_val_f, y_train[val_i]):.4f}")
    
    # SVC
    svc = make_pipeline(StandardScaler(), SVC(probability=True, class_weight='balanced', kernel='rbf', C=10))
    svc.fit(X_tr_f, y_tr_f)
    svc_oof[val_i] = svc.predict_proba(X_val_f)
    svc_test += svc.predict_proba(X_test_feat) / N_FOLDS
    print(f"  SVC: {svc.score(X_val_f, y_train[val_i]):.4f}")

    print(f"  Time: {(time.time()-t1)/60:.1f}min")

print(f"\nTotal: {(time.time()-t0)/60:.1f}min")

Training: 5 folds, 60 epochs, Mixup=True, Model=SimpleCNN

Fold 1/5
  CNN: 0.5000
  ET:  0.5324
  RF:  0.4815
  HGB: 0.5231
  SVC: 0.5278
  Time: 2.4min

Fold 2/5
  CNN: 0.4583
  ET:  0.5139
  RF:  0.5000
  HGB: 0.4815
  SVC: 0.4815
  Time: 2.7min

Fold 3/5
  CNN: 0.5093
  ET:  0.5185
  RF:  0.5231
  HGB: 0.5231
  SVC: 0.4954
  Time: 2.8min

Fold 4/5
  CNN: 0.5139
  ET:  0.4907
  RF:  0.5000
  HGB: 0.4954
  SVC: 0.5185
  Time: 2.8min

Fold 5/5
  CNN: 0.4583
  ET:  0.4722
  RF:  0.5093
  HGB: 0.4491
  SVC: 0.4537
  Time: 2.9min

Total: 13.8min


In [7]:
# Cell 7: Ensemble Weight Search
print("Individual CV Accuracy:")
print(f"  CNN: {accuracy_score(y_train, cnn_oof.argmax(1)):.4f}")
print(f"  ET:  {accuracy_score(y_train, et_oof.argmax(1)):.4f}")
print(f"  RF:  {accuracy_score(y_train, rf_oof.argmax(1)):.4f}")
print(f"  HGB: {accuracy_score(y_train, hgb_oof.argmax(1)):.4f}")
print(f"  SVC: {accuracy_score(y_train, svc_oof.argmax(1)):.4f}")

# Random Search for 5 weights
best_acc, best_w = 0, None
for _ in range(2000):
    w = np.random.dirichlet(np.ones(5))
    oof = w[0]*cnn_oof + w[1]*et_oof + w[2]*rf_oof + w[3]*hgb_oof + w[4]*svc_oof
    acc = accuracy_score(y_train, oof.argmax(1))
    if acc > best_acc:
        best_acc, best_w = acc, w

print(f"\nBest weights: CNN={best_w[0]:.2f}, ET={best_w[1]:.2f}, RF={best_w[2]:.2f}, HGB={best_w[3]:.2f}, SVC={best_w[4]:.2f}")
print(f"Ensemble CV: {best_acc:.4f}")

Individual CV Accuracy:
  CNN: 0.4880
  ET:  0.5056
  RF:  0.5028
  HGB: 0.4944
  SVC: 0.5139

Best weights: CNN=0.43, ET=0.25, RF=0.20, HGB=0.04, SVC=0.08
Ensemble CV: 0.5380


In [8]:
# Cell 8: Submission
w = best_w
final = w[0]*cnn_test + w[1]*et_test + w[2]*rf_test + w[3]*hgb_test + w[4]*svc_test
preds = final.argmax(1)

submission = pd.DataFrame({'ID': np.arange(1, n_test + 1), 'Label': preds})
submission.to_csv('submission_milestone2_v4.3.csv', index=False)

print("Saved: submission_milestone2_v4.3.csv")
print(f"\nDistribution:\n{submission['Label'].value_counts().sort_index()}")
print(f"\nTotal time: {(time.time()-t0)/60:.1f}min")

Saved: submission_milestone2_v4.3.csv

Distribution:
Label
0    227
1     51
2     49
3     72
4      1
Name: count, dtype: int64

Total time: 13.8min
