# 03_deep_features.ipynb

**Objective:**  
1. Remount Drive & set paths  
2. Choose backbone, pooling, and PCA mode via parameters  
3. Load images from `train` & `yale_test`, extract CNN feature maps → vectors  
4. Optionally apply PCA  
5. Save `.npz` files into a `features/` folder  
6. Verify shapes & sample values  


In [None]:
import torch
import joblib
import numpy as np
from torchvision import models, transforms, datasets
from sklearn.decomposition import PCA

In [None]:
# Cell Tag: parameters
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os
from pathlib import Path

# Paths
ROOT = Path("/content/drive/My Drive/Colab Notebooks/CPSC 381-581: Machine Learning/Final Project")
DATA_DIR = ROOT / "data"
FEATURE_DIR = ROOT / "features"
FEATURE_DIR.mkdir(exist_ok=True)

# Hyperparameters
ARCH       = "resnet50"
POOL       = "gap"
PCA_MODE   = "95var"
BATCH_SIZE = 32
DEVICE     = "cuda" if __import__("torch").cuda.is_available() else "cpu"

print(f"Arch={ARCH}, Pool={POOL}, PCA={PCA_MODE}, Device={DEVICE}")
print("Saving features in:", FEATURE_DIR)

Mounted at /content/drive
Arch=resnet50, Pool=gap, PCA=95var, Device=cuda
Saving features in: /content/drive/My Drive/Colab Notebooks/CPSC 381-581: Machine Learning/Final Project/features


In [None]:
# Cell Tag: model-setup
# 1. Load backbone and cut off classification head
backbone = getattr(models, ARCH)(pretrained=True).to(DEVICE)
backbone.eval()
for p in backbone.parameters():
    p.requires_grad = False

# remove last two layers (avgpool + fc) or adapt per arch
# For ResNet: take everything up to layer4
feature_extractor = torch.nn.Sequential(
    *list(backbone.children())[:-2]
).to(DEVICE)

# 2. Define pooling functions
def pool_gap(x): return x.mean(dim=[2,3])
def pool_gmp(x): return x.amax(dim=[2,3])
POOLS = {"gap": pool_gap, "gmp": pool_gmp}
pool_fn = POOLS[POOL]


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 202MB/s]


In [None]:
# Cell Tag: transforms
# Image preprocessing
preproc = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406],
                         std =[0.229,0.224,0.225])
])


In [None]:
# Cell Tag: extract
def extract_features(split):
    src = DATA_DIR / split
    ds  = datasets.ImageFolder(src, transform=preproc)
    loader = torch.utils.data.DataLoader(ds, batch_size=BATCH_SIZE, shuffle=False)

    all_feats = []
    all_lbls  = []
    with torch.no_grad():
        for imgs, lbls in loader:
            imgs = imgs.to(DEVICE)
            fmap = feature_extractor(imgs)
            vec  = pool_fn(fmap)
            all_feats.append(vec.cpu().numpy())
            all_lbls.extend(lbls.numpy())

    X = np.vstack(all_feats)
    y = np.array(all_lbls)
    return X, y, ds.classes

# Initialize PCA
pca = None

for split in ["augmented", "yale_test_augmented"]:
    X, y, classes = extract_features(split)
    print(f"{split}: X shape {X.shape}, y length {len(y)}")

    # Apply PCA consistently
    if PCA_MODE == "95var":
        if split == "augmented":
            pca = PCA(0.95, svd_solver="full")
            X = pca.fit_transform(X)
            joblib.dump(pca, FEATURE_DIR / "pca_95var.pkl")
            print(f"  → PCA 95% var → new dim {X.shape[1]}")
        else:
            pca = joblib.load(FEATURE_DIR / "pca_95var.pkl")
            X = pca.transform(X)
            print(f"  → Test data transformed using saved PCA → new dim {X.shape[1]}")

    elif PCA_MODE == "128":
        if split == "augmented":
            pca = PCA(128)
            X = pca.fit_transform(X)
            joblib.dump(pca, FEATURE_DIR / "pca_128.pkl")
            print(f"  → PCA 128 comps → new dim {X.shape[1]}")
        else:
            pca = joblib.load(FEATURE_DIR / "pca_128.pkl")
            X = pca.transform(X)
            print(f"  → Test data transformed using saved PCA → new dim {X.shape[1]}")

    # Save feature set
    out_fn = FEATURE_DIR / f"{split}_{ARCH}_{POOL}_{PCA_MODE}.npz"
    np.savez(out_fn, X=X, y=y)
    print("Saved", out_fn)



augmented: X shape (2973, 2048), y length 2973
  → PCA 95% var → new dim 248
Saved /content/drive/My Drive/Colab Notebooks/CPSC 381-581: Machine Learning/Final Project/features/augmented_resnet50_gap_95var.npz
yale_test_augmented: X shape (858, 2048), y length 858
  → Test data transformed using saved PCA → new dim 248
Saved /content/drive/My Drive/Colab Notebooks/CPSC 381-581: Machine Learning/Final Project/features/yale_test_augmented_resnet50_gap_95var.npz


In [None]:
# Cell Tag: verify
# Load one back just to check
data = np.load(FEATURE_DIR / f"train_{ARCH}_{POOL}_{PCA_MODE}.npz")
print("Re-loaded X:", data["X"].shape, "y:", data["y"].shape)
print("Sample vector (first row):", data["X"][0][:5])


Re-loaded X: (1067, 199) y: (1067,)
Sample vector (first row): [-5.5701747 -6.7942414  4.3036103  1.7359451  2.0084238]
