In [None]:
import sys, os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import torch
import argparse
import wandb
import logging

root_dir = r'../..'
SEED = 42
TOL_FP = 1e-12

plt.rc('font', size=18)#weight='bold', 
plt.rc('legend', fontsize=18)
plt.rc('lines', linewidth=3, markersize=9)
mpl.rcParams['axes.grid'] = True

markers = ['o','^','s','p','d']
colors = ['b','g','r','c','m','y']

# Dataset

In [2]:
from dataloader.UTKFace import UTKFaceDataset, get_utkface_train_test_loader

root_dir = r'../../dataset/UTKFace'
dataset = UTKFaceDataset(root_dir)
train_loader, test_loader = get_utkface_train_test_loader(dataset, batch_size=32, test_size=0.2, seed=42)

# Quick check
for batch_images, batch_ages in train_loader:
    print("Image batch shape:", batch_images.shape)  # e.g. [32, 3, 64, 64]
    print("Age batch shape:", batch_ages.shape)      # e.g. [32]
    break

  from .autonotebook import tqdm as notebook_tqdm


Image batch shape: torch.Size([32, 3, 224, 224])
Age batch shape: torch.Size([32])


# Models

In [3]:
import torch
import torch.nn as nn
from torchvision import models
import torchvision.transforms as T
from transformers import CLIPModel, CLIPProcessor
from torch.utils.data import DataLoader

def get_intrinsic_dim(features, ratio=0.95, return_sval=False): 
    features = features.detach().cpu()
    sval = torch.linalg.svdvals(features)
    cumsum_sval = torch.cumsum(sval, dim=0)
    intrinsic_dim = torch.where(cumsum_sval >= ratio*cumsum_sval[-1])[0][0] + 1
    if return_sval:
        return intrinsic_dim, sval
    return intrinsic_dim

cutoff = 0.9

## ResNet

In [5]:
# Example: your dataset or DataLoader that yields (images, age)
# Suppose 'dataset' is your UTKFaceDataset or huggingface Dataset
batch_size = 128
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

model_tag = 'resnet18'
weak_dict = {
    'resnet18': models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1),
    'resnet34': models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1),
    'resnet50': models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2),
    'resnet101': models.resnet101(weights=models.ResNet101_Weights.IMAGENET1K_V2),
    'resnet152': models.resnet152(weights=models.ResNet152_Weights.IMAGENET1K_V2), 
}
resnet = weak_dict[model_tag]

# 2) Remove the final fully connected layer -> get a feature extractor
#    One approach: use nn.Sequential on all layers except the last
feature_extractor = nn.Sequential(*list(resnet.children())[:-1])  
feature_extractor.eval()

# 3) (Optional) put model on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor.to(device)

# 4) Disable gradient computation
for param in feature_extractor.parameters():
    param.requires_grad = False

all_features = []

# 5) Inference loop
with torch.no_grad():
    for images, _ in data_loader:
        images = images.to(device)  # shape: (B, 3, H, W)
        
        # ResNet outputs a 4D tensor [B, 2048, 1, 1]
        feats = feature_extractor(images)  # shape: (B, 2048, 1, 1)
        
        # Flatten the spatial dims
        feats = feats.view(feats.size(0), -1)  # shape: (B, 2048)
        
        # Move to CPU (if on GPU) and collect
        all_features.append(feats.cpu())

# 6) Concatenate -> shape (N, 2048)
resnet_features = torch.cat(all_features, dim=0)
print("ResNet feature matrix shape =", resnet_features.shape)
# e.g. [23705, 2048] if the dataset has 23,705 images

# os.makedirs("./precomputed", exist_ok=True)
torch.save(resnet_features, f"./precomputed/utkface_{model_tag}.pt")

ResNet feature matrix shape = torch.Size([23708, 512])


## CLIP ViT

In [6]:
# 1) Load the CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# We'll focus on the vision encoder part
vision_encoder = clip_model.vision_model

# 2) Freeze
vision_encoder.eval()
for param in vision_encoder.parameters():
    param.requires_grad = False

# 3) (Optional) GPU usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vision_encoder.to(device)

# Suppose 'dataset' yields PIL images + labels
data_loader = DataLoader(dataset, batch_size=32, shuffle=False)

all_features_clip = []
with torch.no_grad():
    for pil_images, _ in data_loader:
        # 4) Use the CLIP processor or transform to get pixel_values
        #    Usually you'd do something like:
        inputs = processor(images=pil_images, return_tensors="pt", do_rescale=False)
        pixel_values = inputs["pixel_values"].to(device)  # (B, 3, 224, 224)
        
        # 5) Forward through CLIP's vision encoder
        outputs = vision_encoder(pixel_values=pixel_values)
        
        # 6) outputs has .last_hidden_state and .pooler_output
        #    .pooler_output is usually the [CLS] embedding, shape (B, hidden_size)
        feats = outputs.pooler_output  # shape (B, 768) for ViT-B/32
        
        all_features_clip.append(feats.cpu())

clip_features = torch.cat(all_features_clip, dim=0)  # (N, 768)
print("CLIP feature matrix shape =", clip_features.shape)
# e.g. [23705, 768]

torch.save(clip_features, "./precomputed/utkface_clipb32.pt")

CLIP feature matrix shape = torch.Size([23708, 768])


# Labels

In [37]:
data_loader = DataLoader(dataset, batch_size=32, shuffle=False)
all_labels = []
for _, labels in data_loader:
    all_labels.append(labels)
all_labels = torch.cat(all_labels, dim=0) # shape: [23705]
torch.save(all_labels, "./precomputed/utkface_labels.pt")