In [8]:
import torch
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torchvision.transforms import Compose, ToTensor, Normalize, Resize
from torchvision.models import resnet50, vit_b_16, vit_b_32
# from tqdm.autonotebook import tqdm
from copy import deepcopy
# from cka import CKACalculator
import matplotlib.pyplot as plt
# plt.rcParams['figure.figsize'] = (7, 7)
import sys
import os

# get the root project folder (one level up from the notebook)
project_root = os.path.abspath("..")
sys.path.append(project_root)

from CNN import CNN
from VisionTransormer import VisionTransformer

# config 
import yaml

config = yaml.safe_load(open('../config.yaml'))
cnn_config = config['cnn']
vt_config = config['vision_transformer']



In [9]:
device = torch.device(
    'cuda' if torch.cuda.is_available() else
    'mps' if torch.backends.mps.is_available() else
    'cpu'
)
device


device(type='cpu')

In [22]:
transform = Compose([
    Resize(224),
    ToTensor(),
    Normalize(mean=(0.485, 0.456, 0.406),
              std=(0.229, 0.224, 0.225))
])

train_dataset = CIFAR10(root='../data', train=True, download=True, transform=transform)
test_dataset  = CIFAR10(root='../data', train=False, download=True, transform=transform)

batch_size = config['batch_size']

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

len(train_dataset), len(test_dataset)
from torch.utils.data import Subset
import numpy as np

train_percent = 0.002
test_percent = 0.002

# compute subset sizes
train_size = int(len(train_dataset) * train_percent)
test_size = int(len(test_dataset) * test_percent)

# create random subset indices
train_indices = np.random.choice(len(train_dataset), train_size, replace=False)
test_indices = np.random.choice(len(test_dataset), test_size, replace=False)

# create subsets
training_data_small = Subset(train_dataset, train_indices)
test_data_small = Subset(test_dataset, test_indices)

# new loaders (keeping same batch size)
train_loader = DataLoader(training_data_small, batch_size=config['batch_size'], shuffle=True)
test_loader = DataLoader(test_data_small, batch_size=config['batch_size'], shuffle=False)


Files already downloaded and verified
Files already downloaded and verified


In [14]:
def count_trainable_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# quick sanity check later:
model = CNN(cnn_config, device)
count_trainable_params(model)


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /Users/markgardner/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100.0%


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  



ValueError: Attempted to use an uninitialized parameter in <method 'numel' of 'torch._C.TensorBase' objects>. This error happens when you are using a `LazyModule` or explicitly manipulating `torch.nn.parameter.UninitializedParameter` objects. When using LazyModules Call `forward` with a dummy batch to initialize the parameters before calling torch functions

In [12]:
def make_cnn(num_layers_to_drop: int):
    cfg = dict(cnn_config)           # shallow copy
    cfg['num_layers_to_drop'] = num_layers_to_drop
    return CNN(cfg, device)

def make_vit(num_encoder_layers_to_drop: int):
    cfg = dict(vt_config)
    cfg['num_encoder_layers_to_drop'] = num_encoder_layers_to_drop
    return VisionTransformer(cfg, device)


In [None]:
import time 

def run_training(model, num_epochs, train_loader, test_loader):
    # history = {
    #     "epoch": [],
    #     "test_accuracy": [],
    #     "test_loss": [],
    #     "epoch_time": [],
    # }
    history = {
        "epoch": [],
        "train_loss": [],
        "test_loss": [],
        "test_accuracy": [],
        "epoch_time": [],
    }


    # for epoch in range(num_epochs):
    #     start_time = time.time()

    #     # train for one epoch
    #     model.train(train_loader)
    #     # evaluate on test
    #     test_ret = model.test(test_loader)
    #     # append results into history
    #     history.append(test_ret)

    # return history


    for epoch in range(num_epochs):
        for batch_idx, batch in enumerate(training_loader):
            loss = model.training_step(batch)
            if batch_idx % config['print_batch_frequency'] == 0:
                print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.4f}")
        test_loss = 0
        correct = 0
        for batch_idx, batch in enumerate(test_loader):
            test_result = model.test_step(batch)
            test_loss += test_result[0]
            correct += test_result[1]
        test_loss /= len(test_loader)
        correct /= len(test_data)
        print(f"Test Error for Epoch {epoch}: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    save_weights = config['save_weights_path']
    if save_weights != '':
        torch.save(model.state_dict(), save_weights)
        print("Weights saved to {save_weights}")

#########################
#########################
#########################
    for epoch in range(num_epochs):
        start_time = time.time()
        
        # TRAIN LOOP
        total_train_loss = 0
        for batch in train_loader:
            batch_loss = model.training_step(batch)
            total_train_loss += batch_loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # TEST LOOP
        total_test_loss = 0
        total_correct = 0
        for batch in test_loader:
            test_loss, correct = model.test_step(batch)
            total_test_loss += test_loss
            total_correct += correct

        avg_test_loss = total_test_loss / len(test_loader)
        accuracy = total_correct / len(test_dataset)

        # STORE HISTORY
        history["epoch"].append(epoch)
        history["train_loss"].append(avg_train_loss)
        history["test_loss"].append(avg_test_loss)
        history["test_accuracy"].append(accuracy)
        history["epoch_time"].append(time.time() - start_time)


IndentationError: expected an indented block after 'for' statement on line 9 (2449710923.py, line 14)

In [None]:
import time 

def run_training(model, num_epochs, train_loader, test_loader):

    history = {
        "epoch": [],
        "train_loss": [],
        "test_loss": [],
        "test_accuracy": [],
        "epoch_time": [],
    }

    for epoch in range(num_epochs):
        for batch_idx, batch in enumerate(train_loader):
            loss = model.training_step(batch)
            if batch_idx % config['print_batch_frequency'] == 0:
                print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.4f}")
        test_loss = 0
        correct = 0
        for batch_idx, batch in enumerate(test_loader):
            test_result = model.test_step(batch)
            test_loss += test_result[0]
            correct += test_result[1]
        test_loss /= len(test_loader)
        correct /= len(test_data)
        print(f"Test Error for Epoch {epoch}: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    save_weights = config['save_weights_path']
    if save_weights != '':
        torch.save(model.state_dict(), save_weights)
        print("Weights saved to {save_weights}")


In [None]:
# import time 

def run_training(model, num_epochs, training_loader, test_loader, test_data, print_freq):

    history = {
        "epoch": [],
        "train_loss": [],
        "test_loss": [],
        "test_accuracy": [],
        "epoch_time": [],
    }
    # start_time = time.time()

    for epoch in range(num_epochs):

        # ---- TRAIN EPOCH ----
        train_loss_running = 0
        for batch_idx, batch in enumerate(training_loader):
            loss = model.training_step(batch)
            train_loss_running += loss.item()

            if batch_idx % print_freq == 0:
                print(f"[Epoch {epoch}] Batch {batch_idx} Loss: {loss.item():.4f}")

        avg_train_loss = train_loss_running / len(training_loader)

        # ---- TEST EPOCH ----
        test_loss = 0
        correct = 0
        for batch_idx, batch in enumerate(test_loader):
            test_l, corr = model.test_step(batch)
            test_loss += test_l
            correct += corr

        test_loss /= len(test_loader)
        accuracy = correct / len(test_data)

        print(f"Epoch {epoch}: Test Acc: {accuracy*100:.2f}%, Test Loss: {test_loss:.4f}")

        # ---- SAVE TO HISTORY ----
        history["train_loss_per_epoch"].append(avg_train_loss)
        history["test_loss_per_epoch"].append(test_loss)
        history["test_accuracy_per_epoch"].append(accuracy)
    end = time.time()
    total_time = end - start
    return history


In [18]:
import time

def run_training(model, num_epochs, train_loader, test_loader, print_freq):
    history = {
        "epoch": [],
        "test_accuracy": [],
        "test_loss": [],
        "total_time": None,
    }

    start_time = time.time()

    for epoch in range(num_epochs):
        # ---- train loop (same as main.py) ----
        for batch_idx, batch in enumerate(train_loader):
            loss = model.training_step(batch)
            if batch_idx % print_freq == 0:
                print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.4f}")

        # ---- test loop (same as main.py, but we compute total manually) ----
        test_loss = 0.0
        correct = 0.0
        total = 0

        for batch_idx, batch in enumerate(test_loader):
            batch_loss, batch_correct = model.test_step(batch)
            test_loss += batch_loss
            correct += batch_correct
            # batch[0] are the images
            total += batch[0].size(0)

        test_loss /= len(test_loader)
        accuracy = correct / total

        print(
            f"Test Error for Epoch {epoch}: "
            f"Accuracy: {100 * accuracy:>0.1f}%, Avg loss: {test_loss:>8f}\n"
        )

        # ---- log history ----
        history["epoch"].append(epoch)
        history["test_accuracy"].append(float(accuracy))
        history["test_loss"].append(float(test_loss))

    history["total_time"] = time.time() - start_time
    return history


In [None]:
import time 

num_epochs = config['num_epochs']
print_freq = config['print_batch_frequency']

# cnn_drop_list = [0, 1, 2]
# vit_drop_list = [0, 4, 8]
cnn_drop_list = [0, 2]
vit_drop_list = [0, 8]

results = []

for num_drop in cnn_drop_list:
    model = make_cnn(num_drop)

    # initialize LazyLinear
    dummy = next(iter(train_loader))[0].to(device)
    with torch.no_grad():
        _ = model(dummy)

    n_params = count_trainable_params(model)
    start_time = time.time()
    history = run_training(model, num_epochs, train_loader, test_loader, print_freq)
    total_time = time.time() - start_time

    results.append({
        "model": "cnn",
        "layers_dropped": num_drop,
        "params": n_params,
        "history": history,
        "run_time": total_time
    })

for num_drop in vit_drop_list:
    model = make_vit(num_drop)

    dummy = next(iter(train_loader))[0].to(device)
    with torch.no_grad():
        _ = model(dummy)

    n_params = count_trainable_params(model)
    start_time = time.time()

    history = run_training(model, num_epochs, train_loader, test_loader, print_freq)
    total_time = time.time() - start_time

    results.append({
        "model": "vit",
        "layers_dropped": num_drop,
        "params": n_params,
        "history": history,
        "run_time": total_time
    })


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  



Epoch 0, Batch 0, Loss: 2.6519
Test Error for Epoch 0: Accuracy: 20.0%, Avg loss: 3.809594

Epoch 1, Batch 0, Loss: 3.7837
Test Error for Epoch 1: Accuracy: 35.0%, Avg loss: 2.559745

Epoch 2, Batch 0, Loss: 2.2152
Test Error for Epoch 2: Accuracy: 25.0%, Avg loss: 2.035981

Epoch 3, Batch 0, Loss: 1.2523
Test Error for Epoch 3: Accuracy: 40.0%, Avg loss: 2.103549

Epoch 4, Batch 0, Loss: 1.4942
Test Error for Epoch 4: Accuracy: 40.0%, Avg loss: 1.836549

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (

In [27]:
results

[{'model': 'cnn',
  'layers_dropped': 0,
  'params': 5130,
  'history': {'epoch': [0, 1, 2, 3, 4],
   'test_accuracy': [0.3, 0.2, 0.5, 0.35, 0.4],
   'test_loss': [2.746870517730713,
    3.0941123962402344,
    2.067286252975464,
    2.0940909385681152,
    1.7339544296264648],
   'total_time': 20.036497116088867}},
 {'model': 'cnn',
  'layers_dropped': 2,
  'params': 1290,
  'history': {'epoch': [0, 1, 2, 3, 4],
   'test_accuracy': [0.25, 0.2, 0.25, 0.2, 0.15],
   'test_loss': [2.2805933952331543,
    2.2670156955718994,
    2.253814220428467,
    2.229841709136963,
    2.199183940887451],
   'total_time': 14.324692964553833}},
 {'model': 'vit',
  'layers_dropped': 0,
  'params': 7690,
  'history': {'epoch': [0, 1, 2, 3, 4],
   'test_accuracy': [0.1, 0.1, 0.1, 0.1, 0.1],
   'test_loss': [2.3359689712524414,
    2.3329410552978516,
    2.3299031257629395,
    2.3268628120422363,
    2.3238558769226074],
   'total_time': 135.3274688720703}},
 {'model': 'vit',
  'layers_dropped': 8,
  'p

In [17]:
num_epochs = config['num_epochs']

cnn_drop_list = [0, 1, 2]   # ðŸ‘ˆ you can adjust
vit_drop_list = [0, 4, 8]   # ðŸ‘ˆ not symmetric; thatâ€™s okay

results = []

for num_drop in cnn_drop_list:
    model = make_cnn(num_drop)
    # ðŸ‘‡ initialize LazyLinear by running one fake batch through the model
    dummy = next(iter(train_loader))[0].to(device)  # just the images
    with torch.no_grad():
        _ = model(dummy)

    n_params = count_trainable_params(model)
    # n_params = count_trainable_params(model)
    history = run_training(model, num_epochs, train_loader, test_loader)
    results.append({
        "model": "cnn",
        "layers_dropped": num_drop,
        "params": n_params,
        "history": history,
    })

for num_drop in vit_drop_list:
    model = make_vit(num_drop)
        # ðŸ‘‡ initialize LazyLinear by running one fake batch through the model
    dummy = next(iter(train_loader))[0].to(device)  # just the images
    with torch.no_grad():
        _ = model(dummy)

    n_params = count_trainable_params(model)
    n_params = count_trainable_params(model)
    history = run_training(model, num_epochs, train_loader, test_loader)
    results.append({
        "model": "vit",
        "layers_dropped": num_drop,
        "params": n_params,
        "history": history,
    })


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

TypeError: run_training() missing 2 required positional arguments: 'test_data' and 'print_freq'