In [1]:
import os
import time
import subprocess
import torch
import torch.nn.functional as F
from torch.profiler import profile, ProfilerActivity
from torchprofile import profile_macs

try:
    from pyJoules.energy_meter import measure_energy
    from pyJoules.device.rapl_device import RaplPackageDomain, RaplDramDomain
    from pyJoules.device.nvidia_device import NvidiaGPUDomain
    PYJOULES_AVAILABLE = True
except ImportError:
    PYJOULES_AVAILABLE = False


def get_model_size_mb(model, temp_path="temp_model.pt"):
    torch.save(model.state_dict(), temp_path)
    size_mb = os.path.getsize(temp_path) / (1024 ** 2)
    os.remove(temp_path)
    return round(size_mb, 3)

def get_file_size_mb(path):
    return round(os.path.getsize(path) / (1024 ** 2), 3)

def get_macs(model, batch_size=1, img_size=None, device="cuda", dataset_name=None):
    if img_size is None:
        if dataset_name in ["cifar10", "cifar100", "rmnist"]:
            img_size = 32
        else:
            img_size = 224
    dummy_input = torch.randn(batch_size, 3, img_size, img_size).to(device)
    macs = profile_macs(model, dummy_input)
    return round(macs / 1e6, 3)

def measure_inference_latency(model, dataloader, device, num_batches=10):
    model.eval()
    latencies, peak_mem, avg_mem = [], [], []

    with torch.no_grad():
        for i, (inputs, _) in enumerate(dataloader):
            if i >= num_batches:
                break
            inputs = inputs.to(device)
            torch.cuda.reset_peak_memory_stats(device)
            torch.cuda.synchronize()
            start = time.perf_counter()
            with profile(activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU],
                         record_shapes=False, profile_memory=True) as prof:
                outputs = model(inputs)
            torch.cuda.synchronize()
            end = time.perf_counter()
            latency = (end - start) * 1000
            latencies.append(latency)
            peak_mem.append(torch.cuda.max_memory_allocated(device) / (1024 ** 2))
            avg_mem.append(torch.cuda.memory_allocated(device) / (1024 ** 2))
    avg_latency = round(sum(latencies) / len(latencies), 3)
    avg_peak_mem = round(sum(peak_mem) / len(peak_mem), 3)
    avg_used_mem = round(sum(avg_mem) / len(avg_mem), 3)
    return avg_latency, avg_peak_mem, avg_used_mem

def evaluate_accuracy(model, dataloader, device, topk=(1, 5)):
    model.eval()
    correct_top1, correct_top5, total = 0, 0, 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, pred_top1 = outputs.max(1)
            correct_top1 += pred_top1.eq(labels).sum().item()
            _, pred_top5 = outputs.topk(5, 1, True, True)
            correct_top5 += sum([labels[i] in pred_top5[i] for i in range(len(labels))])
            total += labels.size(0)
    top1 = 100.0 * correct_top1 / total
    top5 = 100.0 * correct_top5 / total
    return round(top1, 3), round(top5, 3)

def get_gpu_power(device_id=0):
    try:
        output = subprocess.check_output(
            f"nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits -i {device_id}",
            shell=True
        ).decode("utf-8").strip()
        return float(output)
    except Exception:
        return None

def measure_avg_gpu_power(model, dataloader, device, num_batches=10, device_id=0):
    model.eval()
    powers = []
    with torch.no_grad():
        for i, (inputs, _) in enumerate(dataloader):
            if i >= num_batches:
                break
            inputs = inputs.to(device)
            _ = model(inputs)
            p = get_gpu_power(device_id)
            if p:
                powers.append(p)
    if not powers:
        return None
    avg_power = sum(powers) / len(powers)
    avg_energy_mJ = avg_power * (num_batches * 0.1) * 1000
    return round(avg_energy_mJ, 3)

def measure_energy_model(model, dataloader, device, num_batches=10):
    if not PYJOULES_AVAILABLE:
        return None
    energy_records = []
    try:
        @measure_energy(domains=[RaplPackageDomain(0), RaplDramDomain(0), NvidiaGPUDomain(0)])
        def inference_batch(inputs):
            with torch.no_grad():
                _ = model(inputs)
        model.eval()
        with torch.no_grad():
            for i, (inputs, _) in enumerate(dataloader):
                if i >= num_batches:
                    break
                inputs = inputs.to(device)
                inference_batch(inputs)
                energy_records.append(getattr(inference_batch, "energy_consumed", None))
        energy_vals = [e for e in energy_records if e is not None]
        if len(energy_vals) == 0:
            return None
        avg_energy = sum(energy_vals) / len(energy_vals)
        return round(avg_energy, 3)
    except Exception:
        return None

def profile_model(model, dataloader, dataset_name, device="cuda", num_batches=10,
                  model_path_for_size=None, precomputed_macs_m=None):
    print(f"\nProfiling {model.__class__.__name__} on {dataset_name}...")
    model = model.to(device)
    model.eval()
    if model_path_for_size:
        size_mb = get_file_size_mb(model_path_for_size)
    else:
        size_mb = get_model_size_mb(model)
    if precomputed_macs_m:
        macs_million = precomputed_macs_m
    else:
        macs_million = get_macs(model, batch_size=1, device=device, dataset_name=dataset_name)
    latency_ms, peak_mem_mb, avg_mem_mb = measure_inference_latency(
        model, dataloader, device, num_batches
    )
    energy_mj = measure_energy_model(model, dataloader, device, num_batches)
    if energy_mj is None:
        energy_mj = measure_avg_gpu_power(model, dataloader, device, num_batches)
    top1, top5 = evaluate_accuracy(model, dataloader, device)
    results = {
        "Model": model.__class__.__name__,
        "Dataset": dataset_name,
        "Size (MB)": size_mb,
        "MACs (M)": macs_million,
        "Peak Mem (MB)": peak_mem_mb,
        "Avg Mem (MB)": avg_mem_mb,
        "Latency (ms)": latency_ms,
        "Energy (mJ)": energy_mj,
        "Top-1 (%)": top1,
        "Top-5 (%)": top5,
    }
    print("\nProfiling Results:")
    for k, v in results.items():
        print(f"{k:<15}: {v}")
    return results

  import pynvml  # type: ignore[import]


In [2]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]

Transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

def cifar10_trainloader(batch_size=64, shuffle=True):
    train = datasets.CIFAR10(root="./data", train=True, transform=Transform, download=True)
    return DataLoader(train, batch_size=batch_size, shuffle=shuffle)

def ciaf10_testloader(batch_size=64, shuffle=False):
    test = datasets.CIFAR10(root="./data", train=False, transform=Transform, download=True)
    return DataLoader(test, batch_size=batch_size, shuffle=shuffle)

def cifar100_trainloader(batch_size=64, shuffle=True):
    train = datasets.CIFAR100(root="./data", train=True, transform=Transform, download=True)
    return DataLoader(train, batch_size=batch_size, shuffle=shuffle)

def ciaf100_testloader(batch_size=64, shuffle=False):
    test = datasets.CIFAR100(root="./data", train=False, transform=Transform, download=True)
    return DataLoader(test, batch_size=batch_size, shuffle=shuffle)

In [3]:
datasets = {
    "cifar10": {
        "trainloader": cifar10_trainloader(batch_size=512),
        "testloader": ciaf10_testloader(batch_size=512)
    },
    "cifar100": {
        "trainloader": cifar100_trainloader(batch_size=512),
        "testloader": ciaf100_testloader(batch_size=512)
    }
}
testloader = datasets["cifar10"]["testloader"]

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [None]:
model10 = torch.hub.load("chenyaofo/pytorch-cifar-models", "cifar10_vgg16_bn", pretrained=True)
model100 = torch.hub.load("chenyaofo/pytorch-cifar-models", "cifar100_vgg16_bn", pretrained=True)

Using cache found in C:\Users\Fatim_Sproj/.cache\torch\hub\chenyaofo_pytorch-cifar-models_master


In [4]:
def load_data_and_model(dataset_name, datasets):
    print(f"\Loading dataset and model for {dataset_name}\n")

    trainloader = datasets[dataset_name]["trainloader"]
    testloader = datasets[dataset_name]["testloader"]
    return trainloader, testloader

In [5]:
dataset_name = "cifar10"
trainloader10, testloader10 = load_data_and_model(dataset_name, datasets)

\Loading dataset and model for cifar10



### Cifar 10base

In [7]:
profile_results = profile_model(model10, testloader, dataset_name)


Profiling VGG on cifar10...

Profiling Results:
Model          : VGG
Dataset        : cifar10
Size (MB)      : 58.251
MACs (M)       : 314.002
Peak Mem (MB)  : 457.388
Avg Mem (MB)   : 73.249
Latency (ms)   : 15.461
Energy (mJ)    : 77151.0
Top-1 (%)      : 82.92
Top-5 (%)      : 98.27


### cifar100 base

In [6]:
dataset_name = "cifar100"
trainloader100, testloader100 = load_data_and_model(dataset_name, datasets)

\Loading dataset and model for cifar100



In [9]:
profile_results = profile_model(model100, testloader100, dataset_name)


Profiling VGG on cifar100...

Profiling Results:
Model          : VGG
Dataset        : cifar100
Size (MB)      : 58.427
MACs (M)       : 314.049
Peak Mem (MB)  : 457.722
Avg Mem (MB)   : 73.601
Latency (ms)   : 15.079
Energy (mJ)    : 75541.0
Top-1 (%)      : 63.57
Top-5 (%)      : 83.71


In [6]:
import torch
import torch.nn as nn
import pandas as pd

def create_sparse_weights_from_finetuned_model(finetuned_model, mask_path, output_sparse_path):
    print(f"Creating updated sparse weights from fine-tuned model...")
    masks = torch.load(mask_path, map_location='cpu', weights_only=True)
    sparse_weights = {}
    
    for name, param in finetuned_model.named_parameters():
        if name in masks:
            mask = masks[name].to(param.device)
            W = param.data * mask

            if W.dim() == 2:
                sparse_weights[name] = W.to_sparse_csr().cpu()
            else: 
                original_shape = W.shape
                W_flat = W.reshape(original_shape[0], -1)
                sparse_csr_tensor = W_flat.to_sparse_csr().cpu()
                sparse_weights[name] = (sparse_csr_tensor, original_shape)

    torch.save(sparse_weights, output_sparse_path)
    print(f"Saved updated fine-tuned sparse weights to: {output_sparse_path}")

class SparseVGG(nn.Module):
    def __init__(self, vgg_model, sparse_weights_path, device='cpu'):
        super(SparseVGG, self).__init__()
        self.features = vgg_model.features
        self.classifier = vgg_model.classifier

        sparse_weights = torch.load(sparse_weights_path, map_location=device, weights_only=True)
        self._inject_sparse_weights(sparse_weights, device)

    def _reconstruct_dense_conv_weight(self, sparse_object):
        sparse_csr, original_shape = sparse_object
        return sparse_csr.to_dense().reshape(original_shape)

    def _inject_sparse_weights(self, sparse_weights, device):
        for name, module in self.named_modules():
            weight_name = f"{name}.weight"
            if weight_name in sparse_weights:
                sparse_object = sparse_weights[weight_name]
                if isinstance(module, nn.Conv2d):
                    dense_weight = self._reconstruct_dense_conv_weight(sparse_object)
                    module.weight.data = dense_weight.to(device)
                elif isinstance(module, nn.Linear):
                    module.sparse_weight = sparse_object.to(device)
                    del module.weight
    
    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        for module in self.classifier:
            if isinstance(module, nn.Linear) and hasattr(module, 'sparse_weight'):
                x = torch.sparse.mm(x, module.sparse_weight.t()) + module.bias
            else:
                x = module(x)
        return x


### Cifar10

In [9]:
import os
import gc
import torch
import numpy as np
import random
import pandas as pd

torch.cuda.empty_cache()
gc.collect()

import multiprocessing as mp
mp.set_start_method("spawn", force=True)

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DATASET_NAME = "cifar10"

fine_tuned_model_path = r"C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\results\models\cifar10_vgg16_unstructured_finetuned_80.pt"
mask_path = r"C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\results\pruning_masks\cifar10_unstructured_mask.pt"
finetuned_sparse_weights_path = r"C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\results\sparse_weights\cifar10_sparse_weights_finetuned.pt"

if DEVICE == 'cuda':
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.synchronize()

print("Loading base model and fine-tuned state dictionary...")
base_model = torch.hub.load("chenyaofo/pytorch-cifar-models", "cifar10_vgg16_bn", pretrained=False)
base_model.load_state_dict(torch.load(fine_tuned_model_path, map_location='cpu', weights_only=True))
base_model.to(DEVICE).eval()

print("Calculating MACs on the original dense model...")
macs_million = get_macs(base_model, batch_size=1, device=DEVICE, dataset_name=DATASET_NAME)

create_sparse_weights_from_finetuned_model(base_model, mask_path, finetuned_sparse_weights_path)

print("\nInstantiating custom SparseVGG model for profiling...")
sparse_vgg_model = SparseVGG(base_model, finetuned_sparse_weights_path, device=DEVICE)
sparse_vgg_model.eval()

print("Loading data...")
test_loader = testloader10 

if DEVICE == 'cuda':
    torch.cuda.synchronize()
    torch.cuda.reset_peak_memory_stats()

profiling_results = profile_model(
    model=sparse_vgg_model,
    dataloader=test_loader,
    dataset_name=DATASET_NAME,
    device=DEVICE,
    num_batches=10,
    model_path_for_size=finetuned_sparse_weights_path, 
    precomputed_macs_m=macs_million
)

if DEVICE == 'cuda':
    torch.cuda.synchronize()
    peak_mem = torch.cuda.max_memory_allocated() / (1024 ** 2)
    print(f"Peak GPU memory usage: {peak_mem:.3f} MB")

print("\n--- Final Profiling Summary Table ---")
df = pd.DataFrame([profiling_results])
print(df.to_string(index=False))
print("-----------------------------------")

del sparse_vgg_model, base_model
torch.cuda.empty_cache()
gc.collect()

Loading base model and fine-tuned state dictionary...


Using cache found in C:\Users\Fatim_Sproj/.cache\torch\hub\chenyaofo_pytorch-cifar-models_master


Calculating MACs on the original dense model...
Creating updated sparse weights from fine-tuned model...


  sparse_csr_tensor = W_flat.to_sparse_csr().cpu()


Saved updated fine-tuned sparse weights to: C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\results\sparse_weights\cifar10_sparse_weights_finetuned.pt

Instantiating custom SparseVGG model for profiling...
Loading data...

Profiling SparseVGG on cifar10...

Profiling Results:
Model          : SparseVGG
Dataset        : cifar10
Size (MB)      : 34.331
MACs (M)       : 314.002
Peak Mem (MB)  : 455.333
Avg Mem (MB)   : 71.194
Latency (ms)   : 17.096
Energy (mJ)    : 78365.0
Top-1 (%)      : 92.37
Top-5 (%)      : 98.7
Peak GPU memory usage: 455.373 MB

--- Final Profiling Summary Table ---
    Model Dataset  Size (MB)  MACs (M)  Peak Mem (MB)  Avg Mem (MB)  Latency (ms)  Energy (mJ)  Top-1 (%)  Top-5 (%)
SparseVGG cifar10     34.331   314.002        455.333        71.194        17.096      78365.0      92.37       98.7
-----------------------------------


206

### cifar100

In [10]:
import os
import gc
import torch
import numpy as np
import random
import pandas as pd

torch.cuda.empty_cache()
gc.collect()

import multiprocessing as mp
mp.set_start_method("spawn", force=True)

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DATASET_NAME = "cifar100"

fine_tuned_model_path = r"C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\results\models\cifar100_vgg16_unstructured_finetuned_80.pt"
mask_path = r"C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\results\pruning_masks\cifar100_unstructured_mask.pt"
finetuned_sparse_weights_path = r"C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\results\sparse_weights\cifar100_sparse_weights_finetuned.pt"

if DEVICE == 'cuda':
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.synchronize()

print("Loading base model and fine-tuned state dictionary...")
base_model = torch.hub.load("chenyaofo/pytorch-cifar-models", "cifar100_vgg16_bn", pretrained=False)
base_model.load_state_dict(torch.load(fine_tuned_model_path, map_location='cpu', weights_only=True))
base_model.to(DEVICE).eval()

print("Calculating MACs on the original dense model...")
macs_million = get_macs(base_model, batch_size=1, device=DEVICE, dataset_name=DATASET_NAME)

create_sparse_weights_from_finetuned_model(base_model, mask_path, finetuned_sparse_weights_path)

print("\nInstantiating custom SparseVGG model for profiling...")
sparse_vgg_model = SparseVGG(base_model, finetuned_sparse_weights_path, device=DEVICE)
sparse_vgg_model.eval()

print("Loading data...")
test_loader = testloader100 

if DEVICE == 'cuda':
    torch.cuda.synchronize()
    torch.cuda.reset_peak_memory_stats()

profiling_results = profile_model(
    model=sparse_vgg_model,
    dataloader=test_loader,
    dataset_name=DATASET_NAME,
    device=DEVICE,
    num_batches=10,
    model_path_for_size=finetuned_sparse_weights_path, 
    precomputed_macs_m=macs_million
)

if DEVICE == 'cuda':
    torch.cuda.synchronize()
    peak_mem = torch.cuda.max_memory_allocated() / (1024 ** 2)
    print(f"Peak GPU memory usage: {peak_mem:.3f} MB")

print("\n--- Final Profiling Summary Table ---")
df = pd.DataFrame([profiling_results])
print(df.to_string(index=False))
print("-----------------------------------")

del sparse_vgg_model, base_model
torch.cuda.empty_cache()
gc.collect()

Loading base model and fine-tuned state dictionary...


Using cache found in C:\Users\Fatim_Sproj/.cache\torch\hub\chenyaofo_pytorch-cifar-models_master


Calculating MACs on the original dense model...
Creating updated sparse weights from fine-tuned model...
Saved updated fine-tuned sparse weights to: C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\results\sparse_weights\cifar100_sparse_weights_finetuned.pt

Instantiating custom SparseVGG model for profiling...
Loading data...

Profiling SparseVGG on cifar100...

Profiling Results:
Model          : SparseVGG
Dataset        : cifar100
Size (MB)      : 34.384
MACs (M)       : 314.049
Peak Mem (MB)  : 455.946
Avg Mem (MB)   : 71.824
Latency (ms)   : 30.656
Energy (mJ)    : 70632.0
Top-1 (%)      : 68.07
Top-5 (%)      : 85.83
Peak GPU memory usage: 456.002 MB

--- Final Profiling Summary Table ---
    Model  Dataset  Size (MB)  MACs (M)  Peak Mem (MB)  Avg Mem (MB)  Latency (ms)  Energy (mJ)  Top-1 (%)  Top-5 (%)
SparseVGG cifar100     34.384   314.049        455.946        71.824        30.656      70632.0      68.07      85.83
-----------------------------------


206

### Part B

In [10]:
import os
import gc
import torch
import numpy as np
import random
import pandas as pd

torch.cuda.empty_cache()
gc.collect()

import multiprocessing as mp
mp.set_start_method("spawn", force=True)

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DATASET_NAME = "cifar10"

fine_tuned_model_path = r"C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\intermediate_models\finetuned_model10.pt"
mask_path = r"C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\intermediate_models\stage3_mask.pt"
finetuned_sparse_weights_path = r"C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\intermediate_models\finetuned_sparse10.pt"

if DEVICE == 'cuda':
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.synchronize()

print("Loading base model and fine-tuned state dictionary...")
base_model = torch.hub.load("chenyaofo/pytorch-cifar-models", "cifar10_vgg16_bn", pretrained=False)
base_model.load_state_dict(torch.load(fine_tuned_model_path, map_location='cpu', weights_only=True))
base_model.to(DEVICE).eval()

print("Calculating MACs on the original dense model...")
macs_million = get_macs(base_model, batch_size=1, device=DEVICE, dataset_name=DATASET_NAME)

create_sparse_weights_from_finetuned_model(base_model, mask_path, finetuned_sparse_weights_path)

print("\nInstantiating custom SparseVGG model for profiling...")
sparse_vgg_model = SparseVGG(base_model, finetuned_sparse_weights_path, device=DEVICE)
sparse_vgg_model.eval()

print("Loading data...")
test_loader = testloader10 

if DEVICE == 'cuda':
    torch.cuda.synchronize()
    torch.cuda.reset_peak_memory_stats()

profiling_results = profile_model(
    model=sparse_vgg_model,
    dataloader=test_loader,
    dataset_name=DATASET_NAME,
    device=DEVICE,
    num_batches=10,
    model_path_for_size=finetuned_sparse_weights_path, 
    precomputed_macs_m=macs_million
)

if DEVICE == 'cuda':
    torch.cuda.synchronize()
    peak_mem = torch.cuda.max_memory_allocated() / (1024 ** 2)
    print(f"Peak GPU memory usage: {peak_mem:.3f} MB")

print("\n--- Final Profiling Summary Table ---")
df = pd.DataFrame([profiling_results])
print(df.to_string(index=False))
print("-----------------------------------")

del sparse_vgg_model, base_model
torch.cuda.empty_cache()
gc.collect()

Loading base model and fine-tuned state dictionary...


Using cache found in C:\Users\Fatim_Sproj/.cache\torch\hub\chenyaofo_pytorch-cifar-models_master


Calculating MACs on the original dense model...
Creating updated sparse weights from fine-tuned model...


  sparse_csr_tensor = W_flat.to_sparse_csr().cpu()


Saved updated fine-tuned sparse weights to: C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\intermediate_models\finetuned_sparse10.pt

Instantiating custom SparseVGG model for profiling...
Loading data...

Profiling SparseVGG on cifar10...

Profiling Results:
Model          : SparseVGG
Dataset        : cifar10
Size (MB)      : 35.105
MACs (M)       : 314.002
Peak Mem (MB)  : 455.873
Avg Mem (MB)   : 71.733
Latency (ms)   : 39.59
Energy (mJ)    : 72881.0
Top-1 (%)      : 82.56
Top-5 (%)      : 97.86
Peak GPU memory usage: 455.912 MB

--- Final Profiling Summary Table ---
    Model Dataset  Size (MB)  MACs (M)  Peak Mem (MB)  Avg Mem (MB)  Latency (ms)  Energy (mJ)  Top-1 (%)  Top-5 (%)
SparseVGG cifar10     35.105   314.002        455.873        71.733         39.59      72881.0      82.56      97.86
-----------------------------------


127

In [9]:
import os
import gc
import torch
import numpy as np
import random
import pandas as pd

torch.cuda.empty_cache()
gc.collect()

import multiprocessing as mp
mp.set_start_method("spawn", force=True)

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DATASET_NAME = "cifar100"

fine_tuned_model_path = r"C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\intermediate_models\finetuned_model100.pt"
mask_path = r"C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\intermediate_models\stage3_mask100.pt"
finetuned_sparse_weights_path = r"C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\intermediate_models\finetuned_sparse100.pt"

if DEVICE == 'cuda':
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.synchronize()

print("Loading base model and fine-tuned state dictionary...")
base_model = torch.hub.load("chenyaofo/pytorch-cifar-models", "cifar100_vgg16_bn", pretrained=False)
base_model.load_state_dict(torch.load(fine_tuned_model_path, map_location='cpu', weights_only=True))
base_model.to(DEVICE).eval()

print("Calculating MACs on the original dense model...")
macs_million = get_macs(base_model, batch_size=1, device=DEVICE, dataset_name=DATASET_NAME)

create_sparse_weights_from_finetuned_model(base_model, mask_path, finetuned_sparse_weights_path)

print("\nInstantiating custom SparseVGG model for profiling...")
sparse_vgg_model = SparseVGG(base_model, finetuned_sparse_weights_path, device=DEVICE)
sparse_vgg_model.eval()

print("Loading data...")
test_loader = testloader100

if DEVICE == 'cuda':
    torch.cuda.synchronize()
    torch.cuda.reset_peak_memory_stats()

profiling_results = profile_model(
    model=sparse_vgg_model,
    dataloader=test_loader,
    dataset_name=DATASET_NAME,
    device=DEVICE,
    num_batches=10,
    model_path_for_size=finetuned_sparse_weights_path, 
    precomputed_macs_m=macs_million
)

if DEVICE == 'cuda':
    torch.cuda.synchronize()
    peak_mem = torch.cuda.max_memory_allocated() / (1024 ** 2)
    print(f"Peak GPU memory usage: {peak_mem:.3f} MB")

print("\n--- Final Profiling Summary Table ---")
df = pd.DataFrame([profiling_results])
print(df.to_string(index=False))
print("-----------------------------------")

del sparse_vgg_model, base_model
torch.cuda.empty_cache()
gc.collect()

Loading base model and fine-tuned state dictionary...


Using cache found in C:\Users\Fatim_Sproj/.cache\torch\hub\chenyaofo_pytorch-cifar-models_master


Calculating MACs on the original dense model...
Creating updated sparse weights from fine-tuned model...
Saved updated fine-tuned sparse weights to: C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\intermediate_models\finetuned_sparse100.pt

Instantiating custom SparseVGG model for profiling...
Loading data...

Profiling SparseVGG on cifar100...

Profiling Results:
Model          : SparseVGG
Dataset        : cifar100
Size (MB)      : 35.214
MACs (M)       : 314.049
Peak Mem (MB)  : 456.007
Avg Mem (MB)   : 71.885
Latency (ms)   : 32.875
Energy (mJ)    : 73002.0
Top-1 (%)      : 42.67
Top-5 (%)      : 61.07
Peak GPU memory usage: 456.063 MB

--- Final Profiling Summary Table ---
    Model  Dataset  Size (MB)  MACs (M)  Peak Mem (MB)  Avg Mem (MB)  Latency (ms)  Energy (mJ)  Top-1 (%)  Top-5 (%)
SparseVGG cifar100     35.214   314.049        456.007        71.885        32.875      73002.0      42.67      61.07
-----------------------------------


127

### Structured Pruning

In [7]:
save_path_full = r"C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\cifar10_vgg16_pruned_80sparsity.pt"
device = "cuda"
loaded_model = torch.load(save_path_full)
loaded_model.to(device)

  loaded_model = torch.load(save_path_full)


VGG(
  (features): Sequential(
    (0): Conv2d(3, 22, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(22, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(22, 22, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(22, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(22, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): Conv2d(45, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace=True)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(45, 90, kerne

In [10]:
profile_results = profile_model(loaded_model, testloader10, dataset_name)


Profiling VGG on cifar10...

Profiling Results:
Model          : VGG
Dataset        : cifar10
Size (MB)      : 11.144
MACs (M)       : 41.782
Peak Mem (MB)  : 118.364
Avg Mem (MB)   : 25.287
Latency (ms)   : 8.342
Energy (mJ)    : 161371.0
Top-1 (%)      : 88.01
Top-5 (%)      : 99.11


In [7]:
save_path_full = r"C:\Users\Fatim_Sproj\Desktop\Fatim\Spring 2025\aiedge\Pruning\cifar100_vgg16_pruned_80sparsity.pt"
device = "cuda"
loaded_model = torch.load(save_path_full)
loaded_model.to(device)

  loaded_model = torch.load(save_path_full)


VGG(
  (features): Sequential(
    (0): Conv2d(3, 22, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(22, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(22, 22, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(22, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(22, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): Conv2d(45, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace=True)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(45, 90, kerne

In [8]:
profile_results = profile_model(loaded_model, testloader100, dataset_name)


Profiling VGG on cifar100...

Profiling Results:
Model          : VGG
Dataset        : cifar100
Size (MB)      : 11.321
MACs (M)       : 41.828
Peak Mem (MB)  : 118.698
Avg Mem (MB)   : 25.638
Latency (ms)   : 11.238
Energy (mJ)    : 159944.0
Top-1 (%)      : 54.46
Top-5 (%)      : 79.61
