Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Change directory to myDrive (smcp must be in myDrive)

In [None]:
%cd 'gdrive/MyDrive'
!ls

Uninstall problem causing packages

In [None]:
!pip uninstall torchtext torchaudio torchdata tensorflow

Install correct versions of packages

In [None]:
!pip install lightning-bolts==0.5.0 tabulate torch==1.11.0 torchvision==0.12.0 torchmetrics==0.9.1 pytorch-lightning==1.5.10

In [None]:
# !pip install tqdm==4.65.0 tensorboard==2.11.2 torch==1.10.0 lightning-lite==1.8.0 pytorch-lightning==1.8.0 torchmetrics==0.11.4 torchvision==0.11.1 lightning-utilities==0.3.0 lightning-bolts==0.6.0.post1

In [None]:
#!pip install pytorch-lightning==1.4.9 torch==1.8.0 torchmetrics==0.7.0 torchtext==0.9.0 torchvision==0.9.0 lightning-bolts==0.5.0

New data module

In [None]:
import torchvision as tv
from pl_bolts.transforms.dataset_normalizations import cifar10_normalization
from pl_bolts.datamodules import CIFAR10DataModule

def get_default(data_dir, batch_size, num_workers):
    train_transforms = tv.transforms.Compose([
        tv.transforms.RandomCrop(32, padding=4),
        tv.transforms.RandomHorizontalFlip(),
        tv.transforms.ToTensor(),
        cifar10_normalization()
    ])

    test_transforms = tv.transforms.Compose([
        tv.transforms.ToTensor(),
        cifar10_normalization()
    ])

    cifar10_dm = CIFAR10DataModule(
        data_dir=data_dir,
        batch_size=batch_size,
        num_workers=num_workers,
        train_transforms=train_transforms,
        test_transforms=test_transforms,
        val_transforms=test_transforms

    )
    return cifar10_dm

Copy of image_classifier.py (with small modifications for running in interactive env)

In [None]:
from math import ceil
from typing import Dict, Tuple, Union
import warnings

import pytorch_lightning as pl
from pytorch_lightning import LightningModule
from pytorch_lightning.plugins import DDPPlugin
import pytorch_lightning.callbacks as pt_callbacks
from pytorch_lightning.utilities.warnings import LightningDeprecationWarning
import torch
from torch.distributed.algorithms.ddp_comm_hooks.default_hooks import fp16_compress_hook
import torch.nn as nn
import torch.optim as optim
from torchmetrics import MetricCollection, Accuracy

from smcp.core.enum_parse import EnumAction
from smcp.core.lr_scheduler import WarmupMultiStepLR, WarmupCosineLR, WarmupLinearLR, WarmupExponentialLR
from sparse_ops import create_coster, create_importance_accumulator, ChannelBNRescalingType, ChannelPruning, ChannelPruningSchedule, ChannelPruningType, CostingType, DynamicPruning, ImportanceAccumulatorType, ImportanceType, ImportanceGradType, ImportanceHessType, ParameterMaskingType, PruningLogVerbosity, PruningSchedule
from smcp.classification.datasets import UpscaledCIFAR10DataModule, UpscaledCIFAR100DataModule, ImagenetDataModule
from smcp.classification.models import get_classification_model
from smcp.classification.losses import LabelSmoothing

# Disable pl deprecations
warnings.simplefilter("ignore", LightningDeprecationWarning)

class ImageClassifierParams:
    arch: str
    pretrained: Union[bool, str]
    num_classes: int
    label_smoothing: float
    learning_rate: float
    momentum: float
    nesterov: bool
    weight_decay: float
    bn_weight_decay: float
    lr_schedule: str
    warmup: int
    epochs: int

class ImageClassifier(pl.LightningModule):
    hparams: ImageClassifierParams
    model: nn.Module

    def __init__(
        self, arch: str, num_classes: int, label_smoothing: float, pretrained: Union[bool, str],
        learning_rate: float, momentum: float, nesterov: bool, weight_decay: float, bn_weight_decay: float,
        lr_schedule: str, warmup: int, epochs: int, **kwargs
    ):
        """Image Classifier model
        Args:
            arch: type of classifier architecture
            num_classes: number of image classes
            label_smoothing: [0, 1) value for label smoothing
            pretrained: whether to use a pretrained network. If a string, path to the pretrained weights
            learning_rate: learning rate
            momentum: SGD momentum
            nesterov: whether to enable Nesterov momentum
            weight_decay: amount of weight decay for non-BatchNorm weights
            bn_weight_decay: amount of weight decay for BatchNorm weights
            lr_schedule: LR scheduler type
            warmup: LR scheduler linear warmup time
            epochs: total number of training epochs
        """
        super().__init__()
        self.save_hyperparameters()

        self.model = get_classification_model(arch, num_classes, pretrained=pretrained).to(memory_format=torch.channels_last)
        self.example_input_array = torch.ones(1, 3, 224, 224).to(memory_format=torch.channels_last)


        if label_smoothing > 0.0:
            self.criterion = LabelSmoothing(label_smoothing)
        else:
            self.criterion = nn.CrossEntropyLoss()

        acc_metrics = MetricCollection({
            "top1": Accuracy(num_classes=num_classes, top_k=1),
            "top5": Accuracy(num_classes=num_classes, top_k=5)
        })
        self.train_acc_metrics = acc_metrics.clone(prefix="train/")
        self.val_acc_metrics = acc_metrics.clone(prefix="val/")

    def configure_optimizers(self) -> optim.Optimizer:
        parameters_for_optimizer = list(self.model.named_parameters())

        bn_params = [v for n, v in parameters_for_optimizer if "bn" in n]
        rest_params = [v for n, v in parameters_for_optimizer if not "bn" in n]
        optimizer = optim.SGD(
            [
                {"params": bn_params, "weight_decay": self.hparams.bn_weight_decay},
                {"params": rest_params, "weight_decay": self.hparams.weight_decay}
            ],
            self.hparams.learning_rate,
            momentum=self.hparams.momentum,
            weight_decay=self.hparams.weight_decay,
            nesterov=self.hparams.nesterov
        )

        lr_scheduler = None
        if self.hparams.lr_schedule == "step":
            lr_scheduler = WarmupMultiStepLR(optimizer, self.hparams.warmup, [30,60,80], 0.1)
        elif self.hparams.lr_schedule == "step_prune":
            lr_scheduler = WarmupMultiStepLR(optimizer, self.hparams.warmup, [10,20,30], 0.1)
        elif self.hparams.lr_schedule == "cosine":
            lr_scheduler = WarmupCosineLR(optimizer, self.hparams.warmup, self.hparams.epochs)
        elif self.hparams.lr_schedule == "linear":
            lr_scheduler = WarmupLinearLR(optimizer, self.hparams.warmup, self.hparams.epochs)
        elif self.hparams.lr_schedule == "exponential":
            lr_scheduler = WarmupExponentialLR(optimizer, self.hparams.warmup, gamma=0.98) #ours
        return {
            "optimizer": optimizer,
            "lr_scheduler": lr_scheduler
        }

    def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: optim.Optimizer, optimizer_idx: int) -> None:
        optimizer.zero_grad(set_to_none=True)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.model(x)

    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> torch.Tensor:
        x, target = batch

        logits = self.forward(x)
        loss = self.criterion(logits, target)

        self.log("train/loss", loss, sync_dist=True)

        preds = nn.functional.softmax(logits, dim=1)
        acc_metrics = self.train_acc_metrics(preds, target)
        self.log_dict(acc_metrics, sync_dist=True)

        return { "loss": loss }

    def validation_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> Dict[str, torch.Tensor]:
        x, target = batch

        logits = self.forward(x)
        loss = self.criterion(logits, target)

        self.log("val/loss", loss, sync_dist=True)

        preds = nn.functional.softmax(logits, dim=1)
        acc_metrics = self.val_acc_metrics(preds, target)
        self.log_dict(acc_metrics, sync_dist=True)

        return loss


def main(hparams):
    # Interpret/modify the hparams
    using_gpu = hparams.gpus is not None
    data_dtype = torch.float16 if hparams.fp16 else torch.float32
    precision = 16 if hparams.fp16 else 32
    accum_grad_batches = hparams.simulated_batch_size // hparams.batch_size
    accelerator = "ddp" if using_gpu else None
    sync_batchnorm =  using_gpu and hparams.batch_size <= 32

    eff_batch_size = hparams.batch_size * accum_grad_batches * hparams.num_nodes * (hparams.gpus if using_gpu else 1)
    hparams.learning_rate *= eff_batch_size / 256
    hparams.rewiring_freq = ceil(256 * hparams.rewiring_freq / eff_batch_size)

    # Setup datamodule
    dm_cls = None
    if hparams.dataset == "Imagenet":
        dm_cls = ImagenetDataModule
    elif hparams.dataset == "CIFAR10":
        dm_cls = UpscaledCIFAR10DataModule
    elif hparams.dataset == "CIFAR100":
        dm_cls = UpscaledCIFAR100DataModule
    else:
        raise NotImplementedError(f"Dataset {hparams.dataset} unknown")

    # dm = dm_cls(
    #     hparams.data_root, num_workers=hparams.workers, batch_size=hparams.batch_size,
    #     shuffle=True, pin_memory=using_gpu, drop_last=True, dtype=data_dtype
    # )
    dm = get_default(hparams.data_root, batch_size=hparams.batch_size, num_workers=hparams.workers)

    # Setup model
    model = ImageClassifier(num_classes=dm.num_classes, **vars(hparams))

    # Setup trainer
    pl.seed_everything(hparams.seed, workers=True)

    logger = pl.loggers.TensorBoardLogger(
        save_dir=hparams.output_dir,
        name=f"image_classifier-{hparams.dataset}",
        default_hp_metric=False
    )
    callbacks = [
        pl.callbacks.LearningRateMonitor(logging_interval="epoch"),
        # pl.callbacks.ModelCheckpoint(
        #     filename="image_classifier-epoch{epoch}-val_loss{val/loss:.4f}-top1{val/top1:.4f}",
        #     mode="max",
        #     monitor="val/top1",
        #     auto_insert_metric_name=False,
        #     save_last=True,
        #     every_n_val_epochs=hparams.ckpt_freq
        # )
    ]

    if hparams.prune:
        importance_accum = create_importance_accumulator(hparams.importance_accumulator)

        if hparams.channel_type is not ChannelPruningType.Skip:
            coster = create_coster(hparams.costing_type, hparams.costing_latency_table)

            pruning_schedule = ChannelPruningSchedule(
                hparams.channel_ratio, hparams.channel_schedule,
                hparams.epochs, hparams.prune_warmup, hparams.channel_schedule_length, hparams.prune_cooldown, hparams.rewiring_freq
            )

            unpruned_layers = ["model.conv1", "model.conv_bn"]
            pruning_method = ChannelPruning(
                hparams.masking_type, pruning_schedule, importance_accum, coster,
                hparams.channel_type, unpruned_layers, hparams.channel_chunk_size, hparams.channel_allow_layer_prune, hparams.channel_bnrescaling_type,
                hparams.channel_doublesided_weight, track_mask_convergence=True
            )
        else:
            raise NotImplementedError("Pruning is set but an unrecognized configuration was given")

        pruning_callback = DynamicPruning(
            pruning_method, hparams.importance_type, hparams.importance_grad_type,
            hparams.importance_hess_type, hparams.pruned_decay, True,
            log_verbosity=PruningLogVerbosity.Full
        )
        callbacks.append(pruning_callback)

    plugins = []
    if accelerator == "ddp":
        plugins.append(DDPPlugin(
            find_unused_parameters=False,
            gradient_as_bucket_view=True,
            ddp_comm_hook=fp16_compress_hook if hparams.fp16 else None
        ))

    trainer = pl.Trainer(
        accelerator=accelerator, num_nodes=hparams.num_nodes, gpus=hparams.gpus,
        benchmark=using_gpu, sync_batchnorm=sync_batchnorm,
        max_epochs=hparams.epochs, precision=precision, accumulate_grad_batches=accum_grad_batches,
        gradient_clip_val=hparams.clip, log_every_n_steps=hparams.train_log_freq,
        plugins=plugins, callbacks=callbacks, logger=logger, weights_summary="full", enable_checkpointing=False
    )

    # Run experiment
    trainer.fit(model, datamodule=dm)

    # Perform final validation
    trainer.validate(model, datamodule=dm)

    # Save the model (without training state)
    torch.save(model.model.state_dict(), f"{logger.log_dir}/80_pruning.pt")


Train classifier

In [None]:
class Args:
  def __init__(self):
    self.arch = "resnet18"
    self.pretrained = False
    self.dataset = "CIFAR10" #Imagenet
    self.data_root = "."
    self.workers = 0 # 8
    self.batch_size = 64 # 256
    self.simulated_batch_size = 64 # -1
    self.learning_rate = 0.1 #0.256
    self.momentum = 0.875
    self.nesterov = False
    self.weight_decay = 3.0517578125e-05
    self.bn_weight_decay = 0.0
    self.clip = None # 0.01
    self.epochs = 21 # 90
    self.lr_schedule = "exponential" #choices=["step", "linear", "cosine", "step_prune"])
    self.warmup = 0
    self.label_smoothing = 0.1
    self.seed = 42
    self.num_nodes = 1
    self.gpus = 1
    self.fp16 = False
    self.output_dir = "logs/"
    self.train_log_freq = 50
    self.ckpt_freq = 5
    self.prune = True #False
    self.masking_type = ParameterMaskingType.Soft
    self.prune_warmup = 0
    self.prune_cooldown = 5
    self.rewiring_freq = 10
    self.pruned_decay = 2e-4
    self.channel_type = ChannelPruningType.Global
    self.channel_ratio = 0.8 # 0
    self.channel_chunk_size = 1
    self.channel_allow_layer_prune = False
    self.channel_schedule = "exp"
    self.channel_schedule_length = 16 # 55
    self.channel_bnrescaling_type = ChannelBNRescalingType.Skip
    self.channel_doublesided_weight = 1
    self.importance_type = ImportanceType.Weight
    self.importance_grad_type = ImportanceGradType.INST
    self.importance_hess_type = ImportanceHessType.GRADSQ
    self.importance_accumulator = ImportanceAccumulatorType.Latest
    self.costing_type = CostingType.Flop
    self.costing_latency_table = "." # "./latency_tables/resnet50_titanV_cudnn74.pkl"

args = Args()
main(args)

Calculate time (FPS)

In [None]:
from torch.utils.data import DataLoader
import torch.backends.cudnn as cudnn

def time_inference(
    model: nn.Module, dataloader: DataLoader, num_batches: int = 30, warmup: int = 10
) -> float:
    cudnn.benchmark = True
    cudnn.deterministic = True

    start_evt = torch.cuda.Event(enable_timing=True)
    end_evt = torch.cuda.Event(enable_timing=True)

    device = torch.device("cuda")
    model = model.to(device)

    model.eval()

    times = []
    for i, (input, target) in enumerate(dataloader):
        input = input.to(device)

        start_evt.record()
        output = model(input)
        end_evt.record()

        torch.cuda.synchronize()
        elapsed_time = start_evt.elapsed_time(end_evt)

        if i < warmup:
            continue

        times.append(elapsed_time)

        if i >= warmup + num_batches:
            break
    fps = sum(times) / len(times)
    print("FPS:", fps)
    return fps

In [None]:
dm = get_default(".", batch_size=64, num_workers=1)

In [None]:
dm.prepare_data()
dm.setup()
dl = dm.val_dataloader()

In [None]:
m = get_classification_model('resnet18', 10, pretrained=False).to(memory_format=torch.channels_last)
path = "logs/image_classifier-CIFAR10/version_45/70_pruning.pt"
m.load_state_dict(torch.load(path))

In [None]:
fps = time_inference(m, dl)
print("FPS:", fps)

In [None]:
# no_pruning.pt top1 = 0.7883999943733215  top5 = 0.9855999946594238 FPS = 8.281613411441926 version_38
# 10_pruning.pt top1 = 0.7860999703407288 top5 = 0.9861000180244446 FPS = 8.389946783742596 version_39
# 20_pruning.pt top1 = 0.7764999866485596 top5 = 0.9865999817848206 FPS = 8.39308895603303 version_40
# 30_pruning.pt top1 = 0.7702216204049691 top5 = 0.9853000044822693 FPS = 9.072773195082142 version_41
# 40_pruning.pt top1 = 0.7700999975204468 top5 = 0.986299991607666 FPS = 8.629408990183185 version_42
# 50_pruning.pt top1 = 0.7663999795913696 top5 = 0.9850000143051147 FPS = 8.281490695091986 version_43
# 60_pruning.pt top1 = 0.7562000155448914 top5 = 0.984000027179718 FPS = 9.224685422835812 version_44
# 70_pruning.pt top1 = 0.6859999895095825 top5 = 0.9761000275611877 FPS =  9.628498600375268 version 45
# 80_pruning.pt top1 =  top5 =  FPS =  
# 90_pruning.pt top1 =  top5 =  FPS =  

# 32_bs.pt
# 64_bs.pt
# 128_bs.pt
# 256_bs.pt

# 10_rewiring_freq.pt
# 20_rewiring_freq.pt
# 30_rewiring_freq.pt

# 256_lr.pt
# 512_lr.pt
# 768_lr.pt

In [None]:
!pip install tensorboard

In [None]:
%load_ext tensorboard

In [None]:
import tensorflow as tf
import datetime, os

In [None]:
%tensorboard --logdir logs/image_classifier-CIFAR10/version_38/

Read the excel file with the results for the pruning ratios

In [None]:
import pandas as pd

df = pd.read_excel('channel_ratio_results.xlsx')
print(df)

channel_ratio_values = df['Channel Ratio'].tolist()
top_1_values_raw = df['top_1'].tolist()
fps_values = df['FPS'].tolist()

top_1_values = []
for item in top_1_values_raw:
    if isinstance(item, str):
        item = item.replace('\n', '').replace(',', '.')
        item = float(item)
    top_1_values.append(item)

print(channel_ratio_values)
print(top_1_values)
print(fps_values)

Plot the Top1 / FPS for different pruning ratios 

In [None]:
import matplotlib.pyplot as plt

def plot_results_top1(channel_ratio_values, one_accs, fps):
  
  plt.plot(fps, one_accs)

  plt.title("Top-1 Accuracy vs Frames per Second (FPS) for different pruning ratios")
  plt.xlabel("Frames per Second (FPS)")
  plt.ylabel("Top-1 Accuracy")
  plt.legend()

  plt.scatter(fps, one_accs, s=100, c="r", marker="o")
  
  # add labels for specific points
  for i in range(len(one_accs)):
    plt.text(fps[i], one_accs[i], str(channel_ratio_values[i] * 100) + "%", fontsize=6)
  

plot_results_top1(channel_ratio_values, top_1_values, fps_values)
