In [7]:
# NOTE: Inference speed (Latency/Throughput) was benchmarked on an NVIDIA Tesla T4.
# Results may vary depending on the specific GPU architecture and system load.
# Complexity (GFLOPs) and Parameters remain constant at 256x256 resolution.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import torch.nn as nn
import torch as th
import time

class EncoderModule(nn.Module):
    def __init__(self, in_channels, out_channels, leaky_relu_slope=0.2):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1)
        self.bn = nn.BatchNorm2d(out_channels)
        self.lrelu = nn.LeakyReLU(leaky_relu_slope)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.lrelu(x)
        return x

class FeatureMapModule(nn.Module):
    def __init__(self, in_channels, out_channels, leaky_relu_slope=0.2):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1)
        self.lrelu = nn.LeakyReLU(leaky_relu_slope)

    def forward(self, x):
        x = self.conv(x)
        x = self.lrelu(x)
        return x

class DecoderModule(nn.Module):
    def __init__(self, in_channels, out_channels, dropout_prob=0.5):
        super().__init__()
        self.deconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1)
        self.bn = nn.BatchNorm2d(out_channels)
        self.dropout = nn.Dropout(dropout_prob)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.deconv(x)
        x = self.bn(x)
        x = self.dropout(x)
        x = self.relu(x)
        return x

class OutputModule(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.deconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1)

    def forward(self, x):
        x = self.deconv(x)
        return x

class Autoencoder(nn.Module):
    """
    Autoencoder model for image generation

    A residual autoencoder model for image generation.
    The final model will be an image-to-image translation model
    that enhances underwater images.
    """
    def __init__(self, DEBUG=False):
        super().__init__()
        self.DEBUG = DEBUG

        self.EncoderLayers = nn.ModuleList([
            EncoderModule(3, 64),
            EncoderModule(64, 128),
            EncoderModule(128, 256),
            EncoderModule(256, 512),
            EncoderModule(512, 512),
            EncoderModule(512, 512),
            EncoderModule(512, 512),
            FeatureMapModule(512, 512),
        ])

        self.DecoderLayers = nn.ModuleList([
            DecoderModule(512, 512),
            DecoderModule(1024, 512),
            DecoderModule(1024, 512),
            DecoderModule(1024, 512, dropout_prob=0.0),
            DecoderModule(1024, 256, dropout_prob=0.0),
            DecoderModule(512, 128, dropout_prob=0.0),
            DecoderModule(256, 64, dropout_prob=0.0),
        ])

        self.OutputLayer = OutputModule(128, 3)
        self.tanh = nn.Tanh() #NOTE: Not actually in the paper, but required to limit values to [0,1]. This produces a valid (float) image tensor.


    def forward(self, x):
        """Forward pass for the autoencoder model.

        Args:
            x (th.Tensor): Input image tensor

        Returns:
            th.Tensor: Output image tensor
        """
        # Store the activations of the encoder layers for skip connections
        layer_outputs = []

        if self.DEBUG:
            print("Starting forward pass")
            print(x.shape)

        # Encoder pass
        for i in range(len(self.EncoderLayers)):
            x = self.EncoderLayers[i](x)
            if i < len(self.EncoderLayers) - 1:
                layer_outputs.append(x)
            if self.DEBUG:
                print(x.shape)

        if self.DEBUG:
            print("Encoding complete")
            print(x.shape)

        # Checking the shapes of the stored activations
        #[print("Stored activations: ",x.shape) for x in layer_outputs]

        # Decoder pass
        for i in range(len(self.DecoderLayers)):

            if i != 0:
                # Get the appropriate encoder activation
                s = layer_outputs.pop()

                # If the shapes match, concatenate the activations
                if x.shape == s.shape:
                    x = th.cat((x, s), 1)

                else:
                    print("Error, shapes do not match")
                    print("X:", x.shape)
                    print("S:", s.shape)
                    return th.tensor([])

            # Pass the concatenated activations through the decoder layer
            x = self.DecoderLayers[i](x)
            if self.DEBUG:
                print(x.shape)

        if self.DEBUG:
            print("Decoding complete")

        # Perform the final deconvolution
        x = th.cat((x, layer_outputs.pop()), 1)
        x = self.OutputLayer(x)
        x = self.tanh(x)

        if self.DEBUG:
            print("Is layer_outputs empty:", len(layer_outputs) == 0)
            print(x.shape)
            print("Output complete")

        return x

In [3]:
!pip install ptflops

Collecting ptflops
  Downloading ptflops-0.7.5-py3-none-any.whl.metadata (9.4 kB)
Downloading ptflops-0.7.5-py3-none-any.whl (19 kB)
Installing collected packages: ptflops
Successfully installed ptflops-0.7.5


In [10]:
import torch
import time
import numpy as np
from ptflops import get_model_complexity_info

def benchmark_efficiency():
    # 1. Setup Environment
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Replace Autoencoder() with your specific model class
    model = Autoencoder().to(device).eval()
    input_shape = (3, 256, 256)

    print(f"Benchmarking on: {torch.cuda.get_device_name(0)}")

    # 2. Complexity Analysis (Structural Metrics)
    # We use as_strings=False to get precise numerical values
    with torch.no_grad():
        macs, params = get_model_complexity_info(
            model, input_shape, as_strings=False, print_per_layer_stat=False
        )

    # 3.0x multiplier logic for Full-Stack GFLOPs
    # (Counts Add, Multiply, and Activation as distinct operations)
    calculated_flops = (macs * 3.0) / 1e9

    # 3. Latency & Throughput (Performance Metrics)
    iterations = 100
    warmup = 50

    # Prepare input tensor on the device
    x = torch.randn(1, *input_shape).to(device)

    # Warm-up phase to initialize CUDA kernels and clear cache
    with torch.no_grad():
        for _ in range(warmup):
            _ = model(x)

    # Ensure all kernels are finished before starting the timer
    torch.cuda.synchronize()

    latencies = []
    with torch.no_grad():
        for _ in range(iterations):
            start = time.time()

            _ = model(x)

            # Synchronize after every pass for high-precision timing
            torch.cuda.synchronize()

            end = time.time()
            latencies.append((end - start) * 1000)

    avg_lat = np.mean(latencies)
    avg_fps = 1000.0 / avg_lat

    # 4. Final Output
    print("\n" + "="*45)
    print("       MODEL EFFICIENCY REPORT")
    print("="*45)
    print(f"üìå Parameters:   {params / 1e6:.2f} M")
    print(f"üìå GFLOPs:       {calculated_flops:.2f} G")
    print(f"‚è±  Avg Latency:  {avg_lat:.2f} ms")
    print(f"‚ö° Throughput:   {avg_fps:.2f} FPS")
    print("="*45)
    print("Measurement: Standard Synchronous GPU Inference")
    print("="*45 + "\n")

if __name__ == "__main__":
    benchmark_efficiency()

Benchmarking on: Tesla T4

       MODEL EFFICIENCY REPORT
üìå Parameters:   54.42 M
üìå GFLOPs:       18.20 G
‚è±  Avg Latency:  7.15 ms
‚ö° Throughput:   139.95 FPS
Measurement: Standard Synchronous GPU Inference

