In [9]:
# System libraries
import os
import sys
import time
from typing import Optional
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '..')))

# Pytorch/TIMM libraries
import timm
from timm.models.helpers import model_parameters
import torch.nn as nn
import torch
from torchprofile import profile_macs
from torch.profiler import profile, record_function, ProfilerActivity
from fvcore.nn import FlopCountAnalysis, parameter_count

# Helper libraries
import numpy as np
import json
from PIL import Image
from collections import defaultdict

# Data visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ace_tools_open as tools
from torch.utils.tensorboard import SummaryWriter

# Environment variables
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

# Commands
# tensorboard --logdir=./my_work/v2/logs

In [3]:
# models = [model for model in timm.list_models('vit*')]
# print([x for x in models if 'vitamin' not in str(x)])
model_name = "vit_base_patch16_224"

#TODO Learn how to use SummaryWriter
#TODO Learn what CUDA AMP is
#TODO Terrible looking plots in tensorboard, fix
#TODO create better plots for layers

In [10]:
class Analyzer:
    def __init__(self, model_name, device="cpu", pretrained=True, pretrained_cfg=None, pretrained_cfg_overlay=None,
                checkpoint_path='', scriptable=None, exportable=None, no_jit=True):
        self.model_name = model_name
        self.device = torch.device(device if torch.cuda.is_available() and device == "cuda" else "cpu")
        print(f"Using device: {self.device}")
        self.model = timm.create_model(
            model_name, pretrained=pretrained, pretrained_cfg=pretrained_cfg, pretrained_cfg_overlay=pretrained_cfg_overlay,
            checkpoint_path=checkpoint_path, scriptable=scriptable, exportable=exportable, no_jit=no_jit
        ).to(self.device)
        self.profiler = None  
        
        # print(self.model.default_cfg)
        # params = sum(p.numel() for p in self.model.parameters())
        # print(f"Number of parameters: {params / 1e6:.2f}M")
    
    def inference_one(self, input_tensor: torch.Tensor) -> None:
        """Run inference on a single input tensor."""
        input_tensor = input_tensor.to(self.device)
        self.model.eval()  
        with torch.no_grad():
            output = self.model(input_tensor)
            print(f"Model device: {next(self.model.parameters()).device}")
            print(f"Input tensor device: {input_tensor.device}")

        return output
    
    def start_profiler(self, create_logfile=False) -> None:
        """Initialize and start the profiler."""
        if create_logfile:
            trace_handler = torch.profiler.tensorboard_trace_handler('./logs')
        else:
            trace_handler = None
            
        self.profiler = profile(
            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
            on_trace_ready=trace_handler,
            record_shapes=True,
            with_stack=True,
            with_flops=True,
            profile_memory=True,
            with_modules=True
        )
        self.profiler.__enter__()  # Start the profiler context manually

    def stop_profiler(self) -> None:
        """Stop the profiler and process the collected data."""
        if self.profiler:
            self.profiler.__exit__(None, None, None) 
            
    def is_cuda_initialized(self, logs=False) -> None:
        for name, param in self.model.named_parameters():
            if param.device.type != "cuda":
                if logs:	
                    print(f"Parameter {name} is not on GPU")
                else:
                    print("Some or all model parameters are not on GPU")
                    return
                # return False
        print("All model parameters are on the GPU")
        # return True

    def list_events(self, show_key_averages=False, list_all=False) -> None:
        """
        List all recorded events from the profiler.	
        name	         - The name of the operation (e.g., aten::add, aten::matmul, aten::conv2d).
		cpu_time_total	 - Total time spent on the CPU for this operation, in microseconds.
		cuda_time_total	 - Total time spent on the GPU for this operation, in microseconds.
		input_shapes	 - Shapes of the tensors used as inputs to this operation.
		output_shapes	 - Shapes of the tensors produced by this operation (if applicable).
		device_type	     - Whether the operation was executed on CPU or CUDA.
		device	         - The device ID on which the operation was executed.
		self_cpu_time	 - Time spent on the CPU for this operation alone (excluding time for child operations).
		self_cuda_time   - Time spent on the GPU for this operation alone (excluding time for child operations).
		"""
        if self.profiler is None:
            print("Profiler has not been initialized or profiling session has ended.")
        else:
            if show_key_averages:	
                print(self.profiler.key_averages().table())
            if list_all:
                for event in self.profiler.events():
                    print(f"Name: {event.name}, CPU Time: {event.cpu_time_total}, CUDA Time: {event.cuda_time_total}")
    
    def event_handler(self, create_csv_file=False, log_to_tensorboard=False, plot_events=False, log_dir='logs') -> Optional[pd.DataFrame]:
        if self.profiler is None:
            print("Profiler has not been initialized or profiling session has ended.")
        else:         
            key_averages = self.profiler.key_averages()
            
            def helper_list_events(key_averages=key_averages):
                # Helper code to print all events and their attributes
                for i, event in enumerate(key_averages):
                    print(f"Event {i + 1} name: {event.key}")
                    attributes = [attr for attr in dir(event) if not attr.startswith("_") and not callable(getattr(event, attr))]
                    for attr in attributes:
                        print(f"  {attr}: {getattr(event, attr)}")
                    print("-" * 50)  # Separator between events
            # helper_list_events()
                
            profiler_data = []
            
            if log_to_tensorboard:
                writer = SummaryWriter(log_dir=log_dir) 
                
            # Usually collapsed 
            for event in key_averages:
                if not event.key == "[memory]":
                    cpu_children_time = event.cpu_time_total - event.self_cpu_time_total
                    cuda_children_time = event.device_time_total - event.self_device_time_total
                    
                    event_data = {
                        "Name": event.key,
                        "Count": event.count,
                        # Timing attributes
                        "CPU time op only": event.self_cpu_time_total,
                        "CPU time total (+children)": event.cpu_time_total,
                        "CPU children time": cpu_children_time,
						"CUDA time op only": event.self_device_time_total,
						"CUDA time total (+children)": event.device_time_total,
						"CUDA children time": cuda_children_time,	
						# Memory attributes
						"CPU memory usage (+children)": event.cpu_memory_usage,
						"CPU memory usage op only": event.self_cpu_memory_usage,
						"CUDA memory usage (+children)": event.device_memory_usage,
						"CUDA memory usage op only": event.self_device_memory_usage,
						# Performance
						"Flops": event.flops,	
						"Is Async": event.is_async,
						"Input Shapes": event.input_shapes,
						"Stack": event.stack
					}
                    
                    profiler_data.append(event_data)
                    
                    # Log metrics to TensorBoard
                    if log_to_tensorboard:
                        writer.add_scalar("Performance/CPU_time_op_only", event.self_cpu_time_total, global_step=event.count)
                        writer.add_scalar("Performance/CPU_time_total", event.cpu_time_total, global_step=event.count)
                        writer.add_scalar("Performance/CPU_children_time", cpu_children_time, global_step=event.count)
                        writer.add_scalar("Performance/CUDA_time_op_only", event.self_device_time_total, global_step=event.count)
                        writer.add_scalar("Performance/CUDA_time_total", event.device_time_total, global_step=event.count)
                        writer.add_scalar("Performance/CUDA_children_time", cuda_children_time, global_step=event.count)
                        writer.add_scalar("Performance/FLOPS", event.flops, global_step=event.count)
                        writer.add_scalar("Memory/CPU_memory_usage_children", event.cpu_memory_usage, global_step=event.count)
                        writer.add_scalar("Memory/CUDA_memory_usage_children", event.device_memory_usage, global_step=event.count)
                        
            df = pd.DataFrame(profiler_data)
	
            if create_csv_file:
                df.to_csv(f"./operations/{self.model_name}_profiler_data.csv", index=False)
                
            if log_to_tensorboard:
                writer.close()
                
            if plot_events:
                numeric_columns = [
					"CPU time op only", "CPU time total (+children)", "CPU children time",
					"CUDA time op only", "CUDA time total (+children)", "CUDA children time",
					"CPU memory usage (+children)", "CPU memory usage op only",
					"CUDA memory usage (+children)", "CUDA memory usage op only", "Flops"
				]
                df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')
                df = df.dropna()

				# Usually collapsed 
                def helper_plot_events(df=df):
                    # CPU vs CUDA Time (Operation Only)
                    plt.figure(figsize=(14, 6))
                    df_melted = df.melt(
						id_vars=["Name"], 
						value_vars=["CPU time op only", "CUDA time op only"],
						var_name="Type", 
						value_name="Time (us)"
					)
                    sns.barplot(data=df_melted, x="Name", y="Time (us)", hue="Type")
                    plt.xticks(rotation=90)
                    plt.title("CPU vs CUDA Time (Operation Only)")
                    plt.show()
					
                helper_plot_events()  
            
            return df
        
    def analyze_layers(self, create_csv=False) -> Optional[pd.DataFrame]:
        """Uses a recursive function to extract layer information from the model."""
        layers_data = []
    
        def extract_layer_info(name, module, parent_name="") -> None:
			# Define layer_info here and update it dynamically	
            layer_info = {
				"Name": f"{parent_name}.{name}" if parent_name else name,
				"Type": module.__class__.__name__,
			}
                        
            layers_data.append(layer_info)
            
        def recurse_layers(module, parent_name="") -> None:
            for name, child in module.named_children():
                extract_layer_info(name, child, parent_name)
                recurse_layers(child, parent_name=f"{parent_name}.{name}" if parent_name else name)

        recurse_layers(self.model)

        df = pd.DataFrame(layers_data)
        if create_csv:
            df.to_csv(f"./layers/{self.model_name}_layers.csv", index=False)

        return df
    
    
    def measure_inference_time(self, mode='single', num_warmup_runs=10, num_measurement_runs=10, batch_size=None) -> Optional[float]:
        self.model.eval()
        input_size = self.model.default_cfg.get('input_size', (3, 224, 224))

        if mode == 'single':
            # Single image inference
            batch_size = 1
            print("Measuring inference time for single image...")
        elif mode == 'batch':
            # Batch inference
            if batch_size is None:
                batch_size = 512  # Default batch size from ViT paper
            print(f"Measuring inference time for batch size {batch_size}...")
        else:
            raise ValueError("Mode must be 'single' or 'batch'.")

        # Prepare input tensor
        sample_input = torch.randn(batch_size, *input_size).to(self.device)

        # Warm-up runs
        with torch.no_grad():
            for _ in range(num_warmup_runs):
                _ = self.model(sample_input)
            # Ensure all operations are complete
            if self.device.type == 'cuda':
                torch.cuda.synchronize()

        # Measurement runs
        inference_times = []
        with torch.no_grad():
            for _ in range(num_measurement_runs):
                start_time = time.time()
                _ = self.model(sample_input)
                if self.device.type == 'cuda':
                    torch.cuda.synchronize()
                end_time = time.time()
                inference_time = (end_time - start_time) * 1000  # Convert to milliseconds
                inference_times.append(inference_time)

        avg_inference_time_ms = sum(inference_times) / len(inference_times)
        print(f"Average Inference Time: {avg_inference_time_ms:.2f} ms")

        # return avg_inference_time_ms
    
    def analyze_performance(self, create_csv=False) -> Optional[pd.DataFrame]:
        """
        Analyze the performance of each layer in the model using forward hooks,
        identify bottlenecks, and optionally save the data to a CSV file.
		"""

		# Dictionary to store timing information
        timings = {}
		
        def register_hooks(module, name):
            """
			Registers forward hooks to measure the execution time of each module.
			"""
            def pre_forward_hook(module, input):
                module.start_time = time.perf_counter()
				
            def post_forward_hook(module, input, output):
                elapsed_time = (time.perf_counter() - module.start_time) * 1000  # Convert to milliseconds
                if name in timings:
                    timings[name]["Total Time (ms)"] += elapsed_time
                    timings[name]["Calls"] += 1
                else:
                    timings[name] = {"Total Time (ms)": elapsed_time, "Calls": 1}
					
            module.register_forward_pre_hook(pre_forward_hook)
            module.register_forward_hook(post_forward_hook)
		
		# Register hooks for all submodules
        for name, module in self.model.named_modules():
            register_hooks(module, name)
		
		# Prepare a sample input tensor
        input_size = self.model.default_cfg.get('input_size', (3, 224, 224))
        sample_input = torch.randn(1, *input_size).to(self.device)
		
		# Run inference to trigger the hooks
        self.inference_one(sample_input)
		
		# Remove hooks to avoid side effects
        for module in self.model.modules():
            module._forward_pre_hooks.clear()
            module._forward_hooks.clear()
		
		# Compile timing data into a DataFrame
        data = []
        for name, info in timings.items():
            data.append({
				"Layer Name": name,
				"Total Time (ms)": info["Total Time (ms)"],
				"Calls": info["Calls"],
				"Average Time per Call (ms)": info["Total Time (ms)"] / info["Calls"]
			})
		
        df = pd.DataFrame(data)
        df = df.sort_values(by="Total Time (ms)", ascending=False)
		
        if create_csv:
            df.to_csv(f"./performance/{self.model_name}_layer_timings.csv", index=False)
		
        print("Top bottlenecks:")
        print(df.head(10))
		
        return df
			
        
        
        
        
        # """
        # Analyzes the performance of each layer in the model.

        # Parameters:
        # - input_size (tuple): Input size excluding batch dimension, e.g., (3, 224, 224).
        # - create_csv (bool): Whether to save the results to a CSV file.

        # Returns:
        # - df (pd.DataFrame): DataFrame containing per-layer performance metrics.
        # """
        # print("Analyzing per-layer performance...")

        # # Use default input size if not provided
        # if input_size is None:
        #     input_size = self.model.default_cfg.get('input_size', (3, 224, 224))

        # # Prepare a sample input
        # sample_input = torch.randn(1, *input_size).to(self.device)

        # # Dictionary to store metrics
        # layer_metrics = {}

        # # Function to register hooks
        # def register_hooks(module, name):

        #     def pre_forward_hook(module, input):
        #         module.__start_time = time.time()

        #     def forward_hook(module, input, output):
        #         elapsed_time = (time.time() - module.__start_time) * 1000  # Convert to milliseconds
        #         layer_metrics[name]['Execution Time (ms)'] = elapsed_time

        #     # Only register hooks to leaf modules
        #     if len(list(module.children())) == 0:
        #         layer_metrics[name] = {
        #             'Type': module.__class__.__name__,
        #             'Parameters': sum(p.numel() for p in module.parameters() if p.requires_grad),
        #             'MACs': 0,
        #             'Execution Time (ms)': 0,
        #             'Memory Usage (MB)': 0,
        #         }
        #         module.register_forward_pre_hook(pre_forward_hook)
        #         module.register_forward_hook(forward_hook)

        # # Register hooks
        # for name, module in self.model.named_modules():
        #     register_hooks(module, name)

        # # Use PyTorch Profiler to collect detailed metrics
        # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        #             record_shapes=True,
        #             profile_memory=True,
        #             with_stack=True) as prof:
        #     with torch.no_grad():
        #         self.model(sample_input)

        # # Remove hooks
        # for module in self.model.modules():
        #     module._backward_hooks = {}
        #     module._forward_hooks = {}
        #     module._forward_pre_hooks = {}

        # # Process profiler events
        # prof_data = prof.key_averages(group_by_input_shape=True)

        # for event in prof_data:
        #     module_name = event.key
        #     if module_name in layer_metrics:
        #         layer_metrics[module_name]['MACs'] = event.flops  # Number of FLOPs
        #         layer_metrics[module_name]['Memory Usage (MB)'] = event.self_cpu_memory_usage / (1024 ** 2)  # Convert to MB

        # # Convert the metrics dictionary to a DataFrame
        # df = pd.DataFrame.from_dict(layer_metrics, orient='index')
        # df.reset_index(inplace=True)
        # df.rename(columns={'index': 'Name'}, inplace=True)

        # # Sort DataFrame by Execution Time
        # df.sort_values(by='Execution Time (ms)', ascending=False, inplace=True)

        # if create_csv:
        #     df.to_csv(f"./performance/{self.model_name}_performance.csv", index=False)

        # print("Per-layer performance analysis completed.")
        # return df

In [5]:
def del_logs():
    for file in os.listdir('./logs'):
        os.remove(os.path.join('./logs', file))
def del_operations():
    for file in os.listdir('./operations'):
        os.remove(os.path.join('./operations', file))
def del_layers():
    for file in os.listdir('./layers'):
        os.remove(os.path.join('./layers', file))
def del_performance():
    for file in os.listdir('./performance'):
        os.remove(os.path.join('./performance', file))

In [11]:
del_logs()
del_operations()
del_layers()
del_performance()

In [12]:
analyzer = Analyzer(model_name, device="cuda")
input_tensor = torch.randn(1, 3, 224, 224)
analyzer.start_profiler(create_logfile=False)  # Creates logs of each operation
analyzer.inference_one(input_tensor)
analyzer.stop_profiler()
analyzer.is_cuda_initialized(logs=False)
print("Initialization done")
print("------------------------------------------------------------------------")
analyzer.list_events(show_key_averages=False, list_all=False)

events_df = analyzer.event_handler(create_csv_file=False, log_to_tensorboard=False, plot_events=False)
layers_df = analyzer.analyze_layers(create_csv=False)

analyzer.measure_inference_time(mode='single', num_warmup_runs=10, num_measurement_runs=10)

performance_df = analyzer.analyze_performance(create_csv=True) 

Using device: cuda
Model device: cuda:0
Input tensor device: cuda:0
All model parameters are on the GPU
Initialization done
------------------------------------------------------------------------
Measuring inference time for single image...
Average Inference Time: 21.98 ms
Model device: cuda:0
Input tensor device: cuda:0
Top bottlenecks:
    Layer Name  Total Time (ms)  Calls  Average Time per Call (ms)
251                     10.9134      1                     10.9134
246     blocks          10.1605      1                     10.1605
25    blocks.0           1.4680      1                      1.4680
165   blocks.7           0.9619      1                      0.9619
145   blocks.6           0.8989      1                      0.8989
125   blocks.5           0.8711      1                      0.8711
45    blocks.1           0.8400      1                      0.8400
185   blocks.8           0.7501      1                      0.7501
205   blocks.9           0.7312      1                  