# üìä Lecture 2: ML Efficiency Basics - Complete Demo

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gaurav-redhat/efficientml_course/blob/main/02_basics/demo.ipynb)

## What You'll Learn
- How to count FLOPs for different layer types
- Memory bandwidth and arithmetic intensity
- Roofline model analysis
- Bottleneck identification (compute vs memory bound)

In [None]:
!pip install torch matplotlib numpy -q
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np

print('Ready for efficiency analysis!')

## Part 1: Counting FLOPs (Floating Point Operations)

Understanding computational cost is the first step to optimization.

In [None]:
def count_flops_linear(in_features, out_features, batch_size=1):
    """
    Linear layer: y = Wx + b
    FLOPs = batch_size √ó (2 √ó in √ó out)  # multiply-add counted as 2
    """
    multiply = batch_size * in_features * out_features
    add = batch_size * in_features * out_features  # or (in_features-1)*out_features + out_features for bias
    return multiply + add

def count_flops_conv2d(in_channels, out_channels, kernel_size, height, width, batch_size=1):
    """
    Conv2D: FLOPs = batch √ó out_h √ó out_w √ó out_ch √ó in_ch √ó K √ó K √ó 2
    """
    k = kernel_size
    out_h, out_w = height - k + 1, width - k + 1  # no padding
    flops_per_output = 2 * in_channels * k * k
    total = batch_size * out_h * out_w * out_channels * flops_per_output
    return total

def count_flops_attention(seq_len, d_model, n_heads):
    """
    Self-attention FLOPs:
    - QKV projection: 3 √ó (2 √ó seq √ó d √ó d)
    - Attention scores: 2 √ó seq √ó seq √ó d
    - Attention @ V: 2 √ó seq √ó seq √ó d
    - Output projection: 2 √ó seq √ó d √ó d
    """
    qkv_proj = 3 * 2 * seq_len * d_model * d_model
    attn_scores = 2 * seq_len * seq_len * d_model
    attn_values = 2 * seq_len * seq_len * d_model
    output_proj = 2 * seq_len * d_model * d_model
    return qkv_proj + attn_scores + attn_values + output_proj

# Example calculations
print('üìä FLOP CALCULATIONS')
print('=' * 60)

# Linear layer
linear_flops = count_flops_linear(768, 3072, batch_size=1)
print(f'\n1Ô∏è‚É£ Linear Layer (768 ‚Üí 3072):')
print(f'   FLOPs = {linear_flops:,} = {linear_flops/1e6:.2f}M')

# Conv2D
conv_flops = count_flops_conv2d(64, 128, 3, 224, 224)
print(f'\n2Ô∏è‚É£ Conv2D (64‚Üí128, 3√ó3, 224√ó224):')
print(f'   FLOPs = {conv_flops:,} = {conv_flops/1e9:.2f}G')

# Self-attention
attn_flops = count_flops_attention(512, 768, 12)
print(f'\n3Ô∏è‚É£ Self-Attention (seq=512, d=768):')
print(f'   FLOPs = {attn_flops:,} = {attn_flops/1e9:.2f}G')

## Part 2: Memory Bandwidth Analysis

In [None]:
def memory_bytes_linear(in_features, out_features, batch_size=1, dtype_bytes=4):
    """
    Memory accessed for linear layer:
    - Read weights: in √ó out √ó dtype
    - Read input: batch √ó in √ó dtype
    - Write output: batch √ó out √ó dtype
    """
    weights = in_features * out_features * dtype_bytes
    input_mem = batch_size * in_features * dtype_bytes
    output_mem = batch_size * out_features * dtype_bytes
    return weights + input_mem + output_mem

def arithmetic_intensity(flops, bytes_accessed):
    """
    Arithmetic Intensity = FLOPs / Bytes
    Higher is better (more compute per memory access)
    """
    return flops / bytes_accessed

# Compare different batch sizes
print('üìà ARITHMETIC INTENSITY vs BATCH SIZE')
print('=' * 60)
print(f'{"Batch":<10} {"FLOPs":<15} {"Memory":<15} {"AI (FLOPs/Byte)":<15}')
print('-' * 60)

batch_sizes = [1, 8, 32, 128, 512]
ais = []

for bs in batch_sizes:
    flops = count_flops_linear(768, 3072, bs)
    mem = memory_bytes_linear(768, 3072, bs)
    ai = arithmetic_intensity(flops, mem)
    ais.append(ai)
    print(f'{bs:<10} {flops:>12,}   {mem:>12,}   {ai:>12.2f}')

# Visualize
plt.figure(figsize=(10, 5))
plt.bar([str(b) for b in batch_sizes], ais, color='#3b82f6')
plt.xlabel('Batch Size')
plt.ylabel('Arithmetic Intensity (FLOPs/Byte)')
plt.title('üìà Arithmetic Intensity Increases with Batch Size')
plt.axhline(y=125, color='r', linestyle='--', label='GPU threshold (H100)')
plt.legend()
plt.show()

print('\nüí° Insight: Larger batches = better hardware utilization!')

## Part 3: The Roofline Model

The roofline model helps identify if you're compute-bound or memory-bound.

In [None]:
# GPU specifications
gpus = {
    'A100': {'compute_tflops': 312, 'memory_bw': 2039},  # TF32 peak
    'H100': {'compute_tflops': 989, 'memory_bw': 3350},
    'RTX 4090': {'compute_tflops': 82, 'memory_bw': 1008},
    'M2 Pro': {'compute_tflops': 3.6, 'memory_bw': 200},
}

# Plot roofline for H100
gpu = 'H100'
compute_peak = gpus[gpu]['compute_tflops'] * 1e12  # Convert to FLOPS
memory_bw = gpus[gpu]['memory_bw'] * 1e9  # Convert to Bytes/s

fig, ax = plt.subplots(figsize=(12, 6))

# Arithmetic intensity range
ai_range = np.logspace(-2, 4, 1000)

# Roofline: min(peak_compute, ai √ó memory_bw)
performance = np.minimum(compute_peak, ai_range * memory_bw)

ax.loglog(ai_range, performance / 1e12, 'b-', linewidth=2, label='Roofline')

# Ridge point
ridge_point = compute_peak / memory_bw
ax.axvline(x=ridge_point, color='r', linestyle='--', alpha=0.5)
ax.text(ridge_point * 1.2, 100, f'Ridge Point\nAI = {ridge_point:.1f}', fontsize=10)

# Plot some operations
operations = {
    'MatMul (small batch)': 0.5,
    'MatMul (large batch)': 200,
    'Attention (short seq)': 10,
    'Attention (long seq)': 150,
    'LayerNorm': 1,
    'Softmax': 2,
}

for name, ai in operations.items():
    perf = min(compute_peak, ai * memory_bw) / 1e12
    marker = 'o' if ai < ridge_point else 's'
    color = 'red' if ai < ridge_point else 'green'
    ax.scatter(ai, perf * 0.7, marker=marker, s=100, c=color, zorder=5)
    ax.annotate(name, (ai, perf * 0.7), xytext=(5, 10), textcoords='offset points', fontsize=9)

ax.set_xlabel('Arithmetic Intensity (FLOPs/Byte)', fontsize=12)
ax.set_ylabel('Performance (TFLOPS)', fontsize=12)
ax.set_title(f'üìä Roofline Model for {gpu}', fontsize=14)
ax.set_xlim(0.01, 10000)
ax.set_ylim(0.1, 2000)
ax.grid(True, alpha=0.3)
ax.legend()

# Add regions
ax.fill_between([0.01, ridge_point], [0.1, 0.1], [2000, 2000], alpha=0.1, color='red', label='Memory Bound')
ax.fill_between([ridge_point, 10000], [0.1, 0.1], [2000, 2000], alpha=0.1, color='green', label='Compute Bound')

plt.tight_layout()
plt.show()

print('\nüî¥ Red points = Memory-bound (limited by memory bandwidth)')
print('üü¢ Green points = Compute-bound (limited by FLOPS)')

## Part 4: Comparing Different Model Architectures

In [None]:
# Real model analysis
models = {
    'ResNet-50': {'params': 25.6, 'flops': 4.1, 'type': 'CNN'},
    'EfficientNet-B0': {'params': 5.3, 'flops': 0.39, 'type': 'CNN'},
    'ViT-B/16': {'params': 86, 'flops': 17.6, 'type': 'Transformer'},
    'DeiT-S': {'params': 22, 'flops': 4.6, 'type': 'Transformer'},
    'BERT-base': {'params': 110, 'flops': 22, 'type': 'NLP'},
    'DistilBERT': {'params': 66, 'flops': 11, 'type': 'NLP'},
}

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Parameters vs FLOPs
colors = {'CNN': '#3b82f6', 'Transformer': '#ef4444', 'NLP': '#22c55e'}
for name, data in models.items():
    axes[0].scatter(data['params'], data['flops'], 
                    c=colors[data['type']], s=100, label=data['type'])
    axes[0].annotate(name, (data['params'], data['flops']), 
                     xytext=(5, 5), textcoords='offset points', fontsize=9)

axes[0].set_xlabel('Parameters (M)', fontsize=12)
axes[0].set_ylabel('FLOPs (G)', fontsize=12)
axes[0].set_title('Parameters vs FLOPs', fontsize=14)
axes[0].grid(True, alpha=0.3)

# Plot 2: Efficiency (FLOPs per Parameter)
efficiency = {name: data['flops'] / data['params'] for name, data in models.items()}
names = list(efficiency.keys())
values = list(efficiency.values())
bar_colors = [colors[models[n]['type']] for n in names]

axes[1].barh(names, values, color=bar_colors)
axes[1].set_xlabel('FLOPs per Parameter (G/M)', fontsize=12)
axes[1].set_title('Compute Efficiency', fontsize=14)
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

print('\nüí° Key Insight: Same accuracy can be achieved with 10x fewer FLOPs!')
print('   EfficientNet-B0 vs ResNet-50: 10x fewer FLOPs, similar accuracy')

## Part 5: Memory Layout and Access Patterns

In [None]:
# Memory layout demonstration
def analyze_memory_layout(tensor):
    """Analyze memory layout of a tensor"""
    print(f'Shape: {tensor.shape}')
    print(f'Stride: {tensor.stride()}')
    print(f'Is contiguous: {tensor.is_contiguous()}')
    print(f'Memory size: {tensor.element_size() * tensor.nelement() / 1024:.2f} KB')

# Create example tensors
x = torch.randn(32, 768, 512)  # Batch, Features, Sequence

print('üìä MEMORY LAYOUT ANALYSIS')
print('=' * 50)

print('\nOriginal tensor:')
analyze_memory_layout(x)

print('\nTransposed tensor:')
x_t = x.transpose(1, 2)
analyze_memory_layout(x_t)

print('\nContiguous copy:')
x_t_contig = x_t.contiguous()
analyze_memory_layout(x_t_contig)

# Benchmark
print('\n‚è±Ô∏è PERFORMANCE IMPACT')
import time

# Non-contiguous operation
start = time.time()
for _ in range(100):
    _ = x_t.sum()
non_contig_time = time.time() - start

# Contiguous operation
start = time.time()
for _ in range(100):
    _ = x_t_contig.sum()
contig_time = time.time() - start

print(f'Non-contiguous sum: {non_contig_time*1000:.2f}ms')
print(f'Contiguous sum: {contig_time*1000:.2f}ms')
print(f'Speedup: {non_contig_time/contig_time:.2f}x')

In [None]:
print('üéØ KEY TAKEAWAYS')
print('=' * 60)
print('\n1. FLOPs: Count multiply-adds, scales with layer dimensions')
print('\n2. Memory Bandwidth: Often the bottleneck, not compute')
print('\n3. Arithmetic Intensity: FLOPs/Byte - higher is better')
print('\n4. Roofline Model: Identifies compute vs memory bottleneck')
print('\n5. Memory Layout: Contiguous access is 2-10x faster')
print('\n' + '=' * 60)
print('\nüìö Next: Learn how to optimize with Pruning!')