# üëÅÔ∏è Lecture 15: Efficient Vision Models - Complete Demo

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gaurav-redhat/efficientml_course/blob/main/15_efficient_vision_models/demo.ipynb)

## What You'll Learn
- Depthwise separable convolutions
- MobileNet architecture design
- EfficientNet compound scaling
- FLOPs vs accuracy trade-offs

In [None]:
!pip install torch torchvision matplotlib numpy -q
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np

torch.manual_seed(42)
print('Ready for Efficient Vision Models!')

## Part 1: Standard vs Depthwise Separable Convolution

In [None]:
def conv_flops(in_ch, out_ch, kernel_size, h, w):
    """FLOPs for standard convolution."""
    return 2 * in_ch * out_ch * kernel_size * kernel_size * h * w

def depthwise_separable_flops(in_ch, out_ch, kernel_size, h, w):
    """FLOPs for depthwise separable convolution."""
    # Depthwise: each channel separately
    depthwise = 2 * in_ch * kernel_size * kernel_size * h * w
    # Pointwise: 1√ó1 conv to mix channels
    pointwise = 2 * in_ch * out_ch * h * w
    return depthwise + pointwise

# Compare
in_ch, out_ch = 256, 256
kernel_size = 3
h, w = 56, 56

standard = conv_flops(in_ch, out_ch, kernel_size, h, w)
separable = depthwise_separable_flops(in_ch, out_ch, kernel_size, h, w)

print('üìä CONVOLUTION COMPARISON')
print('=' * 60)
print(f'Input: {in_ch} channels, {h}√ó{w}, kernel {kernel_size}√ó{kernel_size}')
print(f'\n{"Type":<25} {"FLOPs":<20} {"Relative":<15}')
print('-' * 60)
print(f'{"Standard Conv":<25} {standard/1e6:>15.1f}M  {1.0:>12.1f}x')
print(f'{"Depthwise Separable":<25} {separable/1e6:>15.1f}M  {separable/standard:>12.2f}x')
print(f'\nüí° Savings: {standard/separable:.1f}x fewer FLOPs!')

In [None]:
# Implement and visualize
class StandardConv(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size=3):
        super().__init__()
        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size, padding=1)
    
    def forward(self, x):
        return F.relu(self.conv(x))

class DepthwiseSeparableConv(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size=3):
        super().__init__()
        # Depthwise: each channel separately
        self.depthwise = nn.Conv2d(in_ch, in_ch, kernel_size, 
                                    padding=1, groups=in_ch)
        # Pointwise: 1√ó1 to mix channels
        self.pointwise = nn.Conv2d(in_ch, out_ch, 1)
    
    def forward(self, x):
        x = F.relu(self.depthwise(x))
        return F.relu(self.pointwise(x))

# Compare parameters
std_conv = StandardConv(256, 256)
ds_conv = DepthwiseSeparableConv(256, 256)

std_params = sum(p.numel() for p in std_conv.parameters())
ds_params = sum(p.numel() for p in ds_conv.parameters())

print(f'\nüìä PARAMETER COMPARISON')
print(f'Standard Conv: {std_params:,} parameters')
print(f'Depthwise Separable: {ds_params:,} parameters')
print(f'Savings: {std_params/ds_params:.1f}x fewer parameters')

In [None]:
# Visualize depthwise separable
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Standard convolution
ax = axes[0]
ax.imshow(np.random.rand(3, 3, 4, 4).mean(axis=(2,3)), cmap='Blues')
ax.set_title('Standard Conv\n(All channels mixed)', fontsize=12)
ax.axis('off')

# Depthwise
ax = axes[1]
ax.imshow(np.eye(3), cmap='Greens')
ax.set_title('Depthwise Conv\n(Per-channel filtering)', fontsize=12)
ax.axis('off')

# Pointwise
ax = axes[2]
ax.imshow(np.random.rand(3, 3), cmap='Oranges')
ax.set_title('Pointwise Conv (1√ó1)\n(Channel mixing)', fontsize=12)
ax.axis('off')

plt.suptitle('üìä Depthwise Separable = Depthwise + Pointwise', fontsize=14)
plt.tight_layout()
plt.show()

## Part 2: MobileNet Architecture

In [None]:
class InvertedResidual(nn.Module):
    """
    MobileNetV2 Inverted Residual Block.
    
    Key innovation: Expand ‚Üí Depthwise ‚Üí Project
    (Opposite of standard residual: narrow ‚Üí wide ‚Üí narrow)
    """
    def __init__(self, in_ch, out_ch, stride=1, expand_ratio=6):
        super().__init__()
        hidden_ch = in_ch * expand_ratio
        self.use_residual = (stride == 1 and in_ch == out_ch)
        
        layers = []
        
        # Expand (1√ó1 conv)
        if expand_ratio != 1:
            layers.extend([
                nn.Conv2d(in_ch, hidden_ch, 1, bias=False),
                nn.BatchNorm2d(hidden_ch),
                nn.ReLU6()
            ])
        
        # Depthwise (3√ó3 conv)
        layers.extend([
            nn.Conv2d(hidden_ch, hidden_ch, 3, stride, 1, 
                      groups=hidden_ch, bias=False),
            nn.BatchNorm2d(hidden_ch),
            nn.ReLU6()
        ])
        
        # Project (1√ó1 conv, no activation!)
        layers.extend([
            nn.Conv2d(hidden_ch, out_ch, 1, bias=False),
            nn.BatchNorm2d(out_ch)
        ])
        
        self.conv = nn.Sequential(*layers)
    
    def forward(self, x):
        if self.use_residual:
            return x + self.conv(x)
        return self.conv(x)

# Test block
block = InvertedResidual(32, 64, stride=2, expand_ratio=6)
x = torch.randn(1, 32, 56, 56)
out = block(x)

print('üìä INVERTED RESIDUAL BLOCK')
print('=' * 50)
print(f'Input: {x.shape}')
print(f'Output: {out.shape}')
print(f'Parameters: {sum(p.numel() for p in block.parameters()):,}')

print('\nüîπ Architecture:')
print('   Input (32ch) ‚Üí Expand 6√ó (192ch) ‚Üí Depthwise ‚Üí Project (64ch)')

## Part 3: EfficientNet Compound Scaling

In [None]:
def efficientnet_scaling(phi):
    """
    EfficientNet compound scaling.
    
    Given a base network (B0), scale all dimensions together:
    - Depth (d): Number of layers
    - Width (w): Number of channels
    - Resolution (r): Input image size
    
    Constraint: d √ó w¬≤ √ó r¬≤ ‚âà 2^phi (FLOPs increase by 2^phi)
    """
    # EfficientNet scaling coefficients
    alpha = 1.2   # Depth
    beta = 1.1    # Width
    gamma = 1.15  # Resolution
    
    d = alpha ** phi
    w = beta ** phi
    r = gamma ** phi
    
    return d, w, r

# EfficientNet family
efficientnets = {
    'B0': {'phi': 0, 'resolution': 224, 'params': 5.3, 'flops': 0.39, 'top1': 77.1},
    'B1': {'phi': 0.5, 'resolution': 240, 'params': 7.8, 'flops': 0.70, 'top1': 79.1},
    'B2': {'phi': 1.0, 'resolution': 260, 'params': 9.2, 'flops': 1.0, 'top1': 80.1},
    'B3': {'phi': 1.5, 'resolution': 300, 'params': 12, 'flops': 1.8, 'top1': 81.6},
    'B4': {'phi': 2.0, 'resolution': 380, 'params': 19, 'flops': 4.2, 'top1': 82.9},
    'B5': {'phi': 2.5, 'resolution': 456, 'params': 30, 'flops': 9.9, 'top1': 83.6},
    'B6': {'phi': 3.0, 'resolution': 528, 'params': 43, 'flops': 19, 'top1': 84.0},
    'B7': {'phi': 3.5, 'resolution': 600, 'params': 66, 'flops': 37, 'top1': 84.3},
}

print('üìä EFFICIENTNET FAMILY')
print('=' * 70)
print(f'{"Model":<8} {"Resolution":<12} {"Params (M)":<12} {"FLOPs (B)":<12} {"Top-1 (%)":<12}')
print('-' * 70)

for name, info in efficientnets.items():
    print(f'{name:<8} {info["resolution"]:<12} {info["params"]:<12.1f} {info["flops"]:<12.1f} {info["top1"]:<12.1f}')

In [None]:
# Visualize scaling
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

models = list(efficientnets.keys())
flops = [efficientnets[m]['flops'] for m in models]
params = [efficientnets[m]['params'] for m in models]
top1 = [efficientnets[m]['top1'] for m in models]

# FLOPs vs Accuracy
axes[0].plot(flops, top1, 'o-', color='#3b82f6', linewidth=2, markersize=10)
for m, f, t in zip(models, flops, top1):
    axes[0].annotate(m, (f, t), xytext=(5, 5), textcoords='offset points')
axes[0].set_xlabel('FLOPs (B)')
axes[0].set_ylabel('Top-1 Accuracy (%)')
axes[0].set_title('Accuracy vs FLOPs')
axes[0].grid(True, alpha=0.3)

# Params vs Accuracy
axes[1].plot(params, top1, 'o-', color='#22c55e', linewidth=2, markersize=10)
for m, p, t in zip(models, params, top1):
    axes[1].annotate(m, (p, t), xytext=(5, 5), textcoords='offset points')
axes[1].set_xlabel('Parameters (M)')
axes[1].set_ylabel('Top-1 Accuracy (%)')
axes[1].set_title('Accuracy vs Parameters')
axes[1].grid(True, alpha=0.3)

# Scaling visualization
phis = np.linspace(0, 3.5, 20)
depths = [efficientnet_scaling(p)[0] for p in phis]
widths = [efficientnet_scaling(p)[1] for p in phis]
resolutions = [efficientnet_scaling(p)[2] for p in phis]

axes[2].plot(phis, depths, label='Depth', linewidth=2)
axes[2].plot(phis, widths, label='Width', linewidth=2)
axes[2].plot(phis, resolutions, label='Resolution', linewidth=2)
axes[2].set_xlabel('Compound Coefficient (œÜ)')
axes[2].set_ylabel('Scaling Factor')
axes[2].set_title('Compound Scaling')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Part 4: Comparing Vision Architectures

In [None]:
# Architecture comparison
architectures = {
    'ResNet-50': {'params': 25.6, 'flops': 4.1, 'top1': 76.1},
    'MobileNetV2': {'params': 3.4, 'flops': 0.30, 'top1': 72.0},
    'MobileNetV3-L': {'params': 5.4, 'flops': 0.22, 'top1': 75.2},
    'EfficientNet-B0': {'params': 5.3, 'flops': 0.39, 'top1': 77.1},
    'EfficientNet-B4': {'params': 19, 'flops': 4.2, 'top1': 82.9},
    'ViT-B/16': {'params': 86, 'flops': 17.6, 'top1': 77.9},
    'DeiT-S': {'params': 22, 'flops': 4.6, 'top1': 79.8},
}

print('üìä VISION ARCHITECTURE COMPARISON')
print('=' * 70)
print(f'{"Model":<20} {"Params (M)":<12} {"FLOPs (B)":<12} {"Top-1 (%)":<12} {"Acc/GFLOP":<12}')
print('-' * 70)

for name, info in architectures.items():
    efficiency = info['top1'] / info['flops']
    print(f'{name:<20} {info["params"]:<12.1f} {info["flops"]:<12.1f} {info["top1"]:<12.1f} {efficiency:<12.1f}')

# Visualize
fig, ax = plt.subplots(figsize=(12, 6))

for name, info in architectures.items():
    color = '#3b82f6' if 'ResNet' in name else ('#22c55e' if 'Mobile' in name else 
            '#f59e0b' if 'Efficient' in name else '#ef4444')
    ax.scatter(info['flops'], info['top1'], s=info['params']*10, c=color, alpha=0.7)
    ax.annotate(name, (info['flops'], info['top1']), xytext=(5, 5), 
                textcoords='offset points', fontsize=9)

ax.set_xlabel('FLOPs (B)', fontsize=12)
ax.set_ylabel('Top-1 Accuracy (%)', fontsize=12)
ax.set_title('üìä Vision Models: Accuracy vs Efficiency\n(Circle size = parameters)', fontsize=14)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
print('üéØ KEY TAKEAWAYS')
print('=' * 60)
print('\n1. Depthwise Separable Conv: ~9x fewer FLOPs')
print('\n2. Inverted Residual: Expand ‚Üí Depthwise ‚Üí Project')
print('\n3. EfficientNet: Compound scaling (depth √ó width √ó resolution)')
print('\n4. MobileNet: Best for mobile/edge deployment')
print('\n5. EfficientNet: Best accuracy/efficiency trade-off')
print('\n6. Vision Transformers: Need more data, less efficient')
print('\n' + '=' * 60)
print('\nüìö Next: Efficient LLMs!')