# üöÄ Lecture 1: Introduction to Efficient ML - Complete Demo

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gaurav-redhat/efficientml_course/blob/main/01_introduction/demo.ipynb)

## What You'll Learn
- Why ML efficiency matters (cost, latency, energy)
- Model size comparison across different architectures
- Memory and compute requirements
- The efficiency gap between research and deployment

In [None]:
!pip install torch matplotlib numpy -q
import torch
import matplotlib.pyplot as plt
import numpy as np

print('üñ•Ô∏è PyTorch version:', torch.__version__)
print('GPU available:', torch.cuda.is_available())

## Part 1: The Model Size Explosion

ML models have grown **1000x** in just a few years!

In [None]:
# Model sizes over the years
models = {
    'AlexNet (2012)': 61,
    'VGG-16 (2014)': 138,
    'ResNet-152 (2015)': 60,
    'BERT-base (2018)': 110,
    'GPT-2 (2019)': 1500,
    'GPT-3 (2020)': 175000,
    'PaLM (2022)': 540000,
    'GPT-4 (2023)': 1800000,
}

# Visualization
fig, ax = plt.subplots(figsize=(12, 6))
names = list(models.keys())
sizes = list(models.values())
colors = plt.cm.Reds(np.linspace(0.3, 0.9, len(models)))

bars = ax.barh(names, sizes, color=colors)
ax.set_xlabel('Parameters (Millions)', fontsize=12)
ax.set_title('üöÄ Model Size Growth Over Time', fontsize=14)
ax.set_xscale('log')

for bar, size in zip(bars, sizes):
    label = f'{size/1000:.0f}B' if size >= 1000 else f'{size}M'
    ax.text(bar.get_width() * 1.1, bar.get_y() + bar.get_height()/2, 
            label, va='center', fontsize=10)

plt.tight_layout()
plt.show()

print(f'\nüìà GPT-4 is {models["GPT-4 (2023)"]/models["AlexNet (2012)"]:.0f}x larger than AlexNet!')

## Part 2: Memory Requirements

Let's calculate how much GPU memory different models need.

In [None]:
def calculate_memory(params_millions, dtype='fp32', training=False):
    """
    Calculate memory requirements.
    
    Training memory = Model + Gradients + Optimizer States + Activations
    - FP32: 4 bytes per param
    - FP16: 2 bytes per param
    - Adam optimizer: 8 bytes per param (momentum + variance in FP32)
    """
    bytes_per_param = 4 if dtype == 'fp32' else 2
    model_memory = params_millions * 1e6 * bytes_per_param
    
    if training:
        # Training: Model + Gradients + Optimizer (Adam)
        gradient_memory = model_memory
        optimizer_memory = params_millions * 1e6 * 8  # Adam states in FP32
        # Activations depend on batch size and sequence length (estimate)
        activation_memory = model_memory * 2  # Rough estimate
        total = model_memory + gradient_memory + optimizer_memory + activation_memory
    else:
        total = model_memory
    
    return total / 1e9  # Return in GB

print('üìä Memory Requirements Analysis')
print('=' * 60)
print(f'{"Model":<20} {"Params":<12} {"Inference":<12} {"Training":<12}')
print('-' * 60)

for name, params in models.items():
    short_name = name.split(' ')[0]
    inf_mem = calculate_memory(params, 'fp16', training=False)
    train_mem = calculate_memory(params, 'fp16', training=True)
    print(f'{short_name:<20} {params:<12,} {inf_mem:<12.1f} GB {train_mem:<12.1f} GB')

print('\nüí° Key Insight: Training needs 10-20x more memory than inference!')

## Part 3: The Cost of Training Large Models

In [None]:
# Training costs
training_costs = {
    'BERT-base': 0.5,       # ~$500
    'GPT-2': 50,            # ~$50K
    'GPT-3': 4600,          # ~$4.6M
    'PaLM': 8000,           # ~$8M (estimated)
    'GPT-4': 100000,        # ~$100M (estimated)
}

# CO2 emissions (tons)
co2_emissions = {
    'BERT-base': 0.6,
    'GPT-2': 5,
    'GPT-3': 500,
    'PaLM': 600,
    'GPT-4': 5000,  # Estimated
}

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Cost chart
names = list(training_costs.keys())
costs = list(training_costs.values())
axes[0].bar(names, costs, color='#22c55e')
axes[0].set_ylabel('Training Cost ($K)', fontsize=12)
axes[0].set_title('üí∞ Training Costs', fontsize=14)
axes[0].set_yscale('log')
for i, (n, c) in enumerate(zip(names, costs)):
    label = f'${c/1000:.0f}M' if c >= 1000 else f'${c:.0f}K'
    axes[0].text(i, c * 1.2, label, ha='center')

# CO2 chart
co2 = list(co2_emissions.values())
axes[1].bar(names, co2, color='#ef4444')
axes[1].set_ylabel('CO2 Emissions (tons)', fontsize=12)
axes[1].set_title('üåç Environmental Impact', fontsize=14)
axes[1].set_yscale('log')

plt.tight_layout()
plt.show()

print('\nüåç For reference:')
print('   - Average car: 4.6 tons CO2/year')
print('   - GPT-3 training: 500 tons CO2 = 100+ car-years!')

## Part 4: Latency Requirements in Real Applications

In [None]:
# Application latency requirements
apps = {
    'Voice Assistant': {'required': 100, 'typical_llm': 500},
    'Auto-complete': {'required': 50, 'typical_llm': 200},
    'Chatbot': {'required': 500, 'typical_llm': 1000},
    'Image Generation': {'required': 5000, 'typical_llm': 30000},
    'Self-driving Car': {'required': 10, 'typical_llm': 100},
}

fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(apps))
width = 0.35

required = [v['required'] for v in apps.values()]
typical = [v['typical_llm'] for v in apps.values()]

bars1 = ax.bar(x - width/2, required, width, label='Required Latency', color='#22c55e')
bars2 = ax.bar(x + width/2, typical, width, label='Typical LLM Latency', color='#ef4444')

ax.set_ylabel('Latency (ms)', fontsize=12)
ax.set_title('‚è±Ô∏è Latency Gap: Requirements vs Reality', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(apps.keys(), rotation=15)
ax.legend()
ax.set_yscale('log')

plt.tight_layout()
plt.show()

print('\n‚ö†Ô∏è The Gap: Most LLMs are 2-10x slower than required!')

## Part 5: Efficiency Techniques Overview

In [None]:
# Summary of efficiency techniques
techniques = [
    ('Pruning', '10x smaller', 'Remove unimportant weights'),
    ('Quantization', '4x smaller', 'FP32 ‚Üí INT8/INT4'),
    ('Knowledge Distillation', '3-10x smaller', 'Train small model from large'),
    ('Neural Architecture Search', '2-5x efficient', 'Auto-design efficient nets'),
    ('FlashAttention', '2-4x faster', 'Memory-efficient attention'),
    ('Speculative Decoding', '2-3x faster', 'Use draft model to speed up'),
]

print('üõ†Ô∏è EFFICIENCY TECHNIQUES COVERED IN THIS COURSE')
print('=' * 70)
for tech, benefit, description in techniques:
    print(f'\nüìå {tech}')
    print(f'   Benefit: {benefit}')
    print(f'   How: {description}')

print('\n' + '=' * 70)
print('\nüéØ KEY TAKEAWAYS')
print('-' * 70)
print('1. Model sizes have grown 1000x in 5 years')
print('2. Training costs millions of dollars and tons of CO2')
print('3. Real applications need 10x lower latency than current models')
print('4. Efficiency techniques can reduce size/cost/latency by 10x+')
print('5. This course teaches you how to make ML practical!')

In [None]:
# Final visualization: The efficiency opportunity
fig, ax = plt.subplots(figsize=(10, 6))

categories = ['Model Size', 'Memory', 'Latency', 'Cost', 'Energy']
before = [100, 100, 100, 100, 100]
after = [10, 15, 25, 10, 15]

x = np.arange(len(categories))
width = 0.35

bars1 = ax.bar(x - width/2, before, width, label='Before Optimization', color='#ef4444')
bars2 = ax.bar(x + width/2, after, width, label='After Optimization', color='#22c55e')

ax.set_ylabel('Relative Value (%)', fontsize=12)
ax.set_title('üéØ The Efficiency Opportunity', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(categories)
ax.legend()
ax.axhline(y=100, color='gray', linestyle='--', alpha=0.5)

for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 2,
            f'{100-height:.0f}% ‚Üì', ha='center', va='bottom', fontsize=10, color='green')

plt.tight_layout()
plt.show()

print('\nüöÄ Ready to make ML efficient? Let\'s go!')