# Lecture 2: Neural Network Basics - Compute Primitives

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gaurav-redhat/transformer_problems/blob/efficientml-course/efficientml_course/02_basics/demo.ipynb)

Understanding FLOPs, memory bandwidth, and the roofline model.


In [None]:
!pip install torch -q
import torch
import torch.nn as nn
import time

# Calculate FLOPs for different operations
def count_flops_linear(in_features, out_features, batch_size=1):
    """FLOPs for linear layer: 2 * in * out (multiply + add)"""
    return 2 * batch_size * in_features * out_features

def count_flops_conv2d(in_ch, out_ch, kernel, h, w, batch_size=1):
    """FLOPs for Conv2D"""
    return 2 * batch_size * in_ch * out_ch * kernel * kernel * h * w

# Compare operations
print("FLOPs Comparison")
print("=" * 50)

ops = {
    "Linear(1024, 1024)": count_flops_linear(1024, 1024),
    "Linear(4096, 4096)": count_flops_linear(4096, 4096),
    "Conv2d(64, 64, 3x3, 56x56)": count_flops_conv2d(64, 64, 3, 56, 56),
    "Conv2d(256, 256, 3x3, 14x14)": count_flops_conv2d(256, 256, 3, 14, 14),
}

for name, flops in ops.items():
    print(f"{name:35} | {flops/1e6:>8.2f} MFLOPs")


In [None]:
# Arithmetic Intensity - Are we compute or memory bound?
def arithmetic_intensity(flops, bytes_accessed):
    """FLOPs per byte - higher = more compute bound"""
    return flops / bytes_accessed

# Matrix multiplication A(M,K) @ B(K,N)
M, K, N = 1024, 1024, 1024
flops = 2 * M * K * N
bytes_read = (M * K + K * N) * 4  # FP32
bytes_write = M * N * 4
total_bytes = bytes_read + bytes_write

ai = arithmetic_intensity(flops, total_bytes)
print(f"Matrix Multiply ({M}x{K}) @ ({K}x{N})")
print(f"  FLOPs: {flops/1e9:.2f} GFLOPs")
print(f"  Memory: {total_bytes/1e6:.2f} MB")
print(f"  Arithmetic Intensity: {ai:.1f} FLOPs/Byte")
print(f"\n  GPU Memory BW: ~2000 GB/s")
print(f"  GPU Compute: ~300 TFLOPs")
print(f"  Crossover point: {300e12 / 2000e9:.0f} FLOPs/Byte")
print(f"\n  This operation is: {'Compute-bound âœ“' if ai > 150 else 'Memory-bound'}")
