In [1]:
!pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html


Looking in links: https://storage.googleapis.com/libtpu-releases/index.html
Collecting libtpu==0.0.21 (from torch_xla[tpu])
  Downloading libtpu-0.0.21-cp312-cp312-manylinux_2_31_x86_64.whl.metadata (1.0 kB)
Downloading libtpu-0.0.21-cp312-cp312-manylinux_2_31_x86_64.whl (149.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.8/149.8 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: libtpu
  Attempting uninstall: libtpu
    Found existing installation: libtpu 0.0.17
    Uninstalling libtpu-0.0.17:
      Successfully uninstalled libtpu-0.0.17
Successfully installed libtpu-0.0.21


In [3]:
import torch
import torch_xla.core.xla_model as xm
import numpy as np
import time
import csv
from pathlib import Path
import torch_xla

# =========================
# Configuration
# =========================

device = torch_xla.device()

SIZES = {
    "S": 1024,
    "M": 4096,
    "L": 8192,
}

WARMUP = 10
ITERS = 50
DTYPE = torch.float32

POWER_W = {
    "tpu": 75,  # estimation TPU v2/v3
}

RESULTS_DIR = Path("results")
RESULTS_DIR.mkdir(exist_ok=True)
CSV_PATH = RESULTS_DIR / "matmul_tpu.csv"

device = xm.xla_device()

# =========================
# Stats utils
# =========================

def compute_stats(times_ms):
    t = np.array(times_ms)
    return {
        "mean_ms": t.mean(),
        "p50_ms": np.percentile(t, 50),
        "p95_ms": np.percentile(t, 95),
    }

def gflops(n, ms):
    flops = 2 * (n ** 3)
    return flops / (ms / 1000) / 1e9

def energy_j(power, ms):
    return power * (ms / 1000)

# =========================
# Benchmark TPU
# =========================

def bench_tpu(n):
    x = torch.rand((n, n), dtype=DTYPE, device=device)
    y = torch.rand((n, n), dtype=DTYPE, device=device)

    # Warmup
    for _ in range(WARMUP):
        out = torch.mm(x, y)
        torch_xla.sync()
        _ = out.cpu()

    times = []

    for _ in range(ITERS):
        start = time.perf_counter()

        out = torch.mm(x, y)
        torch_xla.sync()
        _ = out.cpu()   # vraie synchronisation

        end = time.perf_counter()
        times.append((end - start) * 1000)

    stats = compute_stats(times)
    stats["gflops"] = gflops(n, stats["mean_ms"])
    stats["energy_j"] = energy_j(POWER_W["tpu"], stats["mean_ms"])

    return stats

# =========================
# CSV save
# =========================

def save_row(size_tag, n, stats):
    exists = CSV_PATH.exists()

    with open(CSV_PATH, "a", newline="") as f:
        w = csv.writer(f)

        if not exists:
            w.writerow([
                "device", "size", "N",
                "mean_ms", "p50_ms", "p95_ms",
                "gflops", "power_w", "energy_j"
            ])

        w.writerow([
            "tpu", size_tag, n,
            stats["mean_ms"], stats["p50_ms"], stats["p95_ms"],
            stats["gflops"], POWER_W["tpu"], stats["energy_j"]
        ])

# =========================
# Main
# =========================

def main():
    print("TPU device:", device)

    for tag, n in SIZES.items():
        print(f"\n=== Size {tag} (N={n}) ===")

        stats = bench_tpu(n)
        print("TPU:", stats)

        save_row(tag, n, stats)

if __name__ == "__main__":
    main()


  device = xm.xla_device()


TPU device: xla:0

=== Size S (N=1024) ===
TPU: {'mean_ms': np.float64(4.672073459992134), 'p50_ms': np.float64(1.6427794999458456), 'p95_ms': np.float64(3.5857449499417244), 'gflops': np.float64(459.6425262550592), 'energy_j': np.float64(0.3504055094994101)}

=== Size M (N=4096) ===
TPU: {'mean_ms': np.float64(47.449615139996695), 'p50_ms': np.float64(47.59982199999513), 'p95_ms': np.float64(57.0689740500427), 'gflops': np.float64(2896.5240933250184), 'energy_j': np.float64(3.558721135499752)}

=== Size L (N=8192) ===
TPU: {'mean_ms': np.float64(153.16001115998915), 'p50_ms': np.float64(148.08910250002327), 'p95_ms': np.float64(177.71413344995608), 'gflops': np.float64(7178.842698225342), 'energy_j': np.float64(11.487000836999187)}


In [4]:
import torch
import torch.nn as nn
import torch_xla
import numpy as np
import time
import csv
from pathlib import Path

# =========================
# Configuration
# =========================

BATCH_SIZE = 32
IMAGE_SIZE = 64
WARMUP = 10
ITERS = 50
DTYPE = torch.float32

POWER_W = {"tpu": 75}

RESULTS_DIR = Path("results")
RESULTS_DIR.mkdir(exist_ok=True)
CSV_PATH = RESULTS_DIR / "cnn_tpu.csv"

device = torch_xla.device()

# =========================
# CNN
# =========================

class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()

        self.features = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(32 * (IMAGE_SIZE // 4) ** 2, 128),
            nn.ReLU(),
            nn.Linear(128, 10),
        )

    def forward(self, x):
        return self.classifier(self.features(x))

# =========================
# Stats
# =========================

def compute_stats(times):
    t = np.array(times)
    return {
        "mean_ms": float(t.mean()),
        "p50_ms": float(np.percentile(t, 50)),
        "p95_ms": float(np.percentile(t, 95)),
    }

def energy_j(power, ms):
    return power * (ms / 1000)

# =========================
# Benchmark TPU
# =========================

def bench_tpu():

    model = SimpleCNN().to(device)
    model.eval()

    x = torch.rand(
        (BATCH_SIZE, 3, IMAGE_SIZE, IMAGE_SIZE),
        dtype=DTYPE,
        device=device
    )

    # Warmup
    with torch.no_grad():
        for _ in range(WARMUP):
            out = model(x)
            torch_xla.sync()
            _ = out.cpu()

    times = []

    # Benchmark
    with torch.no_grad():
        for _ in range(ITERS):

            start = time.perf_counter()

            out = model(x)
            torch_xla.sync()
            _ = out.cpu()

            end = time.perf_counter()
            times.append((end - start) * 1000)

    stats = compute_stats(times)
    stats["energy_j"] = energy_j(POWER_W["tpu"], stats["mean_ms"])

    return stats

# =========================
# CSV save
# =========================

def save_row(stats):

    exists = CSV_PATH.exists()

    with open(CSV_PATH, "a", newline="") as f:
        w = csv.writer(f)

        if not exists:
            w.writerow([
                "device",
                "mean_ms", "p50_ms", "p95_ms",
                "energy_j"
            ])

        w.writerow([
            "tpu",
            stats["mean_ms"],
            stats["p50_ms"],
            stats["p95_ms"],
            stats["energy_j"],
        ])

# =========================
# Main
# =========================

def main():

    print("=== CNN TPU Benchmark ===")
    print("Device:", device)

    stats = bench_tpu()

    print("TPU:", stats)
    save_row(stats)

if __name__ == "__main__":
    main()


=== CNN TPU Benchmark ===
Device: xla:0
TPU: {'mean_ms': 0.8080623200021364, 'p50_ms': 0.7848949999242905, 'p95_ms': 0.9801534999269278, 'energy_j': 0.06060467400016022}


In [5]:
import torch
import torch.nn as nn
import torchvision.models as models
import numpy as np
import time

# TPU support (optional)
try:
    import torch_xla
    import torch_xla.core.xla_model as xm
    TPU_AVAILABLE = True
except:
    TPU_AVAILABLE = False

# =========================
# CONFIG — tune to stress hardware
# =========================

BATCH_SIZE = 128      # increase to stress more
IMAGE_SIZE = 224
WARMUP = 10
ITERS = 50
DTYPE = torch.float32

# =========================
# Device selection
# =========================

if TPU_AVAILABLE:
    device = torch_xla.device()
    device_type = "TPU"
elif torch.cuda.is_available():
    device = "cuda"
    device_type = "GPU"
else:
    device = "cpu"
    device_type = "CPU"

print("Running on:", device_type)

# =========================
# Model
# =========================

model = models.resnet50(weights=None).to(device)
model.eval()

# heavy input tensor
x_cpu = torch.rand(
    (BATCH_SIZE, 3, IMAGE_SIZE, IMAGE_SIZE),
    dtype=DTYPE
)

if device_type == "GPU":
    x = x_cpu.pin_memory().to(device, non_blocking=True)
elif device_type == "TPU":
    x = x_cpu.to(device)
else:
    x = x_cpu

# =========================
# Sync helpers
# =========================

def sync(out=None):
    if device_type == "GPU":
        torch.cuda.synchronize()

    elif device_type == "TPU":
        torch_xla.sync()
        if out is not None:
            _ = out.cpu()

# =========================
# Warmup
# =========================

with torch.no_grad():
    for _ in range(WARMUP):
        out = model(x)
        sync(out)

# =========================
# Benchmark loop
# =========================

times = []

with torch.no_grad():
    for _ in range(ITERS):

        start = time.perf_counter()

        out = model(x)
        sync(out)

        end = time.perf_counter()
        times.append(end - start)

# =========================
# Results
# =========================

times = np.array(times)

latency_ms = times.mean() * 1000
throughput = BATCH_SIZE / times.mean()

print("\n=== STRESS RESULTS ===")
print(f"Mean latency: {latency_ms:.2f} ms / batch")
print(f"Throughput:   {throughput:.0f} images/sec")
print(f"P95 latency:  {np.percentile(times,95)*1000:.2f} ms")


Running on: TPU

=== STRESS RESULTS ===
Mean latency: 23.03 ms / batch
Throughput:   5559 images/sec
P95 latency:  23.36 ms
