# GPU Accelerated Diffusion Model with hcrot

This notebook demonstrates training and inference of a DDPM (Denoising Diffusion Probabilistic Model) using the `hcrot` library with **CuPy GPU acceleration**.

In [None]:
import os
import warnings
warnings.filterwarnings(action='ignore')

# Ensure we are in the root directory to access datasets
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')

from typing import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm, trange

import hcrot
from hcrot import layers, optim

In [None]:
class Model(layers.Module):
    def __init__(self):
        super().__init__()
        self.unet = layers.UNetModel(
            sample_size=14,
            in_channels=1,
            out_channels=1,
            block_out_channels=(32, 64, 32),
            num_class_embeds=10,
        )

    def forward(self, x_noisy, t, labels):
        return self.unet(x_noisy, t, labels)

In [None]:
def average_pooling(img, pool_size=2):
    B, C, H, W = img.shape
    new_H, new_W = H // pool_size, W // pool_size
    img = img[:, :, :new_H * pool_size, :new_W * pool_size]
    img_reshaped = img.reshape(B, C, new_H, pool_size, new_W, pool_size)
    downsampled = img_reshaped.mean(axis=(3, 5))
    return downsampled

batch_size = 512
num_epochs = 10
timesteps = 1000
lr = 1e-4

device = 'cuda' # Set to 'cuda' for GPU acceleration

print("Loading and preprocessing data...")
df = pd.read_csv('./datasets/mnist_test.csv')
label = df['7'].to_numpy()
df = df.drop('7', axis=1)
dat = df.to_numpy()

mnist = dat[:batch_size * 10]
train_label = label[:batch_size * 10]
mnist = mnist.reshape(-1, 1, 28, 28).astype(np.float32)
mnist = (mnist / 255.) * 2. - 1.
mnist = average_pooling(mnist, 2) # resize to 14x14

print(f"Moving model and dataloader to {device}...")
dataloader = hcrot.dataset.Dataloader(mnist, train_label, batch_size=batch_size, shuffle=True).to(device)
model = Model().to(device)

optimizer = hcrot.optim.AdamW(model, lr_rate=lr)
criterion = layers.MSELoss()
noise_scheduler = layers.DDPMScheduler(num_train_timesteps=timesteps, beta_schedule='squaredcos_cap_v2')

print("Starting GPU training...")
pbar = trange(num_epochs)
for epoch in pbar:
    total_loss = 0
    for i, (x, labels) in enumerate(dataloader):
        t = np.random.randint(0, timesteps, (x.shape[0],))
        noise = np.random.randn(*x.shape)
        noisy_x = noise_scheduler.add_noise(x, noise, t)

        noise_pred = model(noisy_x, t, labels)
        loss = criterion(noise_pred, noise)
        total_loss += loss.item()
        
        dz = criterion.backward()
        optimizer.update(dz)
    
    pbar.set_postfix(loss=total_loss/(i+1))

print("Training finished.")

In [None]:
# Inference on GPU
print("Starting inference...")
model.eval()
record_steps = [0, 200, 400, 600, 800, 999]
latents = np.random.randn(1, 1, 14, 14)

if device == 'cuda':
    import cupy as cp
    latents = cp.asarray(latents)

target_label = np.array([5]) # Generate digit '5'

noise_scheduler.set_timesteps(num_inference_steps=1000)
reverse_process = []

for t in tqdm(noise_scheduler.timesteps):
    noise_pred = model(latents, t, target_label)
    latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
    
    if t in record_steps:
        img = latents[0, 0]
        if hasattr(img, 'get'): img = img.get()
        reverse_process.append(img)

fig, ax = plt.subplots(1, len(reverse_process), figsize=(15, 3))
for i, img in enumerate(reverse_process):
    ax[i].imshow(img, cmap='gray')
    ax[i].axis('off')
    ax[i].set_title(f"T={record_steps[i]}")
plt.show()