In [2]:
import torch
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader, random_split 
from app.model import get_model
from app.trainer import train_diffusion

# Configuration  
BATCH_SIZE = 64
EPOCHS = 2  
LEARNING_RATE = 1e-4
MODEL_SAVE_PATH = "app/diffusion_model.pth"  
SAMPLES_PATH = "diffusion_samples"  

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 1. Load Data 
transform = transforms.Compose([
    transforms.ToTensor() 
])
# Load the full training dataset
full_train_dataset = CIFAR10(root="./data", train=True, download=True, transform=transform)

# Split into training and validation
train_size = int(0.9 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size
train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

print(f"Loaded CIFAR-10: {len(train_dataset)} training images, {len(val_dataset)} validation images.")

# 2. Initialize Model  
model = get_model('DiffusionModel')
model.to(device)

# 3. Initialize Optimizer  
optimizer = optim.Adam(model.network.parameters(), lr=LEARNING_RATE)

# 4. Train Model 
trained_model = train_diffusion(
    model, 
    train_loader, 
    val_loader,   
    optimizer, 
    device=device, 
    epochs=EPOCHS,
    save_path=MODEL_SAVE_PATH,
    samples_path=SAMPLES_PATH
)

print(f"Diffusion model training complete. Checkpoints saved in '{MODEL_SAVE_PATH}'.")
print(f"Sample images saved in '{SAMPLES_PATH}'.")


Using device: cpu
Files already downloaded and verified
Loaded CIFAR-10: 45000 training images, 5000 validation images.
Calculating dataset mean and std...


Calculating Stats: 100%|██████████████████████| 704/704 [00:27<00:00, 26.06it/s]


Calculated Stats -> Mean: tensor([0.4914, 0.4822, 0.4466]), Std: tensor([0.2464, 0.2428, 0.2607])
Starting Diffusion training for 2 epochs on cpu...


Epoch 1 Train: 100%|████████████████| 704/704 [26:18<00:00,  2.24s/it, loss=0.3]
Epoch 1 Val: 100%|██████████████████████████████| 79/79 [01:28<00:00,  1.11s/it]


Epoch 1 | Train Loss: 0.4255 | Val Loss: 0.3441
Checkpoint saved to app/diffusion_model.pth/diffusion_epoch_001.pth


Epoch 2 Train: 100%|██████████████| 704/704 [35:33<00:00,  3.03s/it, loss=0.295]
Epoch 2 Val: 100%|████████████████████████████| 79/79 [2:01:46<00:00, 92.48s/it]


Epoch 2 | Train Loss: 0.3347 | Val Loss: 0.3258
Checkpoint saved to app/diffusion_model.pth/diffusion_epoch_002.pth
Diffusion Training Finished.
Diffusion model training complete. Checkpoints saved in 'app/diffusion_model.pth'.
Sample images saved in 'diffusion_samples'.
