In [1]:
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as tt
from pathlib import Path
import random
import numpy as np
from sklearn.metrics import accuracy_score
import time
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
from diffusers import DDPMPipeline

DEVICE = 'cuda'


  from .autonotebook import tqdm as notebook_tqdm


### UTILS

In [2]:
def set_deterministic(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) 
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def readable_number(num):
    num_str = str(num)[::-1]
    res = ''
    i_prev = 0
    for i in range(3, len(num_str), 3):
        res += num_str[i_prev:i] + ','
        i_prev = i
    if i_prev < len(num_str):
        res += num_str[i_prev:]
    return res[::-1]

def log(writer, metrics, epoch):
    writer.add_scalars('loss', {'train': metrics['loss_train'], 'test': metrics['loss_test']}, epoch)
    writer.add_scalars('accuracy', {'train': metrics['accuracy_train'], 'test': metrics['accuracy_test']}, epoch)
    writer.flush()

def save_checkpoint(state, path, epoch, test_loss):
    Path(path).mkdir(parents=True, exist_ok=True)
    torch.save(state, f'{path}/{epoch}_valloss={test_loss:.3f}.pt')

def get_parameters(model):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    return total, trainable

def print_parameters(model):
    total, trainable = get_parameters(model)
    print(f'model initialized with trainable params: {readable_number(trainable)} || total params: {readable_number(total)} || trainable%: {trainable/total * 100:.3f}')

In [3]:
def test_fp16(model, criterion, data_loader, tqdm_desc, device):
    model.eval()
    y_true, y_pred, test_loss = [], [], []
    for imgs, target in data_loader:
    # for imgs, target in tqdm(data_loader, desc=tqdm_desc):
        imgs, target = imgs.to(device), target.to(device)

        with torch.no_grad():
            with torch.amp.autocast(device_type='cuda'):
                logits = model(imgs)
                loss = criterion(logits, target)

        test_loss.append(loss.item())
        y_pred.extend(logits.argmax(dim=1).flatten().tolist())
        y_true.extend(target.flatten().tolist())

    y_true, y_pred, test_loss = np.array(y_true), np.array(y_pred), np.array(test_loss)
    metrics = {}
    metrics['accuracy_test'] = accuracy_score(y_true, y_pred)
    metrics['loss_test'] = np.mean(test_loss)
    return metrics


def train_fp16_epoch(model, optimizer, criterion, scheduler, data_loader, tqdm_desc, scaler, device):
    model.train()
    y_true, y_pred, train_loss = [], [], [] 
    for imgs, target in data_loader:
    # for imgs, target in tqdm(data_loader, desc=tqdm_desc):
        imgs, target = imgs.to(device), target.to(device)

        with torch.amp.autocast(device_type='cuda'):
            logits = model(imgs)
            loss = criterion(logits, target)
        
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss.append(loss.item())
        y_pred.extend(logits.argmax(dim=1).flatten().tolist())
        y_true.extend(target.flatten().tolist())

    y_true, y_pred, train_loss = np.array(y_true), np.array(y_pred), np.array(train_loss)
    metrics = {}
    metrics['accuracy_train'] = accuracy_score(y_true, y_pred)
    metrics['loss_train'] = np.mean(train_loss)
    return metrics


def train_fp16(writer, model, optimizer, scheduler, criterion, train_loader, val_loader, num_epochs, freq_save, save_path, scaler, device):
    for epoch in range(1, num_epochs + 1):
        start = time.time()
        metrics_train = train_fp16_epoch(
            model, optimizer, criterion, scheduler, train_loader,
            tqdm_desc=f'Training {epoch}/{num_epochs}', scaler=scaler, device=device
        )
        metrics_val = test_fp16(
            model, criterion, val_loader,
            tqdm_desc=f'Validating {epoch}/{num_epochs}', device=device
        )

        if scheduler is not None:
            scheduler.step()

        if epoch % freq_save == 0:
            save_checkpoint(model.state_dict(), save_path, epoch, metrics_val["loss_test"])
          
        log(writer, {**metrics_val, **metrics_train}, epoch)
        end = time.time()
        print(f'{epoch=} in {((end - start) / 60):.2f}m, loss_val={metrics_val["loss_test"]:.3f}, loss_train={metrics_train["loss_train"]:.3f}, acc_val={metrics_val["accuracy_test"]:.3f}, acc_train={metrics_train["accuracy_train"]:.3f}')

### MODEL

#### ENCODER

In [9]:
pipe = DDPMPipeline.from_pretrained("google/ddpm-cat-256")
# pipe = DDPMPipeline.from_pretrained("google/ddpm-cifar10-32")
hf_unet, hf_scheduler = pipe.unet, pipe.scheduler

Loading pipeline components...: 100%|██████████| 2/2 [00:00<00:00,  7.07it/s]


In [10]:
class DiffusionEncoder(nn.Module):
    def __init__(self, unet):
        super().__init__()
        self.unet = unet
    
    def forward(self, imgs, timestep, class_labels=None, up_last=-1, GAP=True):
        params = 0
        # 0. center input if necessary
        if self.unet.config.center_input_sample:
            imgs = 2 * imgs - 1.0

        # 1. time
        timesteps = timestep
        if not torch.is_tensor(timesteps):
            timesteps = torch.tensor([timesteps], dtype=torch.long, device=imgs.device)
        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
            timesteps = timesteps[None].to(imgs.device)

        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
        timesteps = timesteps * torch.ones(imgs.shape[0], dtype=timesteps.dtype, device=timesteps.device)

        t_emb = self.unet.time_proj(timesteps)

        # timesteps does not contain any weights and will always return f32 tensors
        # but time_embedding might actually be running in fp16. so we need to cast here.
        # there might be better ways to encapsulate this.
        t_emb = t_emb.to(dtype=self.unet.dtype)
        emb = self.unet.time_embedding(t_emb)

        total = get_parameters(self.unet.time_embedding)[0]
        params += total
        # print(f'time_embedding {total}')

        if self.unet.class_embedding is not None:
            if class_labels is None:
                raise ValueError("class_labels should be provided when doing class conditioning")

            if self.unet.config.class_embed_type == "timestep":
                class_labels = self.unet.time_proj(class_labels)

            class_emb = self.unet.class_embedding(class_labels).to(dtype=self.unet.dtype)
            emb = emb + class_emb

            total = get_parameters(self.unet.class_embedding)[0]
            params += total
            # print(f'time_embedding {total}')
        elif self.unet.class_embedding is None and class_labels is not None:
            raise ValueError("class_embedding needs to be initialized in order to use class conditioning")

        # 2. pre-process
        skip_sample = imgs
        imgs = self.unet.conv_in(imgs)
        
        total = get_parameters(self.unet.conv_in)[0]
        params += total
        # print(f'conv_in {total}')
        

        # 3. down
        down_block_res_samples = (imgs,)
        for downsample_block in self.unet.down_blocks:
            if hasattr(downsample_block, "skip_conv"):
                imgs, res_samples, skip_sample = downsample_block(
                    hidden_states=imgs, temb=emb, skip_sample=skip_sample
                )
            else:
                imgs, res_samples = downsample_block(hidden_states=imgs, temb=emb)

            down_block_res_samples += res_samples

            total = get_parameters(downsample_block)[0]
            params += total
            # print(f'downsample_block {total}')

        # 4. mid
        imgs = self.unet.mid_block(imgs, emb)
        # print(f'midlle, {imgs.shape=}')

        total = get_parameters(self.unet.mid_block)[0]
        params += total
        # print(f'mid_block {total}')

        # 5. up
        skip_sample = None
        for i, upsample_block in enumerate(self.unet.up_blocks):
            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]

            if hasattr(upsample_block, "skip_conv"):
                imgs, skip_sample = upsample_block(imgs, res_samples, emb, skip_sample)
            else:
                imgs = upsample_block(imgs, res_samples, emb)

            total = get_parameters(upsample_block)[0]
            params += total
            # print(f'upsample_block {total}')

            if up_last == i:
                print(f'params used = {readable_number(params)}')
                return imgs.mean(dim=[2, 3]) if GAP else imgs
        
            
        # 6. post-process
        imgs = self.unet.conv_norm_out(imgs)
        imgs = self.unet.conv_act(imgs)
        imgs = self.unet.conv_out(imgs)

        if skip_sample is not None:
            imgs += skip_sample

        if self.unet.config.time_embedding_type == "fourier":
            timesteps = timesteps.reshape((imgs.shape[0], *([1] * len(imgs.shape[1:]))))
            imgs = imgs / timesteps

        return imgs

In [12]:
# img, t = torch.randn(3, 3, 32, 32).to(DEVICE), torch.tensor([0]).to(DEVICE)
img, t = torch.randn(3, 3, 256, 256).to(DEVICE), torch.tensor([0]).to(DEVICE)
encoder = DiffusionEncoder(hf_unet).to(DEVICE)

out = encoder(img, t, up_last=2, GAP=False)
print_parameters(encoder)
out.shape

params used = 103,741,440
model initialized with trainable params: 113,673,219 || total params: 113,673,219 || trainable%: 100.000


torch.Size([3, 256, 64, 64])

In [7]:
assert(torch.all(hf_unet(img, t, return_dict=False)[0] == encoder(img, t)).item())

#### CLASSIFICATION

In [8]:
class ClassifierNoised(nn.Module):
    def __init__(self, backbone, scheduler, up_last=3):
        super().__init__()
        self.backbone = backbone
        self.scheduler = scheduler
        self.head = nn.Linear(128, 100)
        self.up_last = up_last
        
    def forward(self, x):
        t = torch.randint(low=0, high=self.scheduler.config.num_train_timesteps, size=[x.shape[0]]).to(x.device)
        noise = torch.randn(x.shape).to(x.device)
        noised = self.scheduler.add_noise(x, noise, t)
        out = self.backbone(noised, t, up_last=self.up_last)
        return self.head(out)


img, t = torch.randn(3, 3, 32, 32).to(DEVICE), torch.tensor([0]).to(DEVICE)
backbone = DiffusionEncoder(hf_unet)
classifier = ClassifierNoised(backbone, hf_scheduler).to(DEVICE)
out = classifier(img)
out.shape

torch.Size([3, 100])

In [10]:
class Classifier(nn.Module):
    def __init__(self, backbone, up_last=3, t=0):
        super().__init__()
        self.backbone = backbone
        self.head = nn.Linear(128, 100)
        self.up_last = up_last
        self.t = t
        
    def forward(self, x):
        return self.head(self.backbone(x, self.t, up_last=self.up_last))

In [11]:
img, t = torch.randn(3, 3, 32, 32).to(DEVICE), torch.tensor([0]).to(DEVICE)
backbone = DiffusionEncoder(hf_unet)
classifier = Classifier(backbone).to(DEVICE)

out = classifier(img)
out.shape

torch.Size([3, 100])

### DATASET

In [13]:
def get_loaders(batch_size, dataset='cifar100', horizontal_flip=False):
    transform_test = tt.Compose([tt.ToTensor(), tt.Normalize(mean=[0.485, 0.456, 0.406], std=[0.228, 0.224, 0.225])])
    transform_train = tt.Compose([tt.ToTensor(), tt.Normalize(mean=[0.485, 0.456, 0.406], std=[0.228, 0.224, 0.225]), tt.RandomHorizontalFlip()]) if horizontal_flip else transform_test
    
    if dataset == 'cifar100':
        dataset_train = torchvision.datasets.CIFAR100(root='./datasets/cifar100', train=True, download=True, transform=transform_train)
        dataset_test = torchvision.datasets.CIFAR100(root='./datasets/cifar100', train=False, download=True, transform=transform_test)
    
    if dataset == 'cifar10':
        dataset_train = torchvision.datasets.CIFAR10(root='./datasets/cifar10', train=True, download=True, transform=transform_train)
        dataset_test = torchvision.datasets.CIFAR10(root='./datasets/cifar10', train=False, download=True, transform=transform_test)

    train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=4)
    test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False, drop_last=True, num_workers=4)
    return train_loader, test_loader

In [14]:
BATCH_SIZE=512

train_loader, test_loader = get_loaders(BATCH_SIZE)

Files already downloaded and verified
Files already downloaded and verified


### TRAIN

#### cifar100

In [17]:
class ClassifierNoised(nn.Module):
    def __init__(self, backbone, scheduler, up_last=3):
        super().__init__()
        self.backbone = backbone
        self.scheduler = scheduler
        self.head = nn.Linear(256, 100)
        self.up_last = up_last
        
    def forward(self, x):
        t = torch.randint(low=0, high=50, size=(x.shape[0],), device=x.device, dtype=torch.int64)
        noise = torch.randn(x.shape, device=x.device)
        noised = self.scheduler.add_noise(x, noise, t)
        out = self.backbone(noised, t, up_last=self.up_last)
        return self.head(out)

NAME = 'noised_cls_cifar100_uplast1'
BATCH_SIZE = 128
LR = 1e-3
TRAIN_EPOCH = 15
FREQ_SAVE = 100
DEVICE = 'cuda'

set_deterministic()

train_loader, test_loader = get_loaders(BATCH_SIZE, dataset='cifar100', horizontal_flip=True)

pipe = DDPMPipeline.from_pretrained("DanielBairamian/ddpm-cifar10-32-ema")
hf_unet, hf_scheduler = pipe.unet, pipe.scheduler
backbone = DiffusionEncoder(hf_unet)
model = ClassifierNoised(backbone, hf_scheduler, up_last=1).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


Loading pipeline components...:   0%|          | 0/2 [00:00<?, ?it/s]An error occurred while trying to fetch /home/jovyan/.cache/huggingface/hub/models--DanielBairamian--ddpm-cifar10-32-ema/snapshots/d9caa7c75cd561fed983fa71979309a8435b20f5/unet: Error no file named diffusion_pytorch_model.safetensors found in directory /home/jovyan/.cache/huggingface/hub/models--DanielBairamian--ddpm-cifar10-32-ema/snapshots/d9caa7c75cd561fed983fa71979309a8435b20f5/unet.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
Loading pipeline components...: 100%|██████████| 2/2 [00:00<00:00, 10.17it/s]


epoch=1 in 0.61m, loss_val=2.067, loss_train=2.756, acc_val=0.454, acc_train=0.343
epoch=2 in 0.61m, loss_val=1.736, loss_train=1.936, acc_val=0.516, acc_train=0.478
epoch=3 in 0.61m, loss_val=1.653, loss_train=1.683, acc_val=0.550, acc_train=0.535
epoch=4 in 0.62m, loss_val=1.474, loss_train=1.513, acc_val=0.588, acc_train=0.574
epoch=5 in 0.60m, loss_val=1.429, loss_train=1.357, acc_val=0.602, acc_train=0.612
epoch=6 in 0.59m, loss_val=1.329, loss_train=1.214, acc_val=0.630, acc_train=0.649
epoch=7 in 0.59m, loss_val=1.275, loss_train=1.060, acc_val=0.656, acc_train=0.689
epoch=8 in 0.62m, loss_val=1.221, loss_train=0.924, acc_val=0.667, acc_train=0.726
epoch=9 in 0.59m, loss_val=1.186, loss_train=0.773, acc_val=0.685, acc_train=0.762
epoch=10 in 0.60m, loss_val=1.102, loss_train=0.633, acc_val=0.695, acc_train=0.801
epoch=11 in 0.60m, loss_val=1.050, loss_train=0.494, acc_val=0.722, acc_train=0.844
epoch=12 in 0.59m, loss_val=1.044, loss_train=0.389, acc_val=0.729, acc_train=0.875
e

In [18]:
pipe = DDPMPipeline.from_pretrained("google/ddpm-ema-cat-256")
hf_unet, hf_scheduler = pipe.unet, pipe.scheduler

Fetching 4 files: 100%|██████████| 4/4 [00:10<00:00,  2.66s/it]
Loading pipeline components...:   0%|          | 0/2 [00:00<?, ?it/s]An error occurred while trying to fetch /home/jovyan/.cache/huggingface/hub/models--google--ddpm-ema-cat-256/snapshots/9517646c0efb301d44709f2e294f1548a6fdc408: Error no file named diffusion_pytorch_model.safetensors found in directory /home/jovyan/.cache/huggingface/hub/models--google--ddpm-ema-cat-256/snapshots/9517646c0efb301d44709f2e294f1548a6fdc408.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
Loading pipeline components...: 100%|██████████| 2/2 [00:04<00:00,  2.16s/it]


In [9]:
class ClassifierNoised(nn.Module):
    def __init__(self, backbone, scheduler, up_last=3):
        super().__init__()
        self.backbone = backbone
        self.scheduler = scheduler
        self.head = nn.Linear(512, 100)
        self.up_last = up_last
        
    def forward(self, x):
        t = torch.randint(low=0, high=50, size=(x.shape[0],), device=x.device, dtype=torch.int64)
        noise = torch.randn(x.shape, device=x.device)
        noised = self.scheduler.add_noise(x, noise, t)
        out = self.backbone(noised, t, up_last=self.up_last)
        return self.head(out)

NAME = 'noised_cls_cifar100_256'
BATCH_SIZE = 32
LR = 1e-3
TRAIN_EPOCH = 15
FREQ_SAVE = 100
DEVICE = 'cuda'

set_deterministic()

transform_test = tt.Compose([tt.ToTensor(), tt.Resize(256), tt.Normalize(mean=[0.485, 0.456, 0.406], std=[0.228, 0.224, 0.225])])
transform_train = tt.Compose([tt.ToTensor(),  tt.Resize(256), tt.Normalize(mean=[0.485, 0.456, 0.406], std=[0.228, 0.224, 0.225]), tt.RandomHorizontalFlip()])

dataset_train = torchvision.datasets.CIFAR100(root='./datasets/cifar100', train=True, download=True, transform=transform_train)
dataset_test = torchvision.datasets.CIFAR100(root='./datasets/cifar100', train=False, download=True, transform=transform_test)
train_loader = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=4)
test_loader = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=False, drop_last=True, num_workers=4)

pipe = DDPMPipeline.from_pretrained("google/ddpm-ema-cat-256")
hf_unet, hf_scheduler = pipe.unet, pipe.scheduler
backbone = DiffusionEncoder(hf_unet)
model = ClassifierNoised(backbone, hf_scheduler, up_last=1).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


Loading pipeline components...:   0%|          | 0/2 [00:00<?, ?it/s]An error occurred while trying to fetch /home/jovyan/.cache/huggingface/hub/models--google--ddpm-ema-cat-256/snapshots/9517646c0efb301d44709f2e294f1548a6fdc408: Error no file named diffusion_pytorch_model.safetensors found in directory /home/jovyan/.cache/huggingface/hub/models--google--ddpm-ema-cat-256/snapshots/9517646c0efb301d44709f2e294f1548a6fdc408.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
Loading pipeline components...: 100%|██████████| 2/2 [00:00<00:00,  2.59it/s]


KeyboardInterrupt: 

#### cifar10

In [274]:
class Classifier(nn.Module):
    def __init__(self, backbone, up_last=3, t=0):
        super().__init__()
        self.backbone = backbone
        self.head = nn.Linear(128, 100)
        self.up_last = up_last
        self.t = t
        
    def forward(self, x):
        return self.head(self.backbone(x, self.t, up_last=self.up_last))


NAME = 'cls_cifar10_base'
BATCH_SIZE = 512
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

transform_test = tt.Compose([tt.ToTensor(), tt.Normalize(mean=[0.485, 0.456, 0.406], std=[0.228, 0.224, 0.225])])
transform_train = tt.Compose([tt.ToTensor(), tt.Normalize(mean=[0.485, 0.456, 0.406], std=[0.228, 0.224, 0.225]), tt.RandomHorizontalFlip()])

dataset_train = torchvision.datasets.CIFAR10(root='./datasets/cifar10', train=True, download=True, transform=transform_train)
dataset_test = torchvision.datasets.CIFAR10(root='./datasets/cifar10', train=False, download=True, transform=transform_test)
train_loader = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=4)
test_loader = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=False, drop_last=True, num_workers=4)


backbone = DiffusionEncoder(hf_unet)
model = Classifier(backbone).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./datasets/cifar10/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:08<00:00, 21097557.49it/s]


Extracting ./datasets/cifar10/cifar-10-python.tar.gz to ./datasets/cifar10
Files already downloaded and verified


Training 1/10: 100%|██████████| 97/97 [00:57<00:00,  1.70it/s]
Validating 1/10: 100%|██████████| 19/19 [00:06<00:00,  3.03it/s]


epoch=1 in 1.06m, loss_val=0.469, loss_train=4.304, acc_val=0.848, acc_train=0.682


Training 2/10: 100%|██████████| 97/97 [00:57<00:00,  1.67it/s]
Validating 2/10: 100%|██████████| 19/19 [00:05<00:00,  3.32it/s]


epoch=2 in 1.06m, loss_val=0.385, loss_train=0.300, acc_val=0.874, acc_train=0.900


Training 3/10: 100%|██████████| 97/97 [00:57<00:00,  1.68it/s]
Validating 3/10: 100%|██████████| 19/19 [00:06<00:00,  3.16it/s]


epoch=3 in 1.06m, loss_val=0.349, loss_train=0.185, acc_val=0.892, acc_train=0.938


Training 4/10: 100%|██████████| 97/97 [00:57<00:00,  1.69it/s]
Validating 4/10: 100%|██████████| 19/19 [00:05<00:00,  3.20it/s]


epoch=4 in 1.06m, loss_val=0.352, loss_train=0.117, acc_val=0.895, acc_train=0.962


Training 5/10: 100%|██████████| 97/97 [00:57<00:00,  1.69it/s]
Validating 5/10: 100%|██████████| 19/19 [00:05<00:00,  3.40it/s]


epoch=5 in 1.05m, loss_val=0.367, loss_train=0.071, acc_val=0.901, acc_train=0.978


Training 6/10: 100%|██████████| 97/97 [00:56<00:00,  1.71it/s]
Validating 6/10: 100%|██████████| 19/19 [00:05<00:00,  3.29it/s]


epoch=6 in 1.04m, loss_val=0.379, loss_train=0.037, acc_val=0.899, acc_train=0.989


Training 7/10: 100%|██████████| 97/97 [00:57<00:00,  1.68it/s]
Validating 7/10: 100%|██████████| 19/19 [00:06<00:00,  3.12it/s]


epoch=7 in 1.06m, loss_val=0.401, loss_train=0.021, acc_val=0.903, acc_train=0.995


Training 8/10: 100%|██████████| 97/97 [00:56<00:00,  1.70it/s]
Validating 8/10: 100%|██████████| 19/19 [00:05<00:00,  3.28it/s]


epoch=8 in 1.05m, loss_val=0.403, loss_train=0.013, acc_val=0.906, acc_train=0.997


Training 9/10: 100%|██████████| 97/97 [00:56<00:00,  1.71it/s]
Validating 9/10: 100%|██████████| 19/19 [00:05<00:00,  3.33it/s]


epoch=9 in 1.04m, loss_val=0.406, loss_train=0.007, acc_val=0.906, acc_train=0.998


Training 10/10: 100%|██████████| 97/97 [00:56<00:00,  1.71it/s]
Validating 10/10: 100%|██████████| 19/19 [00:05<00:00,  3.52it/s]

epoch=10 in 1.04m, loss_val=0.407, loss_train=0.005, acc_val=0.908, acc_train=0.999





In [279]:
class Classifier(nn.Module):
    def __init__(self, backbone, up_last=3, t=0):
        super().__init__()
        self.backbone = backbone
        self.head = nn.Linear(256, 10)
        self.up_last = up_last
        self.t = t
        
    def forward(self, x):
        return self.head(self.backbone(x, self.t, up_last=self.up_last))


NAME = 'cls_cifar10_uplast1'
BATCH_SIZE = 512
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

train_loader, test_loader = get_loaders(BATCH_SIZE, dataset='cifar10', horizontal_flip=True)

backbone = DiffusionEncoder(hf_unet)
model = Classifier(backbone, up_last=1).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


Training 1/10: 100%|██████████| 97/97 [00:24<00:00,  3.98it/s]
Validating 1/10: 100%|██████████| 19/19 [00:03<00:00,  4.92it/s]


epoch=1 in 0.47m, loss_val=0.543, loss_train=1.119, acc_val=0.888, acc_train=0.858


Training 2/10: 100%|██████████| 97/97 [00:25<00:00,  3.86it/s]
Validating 2/10: 100%|██████████| 19/19 [00:03<00:00,  5.34it/s]


epoch=2 in 0.48m, loss_val=0.511, loss_train=0.068, acc_val=0.895, acc_train=0.979


Training 3/10: 100%|██████████| 97/97 [00:24<00:00,  3.88it/s]
Validating 3/10: 100%|██████████| 19/19 [00:03<00:00,  5.64it/s]


epoch=3 in 0.47m, loss_val=0.609, loss_train=0.043, acc_val=0.889, acc_train=0.986


Training 4/10: 100%|██████████| 97/97 [00:25<00:00,  3.82it/s]
Validating 4/10: 100%|██████████| 19/19 [00:03<00:00,  5.30it/s]


epoch=4 in 0.48m, loss_val=0.575, loss_train=0.028, acc_val=0.897, acc_train=0.990


Training 5/10: 100%|██████████| 97/97 [00:25<00:00,  3.82it/s]
Validating 5/10: 100%|██████████| 19/19 [00:03<00:00,  5.24it/s]


epoch=5 in 0.48m, loss_val=0.573, loss_train=0.019, acc_val=0.903, acc_train=0.995


Training 6/10: 100%|██████████| 97/97 [00:25<00:00,  3.84it/s]
Validating 6/10: 100%|██████████| 19/19 [00:03<00:00,  5.21it/s]


epoch=6 in 0.48m, loss_val=0.610, loss_train=0.011, acc_val=0.902, acc_train=0.997


Training 7/10: 100%|██████████| 97/97 [00:25<00:00,  3.81it/s]
Validating 7/10: 100%|██████████| 19/19 [00:03<00:00,  5.98it/s]


epoch=7 in 0.48m, loss_val=0.591, loss_train=0.005, acc_val=0.907, acc_train=0.998


Training 8/10: 100%|██████████| 97/97 [00:25<00:00,  3.84it/s]
Validating 8/10: 100%|██████████| 19/19 [00:03<00:00,  5.29it/s]


epoch=8 in 0.48m, loss_val=0.603, loss_train=0.003, acc_val=0.904, acc_train=0.999


Training 9/10: 100%|██████████| 97/97 [00:24<00:00,  3.89it/s]
Validating 9/10: 100%|██████████| 19/19 [00:03<00:00,  5.83it/s]


epoch=9 in 0.47m, loss_val=0.602, loss_train=0.002, acc_val=0.905, acc_train=1.000


Training 10/10: 100%|██████████| 97/97 [00:25<00:00,  3.78it/s]
Validating 10/10: 100%|██████████| 19/19 [00:03<00:00,  5.97it/s]

epoch=10 in 0.48m, loss_val=0.600, loss_train=0.001, acc_val=0.906, acc_train=1.000





In [280]:
class Classifier(nn.Module):
    def __init__(self, backbone, up_last=3, t=0, GAP=True):
        super().__init__()
        self.backbone = backbone
        self.proj = nn.Conv2d(256, 256, 16)
        self.head = nn.Linear(256, 10)

        self.up_last = up_last
        self.t = t
        self.GAP = GAP
        
    def forward(self, x):
        out = self.backbone(x, self.t, up_last=self.up_last, GAP=self.GAP)
        out = self.proj(out)
        return self.head(out.squeeze())


NAME = 'cls_cifar10_gap'
BATCH_SIZE = 512
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

train_loader, test_loader = get_loaders(BATCH_SIZE, dataset='cifar10', horizontal_flip=True)

backbone = DiffusionEncoder(hf_unet)
model = Classifier(backbone, up_last=1, GAP=False).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


Training 1/10: 100%|██████████| 97/97 [00:28<00:00,  3.37it/s]
Validating 1/10: 100%|██████████| 19/19 [00:03<00:00,  5.80it/s]


epoch=1 in 0.54m, loss_val=37.036, loss_train=32.765, acc_val=0.868, acc_train=0.883


Training 2/10: 100%|██████████| 97/97 [00:28<00:00,  3.44it/s]
Validating 2/10: 100%|██████████| 19/19 [00:03<00:00,  5.65it/s]


epoch=2 in 0.53m, loss_val=19.870, loss_train=5.051, acc_val=0.882, acc_train=0.959


Training 3/10: 100%|██████████| 97/97 [00:28<00:00,  3.45it/s]
Validating 3/10: 100%|██████████| 19/19 [00:03<00:00,  5.25it/s]


epoch=3 in 0.53m, loss_val=14.239, loss_train=2.042, acc_val=0.892, acc_train=0.974


Training 4/10: 100%|██████████| 97/97 [00:28<00:00,  3.40it/s]
Validating 4/10: 100%|██████████| 19/19 [00:03<00:00,  5.46it/s]


epoch=4 in 0.53m, loss_val=11.739, loss_train=1.082, acc_val=0.899, acc_train=0.983


Training 5/10: 100%|██████████| 97/97 [00:27<00:00,  3.46it/s]
Validating 5/10: 100%|██████████| 19/19 [00:03<00:00,  5.81it/s]


epoch=5 in 0.52m, loss_val=11.051, loss_train=0.502, acc_val=0.900, acc_train=0.989


Training 6/10: 100%|██████████| 97/97 [00:27<00:00,  3.47it/s]
Validating 6/10: 100%|██████████| 19/19 [00:03<00:00,  5.47it/s]


epoch=6 in 0.52m, loss_val=11.450, loss_train=0.269, acc_val=0.900, acc_train=0.994


Training 7/10: 100%|██████████| 97/97 [00:28<00:00,  3.44it/s]
Validating 7/10: 100%|██████████| 19/19 [00:03<00:00,  5.95it/s]


epoch=7 in 0.52m, loss_val=10.706, loss_train=0.153, acc_val=0.901, acc_train=0.996


Training 8/10: 100%|██████████| 97/97 [00:27<00:00,  3.50it/s]
Validating 8/10: 100%|██████████| 19/19 [00:03<00:00,  4.98it/s]


epoch=8 in 0.53m, loss_val=10.546, loss_train=0.068, acc_val=0.905, acc_train=0.997


Training 9/10: 100%|██████████| 97/97 [00:28<00:00,  3.46it/s]
Validating 9/10: 100%|██████████| 19/19 [00:03<00:00,  5.59it/s]


epoch=9 in 0.53m, loss_val=10.540, loss_train=0.024, acc_val=0.906, acc_train=0.999


Training 10/10: 100%|██████████| 97/97 [00:28<00:00,  3.41it/s]
Validating 10/10: 100%|██████████| 19/19 [00:03<00:00,  5.10it/s]

epoch=10 in 0.54m, loss_val=10.520, loss_train=0.017, acc_val=0.907, acc_train=0.999





In [281]:
class Classifier(nn.Module):
    def __init__(self, backbone, up_last=3, t=0):
        super().__init__()
        self.backbone = backbone
        self.head = nn.Linear(256, 10)
        self.up_last = up_last
        self.t = t
        
    def forward(self, x):
        return self.head(self.backbone(x, self.t, up_last=self.up_last))


NAME = 'cls_cifar10_uplast0'
BATCH_SIZE = 512
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

train_loader, test_loader = get_loaders(BATCH_SIZE, dataset='cifar10', horizontal_flip=True)

backbone = DiffusionEncoder(hf_unet)
model = Classifier(backbone, up_last=0).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


Training 1/10: 100%|██████████| 97/97 [00:21<00:00,  4.55it/s]
Validating 1/10: 100%|██████████| 19/19 [00:03<00:00,  5.63it/s]


epoch=1 in 0.41m, loss_val=1.415, loss_train=1.430, acc_val=0.864, acc_train=0.880


Training 2/10: 100%|██████████| 97/97 [00:21<00:00,  4.52it/s]
Validating 2/10: 100%|██████████| 19/19 [00:02<00:00,  6.74it/s]


epoch=2 in 0.41m, loss_val=1.017, loss_train=0.178, acc_val=0.881, acc_train=0.964


Training 3/10: 100%|██████████| 97/97 [00:21<00:00,  4.48it/s]
Validating 3/10: 100%|██████████| 19/19 [00:02<00:00,  7.11it/s]


epoch=3 in 0.41m, loss_val=0.888, loss_train=0.102, acc_val=0.880, acc_train=0.975


Training 4/10: 100%|██████████| 97/97 [00:21<00:00,  4.58it/s]
Validating 4/10: 100%|██████████| 19/19 [00:02<00:00,  6.42it/s]


epoch=4 in 0.40m, loss_val=0.864, loss_train=0.059, acc_val=0.892, acc_train=0.985


Training 5/10: 100%|██████████| 97/97 [00:21<00:00,  4.50it/s]
Validating 5/10: 100%|██████████| 19/19 [00:03<00:00,  6.09it/s]


epoch=5 in 0.41m, loss_val=0.880, loss_train=0.034, acc_val=0.887, acc_train=0.991


Training 6/10: 100%|██████████| 97/97 [00:21<00:00,  4.51it/s]
Validating 6/10: 100%|██████████| 19/19 [00:02<00:00,  6.97it/s]


epoch=6 in 0.40m, loss_val=0.798, loss_train=0.021, acc_val=0.897, acc_train=0.994


Training 7/10: 100%|██████████| 97/97 [00:20<00:00,  4.63it/s]
Validating 7/10: 100%|██████████| 19/19 [00:02<00:00,  6.76it/s]


epoch=7 in 0.40m, loss_val=0.796, loss_train=0.008, acc_val=0.902, acc_train=0.998


Training 8/10: 100%|██████████| 97/97 [00:21<00:00,  4.56it/s]
Validating 8/10: 100%|██████████| 19/19 [00:03<00:00,  5.35it/s]


epoch=8 in 0.41m, loss_val=0.803, loss_train=0.003, acc_val=0.901, acc_train=0.999


Training 9/10: 100%|██████████| 97/97 [00:21<00:00,  4.49it/s]
Validating 9/10: 100%|██████████| 19/19 [00:03<00:00,  6.00it/s]


epoch=9 in 0.41m, loss_val=0.800, loss_train=0.001, acc_val=0.902, acc_train=1.000


Training 10/10: 100%|██████████| 97/97 [00:21<00:00,  4.45it/s]
Validating 10/10: 100%|██████████| 19/19 [00:02<00:00,  6.70it/s]

epoch=10 in 0.41m, loss_val=0.800, loss_train=0.001, acc_val=0.903, acc_train=1.000





#### CIFAR10 noised

In [373]:
class ClassifierNoised(nn.Module):
    def __init__(self, backbone, scheduler, up_last=3):
        super().__init__()
        self.backbone = backbone
        self.scheduler = scheduler
        self.head = nn.Linear(256, 100)
        self.up_last = up_last
        
    def forward(self, x):
        t = torch.randint(low=0, high=20, size=(x.shape[0],), device=x.device, dtype=torch.int64)
        noise = torch.randn(x.shape, device=x.device)
        noised = self.scheduler.add_noise(x, noise, t)
        out = self.backbone(noised, t, up_last=self.up_last)
        return self.head(out)

NAME = 'noised_cls_cifar10_uplast1'
BATCH_SIZE = 128
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

set_deterministic()

train_loader, test_loader = get_loaders(BATCH_SIZE, dataset='cifar10', horizontal_flip=True)

backbone = DiffusionEncoder(hf_unet)
model = ClassifierNoised(backbone, hf_scheduler, up_last=1).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


Training 1/10: 100%|██████████| 390/390 [00:33<00:00, 11.59it/s]
Validating 1/10: 100%|██████████| 78/78 [00:04<00:00, 18.29it/s]


epoch=1 in 0.63m, loss_val=0.599, loss_train=0.194, acc_val=0.869, acc_train=0.960


Training 2/10: 100%|██████████| 390/390 [00:32<00:00, 11.87it/s]
Validating 2/10: 100%|██████████| 78/78 [00:04<00:00, 18.17it/s]


epoch=2 in 0.62m, loss_val=0.616, loss_train=0.111, acc_val=0.855, acc_train=0.968


Training 3/10: 100%|██████████| 390/390 [00:34<00:00, 11.33it/s]
Validating 3/10: 100%|██████████| 78/78 [00:03<00:00, 23.21it/s]


epoch=3 in 0.63m, loss_val=0.562, loss_train=0.100, acc_val=0.865, acc_train=0.971


Training 4/10: 100%|██████████| 390/390 [00:32<00:00, 12.11it/s]
Validating 4/10: 100%|██████████| 78/78 [00:04<00:00, 19.48it/s]


epoch=4 in 0.60m, loss_val=0.614, loss_train=0.092, acc_val=0.863, acc_train=0.972


Training 5/10: 100%|██████████| 390/390 [00:32<00:00, 12.10it/s]
Validating 5/10: 100%|██████████| 78/78 [00:03<00:00, 20.12it/s]


epoch=5 in 0.60m, loss_val=0.562, loss_train=0.073, acc_val=0.874, acc_train=0.978


Training 6/10: 100%|██████████| 390/390 [00:32<00:00, 11.84it/s]
Validating 6/10: 100%|██████████| 78/78 [00:02<00:00, 26.88it/s]


epoch=6 in 0.60m, loss_val=0.465, loss_train=0.039, acc_val=0.883, acc_train=0.988


Training 7/10: 100%|██████████| 390/390 [00:33<00:00, 11.70it/s]
Validating 7/10: 100%|██████████| 78/78 [00:03<00:00, 23.25it/s]


epoch=7 in 0.61m, loss_val=0.502, loss_train=0.021, acc_val=0.886, acc_train=0.993


Training 8/10: 100%|██████████| 390/390 [00:31<00:00, 12.21it/s]
Validating 8/10: 100%|██████████| 78/78 [00:03<00:00, 25.98it/s]


epoch=8 in 0.58m, loss_val=0.461, loss_train=0.010, acc_val=0.898, acc_train=0.997


Training 9/10: 100%|██████████| 390/390 [00:33<00:00, 11.65it/s]
Validating 9/10: 100%|██████████| 78/78 [00:03<00:00, 25.22it/s]


epoch=9 in 0.61m, loss_val=0.475, loss_train=0.005, acc_val=0.899, acc_train=0.999


Training 10/10: 100%|██████████| 390/390 [00:33<00:00, 11.72it/s]
Validating 10/10: 100%|██████████| 78/78 [00:03<00:00, 21.27it/s]

epoch=10 in 0.62m, loss_val=0.464, loss_train=0.003, acc_val=0.903, acc_train=0.999





In [None]:
class ClassifierNoised(nn.Module):
    def __init__(self, backbone, scheduler, up_last=3):
        super().__init__()
        self.backbone = backbone
        self.scheduler = scheduler
        self.head = nn.Linear(256, 100)
        self.up_last = up_last
        
    def forward(self, x):
        t = torch.randint(low=0, high=50, size=(x.shape[0],), device=x.device, dtype=torch.int64)
        noise = torch.randn(x.shape, device=x.device)
        noised = self.scheduler.add_noise(x, noise, t)
        out = self.backbone(noised, t, up_last=self.up_last)
        return self.head(out)

NAME = 'noised_cls_cifar10_ts50' #need to rerun 
BATCH_SIZE = 128
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

set_deterministic()

train_loader, test_loader = get_loaders(BATCH_SIZE, dataset='cifar10', horizontal_flip=True)

pipe = DDPMPipeline.from_pretrained("DanielBairamian/ddpm-cifar10-32-ema")
hf_unet, hf_scheduler = pipe.unet, pipe.scheduler
backbone = DiffusionEncoder(hf_unet)
model = ClassifierNoised(backbone, hf_scheduler, up_last=1).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


Loading pipeline components...:   0%|          | 0/2 [00:00<?, ?it/s]An error occurred while trying to fetch /home/jovyan/.cache/huggingface/hub/models--DanielBairamian--ddpm-cifar10-32-ema/snapshots/d9caa7c75cd561fed983fa71979309a8435b20f5/unet: Error no file named diffusion_pytorch_model.safetensors found in directory /home/jovyan/.cache/huggingface/hub/models--DanielBairamian--ddpm-cifar10-32-ema/snapshots/d9caa7c75cd561fed983fa71979309a8435b20f5/unet.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
Loading pipeline components...: 100%|██████████| 2/2 [00:00<00:00, 11.10it/s]
Training 1/10: 100%|██████████| 390/390 [00:32<00:00, 11.91it/s]
Validating 1/10: 100%|██████████| 78/78 [00:04<00:00, 18.99it/s]


epoch=1 in 0.62m, loss_val=0.539, loss_train=0.533, acc_val=0.842, acc_train=0.886


Training 2/10: 100%|██████████| 390/390 [00:32<00:00, 12.03it/s]
Validating 2/10: 100%|██████████| 78/78 [00:03<00:00, 21.80it/s]


epoch=2 in 0.60m, loss_val=0.537, loss_train=0.254, acc_val=0.843, acc_train=0.922


Training 3/10: 100%|██████████| 390/390 [00:32<00:00, 12.09it/s]
Validating 3/10: 100%|██████████| 78/78 [00:03<00:00, 22.92it/s]


epoch=3 in 0.59m, loss_val=0.530, loss_train=0.209, acc_val=0.851, acc_train=0.934


Training 4/10: 100%|██████████| 390/390 [00:32<00:00, 11.93it/s]
Validating 4/10: 100%|██████████| 78/78 [00:03<00:00, 23.32it/s]


epoch=4 in 0.60m, loss_val=0.449, loss_train=0.176, acc_val=0.872, acc_train=0.945


Training 5/10: 100%|██████████| 390/390 [00:32<00:00, 11.96it/s]
Validating 5/10: 100%|██████████| 78/78 [00:03<00:00, 20.52it/s]


epoch=5 in 0.61m, loss_val=0.452, loss_train=0.120, acc_val=0.871, acc_train=0.961


Training 6/10: 100%|██████████| 390/390 [00:33<00:00, 11.78it/s]
Validating 6/10: 100%|██████████| 78/78 [00:03<00:00, 21.01it/s]


epoch=6 in 0.61m, loss_val=0.384, loss_train=0.072, acc_val=0.899, acc_train=0.977


Training 7/10: 100%|██████████| 390/390 [00:32<00:00, 11.97it/s]
Validating 7/10: 100%|██████████| 78/78 [00:03<00:00, 21.65it/s]


epoch=7 in 0.60m, loss_val=0.385, loss_train=0.043, acc_val=0.896, acc_train=0.986


Training 8/10: 100%|██████████| 390/390 [00:34<00:00, 11.19it/s]
Validating 8/10: 100%|██████████| 78/78 [00:03<00:00, 20.89it/s]


epoch=8 in 0.64m, loss_val=0.377, loss_train=0.021, acc_val=0.906, acc_train=0.993


Training 9/10: 100%|██████████| 390/390 [00:33<00:00, 11.76it/s]
Validating 9/10: 100%|██████████| 78/78 [00:03<00:00, 25.40it/s]


epoch=9 in 0.60m, loss_val=0.378, loss_train=0.009, acc_val=0.910, acc_train=0.997


Training 10/10: 100%|██████████| 390/390 [00:32<00:00, 12.10it/s]
Validating 10/10: 100%|██████████| 78/78 [00:03<00:00, 22.68it/s]

epoch=10 in 0.60m, loss_val=0.370, loss_train=0.005, acc_val=0.912, acc_train=0.999





In [None]:
class ClassifierNoised(nn.Module):
    def __init__(self, backbone, scheduler, up_last=3):
        super().__init__()
        self.backbone = backbone
        self.scheduler = scheduler
        self.head = nn.Linear(256, 100)
        self.up_last = up_last
        
    def forward(self, x):
        t = torch.randint(low=0, high=20, size=(x.shape[0],), device=x.device, dtype=torch.int64)
        noise = torch.randn(x.shape, device=x.device)
        noised = self.scheduler.add_noise(x, noise, t)
        out = self.backbone(noised, t, up_last=self.up_last)
        return self.head(out)

NAME = 'noised_cls_cifar10_ts20'
BATCH_SIZE = 128
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

set_deterministic()

train_loader, test_loader = get_loaders(BATCH_SIZE, dataset='cifar10', horizontal_flip=True)


pipe = DDPMPipeline.from_pretrained("DanielBairamian/ddpm-cifar10-32-ema")
hf_unet, hf_scheduler = pipe.unet, pipe.scheduler
backbone = DiffusionEncoder(hf_unet)
model = ClassifierNoised(backbone, hf_scheduler, up_last=1).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


Loading pipeline components...:   0%|          | 0/2 [00:00<?, ?it/s]An error occurred while trying to fetch /home/jovyan/.cache/huggingface/hub/models--DanielBairamian--ddpm-cifar10-32-ema/snapshots/d9caa7c75cd561fed983fa71979309a8435b20f5/unet: Error no file named diffusion_pytorch_model.safetensors found in directory /home/jovyan/.cache/huggingface/hub/models--DanielBairamian--ddpm-cifar10-32-ema/snapshots/d9caa7c75cd561fed983fa71979309a8435b20f5/unet.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
Loading pipeline components...: 100%|██████████| 2/2 [00:00<00:00,  9.42it/s]
Training 1/10: 100%|██████████| 390/390 [00:32<00:00, 11.92it/s]
Validating 1/10: 100%|██████████| 78/78 [00:03<00:00, 19.61it/s]


epoch=1 in 0.61m, loss_val=0.539, loss_train=0.533, acc_val=0.842, acc_train=0.886


Training 2/10: 100%|██████████| 390/390 [00:33<00:00, 11.52it/s]
Validating 2/10: 100%|██████████| 78/78 [00:04<00:00, 17.83it/s]


epoch=2 in 0.64m, loss_val=0.537, loss_train=0.254, acc_val=0.843, acc_train=0.922


Training 3/10: 100%|██████████| 390/390 [00:32<00:00, 11.84it/s]
Validating 3/10: 100%|██████████| 78/78 [00:03<00:00, 22.93it/s]


epoch=3 in 0.61m, loss_val=0.530, loss_train=0.209, acc_val=0.851, acc_train=0.934


Training 4/10: 100%|██████████| 390/390 [00:31<00:00, 12.19it/s]
Validating 4/10: 100%|██████████| 78/78 [00:03<00:00, 21.21it/s]


epoch=4 in 0.60m, loss_val=0.449, loss_train=0.176, acc_val=0.872, acc_train=0.945


Training 5/10: 100%|██████████| 390/390 [00:33<00:00, 11.76it/s]
Validating 5/10: 100%|██████████| 78/78 [00:03<00:00, 23.53it/s]


epoch=5 in 0.61m, loss_val=0.452, loss_train=0.120, acc_val=0.871, acc_train=0.961


Training 6/10: 100%|██████████| 390/390 [00:32<00:00, 11.96it/s]
Validating 6/10: 100%|██████████| 78/78 [00:03<00:00, 23.64it/s]


epoch=6 in 0.60m, loss_val=0.384, loss_train=0.072, acc_val=0.899, acc_train=0.977


Training 7/10: 100%|██████████| 390/390 [00:32<00:00, 12.00it/s]
Validating 7/10: 100%|██████████| 78/78 [00:03<00:00, 21.86it/s]


epoch=7 in 0.60m, loss_val=0.385, loss_train=0.043, acc_val=0.896, acc_train=0.986


Training 8/10: 100%|██████████| 390/390 [00:32<00:00, 11.90it/s]
Validating 8/10: 100%|██████████| 78/78 [00:03<00:00, 23.72it/s]


epoch=8 in 0.60m, loss_val=0.377, loss_train=0.021, acc_val=0.906, acc_train=0.993


Training 9/10: 100%|██████████| 390/390 [00:32<00:00, 11.91it/s]
Validating 9/10: 100%|██████████| 78/78 [00:03<00:00, 20.97it/s]


epoch=9 in 0.61m, loss_val=0.378, loss_train=0.009, acc_val=0.910, acc_train=0.997


Training 10/10: 100%|██████████| 390/390 [00:32<00:00, 12.14it/s]
Validating 10/10: 100%|██████████| 78/78 [00:04<00:00, 16.77it/s]

epoch=10 in 0.61m, loss_val=0.370, loss_train=0.005, acc_val=0.912, acc_train=0.999





In [386]:
class ClassifierNoised(nn.Module):
    def __init__(self, backbone, scheduler, up_last=3):
        super().__init__()
        self.backbone = backbone
        self.scheduler = scheduler
        self.head = nn.Linear(256, 100)
        self.up_last = up_last
        
    def forward(self, x):
        t = torch.randint(low=0, high=20, size=(x.shape[0],), device=x.device, dtype=torch.int64)
        noise = torch.randn(x.shape, device=x.device)
        noised = self.scheduler.add_noise(x, noise, t)
        out = self.backbone(noised, t, up_last=self.up_last)
        return self.head(out)

NAME = 'noised_cls_cifar10_ts20_g'
BATCH_SIZE = 128
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

set_deterministic()

train_loader, test_loader = get_loaders(BATCH_SIZE, dataset='cifar10', horizontal_flip=True)

pipe = DDPMPipeline.from_pretrained("google/ddpm-cifar10-32")
hf_unet, hf_scheduler = pipe.unet, pipe.scheduler
backbone = DiffusionEncoder(hf_unet)
model = ClassifierNoised(backbone, hf_scheduler, up_last=1).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


Loading pipeline components...: 100%|██████████| 2/2 [00:00<00:00, 21.63it/s]
Training 1/10: 100%|██████████| 390/390 [00:33<00:00, 11.68it/s]
Validating 1/10: 100%|██████████| 78/78 [00:04<00:00, 18.56it/s]


epoch=1 in 0.63m, loss_val=0.533, loss_train=0.606, acc_val=0.851, acc_train=0.881


Training 2/10: 100%|██████████| 390/390 [00:33<00:00, 11.64it/s]
Validating 2/10: 100%|██████████| 78/78 [00:03<00:00, 22.67it/s]


epoch=2 in 0.62m, loss_val=0.501, loss_train=0.231, acc_val=0.857, acc_train=0.929


Training 3/10: 100%|██████████| 390/390 [00:32<00:00, 11.82it/s]
Validating 3/10: 100%|██████████| 78/78 [00:03<00:00, 21.48it/s]


epoch=3 in 0.61m, loss_val=0.525, loss_train=0.199, acc_val=0.849, acc_train=0.938


Training 4/10: 100%|██████████| 390/390 [00:33<00:00, 11.51it/s]
Validating 4/10: 100%|██████████| 78/78 [00:03<00:00, 20.12it/s]


epoch=4 in 0.63m, loss_val=0.459, loss_train=0.169, acc_val=0.869, acc_train=0.947


Training 5/10: 100%|██████████| 390/390 [00:33<00:00, 11.69it/s]
Validating 5/10: 100%|██████████| 78/78 [00:03<00:00, 21.16it/s]


epoch=5 in 0.62m, loss_val=0.436, loss_train=0.116, acc_val=0.879, acc_train=0.962


Training 6/10: 100%|██████████| 390/390 [00:32<00:00, 11.84it/s]
Validating 6/10: 100%|██████████| 78/78 [00:03<00:00, 22.42it/s]


epoch=6 in 0.61m, loss_val=0.412, loss_train=0.072, acc_val=0.891, acc_train=0.977


Training 7/10: 100%|██████████| 390/390 [00:32<00:00, 11.85it/s]
Validating 7/10: 100%|██████████| 78/78 [00:03<00:00, 23.14it/s]


epoch=7 in 0.61m, loss_val=0.438, loss_train=0.040, acc_val=0.892, acc_train=0.987


Training 8/10: 100%|██████████| 390/390 [00:31<00:00, 12.21it/s]
Validating 8/10: 100%|██████████| 78/78 [00:03<00:00, 20.98it/s]


epoch=8 in 0.59m, loss_val=0.392, loss_train=0.019, acc_val=0.904, acc_train=0.994


Training 9/10: 100%|██████████| 390/390 [00:32<00:00, 12.14it/s]
Validating 9/10: 100%|██████████| 78/78 [00:03<00:00, 21.14it/s]


epoch=9 in 0.60m, loss_val=0.404, loss_train=0.008, acc_val=0.905, acc_train=0.997


Training 10/10: 100%|██████████| 390/390 [00:32<00:00, 11.97it/s]
Validating 10/10: 100%|██████████| 78/78 [00:03<00:00, 23.74it/s]

epoch=10 in 0.60m, loss_val=0.398, loss_train=0.004, acc_val=0.907, acc_train=0.999





In [387]:
class ClassifierNoised(nn.Module):
    def __init__(self, backbone, scheduler, up_last=3):
        super().__init__()
        self.backbone = backbone
        self.scheduler = scheduler
        self.head = nn.Linear(256, 100)
        self.up_last = up_last
        
    def forward(self, x):
        t = torch.randint(low=0, high=50, size=(x.shape[0],), device=x.device, dtype=torch.int64)
        noise = torch.randn(x.shape, device=x.device)
        noised = self.scheduler.add_noise(x, noise, t)
        out = self.backbone(noised, t, up_last=self.up_last)
        return self.head(out)

NAME = 'noised_cls_cifar10_ts50_g'
BATCH_SIZE = 128
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

set_deterministic()

train_loader, test_loader = get_loaders(BATCH_SIZE, dataset='cifar10', horizontal_flip=True)

pipe = DDPMPipeline.from_pretrained("google/ddpm-cifar10-32")
hf_unet, hf_scheduler = pipe.unet, pipe.scheduler
backbone = DiffusionEncoder(hf_unet)
model = ClassifierNoised(backbone, hf_scheduler, up_last=1).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


Loading pipeline components...: 100%|██████████| 2/2 [00:00<00:00, 22.43it/s]
Training 1/10: 100%|██████████| 390/390 [00:32<00:00, 12.01it/s]
Validating 1/10: 100%|██████████| 78/78 [00:03<00:00, 22.28it/s]


epoch=1 in 0.60m, loss_val=0.583, loss_train=0.594, acc_val=0.840, acc_train=0.884


Training 2/10: 100%|██████████| 390/390 [00:32<00:00, 12.05it/s]
Validating 2/10: 100%|██████████| 78/78 [00:03<00:00, 22.31it/s]


epoch=2 in 0.60m, loss_val=0.537, loss_train=0.253, acc_val=0.842, acc_train=0.923


Training 3/10: 100%|██████████| 390/390 [00:32<00:00, 11.84it/s]
Validating 3/10: 100%|██████████| 78/78 [00:03<00:00, 23.01it/s]


epoch=3 in 0.61m, loss_val=0.492, loss_train=0.221, acc_val=0.859, acc_train=0.931


Training 4/10: 100%|██████████| 390/390 [00:32<00:00, 12.04it/s]
Validating 4/10: 100%|██████████| 78/78 [00:03<00:00, 21.39it/s]


epoch=4 in 0.60m, loss_val=0.526, loss_train=0.177, acc_val=0.859, acc_train=0.944


Training 5/10: 100%|██████████| 390/390 [00:32<00:00, 11.85it/s]
Validating 5/10: 100%|██████████| 78/78 [00:03<00:00, 21.14it/s]


epoch=5 in 0.61m, loss_val=0.412, loss_train=0.131, acc_val=0.876, acc_train=0.959


Training 6/10: 100%|██████████| 390/390 [00:32<00:00, 12.00it/s]
Validating 6/10: 100%|██████████| 78/78 [00:03<00:00, 23.81it/s]


epoch=6 in 0.60m, loss_val=0.418, loss_train=0.080, acc_val=0.883, acc_train=0.974


Training 7/10: 100%|██████████| 390/390 [00:32<00:00, 11.95it/s]
Validating 7/10: 100%|██████████| 78/78 [00:03<00:00, 21.39it/s]


epoch=7 in 0.61m, loss_val=0.438, loss_train=0.043, acc_val=0.885, acc_train=0.987


Training 8/10: 100%|██████████| 390/390 [00:33<00:00, 11.72it/s]
Validating 8/10: 100%|██████████| 78/78 [00:04<00:00, 18.54it/s]


epoch=8 in 0.63m, loss_val=0.439, loss_train=0.019, acc_val=0.899, acc_train=0.994


Training 9/10: 100%|██████████| 390/390 [00:32<00:00, 12.12it/s]
Validating 9/10: 100%|██████████| 78/78 [00:03<00:00, 23.03it/s]


epoch=9 in 0.59m, loss_val=0.419, loss_train=0.010, acc_val=0.903, acc_train=0.997


Training 10/10: 100%|██████████| 390/390 [00:32<00:00, 11.87it/s]
Validating 10/10: 100%|██████████| 78/78 [00:03<00:00, 21.95it/s]

epoch=10 in 0.61m, loss_val=0.423, loss_train=0.004, acc_val=0.906, acc_train=0.999





In [388]:
class ClassifierNoised(nn.Module):
    def __init__(self, backbone, scheduler, up_last=3):
        super().__init__()
        self.backbone = backbone
        self.scheduler = scheduler
        self.head = nn.Linear(256, 100)
        self.up_last = up_last
        
    def forward(self, x):
        t = torch.randint(low=0, high=20, size=(x.shape[0],), device=x.device, dtype=torch.int64)
        noise = torch.randn(x.shape, device=x.device)
        noised = self.scheduler.add_noise(x, noise, t)
        out = self.backbone(noised, t, up_last=self.up_last)
        return self.head(out)

NAME = 'noised_cls_cifar10_ts20_g_adam'
BATCH_SIZE = 128
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

set_deterministic()

train_loader, test_loader = get_loaders(BATCH_SIZE, dataset='cifar10', horizontal_flip=True)

pipe = DDPMPipeline.from_pretrained("google/ddpm-cifar10-32")
hf_unet, hf_scheduler = pipe.unet, pipe.scheduler
backbone = DiffusionEncoder(hf_unet)
model = ClassifierNoised(backbone, hf_scheduler, up_last=1).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


Loading pipeline components...: 100%|██████████| 2/2 [00:00<00:00, 22.81it/s]
Training 1/10: 100%|██████████| 390/390 [00:31<00:00, 12.28it/s]
Validating 1/10: 100%|██████████| 78/78 [00:03<00:00, 24.21it/s]


epoch=1 in 0.58m, loss_val=0.522, loss_train=0.609, acc_val=0.850, acc_train=0.881


Training 2/10: 100%|██████████| 390/390 [00:32<00:00, 11.96it/s]
Validating 2/10: 100%|██████████| 78/78 [00:03<00:00, 23.92it/s]


epoch=2 in 0.60m, loss_val=0.603, loss_train=0.237, acc_val=0.846, acc_train=0.927


Training 3/10: 100%|██████████| 390/390 [00:33<00:00, 11.77it/s]
Validating 3/10: 100%|██████████| 78/78 [00:03<00:00, 25.49it/s]


epoch=3 in 0.60m, loss_val=0.491, loss_train=0.210, acc_val=0.862, acc_train=0.934


Training 4/10: 100%|██████████| 390/390 [00:33<00:00, 11.79it/s]
Validating 4/10: 100%|██████████| 78/78 [00:04<00:00, 18.56it/s]


epoch=4 in 0.62m, loss_val=0.477, loss_train=0.161, acc_val=0.866, acc_train=0.948


Training 5/10: 100%|██████████| 390/390 [00:31<00:00, 12.19it/s]
Validating 5/10: 100%|██████████| 78/78 [00:03<00:00, 22.54it/s]


epoch=5 in 0.59m, loss_val=0.456, loss_train=0.118, acc_val=0.880, acc_train=0.962


Training 6/10: 100%|██████████| 390/390 [00:33<00:00, 11.73it/s]
Validating 6/10: 100%|██████████| 78/78 [00:03<00:00, 22.60it/s]


epoch=6 in 0.61m, loss_val=0.425, loss_train=0.073, acc_val=0.885, acc_train=0.977


Training 7/10: 100%|██████████| 390/390 [00:32<00:00, 11.95it/s]
Validating 7/10: 100%|██████████| 78/78 [00:03<00:00, 23.22it/s]


epoch=7 in 0.60m, loss_val=0.408, loss_train=0.040, acc_val=0.898, acc_train=0.987


Training 8/10: 100%|██████████| 390/390 [00:31<00:00, 12.27it/s]
Validating 8/10: 100%|██████████| 78/78 [00:03<00:00, 24.57it/s]


epoch=8 in 0.58m, loss_val=0.378, loss_train=0.021, acc_val=0.906, acc_train=0.994


Training 9/10: 100%|██████████| 390/390 [00:32<00:00, 12.15it/s]
Validating 9/10: 100%|██████████| 78/78 [00:03<00:00, 20.49it/s]


epoch=9 in 0.60m, loss_val=0.391, loss_train=0.008, acc_val=0.909, acc_train=0.997


Training 10/10: 100%|██████████| 390/390 [00:32<00:00, 12.07it/s]
Validating 10/10: 100%|██████████| 78/78 [00:03<00:00, 25.23it/s]

epoch=10 in 0.59m, loss_val=0.373, loss_train=0.005, acc_val=0.913, acc_train=0.999





In [390]:
class ClassifierNoised(nn.Module):
    def __init__(self, backbone, scheduler, up_last=3):
        super().__init__()
        self.backbone = backbone
        self.scheduler = scheduler
        self.head = nn.Linear(256, 100)
        self.up_last = up_last
        
    def forward(self, x):
        t = torch.randint(low=0, high=20, size=(x.shape[0],), device=x.device, dtype=torch.int64)
        noise = torch.randn(x.shape, device=x.device)
        noised = self.scheduler.add_noise(x, noise, t)
        out = self.backbone(noised, t, up_last=self.up_last)
        return self.head(out)

NAME = 'ema_noised_cls_cifar10_ts20_adam'
BATCH_SIZE = 128
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

set_deterministic()

train_loader, test_loader = get_loaders(BATCH_SIZE, dataset='cifar10', horizontal_flip=True)

pipe = DDPMPipeline.from_pretrained("DanielBairamian/ddpm-cifar10-32-ema")
hf_unet, hf_scheduler = pipe.unet, pipe.scheduler
backbone = DiffusionEncoder(hf_unet)
model = ClassifierNoised(backbone, hf_scheduler, up_last=1).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


Loading pipeline components...:   0%|          | 0/2 [00:00<?, ?it/s]An error occurred while trying to fetch /home/jovyan/.cache/huggingface/hub/models--DanielBairamian--ddpm-cifar10-32-ema/snapshots/d9caa7c75cd561fed983fa71979309a8435b20f5/unet: Error no file named diffusion_pytorch_model.safetensors found in directory /home/jovyan/.cache/huggingface/hub/models--DanielBairamian--ddpm-cifar10-32-ema/snapshots/d9caa7c75cd561fed983fa71979309a8435b20f5/unet.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
Loading pipeline components...: 100%|██████████| 2/2 [00:00<00:00,  4.13it/s]
Training 1/10: 100%|██████████| 390/390 [00:32<00:00, 11.89it/s]
Validating 1/10: 100%|██████████| 78/78 [00:03<00:00, 20.70it/s]


epoch=1 in 0.61m, loss_val=0.526, loss_train=0.527, acc_val=0.847, acc_train=0.888


Training 2/10: 100%|██████████| 390/390 [00:32<00:00, 11.85it/s]
Validating 2/10: 100%|██████████| 78/78 [00:03<00:00, 23.23it/s]


epoch=2 in 0.61m, loss_val=0.550, loss_train=0.253, acc_val=0.835, acc_train=0.922


Training 3/10: 100%|██████████| 390/390 [00:32<00:00, 12.09it/s]
Validating 3/10: 100%|██████████| 78/78 [00:02<00:00, 26.90it/s]


epoch=3 in 0.59m, loss_val=0.526, loss_train=0.228, acc_val=0.850, acc_train=0.929


Training 4/10: 100%|██████████| 390/390 [00:31<00:00, 12.22it/s]
Validating 4/10: 100%|██████████| 78/78 [00:03<00:00, 20.34it/s]


epoch=4 in 0.60m, loss_val=0.472, loss_train=0.165, acc_val=0.863, acc_train=0.947


Training 5/10: 100%|██████████| 390/390 [00:32<00:00, 11.87it/s]
Validating 5/10: 100%|██████████| 78/78 [00:03<00:00, 24.32it/s]


epoch=5 in 0.60m, loss_val=0.441, loss_train=0.121, acc_val=0.875, acc_train=0.961


Training 6/10: 100%|██████████| 390/390 [00:32<00:00, 11.97it/s]
Validating 6/10: 100%|██████████| 78/78 [00:03<00:00, 25.24it/s]


epoch=6 in 0.60m, loss_val=0.430, loss_train=0.079, acc_val=0.888, acc_train=0.975


Training 7/10: 100%|██████████| 390/390 [00:31<00:00, 12.22it/s]
Validating 7/10: 100%|██████████| 78/78 [00:03<00:00, 20.90it/s]


epoch=7 in 0.60m, loss_val=0.365, loss_train=0.046, acc_val=0.902, acc_train=0.985


Training 8/10: 100%|██████████| 390/390 [00:33<00:00, 11.78it/s]
Validating 8/10: 100%|██████████| 78/78 [00:03<00:00, 20.79it/s]


epoch=8 in 0.62m, loss_val=0.398, loss_train=0.020, acc_val=0.905, acc_train=0.994


Training 9/10: 100%|██████████| 390/390 [00:32<00:00, 12.05it/s]
Validating 9/10: 100%|██████████| 78/78 [00:03<00:00, 20.10it/s]


epoch=9 in 0.60m, loss_val=0.412, loss_train=0.009, acc_val=0.909, acc_train=0.997


Training 10/10: 100%|██████████| 390/390 [00:33<00:00, 11.69it/s]
Validating 10/10: 100%|██████████| 78/78 [00:04<00:00, 17.91it/s]

epoch=10 in 0.63m, loss_val=0.395, loss_train=0.004, acc_val=0.912, acc_train=0.999





In [391]:
class ClassifierNoised(nn.Module):
    def __init__(self, backbone, scheduler, up_last=3):
        super().__init__()
        self.backbone = backbone
        self.scheduler = scheduler
        self.head = nn.Linear(256, 100)
        self.up_last = up_last
        
    def forward(self, x):
        t = torch.randint(low=0, high=50, size=(x.shape[0],), device=x.device, dtype=torch.int64)
        noise = torch.randn(x.shape, device=x.device)
        noised = self.scheduler.add_noise(x, noise, t)
        out = self.backbone(noised, t, up_last=self.up_last)
        return self.head(out)

NAME = 'ema_noised_cls_cifar10_ts50_ep15'
BATCH_SIZE = 128
LR = 1e-3
TRAIN_EPOCH = 15
FREQ_SAVE = 100
DEVICE = 'cuda'

set_deterministic()

train_loader, test_loader = get_loaders(BATCH_SIZE, dataset='cifar10', horizontal_flip=True)

pipe = DDPMPipeline.from_pretrained("DanielBairamian/ddpm-cifar10-32-ema")
hf_unet, hf_scheduler = pipe.unet, pipe.scheduler
backbone = DiffusionEncoder(hf_unet)
model = ClassifierNoised(backbone, hf_scheduler, up_last=1).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


Loading pipeline components...:   0%|          | 0/2 [00:00<?, ?it/s]An error occurred while trying to fetch /home/jovyan/.cache/huggingface/hub/models--DanielBairamian--ddpm-cifar10-32-ema/snapshots/d9caa7c75cd561fed983fa71979309a8435b20f5/unet: Error no file named diffusion_pytorch_model.safetensors found in directory /home/jovyan/.cache/huggingface/hub/models--DanielBairamian--ddpm-cifar10-32-ema/snapshots/d9caa7c75cd561fed983fa71979309a8435b20f5/unet.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
Loading pipeline components...: 100%|██████████| 2/2 [00:00<00:00, 11.06it/s]
Training 1/15: 100%|██████████| 390/390 [00:32<00:00, 12.04it/s]
Validating 1/15: 100%|██████████| 78/78 [00:03<00:00, 22.66it/s]


epoch=1 in 0.60m, loss_val=0.605, loss_train=0.507, acc_val=0.826, acc_train=0.889


Training 2/15: 100%|██████████| 390/390 [00:33<00:00, 11.51it/s]
Validating 2/15: 100%|██████████| 78/78 [00:03<00:00, 21.46it/s]


epoch=2 in 0.63m, loss_val=0.673, loss_train=0.275, acc_val=0.810, acc_train=0.916


Training 3/15: 100%|██████████| 390/390 [00:33<00:00, 11.75it/s]
Validating 3/15: 100%|██████████| 78/78 [00:03<00:00, 22.74it/s]


epoch=3 in 0.61m, loss_val=0.526, loss_train=0.246, acc_val=0.850, acc_train=0.924


Training 4/15: 100%|██████████| 390/390 [00:33<00:00, 11.63it/s]
Validating 4/15: 100%|██████████| 78/78 [00:03<00:00, 23.49it/s]


epoch=4 in 0.61m, loss_val=0.508, loss_train=0.205, acc_val=0.852, acc_train=0.937


Training 5/15: 100%|██████████| 390/390 [00:34<00:00, 11.41it/s]
Validating 5/15: 100%|██████████| 78/78 [00:02<00:00, 26.43it/s]


epoch=5 in 0.62m, loss_val=0.499, loss_train=0.172, acc_val=0.861, acc_train=0.945


Training 6/15: 100%|██████████| 390/390 [00:32<00:00, 12.12it/s]
Validating 6/15: 100%|██████████| 78/78 [00:02<00:00, 26.65it/s]


epoch=6 in 0.59m, loss_val=0.420, loss_train=0.146, acc_val=0.873, acc_train=0.954


Training 7/15: 100%|██████████| 390/390 [00:31<00:00, 12.37it/s]
Validating 7/15: 100%|██████████| 78/78 [00:03<00:00, 22.36it/s]


epoch=7 in 0.58m, loss_val=0.446, loss_train=0.120, acc_val=0.878, acc_train=0.961


Training 8/15: 100%|██████████| 390/390 [00:32<00:00, 11.82it/s]
Validating 8/15: 100%|██████████| 78/78 [00:03<00:00, 23.03it/s]


epoch=8 in 0.61m, loss_val=0.440, loss_train=0.085, acc_val=0.888, acc_train=0.973


Training 9/15: 100%|██████████| 390/390 [00:31<00:00, 12.22it/s]
Validating 9/15: 100%|██████████| 78/78 [00:02<00:00, 28.98it/s]


epoch=9 in 0.58m, loss_val=0.431, loss_train=0.057, acc_val=0.888, acc_train=0.981


Training 10/15: 100%|██████████| 390/390 [00:32<00:00, 12.07it/s]
Validating 10/15: 100%|██████████| 78/78 [00:03<00:00, 22.62it/s]


epoch=10 in 0.60m, loss_val=0.423, loss_train=0.034, acc_val=0.895, acc_train=0.989


Training 11/15: 100%|██████████| 390/390 [00:31<00:00, 12.26it/s]
Validating 11/15: 100%|██████████| 78/78 [00:03<00:00, 22.12it/s]


epoch=11 in 0.59m, loss_val=0.428, loss_train=0.019, acc_val=0.905, acc_train=0.994


Training 12/15: 100%|██████████| 390/390 [00:32<00:00, 12.08it/s]
Validating 12/15: 100%|██████████| 78/78 [00:03<00:00, 21.78it/s]


epoch=12 in 0.60m, loss_val=0.449, loss_train=0.010, acc_val=0.906, acc_train=0.997


Training 13/15: 100%|██████████| 390/390 [00:31<00:00, 12.19it/s]
Validating 13/15: 100%|██████████| 78/78 [00:03<00:00, 22.91it/s]


epoch=13 in 0.59m, loss_val=0.440, loss_train=0.005, acc_val=0.909, acc_train=0.999


Training 14/15: 100%|██████████| 390/390 [00:33<00:00, 11.50it/s]
Validating 14/15: 100%|██████████| 78/78 [00:04<00:00, 18.72it/s]


epoch=14 in 0.64m, loss_val=0.457, loss_train=0.002, acc_val=0.911, acc_train=0.999


Training 15/15: 100%|██████████| 390/390 [00:32<00:00, 12.10it/s]
Validating 15/15: 100%|██████████| 78/78 [00:04<00:00, 19.43it/s]

epoch=15 in 0.60m, loss_val=0.456, loss_train=0.001, acc_val=0.911, acc_train=1.000





In [393]:
class ClassifierNoised(nn.Module):
    def __init__(self, backbone, scheduler, up_last=3):
        super().__init__()
        self.backbone = backbone
        self.scheduler = scheduler
        self.head = nn.Linear(256, 100)
        self.up_last = up_last
        
    def forward(self, x):
        t = torch.randint(low=0, high=10, size=(x.shape[0],), device=x.device, dtype=torch.int64)
        noise = torch.randn(x.shape, device=x.device)
        noised = self.scheduler.add_noise(x, noise, t)
        out = self.backbone(noised, t, up_last=self.up_last)
        return self.head(out)

NAME = 'ema_noised_cls_cifar10_ts10'
BATCH_SIZE = 128
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

set_deterministic()

train_loader, test_loader = get_loaders(BATCH_SIZE, dataset='cifar10', horizontal_flip=True)

pipe = DDPMPipeline.from_pretrained("DanielBairamian/ddpm-cifar10-32-ema")
hf_unet, hf_scheduler = pipe.unet, pipe.scheduler
backbone = DiffusionEncoder(hf_unet)
model = ClassifierNoised(backbone, hf_scheduler, up_last=1).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


Loading pipeline components...:   0%|          | 0/2 [00:00<?, ?it/s]An error occurred while trying to fetch /home/jovyan/.cache/huggingface/hub/models--DanielBairamian--ddpm-cifar10-32-ema/snapshots/d9caa7c75cd561fed983fa71979309a8435b20f5/unet: Error no file named diffusion_pytorch_model.safetensors found in directory /home/jovyan/.cache/huggingface/hub/models--DanielBairamian--ddpm-cifar10-32-ema/snapshots/d9caa7c75cd561fed983fa71979309a8435b20f5/unet.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.
Loading pipeline components...: 100%|██████████| 2/2 [00:00<00:00, 10.29it/s]


epoch=1 in 0.61m, loss_val=0.568, loss_train=0.533, acc_val=0.840, acc_train=0.888
epoch=2 in 0.60m, loss_val=0.543, loss_train=0.267, acc_val=0.842, acc_train=0.920
epoch=3 in 0.58m, loss_val=0.482, loss_train=0.223, acc_val=0.863, acc_train=0.931
epoch=4 in 0.60m, loss_val=0.497, loss_train=0.165, acc_val=0.874, acc_train=0.948
epoch=5 in 0.60m, loss_val=0.421, loss_train=0.125, acc_val=0.882, acc_train=0.961
epoch=6 in 0.60m, loss_val=0.388, loss_train=0.078, acc_val=0.889, acc_train=0.974
epoch=7 in 0.59m, loss_val=0.404, loss_train=0.042, acc_val=0.895, acc_train=0.986
epoch=8 in 0.60m, loss_val=0.375, loss_train=0.020, acc_val=0.910, acc_train=0.994
epoch=9 in 0.59m, loss_val=0.383, loss_train=0.008, acc_val=0.916, acc_train=0.998
epoch=10 in 0.59m, loss_val=0.386, loss_train=0.004, acc_val=0.916, acc_train=0.999


#### resnet50

In [398]:
transform_test = tt.Compose([tt.ToTensor(), tt.Normalize(mean=[0.485, 0.456, 0.406], std=[0.228, 0.224, 0.225])])
transform_train = tt.Compose([tt.ToTensor(), tt.Normalize(mean=[0.485, 0.456, 0.406], std=[0.228, 0.224, 0.225]), tt.RandomHorizontalFlip()])

dataset_train = torchvision.datasets.CIFAR10(root='./datasets/cifar10', train=True, download=True, transform=transform_train)
dataset_test = torchvision.datasets.CIFAR10(root='./datasets/cifar10', train=False, download=True, transform=transform_test)
train_loader = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=4)
test_loader = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=False, drop_last=True, num_workers=4)

Files already downloaded and verified
Files already downloaded and verified


In [438]:
# dataset_train = torchvision.datasets.CelebA(root='./datasets/celeba', split='train', download=True)
# dataset_test = torchvision.datasets.CelebA(root='./datasets/celeba', split='val', download=True)

In [15]:
def load_mocov2(model, pretrained):
    checkpoint = torch.load(pretrained, map_location="cpu")

    state_dict = checkpoint["state_dict"]
    for k in list(state_dict.keys()):
        if k.startswith("module.encoder_q") and not k.startswith(
            "module.encoder_q.fc"
        ):
            state_dict[k[len("module.encoder_q.") :]] = state_dict[k]
        del state_dict[k]

    msg = model.load_state_dict(state_dict, strict=False)
    assert set(msg.missing_keys) == {"fc.weight", "fc.bias"}
    print("=> loaded pre-trained model '{}'".format(pretrained))
    model.fc=nn.Identity()

In [410]:
class Classifier(nn.Module):
    def __init__(self, backbone):
        super().__init__()
        self.backbone = backbone
        self.head = nn.Linear(2048, 10)
        
    def forward(self, x):
        return self.head(self.backbone(x))


NAME = 'cls_mocov2'
BATCH_SIZE = 512
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

backbone = torchvision.models.resnet50(weights=None)
load_mocov2(backbone, './moco_v2_800ep_pretrain.pth.tar')
model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
model.maxpool = nn.Identity()
model = Classifier(backbone).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

  checkpoint = torch.load(pretrained, map_location="cpu")


=> loaded pre-trained model './moco_v2_800ep_pretrain.pth.tar'
epoch=1 in 0.39m, loss_val=1.162, loss_train=1.282, acc_val=0.599, acc_train=0.533
epoch=2 in 0.38m, loss_val=0.863, loss_train=0.844, acc_val=0.699, acc_train=0.704
epoch=3 in 0.38m, loss_val=0.768, loss_train=0.690, acc_val=0.736, acc_train=0.761
epoch=4 in 0.36m, loss_val=0.638, loss_train=0.586, acc_val=0.782, acc_train=0.797
epoch=5 in 0.35m, loss_val=0.605, loss_train=0.489, acc_val=0.798, acc_train=0.833
epoch=6 in 0.36m, loss_val=0.574, loss_train=0.401, acc_val=0.804, acc_train=0.863
epoch=7 in 0.36m, loss_val=0.523, loss_train=0.317, acc_val=0.828, acc_train=0.893
epoch=8 in 0.36m, loss_val=0.515, loss_train=0.237, acc_val=0.839, acc_train=0.920
epoch=9 in 0.36m, loss_val=0.526, loss_train=0.175, acc_val=0.843, acc_train=0.944
epoch=10 in 0.34m, loss_val=0.531, loss_train=0.135, acc_val=0.846, acc_train=0.958


In [397]:
print_parameters(encoder)

model initialized with trainable params: 23,508,032 || total params: 23,508,032 || trainable%: 100.000


In [19]:
class Classifier(nn.Module):
    def __init__(self, backbone):
        super().__init__()
        self.backbone = backbone
        self.head = nn.Linear(2048, 10)
        
    def forward(self, x):
        return self.head(self.backbone(x))


NAME = 'cls_mocov2'
BATCH_SIZE = 512
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

backbone = torchvision.models.resnet50(weights=None)
load_mocov2(backbone, './moco_v2_800ep_pretrain.pth.tar')
model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
model.maxpool = nn.Identity()
model = Classifier(backbone).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

  checkpoint = torch.load(pretrained, map_location="cpu")


=> loaded pre-trained model './moco_v2_800ep_pretrain.pth.tar'


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [18]:
class Classifier(nn.Module):
    def __init__(self, backbone):
        super().__init__()
        self.backbone = backbone
        self.head = nn.Linear(2048, 10)
        
    def forward(self, x):
        return self.head(self.backbone(x))


NAME = 'cls_mocov2_cifar100'
BATCH_SIZE = 512
LR = 1e-3
TRAIN_EPOCH = 10
FREQ_SAVE = 100
DEVICE = 'cuda'

train_loader, test_loader = get_loaders(BATCH_SIZE, dataset='cifar100', horizontal_flip=True)

backbone = torchvision.models.resnet50(weights=None)
load_mocov2(backbone, './moco_v2_800ep_pretrain.pth.tar')
model = Classifier(backbone).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=TRAIN_EPOCH)

scaler = torch.amp.GradScaler()

writer = SummaryWriter(f'./tensorboard/{NAME}')

train_fp16(writer, model, optimizer, scheduler, criterion, 
    train_loader, test_loader, TRAIN_EPOCH, FREQ_SAVE,
    save_path=f'./checkpoints/{NAME}/', scaler=scaler, device=DEVICE)

Files already downloaded and verified
Files already downloaded and verified


  checkpoint = torch.load(pretrained, map_location="cpu")


=> loaded pre-trained model './moco_v2_800ep_pretrain.pth.tar'


../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [1,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [2,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [3,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [5,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [6,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_f

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
