In [2]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.nn.functional as F
import torchvision.utils as vutils
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models


import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML

%matplotlib inline
%config InlineBackend.figure_format = "retina"

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

cuda


In [3]:
# Train Parameters
batch_size = 64
num_epochs = 40
lr = 1e-4
num_grid_rows = 8
num_samples = 64

# Diffusion Parameters
beta_start = 1e-4
beta_end = 0.02
T = 1000

# Model Parameters
nc = 3
image_size = 32

In [4]:
transform = transforms.Compose([transforms.Resize(image_size), transforms.CenterCrop(image_size), transforms.ToTensor(), transforms.Normalize([0.5 for _ in range(nc)], [0.5 for _ in range(nc)])])

In [7]:
celebADataset = datasets.ImageFolder(root = "../CelebA", transform = transform)
celebALoader = DataLoader(dataset = celebADataset, batch_size = batch_size, shuffle = True)

mnist_train = datasets.MNIST(root = "../MNIST", train = True, transform = transform, download = True)
mnist_test = datasets.MNIST(root = "../MNIST", train = False, transform = transform, download = True)

mnist_combined_loader = DataLoader(dataset = mnist_train + mnist_test, batch_size = batch_size, shuffle = True)

## **Latent Diffusion Models**

### The idea is to train the diffusion models on a low dimensional latent representation rather than the entire big pixel space. In addition to that, also train an Encoder-Decoder model that takes the original image converts it into the latent representation using the encoder and reconverts the latent representation to the reconstructed image.

<p align="center">
<img src="./Media/Latent2.png" style="width:60%;border:0;" alt = "image">
</p>

### The downside is that although the $L1/L2$ reconstruction loss might be low, the perceptual features in the reconstructed image still might be **fuzzy**

## Discretizing the Latent Space using the $\text{CodeBooks}$ from $\text{VQVAEs}$
### $\text{VQVAE}$ as the $\text{AutoEncoder}$

$k$ vectors, each of $d$ dimensions $(k \times d)$ help us encode the data.

<p align="center">
<img src="./Media/VQVAE1.png" style="width:70%;border:0;" alt = "image">
</p>

The encoder generates a feature map of $H \times W$ features each of $d$ dimension.
<p align="center">
<img src="./Media/VQVAE2.png" style="width:70%;border:0;" alt = "image">
</p>

For each of the features, we find the nearest $d$ dimensional encoding to it and replace it with that.

$$ z_q(x) = e_k $$
$$ k = \argmin_j || z_e(x) - e_j ||_2 $$

<p align="center">
<img src="./Media/VQVAE3.png" style="width:70%;border:0;" alt = "image">
</p>

The decoder then discards off the feature map given by the encoder and only uses the nearest codeblock feature map to reconstruct the output image.
<p align="center">
<img src="./Media/VQVAE4.png" style="width:70%;border:0;" alt = "image">
</p>

The issue is we have to define the gradients for the $\argmin$ step separately for the gradients to flow back. We approximate the gradient similar to the straight-through estimator and just copy gradients from decoder input $z_q(x)$ to encoder output $z_e(x)$
<p align="center">
<img src="./Media/VQVAE5.png" style="width:70%;border:0;" alt = "image">
</p>

$$ L = \log p(x | z_q(x)) + || \text{sg}[z_e(x)] - e ||_2^2 + \beta || z_e(x) - \text{sg}[e] ||_2^2 $$

## Perceptual Retention & $\text{LPIPS}$ as the metric

<p align="center">
<img src="./Media/Percept.png" style="width:80%;border:0;" alt = "image">
</p>

<p align="center">
<img src="./Media/Latent3.png" style="width:70%;border:0;" alt = "image">
</p>

<p align="center">
<img src="./Media/Latent4.png" style="width:70%;border:0;" alt = "image">
</p>
