In [1]:
from imutils import paths
import cv2
import os
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms

import torch.autograd as autograd

In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
import torch
device = torch.device("cuda")

In [4]:
# image_paths = list(paths.list_images('data/Caltech101/001'))
# image_paths = list(paths.list_images('data/cars_side-view'))
image_paths = list(paths.list_images('data/Caltech101/016'))

In [5]:
data = []
labels = []
for img_path in tqdm(image_paths):
    label = img_path.split(os.path.sep)[-2]
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    data.append(img)
    labels.append(label)
    if len(labels) > 5000:
        break
    
data = np.array(data)
labels = np.array(labels)

100%|██████████| 123/123 [00:00<00:00, 2736.52it/s]


In [6]:

lb = LabelEncoder()
labels = lb.fit_transform(labels)
print(f"Total Number of Classes: {len(lb.classes_)}")

Total Number of Classes: 1


In [7]:
Counter(labels)

Counter({0: 123})

In [8]:
from sklearn.model_selection import train_test_split
# divide the data into train and test set
(x_train, x_test, y_train, y_test) = train_test_split(data, labels, test_size=0.1, stratify=labels, random_state=42)
print(f"x_train examples: {x_train.shape}\nx_test examples: {x_test.shape}")

x_train examples: (110, 197, 300, 3)
x_test examples: (13, 197, 300, 3)


In [9]:
dataset_config = {'size': 64, 'channels': 3, 'classes': 1}

In [10]:
train_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((dataset_config['size'], dataset_config['size'])),
    transforms.ToTensor(),
    #transforms.Normalize(mean = [0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])

val_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((dataset_config['size'],dataset_config['size'])),
    transforms.ToTensor(),
    #transforms.Normalize(mean = [0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])    

In [11]:
BS = 32
train_loader = torch.utils.data.DataLoader(
    datasets.CIFAR10('../data', train=True, download=True,
                   transform=transforms.ToTensor()),
    batch_size=BS, shuffle=True)
test_loader = torch.utils.data.DataLoader(
    datasets.CIFAR10('../data', train=False, transform=transforms.ToTensor()),
    batch_size=BS, shuffle=True)


Files already downloaded and verified


In [12]:
BS = 32
# custom dataset class
class CustomDataset(Dataset):
    def __init__(self, images, labels= None, transforms = None):
        self.labels = labels
        self.images = images
        self.transforms = transforms
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, index):
        data = self.images[index][:]
        
        if self.transforms:
            data = self.transforms(data)
            
        
        return (data, self.labels[index])
        
train_data = CustomDataset(x_train, y_train, train_transforms)
test_data = CustomDataset(x_test, y_test, val_transform)       

train_loader = DataLoader(train_data, batch_size=BS, shuffle=True, num_workers=4)
test_loader = DataLoader(test_data, batch_size=BS, shuffle=True, num_workers=4, drop_last=False) 

### --- Main --

In [13]:
from __future__ import print_function
import argparse
import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F
from torch.autograd import Variable

from torchvision.utils import save_image

def gradients(y, x):
    return autograd.grad(
                outputs=y, inputs=x, retain_graph=True,
                create_graph=True, grad_outputs=torch.ones_like(y), only_inputs=True)[0]

In [14]:
class HWReduction(nn.Module):
    def forward(self, x):
        # x -> [B, C, H, W]
        return x.mean(dim=(-1, -2))

class Reshape(nn.Module):
    def __init__(self, shape: list):
        super(Reshape, self).__init__()
        self.shape = shape
    def forward(self, x):
        batch_size = x.shape[0]
        return x.reshape([batch_size] + self.shape)

class VAE_Cifar10(nn.Module):
    def __init__(self, label = 'cifar10', image_size = dataset_config['size'],
                 channel_num = dataset_config['channels'],
                 z_size=128):
        # configurations
        super().__init__()
        self.label = label
        self.image_size = image_size
        self.channel_num = channel_num
        self.z_size = z_size

        # encoder
        self.encoder = nn.Sequential(
            self.capacity_conv(channel_num, 16), # 16 x 64 x 64
            nn.InstanceNorm2d(16),
            self.downsampling_conv(16, 32), # 32 x 32 x 32
            self.capacity_conv(32, 64), # 64 x 32 x 32
            self.downsampling_conv(64, 128), # 128 x 16 x 16
            self.capacity_conv(128, 256), # 256 x 16 x 16
            self.downsampling_conv(256, 512), # 512 x 8 x 8
            HWReduction(),
        )

        # H, W will be reduced


        # q
        self.q_mean = self._linear(512, z_size, relu=False)
        self.q_logvar = self._linear(512, z_size, relu=False)

        # projection
        self.project = nn.Sequential(
            self._linear(z_size, 1024),
            self._linear(1024, 8 * 8 * 128),
            Reshape([128, 8, 8])
        )

        # decoder
        self.decoder = nn.Sequential(
            self.upsampling_conv(128, 64), # 32 x 16 x 16
            self.capacity_conv(64, 64), # 64 x 16 x 16
            self.upsampling_conv(64, 32), # 32 x 32 x 32
            self.capacity_conv(32, 32), # 32 x 32 x 32
            self.upsampling_conv(32, 16), # 16 x 64 x 64
            nn.Conv2d(
                16, channel_num,
                kernel_size=3, stride=1, padding=1,
            ),
            nn.Sigmoid()
        )


    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std


    def forward(self, x):
        encoded = self.encoder(x)

        # sample latent code z from q given x.
        mean, logvar = self.q_mean(encoded), self.q_logvar(encoded)
        z = self.reparameterize(mean, logvar)
        z_projected = self.project(mean)

        # reconstruct x from z
        x_reconstructed = self.decoder(z_projected)
        return x_reconstructed, mean, logvar
    
    # ======
    # Layers
    # ======

    def downsampling_conv(self, channel_size, kernel_num):
        return nn.Sequential(
            nn.Conv2d(
                channel_size, kernel_num,
                kernel_size=4, stride=2, padding=1,
            ),
            nn.InstanceNorm2d(kernel_num),
            nn.LeakyReLU(negative_slope=0.1),
        )
    def capacity_conv(self, channel_num, kernel_num):
        return nn.Sequential(
            nn.Conv2d(
                channel_num, kernel_num,
                kernel_size=3, stride=1, padding=1,
            ),
            nn.LeakyReLU(negative_slope=0.1),
        )

    def upsampling_conv(self, channel_num, kernel_num):
        return nn.Sequential(
            nn.ConvTranspose2d(
                channel_num, kernel_num,
                kernel_size=4, stride=2, padding=1,
            ),
            nn.InstanceNorm2d(kernel_num),
            nn.LeakyReLU(negative_slope=0.1),
        )

    def _linear(self, in_size, out_size, relu=True):
        return nn.Sequential(
            nn.Linear(in_size, out_size),
            nn.ReLU(),
        ) if relu else nn.Linear(in_size, out_size)

model = VAE_Cifar10().to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

In [15]:
# Reconstruction + KL divergence losses summed over all elements and batch
def loss_function(recon_x, x, mu, logvar):
    MSE = (recon_x - x.view(-1,  dataset_config['channels'],dataset_config['size'],dataset_config['size'])) ** 2
    MSE = MSE.sum(dim=(-1,-2,-3))

    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=-1)

    return MSE + 0.1 * KLD

In [16]:
def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, (data, _) in enumerate(train_loader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_function(recon_batch, data, mu, logvar).mean(dim=0)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        if batch_idx % 2 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader),
                loss.item() / len(data)))

    print('====> Epoch: {} Average loss: {:.4f}'.format(
          epoch, train_loss / len(train_loader.dataset)))

In [17]:
def test(epoch):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for i, (data, _) in enumerate(test_loader):
            data = data.to(device)
            recon_batch, mu, logvar = model(data)
            test_loss += loss_function(recon_batch, data, mu, logvar).mean(dim=0).item()
            if i == 0:
                n = min(data.size(0), 8)
                comparison = torch.cat([data[:n],
                                      recon_batch[:n].view(n,  dataset_config['channels'], dataset_config['size'], dataset_config['size'])[:n]])
                if not os.path.exists("results/"):
                    os.mkdir("results")
                save_image(comparison.cpu(),
                         'results/reconstruction_' + str(epoch) + '.png', nrow=n)
        for i, (data, _) in enumerate(train_loader):
            data = data.to(device)
            recon_batch, mu, logvar = model(data)
            test_loss += loss_function(recon_batch, data, mu, logvar).mean(dim=0).item()
            if i == 0:
                n = min(data.size(0), 8)
                comparison = torch.cat([data[:n],
                                      recon_batch[:n].view(n,  dataset_config['channels'], dataset_config['size'], dataset_config['size'])[:n]])
                if not os.path.exists("overfit_results/"):
                    os.mkdir("overfit_results")
                save_image(comparison.cpu(),
                         'overfit_results/reconstruction_' + str(epoch) + '.png', nrow=n)

    test_loss /= len(test_loader.dataset)
    print('====> Test set loss: {:.4f}'.format(test_loss))

In [18]:
for epoch in range(1, 100 + 1):
        train(epoch)
        test(epoch)

====> Epoch: 1 Average loss: 29.9374
====> Test set loss: 274.3618
====> Epoch: 2 Average loss: 24.9253
====> Test set loss: 240.4846
====> Epoch: 3 Average loss: 21.9280
====> Test set loss: 228.0100
====> Epoch: 4 Average loss: 20.7099
====> Test set loss: 213.4932
====> Epoch: 5 Average loss: 20.2377
====> Test set loss: 207.8688
====> Epoch: 6 Average loss: 19.4825
====> Test set loss: 201.8396
====> Epoch: 7 Average loss: 19.2276
====> Test set loss: 205.5978
====> Epoch: 8 Average loss: 19.7443
====> Test set loss: 200.3232
====> Epoch: 9 Average loss: 18.8182
====> Test set loss: 201.6387
====> Epoch: 10 Average loss: 19.0804
====> Test set loss: 197.6224
====> Epoch: 11 Average loss: 18.5795
====> Test set loss: 196.4426
====> Epoch: 12 Average loss: 18.4429
====> Test set loss: 193.9426
====> Epoch: 13 Average loss: 18.8231
====> Test set loss: 189.4636
====> Epoch: 14 Average loss: 18.3424
====> Test set loss: 184.4448
====> Epoch: 15 Average loss: 17.4732
====> Test set loss

====> Test set loss: 81.8814
====> Epoch: 53 Average loss: 5.8887
====> Test set loss: 79.5196
====> Epoch: 54 Average loss: 5.7154
====> Test set loss: 76.2224
====> Epoch: 55 Average loss: 5.4137
====> Test set loss: 77.3674
====> Epoch: 56 Average loss: 5.3630
====> Test set loss: 73.6676
====> Epoch: 57 Average loss: 5.0294
====> Test set loss: 71.8292
====> Epoch: 58 Average loss: 4.8439
====> Test set loss: 70.2018
====> Epoch: 59 Average loss: 4.6174
====> Test set loss: 69.0239
====> Epoch: 60 Average loss: 4.5751
====> Test set loss: 70.5519
====> Epoch: 61 Average loss: 4.2451
====> Test set loss: 67.4040
====> Epoch: 62 Average loss: 4.1680
====> Test set loss: 64.7567
====> Epoch: 63 Average loss: 4.0435
====> Test set loss: 64.1390
====> Epoch: 64 Average loss: 3.9865
====> Test set loss: 64.1597
====> Epoch: 65 Average loss: 3.9010
====> Test set loss: 66.4395
====> Epoch: 66 Average loss: 3.8795
====> Test set loss: 64.6958
====> Epoch: 67 Average loss: 3.8214
====> Test

In [19]:
1048576/128/128

64.0

In [20]:
model.train()
train_loss = 0
for batch_idx,( data, _) in enumerate(train_loader):
    data = data.to(device)
    data = data.reshape([-1,3, 128*128])
    optimizer.zero_grad()
    recon_batch, mu, logvar = model(data)
    loss = loss_function(recon_batch, data, mu, logvar)
    loss.backward()
    train_loss += loss.item()
    optimizer.step()
    if batch_idx % 100 == 0:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_idx * len(data), len(train_loader.dataset),
            100. * batch_idx / len(train_loader),
            loss.item() / len(data)))

RuntimeError: Given groups=1, weight of size [16, 3, 3, 3], expected input[1, 8, 3, 16384] to have 3 channels, but got 8 channels instead