In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from torchvision.utils import save_image
# from torch.utils.tensorboard import SummaryWriter
from tensorboardX import SummaryWriter
print(os.getcwd())
torch.__version__
# "\Colab Notebooks\dataset\X_modify.csv"

C:\Users\DALab\Documents\GitHub\pytorch_learning\Variational AutoEncoder


'1.11.0'

In [2]:
CUDA = torch.cuda.is_available()
# torch.cuda.set_device(1) # set pytorch run
device = torch.device('cuda' if CUDA else 'cpu')
device

device(type='cpu')

### SummaryWriter

In [24]:
import datetime as dt
from tensorboardX import SummaryWriter

# define the path that store log information
LOG_DIR = './logs/' + dt.datetime.now().strftime("%Y%m%d-%H%M%S")

writer = SummaryWriter(LOG_DIR)

In [25]:
# Hyperparameter setting
image_size = 28
# h_dim = 400
# z_dim = 20
num_epochs = 3
batch_size = 128
learning_rate = 1e-3

In [26]:
dataset = torchvision.datasets.MNIST(root='./dataset/minist',
                    train=True,
                    transform=transforms.ToTensor(),
                    download=True)

data_loader = torch.utils.data.DataLoader(dataset=dataset,
                      batch_size=batch_size, 
                      shuffle=True)

In [27]:
inputs, classes = next(iter(data_loader))   
print(inputs.shape)

torch.Size([128, 1, 28, 28])


In [28]:
grid = torchvision.utils.make_grid(inputs)
writer.add_image("images", grid)
writer.close()

### Model Construction

In [29]:
from torch.distributions import Normal


class Encoder(nn.Module):
    def __init__(self, latent_dim):
        super(Encoder, self).__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size=3, stride=1, padding=1) # (in_channels, out_channels, kernel_size)
        self.batchnorm = nn.BatchNorm2d(8)
        self.conv2 = nn.Conv2d(8, 16, kernel_size=3, stride=2, padding=0) 
        self.batchnorm2 = nn.BatchNorm2d(16)
        self.conv3 = nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=0)
        self.linear1 = nn.Linear(32*6*6, 128)
        self.linear2 = nn.Linear(128, latent_dim) # for mean 
        self.linear3 = nn.Linear(128, latent_dim) # for variance
    
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.batchnorm(x))
        x = self.conv2(x)
        x = F.relu(self.batchnorm2(x))
        x = F.relu(self.conv3(x))
        # print('final convolution: ', x.shape)
        x = torch.flatten(x, start_dim=1) 
        # print('after flatten: ', x.shape)
        x = F.relu(self.linear1(x))
        # print(x.shape)
        mu = self.linear2(x)
        # print(mu.shape)
        log_var = self.linear3(x)
        # print(mu)
        return mu, log_var


class Decoder(nn.Module):
    def __init__(self, latent_dim):
        super(Decoder, self).__init__()
        self.linear4 = nn.Linear(latent_dim, 128)
        self.linear5 = nn.Linear(128, 32*6*6)

        self.unflatten = nn.Unflatten(dim=1, unflattened_size=torch.Size([32, 6, 6])) # torch.Size(3*3*32)   (32, 6, 6)
        self.deconvolution1 = nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, output_padding=0) # padding versus output_padding
        self.batchnorm3 = nn.BatchNorm2d(16)
        self.deconvolution2 = nn.ConvTranspose2d(16, 8, kernel_size=3, stride=2, output_padding=1)
        self.batchnorm4 = nn.BatchNorm2d(8)
        self.deconvolution3 = nn.ConvTranspose2d(8, 1, kernel_size=1, stride=1, output_padding=0)

    def forward(self, x):
        # linear layer
        x = F.relu(self.linear4(x))
        x = F.relu(self.linear5(x))
        # print('before unflatten: ', x.shape)
        # x = self.unflatten(x)
        x = x.view(-1, 32, 6, 6)
        # print('unflatten: ', x.shape)
        # deconvolution layer
        x = self.deconvolution1(x)
        x = F.relu(self.batchnorm3(x))
        x = self.deconvolution2(x)
        x = F.relu(self.batchnorm4(x))
        x = self.deconvolution3(x)
        return x


class VAE(nn.Module): # for image input data
    def __init__(self, latent_dim):
        super(VAE, self).__init__()
        self.encoder = Encoder(latent_dim)
        # check how to use distribution on the GPU device
        self.Normal_dist = torch.distributions.Normal(0, 1)      # for reparameterization     
        # self.Normal_dist.loc = self.Normal_dist.loc.cuda()     # if running on the GPU, uncomment these two lines
        # self.Normal_dist.scale = self.Normal_dist.scale.cuda()   
        self.decoder = Decoder(latent_dim)
        # compute KL divergence or other metrics
        self.loss = 0

    # pathwise derivative estimator is commonly seen in the reparameterization trick (also used a lot in reinforcement learning)
    def reparameterize(self, mu, log_var):
        # compute sample z as the latent vector
        std = torch.exp(log_var/2)
        # z = mu + std * self.Normal_dist(mu.shape)
        z = mu + std * self.Normal_dist.sample(mu.shape)
        # print('z type: ', z.is_cuda)
        return z


    def forward(self, x):
        mu, log_var = self.encoder(x)
        z = self.reparameterize(mu, log_var)

        output = self.decoder(z)
        # (optional) non-linear mapping for the last layer
        output = F.relu(output)
        return output, mu, log_var
        

In [30]:
model = VAE(latent_dim=40)#.to(device)

In [31]:
print(model)

VAE(
  (encoder): Encoder(
    (conv1): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (batchnorm): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2))
    (batchnorm2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv3): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2))
    (linear1): Linear(in_features=1152, out_features=128, bias=True)
    (linear2): Linear(in_features=128, out_features=40, bias=True)
    (linear3): Linear(in_features=128, out_features=40, bias=True)
  )
  (decoder): Decoder(
    (linear4): Linear(in_features=40, out_features=128, bias=True)
    (linear5): Linear(in_features=128, out_features=1152, bias=True)
    (unflatten): Unflatten(dim=1, unflattened_size=torch.Size([32, 6, 6]))
    (deconvolution1): ConvTranspose2d(32, 16, kernel_size=(3, 3), stride=(2, 2))
    (batchnorm3): BatchNorm2d(16, eps=1e-05, momentum=0.

In [32]:
from torchsummary import summary
input_size = (1, 28, 28)
if CUDA:
    summary(model.cuda(), input_size, batch_size=1) #, device='cuda'
else:
    summary(model, input_size, batch_size=1)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1             [1, 8, 28, 28]              80
       BatchNorm2d-2             [1, 8, 28, 28]              16
            Conv2d-3            [1, 16, 13, 13]           1,168
       BatchNorm2d-4            [1, 16, 13, 13]              32
            Conv2d-5              [1, 32, 6, 6]           4,640
            Linear-6                   [1, 128]         147,584
            Linear-7                    [1, 40]           5,160
            Linear-8                    [1, 40]           5,160
           Encoder-9       [[-1, 40], [-1, 40]]               0
           Linear-10                   [1, 128]           5,248
           Linear-11                  [1, 1152]         148,608
  ConvTranspose2d-12            [1, 16, 13, 13]           4,624
      BatchNorm2d-13            [1, 16, 13, 13]              32
  ConvTranspose2d-14             [1, 8,

### Training

In [35]:
def train_per_epoch(train_loader=None, model=None, loss_fn=None, optimizer=None, show_pred_result=False):
    def loss_fn(input, output, mu, log_var, criterion):
        # print(f'{mu}, {log_var}')
        mse = criterion(input, output)
        kl_element = mu.pow(2).add_(log_var.exp()).mul_(-1).add_(1).add_(log_var)
        kl = torch.sum(kl_element).mul_(-0.5)
        # kl_divergence = 0.5*torch.sum(-1-log_var+mu**2+log_var.exp())
        # print(kl_divergence)
        # return F.binary_cross_entropy(input, output, size_average=False) + kl_divergence
        return mse + kl

    # for one epoch, multiple batches
    model.train()   # assure Dropout, BatchNormalization... layers open in training step
    
    train_loss = 0
    true_rlt = []
    pred_rlt = []

    for i, (data, label) in enumerate(train_loader):
        if CUDA:
            data, label = data.cuda(), label.cuda()

        # ============= forward ===============
        output, mu, log_var = model(data)
        # loss_fn.cuda()  # if loss function have parameter, we need to add .cuda
        loss = loss_fn(data, output, mu, log_var, criterion=nn.MSELoss())
        train_loss += loss

        # ============= backward ===============
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        writer.add_scalar('Loss/train', loss/100, epoch*len(train_loader)+i)

        if (i+1) % 100 == 0:
            print ("Epoch[{}/{}], Step [{}/{}], Reconst Loss: {:.4f}" 
                .format(epoch+1, num_epochs, i+1, len(data_loader), loss))
            
        if i % 10 == 0:
            for name, param in model.named_parameters():
                writer.add_histogram(name, param.clone().cpu().data.numpy(), i)
    return train_loss 


optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
    train_per_epoch(data_loader, model, loss_fn=None, optimizer=optimizer, show_pred_result=False)

Epoch[1/3], Step [100/469], Reconst Loss: 0.0981
Epoch[1/3], Step [200/469], Reconst Loss: 0.0977
Epoch[1/3], Step [300/469], Reconst Loss: 0.0949
Epoch[1/3], Step [400/469], Reconst Loss: 0.1006
Epoch[2/3], Step [100/469], Reconst Loss: 0.0997
Epoch[2/3], Step [200/469], Reconst Loss: 0.1005
Epoch[2/3], Step [300/469], Reconst Loss: 0.1024
Epoch[2/3], Step [400/469], Reconst Loss: 0.1012
Epoch[3/3], Step [100/469], Reconst Loss: 0.1038
Epoch[3/3], Step [200/469], Reconst Loss: 0.0952
Epoch[3/3], Step [300/469], Reconst Loss: 0.0986
Epoch[3/3], Step [400/469], Reconst Loss: 0.0932


In [36]:
writer.close()

```cd <dir>```

```tensorboard --logdir logs```