In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
import os
import matplotlib.pyplot as plt
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
import torch_directml 
import torch.optim as optim
from torch.autograd import grad
from torch.utils.data import random_split
from scipy.stats import uniform

In [2]:
npz_file = '/content/gdrive/My Drive/character_font.npz'



class NPZDataset(Dataset):
    def __init__(self, npz_file, transform=None, filter_label=None, num_samples=None):
        # Load the data from the .npz file
        data = np.load(npz_file)
        self.images = data['images']
        self.labels = data['labels']
        self.transform = transform

        # Filter by label if specified
        if filter_label is not None:
            # Find indices of the desired label
            label_indices = np.where(self.labels == filter_label)[0]

            # If num_samples is specified, limit the number of samples
            if num_samples is not None:
                label_indices = label_indices[:num_samples]

            # Filter images and labels
            self.images = self.images[label_indices]
            self.labels = self.labels[label_indices]

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        # Get the image and label for the given index
        image = self.images[idx]
        label = self.labels[idx]

        # Reshape the image to add a channel dimension
        image = image[np.newaxis, ...]  # Add channel dimension at the beginning

        # Apply transformations if any
        if self.transform:
            image = self.transform(image)

        # Convert to PyTorch tensor if necessary
        image = torch.tensor(image, dtype=torch.float32)
        label = torch.tensor(label, dtype=torch.long)

        return image, label

In [3]:
class Generator(nn.Module):
    def __init__(self, latent_dim):
        super(Generator, self).__init__()
        
        self.init_size = 2  # Initial spatial size
        #self.latent_dim = latent_dim
        
        # In the paper it is not specified how z transforms into a 4x4x512 (for 64x64 data) so we will use a linear layer to do so.
        self.fc = nn.Linear(latent_dim, 512 * self.init_size * self.init_size)
        # No relu because we just had to reshape the latent vector to something that when flattened is 2 * 32^2, which 2x2x512 is

        
        # We make the transposed convolutional layers (fractionally strided convolutions)
        self.deconv_blocks = nn.Sequential(
            # Block 1: Input (2x2x512) -> Output (4x4x256)
            nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            
            # Block 2: Input (4x4x256)-> Output (8x8x128)
            nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            
            # Block 3: Input (8x8x128) -> Output (16x16x64)
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            
            # Final Block: Input (16x16x64) -> Output (32x32x1)
            nn.ConvTranspose2d(64, 1, kernel_size=4, stride=2, padding=1),
            nn.Sigmoid()
        )
    
    def forward(self, z):
        # Pass the latent vector through the fully connected layer
        out = self.fc(z)
        
        # Reshape to match the initial feature map dimensions. out.size(0) = batch size.  
        out = out.view(out.size(0), 512, self.init_size, self.init_size)
        
        # We pass "out" through the transposed convolutional blocks
        img = self.deconv_blocks(out)
        
        return img


In [4]:
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()

        # The input of the discriminator is already an image so we know it has 1 channel (greyscale) and is 32x32

        self.deconv_blocks = nn.Sequential(
            # Block 1: Input (32x32x1) -> Output (16x16x64)
            nn.Conv2d(1, 64, kernel_size=4, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            
            # Block 2: Input (16x16x64)-> Output (8x8x128)
            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            
            # Block 3: Input (8x8x128) -> Output (4x4x256)
            nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            
            # Final Block: Input (4x4x256) -> Output (2x2x512)
            nn.Conv2d(256, 512, kernel_size=4, stride=2, padding=1),
            nn.Sigmoid()
        )
    
    def forward(self, img):
        
        # We pass the image through the convolutional blocks
        result = self.deconv_blocks(img)
        
        return result.view(-1, 1) # The dimensions will be (batchsize, 1) where 1 is the prediction (0 or 1)


In [5]:
def displayGeneratedImage(class_index, generator, z_dim, num_classes, device='cuda'):
    """
    Generates and displays an image for a given class using the generator.

    Args:
        class_index (int): Index of the character class to generate (0 to num_classes - 1).
        generator (nn.Module): Pre-trained generator model.
        z_dim (int): Dimension of the style vector.
        num_classes (int): Number of character classes.
        device (str): Device for computation ('cuda' or 'cpu').
    """
    # Ensure the class index is valid
    if not (0 <= class_index < num_classes):
        raise ValueError(f"Invalid class_index: {class_index}. Must be in range [0, {num_classes - 1}].")

    # Create the one-hot vector for the class
    z_c = torch.zeros(1, num_classes, device=device)
    z_c[0, class_index] = 1  # Set the desired class

    # Create the random style vector
    z_s = torch.randn(1, z_dim, device=device)

    # Concatenate the style and class vectors
    z = torch.cat((z_s, z_c), dim=1)

    # Generate the image
    with torch.no_grad():
        generated_img = generator(z).cpu().numpy()[0, 0]  # Extract the first batch and first channel

    # Rescale the image from [-1, 1] to [0, 255]
    generated_img = np.rot90(generated_img, k=-1)
    generated_img = np.uint8(np.interp(generated_img, (-1, 1), (0, 255)))

    # Display the image
    plt.figure(figsize=(5, 5))
    plt.imshow(generated_img, cmap='gray')
    plt.axis("off")
    plt.title(f"Generated Image for Class {class_index}")
    plt.show()

In [10]:
z_dim = 100
num_classes = 26  
img_size = 32  


# Specify is nvdia or amd

amd = True
if amd: 
    dml = torch_directml.device()
    device = (
    dml if torch_directml.is_available()
    else "cpu"
    )
else: 
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



discriminator = Discriminator().to(device)
generator = Generator(latent_dim=126).to(device)

# inital weights of the discriminator and generator  https://stats.stackexchange.com/questions/319323/whats-the-difference-between-variance-scaling-initializer-and-xavier-initialize

# Since we have more Leaky and Relu activations we chose to go with a normally distributed kaimning initialization 
def init_weights(m):
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d): 
        nn.init.kaiming_normal_(m.weight, a=0.2)  
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)

discriminator.apply(init_weights)

generator.apply(init_weights)

Generator(
  (fc): Linear(in_features=126, out_features=2048, bias=True)
  (deconv_blocks): Sequential(
    (0): ConvTranspose2d(512, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
    (2): ConvTranspose2d(256, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (3): ReLU()
    (4): ConvTranspose2d(128, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (5): ReLU()
    (6): ConvTranspose2d(64, 1, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (7): Sigmoid()
  )
)

In [None]:
# Training 
batch_size = 1024
lr = 0.0002
lambda_gp = 10  
n_critic = 5  # Number of discriminator updates per generator update
epochs = 2500
dataset = NPZDataset(npz_file) 
dlosses = []

glosses = []
gloss = 0

train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])


train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


optimizer_D = optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.99))
optimizer_G = optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.99))

for epoch in range(epochs): 
    for c in range(num_classes):
        class_samples = [sample for sample in train_dataset if sample[1] == c]
        class_loader = DataLoader(class_samples, batch_size=batch_size, shuffle=True)
        one_hot = torch.zeros(num_classes, device=device)
        one_hot[c] = 1


        optimizer_G.zero_grad()

        gloss = 0

        for i in range(n_critic): 
            
            optimizer_D.zero_grad()

            dloss = 0


            for batch in class_loader:
                real_data, _ = batch
                real_data = real_data.to(device)  
                real_data.requires_grad = True # If needed later 
                
                epsilon = torch.rand(size=batch_size)
                epsilon = torch.tensor(epsilon, device=device).view(batch_size, 1, 1, 1)  # Make it into a 4D tensor 

                
                z_s = torch.rand(batch_size, 100, device=device) 
                z_combined = torch.cat((z_s, one_hot.expand(batch_size, -1)), dim=1)

                # Here we do the interpolation between real and fake fonts (generator(z_combined))
                xHat = real_data * epsilon + (1-epsilon) * generator(z_combined)
                xHat.requires_grad = True
                

                probability_interpolated = discriminator(xHat) 
                grad_outputs = torch.ones_like(probability_interpolated, device=device)
                gradients = torch.autograd.grad(
                    outputs=probability_interpolated,
                    inputs=xHat,
                    grad_outputs=grad_outputs,
                    create_graph=True,
                    retain_graph=True
                )[0]

                grad_norm = gradients.view(gradients.size(0), -1).norm(2, dim=1)

                gradient_penalty = lambda_gp * ((grad_norm - 1) ** 2).mean()

                dloss += discriminator(generator(z_combined)).mean() - discriminator(real_data).mean() + gradient_penalty
                
            
            avg_Dloss = dloss/len(class_loader) 
            dlosses.append(avg_Dloss)
            
            # Backpropagation 
            avg_Dloss.backward() 
            optimizer_D.step()


        z_s = torch.rand(batch_size, 100, device=device) 
        z_combined = torch.cat((z_s, one_hot.expand(batch_size, -1)), dim=1)
        gloss = -discriminator(generator(z_combined)).mean()
        glosses.append(gloss)

        # Backpropagation 
        gloss.backward() 
        optimizer_G.step()

        # Display generated image every 20 epochs (Can be modified to what you prefer)
    if epoch % 20 == 0: 
        displayGeneratedImage(class_index=c, generator=generator, z_dim=z_dim, num_classes=num_classes, device='cuda')
        print(f"Epoch [{epoch+1}/{epochs}], D_loss: {dloss.item()}, G_loss: {gloss.item()}")   
        
            



FileNotFoundError: [Errno 2] No such file or directory: '/content/gdrive/My Drive/character_font.npz'