# Imports

In [1]:
import os
from os.path import join
import random
import math
import torch
import torch.nn as nn
import time
import numpy as np
import h5py
import imageio
import shutil
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F

# Google Drive Setup

In [2]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [3]:
datadir = "/content/drive/My Drive/CS444/Final_Project"
os.chdir(datadir)
!pwd

/content/drive/My Drive/CS444/Final_Project


# Generation Parameters

In [4]:
opt = {
    'large': 0,           # flag for adding extra blocks to generator
    'checkpoint_dir': datadir + '/checkpoints',  # where to save models and optimizers
    'fine_size': 64,      # size of cached images saved in DataLoader
    # 'txt_size': 384,      # dimensions of text embeddings (based on encoder used)
    # 'txt_size': 768,      # dimensions of text embeddings (based on encoder used)
    'nc': 3,              # image channels (3 for RGB)
    'nt': 256,            # dimensions of text features
    'nz': 100,            # dimensions for noise
    'nz': 100,            # dimensions for noise
    'ngf': 128,           # number of generator filters in first conv layer
    'n_images': 4,        # number of images to produce per caption
    'manual_seed': 7,     # manual seed for reproducible results
}

# Setup

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

random.seed(opt['manual_seed'])
torch.manual_seed(opt['manual_seed'])
if device == 'cuda':
    torch.cuda.manual_seed_all(opt['manual_seed'])

# Set default tensor type if desired (default is already float32 for CPU)
torch.set_default_dtype(torch.float32)

# Generator Definition

In [6]:
# NOTE: removed all inplace=True tags due to runtime errors

# reimplementation of ConcatTable & CAddTable block in original generator code
# applies conv branch and elementwise adds the identity
class ConcatAddBlock(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(ConcatAddBlock, self).__init__()
        self.conv_branch = nn.Sequential(
            nn.Conv2d(in_channels, hidden_channels, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(hidden_channels),
            nn.ReLU(),
            nn.Conv2d(hidden_channels, hidden_channels, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(hidden_channels),
            nn.ReLU(),
            nn.Conv2d(hidden_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(out_channels)
        )

    def forward(self, x):
        return x + self.conv_branch(x)

# generator definition
# as close to 1:1 reimplementation as possible
class Generator(nn.Module):
    def __init__(self, opt):
        super(Generator, self).__init__()
        self.nz = opt['nz']
        self.nt = opt['nt']
        self.txt_size = opt['txt_size']
        self.ngf = opt['ngf']
        self.nc = opt['nc']
        self.large = opt['large']

        # transformation for text embedding
        self.fcG = nn.Sequential(
            nn.Linear(self.txt_size, self.nt),
            nn.LeakyReLU(0.2)
        )

        self.deconv1 = nn.ConvTranspose2d(self.nz + self.nt, self.ngf * 8, kernel_size=4, stride=1, padding=0, bias=False)
        self.bn1 = nn.BatchNorm2d(self.ngf * 8)

        # state size: (ngf*8) x 4 x 4
        self.resblock1 = ConcatAddBlock(self.ngf * 8, self.ngf * 2, self.ngf * 8)
        if self.large == 1:
            self.resblock1b = ConcatAddBlock(self.ngf * 8, self.ngf * 2, self.ngf * 8)

        # upsample from 4x4 to 8x8
        self.deconv2 = nn.ConvTranspose2d(self.ngf * 8, self.ngf * 4, kernel_size=4, stride=2, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(self.ngf * 4)

        # state size: (ngf*4) x 8 x 8
        self.resblock2 = ConcatAddBlock(self.ngf * 4, self.ngf, self.ngf * 4)
        if self.large == 1:
            self.resblock2b = ConcatAddBlock(self.ngf * 4, self.ngf, self.ngf * 4)

        # upsample from 8x8 to 16x16
        self.deconv3 = nn.ConvTranspose2d(self.ngf * 4, self.ngf * 2, kernel_size=4, stride=2, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.ngf * 2)

        # upsample from 16x16 to 32x32
        self.deconv4 = nn.ConvTranspose2d(self.ngf * 2, self.ngf, kernel_size=4, stride=2, padding=1, bias=False)
        self.bn4 = nn.BatchNorm2d(self.ngf)

        # upsample from 32x32 to 64x64
        self.deconv5 = nn.ConvTranspose2d(self.ngf, self.nc, kernel_size=4, stride=2, padding=1, bias=False)
        self.tanh = nn.Tanh()

    def forward(self, noise, txt):
        """
        noise: (batch, nz, 1, 1)
        txt: (batch, txt_size)
        """
        batch_size = noise.size(0)
        # process text through fcG
        txt_out = self.fcG(txt)
        # reshape to (batch, nt, 1, 1)
        txt_out = txt_out.view(batch_size, self.nt, 1, 1)

        # concatenate noise and processed text
        # shape: (batch, nz + nt, 1, 1)
        input_vec = torch.cat([noise, txt_out], dim=1)

        x = self.deconv1(input_vec)
        x = self.bn1(x)

        x = self.resblock1(x)
        if self.large == 1:
            x = self.resblock1b(x)
        x = F.relu(x)

        x = self.deconv2(x)
        x = self.bn2(x)

        x = self.resblock2(x)
        if self.large == 1:
            x = self.resblock2b(x)
        x = F.relu(x)

        x = self.deconv3(x)
        x = self.bn3(x)
        x = F.relu(x)

        x = self.deconv4(x)
        x = self.bn4(x)
        x = F.relu(x)

        x = self.deconv5(x)
        output = self.tanh(x)
        return output

# Test Caption Encoding

In [7]:
def save_caption_vectors(encoder_path, output_path):
    # open file with plain text captions
    caption_file = join(datadir, 'flower_captions.txt')

    with open(caption_file) as f:
        captions = f.read().split('\n')

    captions = [cap for cap in captions if len(cap) > 0]
    print(captions)
    # encode captions
    model = SentenceTransformer(encoder_path)
    caption_vectors = model.encode(captions)
    # save to drive
    if os.path.isfile(join(datadir, output_path)):
        os.remove(join(datadir, output_path))
    h = h5py.File(join(datadir, output_path), 'w')
    h.create_dataset('vectors', data=caption_vectors)
    h.close()

# create encodings with base and fine-tuned encoder
encodings = [
    (datadir + '/text_encoders/finetuned10_multi-qa-mpnet-base-dot-v1', 'finetuned_captions.hdf5'),
    ('all-MiniLM-L6-v2', 'base_captions.hdf5'),
    ('multi-qa-mpnet-base-dot-v1', 'updated_captions.hdf5')
]

for encoder in encodings:
    save_caption_vectors(encoder[0], encoder[1])

['this flower is white and pink in color, with petals that have veins', 'these flowers have petals that start off white in color and end in a dark purple towards the tips', 'bright droopy yellow petals with burgundy streaks, and a yellow stigma', 'a flower with long pink petals and raised orange stamen', 'the flower shown has a blue petals with a white pistil in the center']
['this flower is white and pink in color, with petals that have veins', 'these flowers have petals that start off white in color and end in a dark purple towards the tips', 'bright droopy yellow petals with burgundy streaks, and a yellow stigma', 'a flower with long pink petals and raised orange stamen', 'the flower shown has a blue petals with a white pistil in the center']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

['this flower is white and pink in color, with petals that have veins', 'these flowers have petals that start off white in color and end in a dark purple towards the tips', 'bright droopy yellow petals with burgundy streaks, and a yellow stigma', 'a flower with long pink petals and raised orange stamen', 'the flower shown has a blue petals with a white pistil in the center']


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Generation

In [8]:
# generation code loosely inspired by: https://github.com/paarthneekhara/text-to-image/blob/master/generate_images.py

# generate samples based on captions
def generate(model, captions, out_name):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # load generator
    netG = Generator(opt).to(device)
    netG.load_state_dict(torch.load(opt['checkpoint_dir'] + model))
    netG.to(device)
    netG.eval()

    # open caption thought vectors file
    caption_thought_vectors = join(datadir, captions)
    h = h5py.File(caption_thought_vectors, 'r')
    caption_vectors = np.array(h['vectors'])
    h.close()

    caption_image_dict = {}

    # loop over each caption vector
    for cn, caption_vector in enumerate(caption_vectors):
        # generate random noise
        z_noise = np.random.normal(-1, 1, [opt['n_images'], opt['nz']]).astype(np.float32)
        # repeat the caption vector for each image
        caption = np.array([caption_vector[:opt['txt_size']]] * opt['n_images'], dtype=np.float32)
        # convert to tensors
        z_noise_tensor = torch.tensor(z_noise, device=device).unsqueeze(-1).unsqueeze(-1)
        caption_tensor = torch.tensor(caption, device=device)

        with torch.no_grad():
          # generate image
            gen_image = netG(z_noise_tensor, caption_tensor)

        # convert images from [-1, 1] to [0, 255]
        gen_image = gen_image.cpu().numpy()
        gen_image = ((gen_image + 1) * 127.5).astype(np.uint8)

        # convert to (H, W, C)
        caption_images = [np.transpose(gen_image[i], (1, 2, 0)) for i in range(opt['n_images'])]
        caption_image_dict[cn] = caption_images

    generations_dir = join(datadir, 'generations')

    image_size = opt['fine_size']

    # for each caption vector, combine its generated images into one image
    for cn in range(len(caption_vectors)):
        caption_images = []
        for im in caption_image_dict[cn]:
            caption_images.append(im)
            # add a space between images
            spacer = np.zeros((image_size, 5, 3), dtype=np.uint8)
            caption_images.append(spacer)
        # remove the last space
        if len(caption_images) > 0:
            caption_images = caption_images[:-1]
        # concatenate images horizontally
        combined_image = np.concatenate(caption_images, axis=1)
        save_path = join(generations_dir, out_name + f'{cn}.jpg')
        imageio.imwrite(save_path, combined_image)

In [9]:
# create samples for all models
models = [
    # basic
    ('/600_net_G_classic.pth', 'base_captions.hdf5', 'classic_image_', 384),
    # training tricks + finetune
    ('/600_net_G_old_updated.pth', 'finetuned_captions.hdf5', 'updated_old_image_', 768),
    # training tricks + finetune + noise to embeddings
    ('/600_net_G_updated.pth', 'finetuned_captions.hdf5', 'updated_image_', 768),
    # basic + training tricks
    ('/600_net_G_classic_updated.pth', 'base_captions.hdf5', 'classic_updated_image_', 384),
    # training tricks + improved encoder no finetune
    ('/600_net_G_updated_nofinetune.pth', 'updated_captions.hdf5', 'updated_nofinetune_image_', 768)
]

for model in models:
    opt['txt_size'] = model[3]
    generate(model[0], model[1], model[2])
    print("generation done for model")

generation done for model
generation done for model
generation done for model
generation done for model
generation done for model
