# Installations & Imports

In [None]:
!pip install ftfy regex tqdm git+https://github.com/openai/CLIP.git

In [2]:
import torch
import clip
import numpy as np
import random
from PIL import Image
import h5py
from torchvision import transforms
import os
from os.path import join
import math
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import time
import matplotlib.pyplot as plt

# Google Drive Setup

In [3]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [4]:
datadir = "/content/drive/My Drive/CS444/Final_Project"
os.chdir(datadir)
!pwd

/content/drive/My Drive/CS444/Final_Project


# Training Parameter Setup

In [5]:
opt = {
    'large': 0,           # flag for adding extra blocks to generator
    'save_every': 100,    # save models and optimizers during training every X epochs
    'print_every': 15,    # print statistics every X batch per epoch
    'cls_weight': 0.5,    # weight for wrong image/text pairs
    'checkpoint_dir': datadir + '/checkpoints', # where to save models and optimizers
    'captions_file': datadir + '/base_encoded_captions.hdf5', # where dataset captions were stored in DataLoader
    'cache_path': datadir + '/image_cache.pt', # where dataset images were stored in DataLoader
    'fine_size': 64,      # size of cached images saved in DataLoader
    'batch_size': 64,     # number of items per batch
    'txt_size': 384,      # dimensions of text embeddings (based on encoder used)
    'nc': 3,              # image channels (3 for RGB)
    'nt': 256,            # dimensions of text features
    'nz': 100,            # dimensions for noise
    'ngf': 128,           # number of generator filters in first conv layer
    'ndf': 64,            # number of discriminator filters in first conv layer
    'num_workers': 2,     # workers for data loader
    'epochs': 600,        # number of training epochs
    'lr': 0.0002,         # init learning rate for Adam optimizer
    'lr_decay': 0.5,      # learning rate decay factor
    'decay_every': 100,   # learning rate decay frequency
    'beta1': 0.5,         # momentum term of Adam
    'train_amt': 0.75,    # percent of dataset for training (train/test split)
    'display': 1,         # flag whether to display sample every epoch while training (0 = False)
    'noise': 'normal',    # noise type: "uniform" or "normal"
    'init_g': datadir + '/checkpoints/600_net_G_classic.pth',         # path to saved generator
    'init_d': '',         # path to saved discriminator
    'init_g_opt': '',     # path to saved generator optimizer
    'init_d_opt': '',     # path to saved discriminator optimizer
    'resume': 0,          # flag whether to resume training from saved models
    'manual_seed': 7,     # manual seed for reproducible results
}

# Initialization

In [6]:
# set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# set seed
random.seed(opt['manual_seed'])
torch.manual_seed(opt['manual_seed'])
if device == 'cuda':
    torch.cuda.manual_seed_all(opt['manual_seed'])

# set default type
torch.set_default_dtype(torch.float32)

# Generator Definition

In [7]:
# NOTE: removed all inplace=True tags due to runtime errors

# reimplementation of ConcatTable & CAddTable block in original generator code
# applies conv branch and elementwise adds the identity
class ConcatAddBlock(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(ConcatAddBlock, self).__init__()
        self.conv_branch = nn.Sequential(
            nn.Conv2d(in_channels, hidden_channels, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(hidden_channels),
            nn.ReLU(),
            nn.Conv2d(hidden_channels, hidden_channels, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(hidden_channels),
            nn.ReLU(),
            nn.Conv2d(hidden_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(out_channels)
        )

    def forward(self, x):
        return x + self.conv_branch(x)

# generator definition
# as close to 1:1 reimplementation as possible
class Generator(nn.Module):
    def __init__(self, opt):
        super(Generator, self).__init__()
        self.nz = opt['nz']
        self.nt = opt['nt']
        self.txt_size = opt['txt_size']
        self.ngf = opt['ngf']
        self.nc = opt['nc']
        self.large = opt['large']

        # transformation for text embedding
        self.fcG = nn.Sequential(
            nn.Linear(self.txt_size, self.nt),
            nn.LeakyReLU(0.2)
        )

        self.deconv1 = nn.ConvTranspose2d(self.nz + self.nt, self.ngf * 8, kernel_size=4, stride=1, padding=0, bias=False)
        self.bn1 = nn.BatchNorm2d(self.ngf * 8)

        # state size: (ngf*8) x 4 x 4
        self.resblock1 = ConcatAddBlock(self.ngf * 8, self.ngf * 2, self.ngf * 8)
        if self.large == 1:
            self.resblock1b = ConcatAddBlock(self.ngf * 8, self.ngf * 2, self.ngf * 8)

        # upsample from 4x4 to 8x8
        self.deconv2 = nn.ConvTranspose2d(self.ngf * 8, self.ngf * 4, kernel_size=4, stride=2, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(self.ngf * 4)

        # state size: (ngf*4) x 8 x 8
        self.resblock2 = ConcatAddBlock(self.ngf * 4, self.ngf, self.ngf * 4)
        if self.large == 1:
            self.resblock2b = ConcatAddBlock(self.ngf * 4, self.ngf, self.ngf * 4)

        # upsample from 8x8 to 16x16
        self.deconv3 = nn.ConvTranspose2d(self.ngf * 4, self.ngf * 2, kernel_size=4, stride=2, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.ngf * 2)

        # upsample from 16x16 to 32x32
        self.deconv4 = nn.ConvTranspose2d(self.ngf * 2, self.ngf, kernel_size=4, stride=2, padding=1, bias=False)
        self.bn4 = nn.BatchNorm2d(self.ngf)

        # upsample from 32x32 to 64x64
        self.deconv5 = nn.ConvTranspose2d(self.ngf, self.nc, kernel_size=4, stride=2, padding=1, bias=False)
        self.tanh = nn.Tanh()

    def forward(self, noise, txt):
        batch_size = noise.size(0)
        # process text through fcG
        txt_out = self.fcG(txt)
        # reshape to (batch, nt, 1, 1)
        txt_out = txt_out.view(batch_size, self.nt, 1, 1)

        # concatenate noise and processed text
        # shape: (batch, nz + nt, 1, 1)
        input_vec = torch.cat([noise, txt_out], dim=1)

        x = self.deconv1(input_vec)
        x = self.bn1(x)

        x = self.resblock1(x)
        if self.large == 1:
            x = self.resblock1b(x)
        x = F.relu(x)

        x = self.deconv2(x)
        x = self.bn2(x)

        x = self.resblock2(x)
        if self.large == 1:
            x = self.resblock2b(x)
        x = F.relu(x)

        x = self.deconv3(x)
        x = self.bn3(x)
        x = F.relu(x)

        x = self.deconv4(x)
        x = self.bn4(x)
        x = F.relu(x)

        x = self.deconv5(x)
        output = self.tanh(x)
        return output

# CLIP Installation

In [8]:
# Load CLIP
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

# Transformation for generated images
clip_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
                         (0.26862954, 0.26130258, 0.27577711)),
])

100%|███████████████████████████████████████| 338M/338M [01:20<00:00, 4.39MiB/s]


# Test Data Split

In [9]:
# Load your test split
def get_test_split():
    h = h5py.File(opt['captions_file'])
    flower_captions = {}
    for key, ds in h.items():
        flower_captions[key] = np.array(ds)
    images = sorted(flower_captions)
    images_train = int(len(images) * opt['train_amt'])
    images_train += opt['batch_size'] - (images_train % opt['batch_size']) if images_train % opt['batch_size'] != 0 else 0
    training_images = images[0:images_train]
    test_images = images[images_train:]
    image_names = set(test_images)
    print(len(test_images))
    print(test_images[0])
    caption_dir = join(datadir, 'text_c10')
    class_dirs = []
    # 102 class subdirectories (class_00001, ..., class_00102).
    for i in range(1, 103):
        class_dir_name = 'class_%.5d' % (i)
        class_dirs.append(join(caption_dir, class_dir_name))

    image_captions = {}

    # read all .txt caption files for each folder
    for class_dir in class_dirs:
        caption_files = [f for f in os.listdir(class_dir) if 'txt' in f]
        for cap_file in caption_files:
            if cap_file[0:11] + '.jpg' not in image_names:
                continue
            with open(join(class_dir, cap_file)) as f:
                captions = f.read().split('\n')
            # reconstruct image filename from caption file name
            img_file = cap_file[0:11] + '.jpg'
            # add 5 captions for each image
            image_captions[img_file] = [cap for cap in captions if len(cap) > 0][0:5]
    print(len(image_captions))
    return test_images, image_captions

test_images, flower_captions = get_test_split()

2045
image_06145.jpg
2045


# Scoring

In [11]:
# with classic
# Load netG
netG = Generator(opt).to(device)
netG.load_state_dict(torch.load(opt['init_g']))
netG.eval()

clip_scores = []

for img_id in test_images:
    # Randomly select 1 of the 5 captions
    all_captions = flower_captions[img_id]
    selected_idx = random.randint(0, 4)
    selected_caption_text = str(all_captions[selected_idx])

    # Load corresponding sentence embedding
    h = h5py.File(opt['captions_file'])
    caption_vector = np.array(h[img_id])[selected_idx][:opt['txt_size']]
    h.close()

    # Generate noise + embed caption
    z = np.random.normal(-1, 1, (1, opt['nz'], 1, 1)).astype(np.float32)
    z_tensor = torch.tensor(z, device=device)
    caption_tensor = torch.tensor(caption_vector, dtype=torch.float32, device=device).unsqueeze(0)

    with torch.no_grad():
        fake_img = netG(z_tensor, caption_tensor)

    # Post-process image: (1, 3, H, W) → (H, W, 3) in [0, 255]
    fake_img_np = fake_img[0].cpu().numpy()
    fake_img_np = ((fake_img_np + 1) * 127.5).astype(np.uint8)
    fake_img_np = np.transpose(fake_img_np, (1, 2, 0))

    # Convert and preprocess for CLIP
    img_clip = clip_transform(fake_img_np).unsqueeze(0).to(device)
    text_clip = clip.tokenize([selected_caption_text]).to(device)

    # Get CLIP embeddings
    with torch.no_grad():
        img_feat = clip_model.encode_image(img_clip)
        txt_feat = clip_model.encode_text(text_clip)

    # Normalize and compute cosine similarity
    img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
    txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)
    similarity = (img_feat @ txt_feat.T).item()
    clip_scores.append(similarity)

# Final result
average_clip_score = np.mean(clip_scores)
print(f"Average CLIP cosine similarity over test set: {average_clip_score:.4f}")

Average CLIP cosine similarity over test set: 0.2874


In [18]:
# with classic_updated
# Load netG
netG = Generator(opt).to(device)
netG.load_state_dict(torch.load(opt['init_g']))
netG.eval()

clip_scores = []

for img_id in test_images:
    # Randomly select 1 of the 5 captions
    all_captions = flower_captions[img_id]
    selected_idx = random.randint(0, 4)
    selected_caption_text = str(all_captions[selected_idx])

    # Load corresponding sentence embedding
    h = h5py.File(opt['captions_file'])
    caption_vector = np.array(h[img_id])[selected_idx][:opt['txt_size']]
    h.close()

    # Generate noise + embed caption
    z = np.random.normal(-1, 1, (1, opt['nz'], 1, 1)).astype(np.float32)
    z_tensor = torch.tensor(z, device=device)
    caption_tensor = torch.tensor(caption_vector, dtype=torch.float32, device=device).unsqueeze(0)

    with torch.no_grad():
        fake_img = netG(z_tensor, caption_tensor)

    # Post-process image: (1, 3, H, W) → (H, W, 3) in [0, 255]
    fake_img_np = fake_img[0].cpu().numpy()
    fake_img_np = ((fake_img_np + 1) * 127.5).astype(np.uint8)
    fake_img_np = np.transpose(fake_img_np, (1, 2, 0))

    # Convert and preprocess for CLIP
    img_clip = clip_transform(fake_img_np).unsqueeze(0).to(device)
    text_clip = clip.tokenize([selected_caption_text]).to(device)

    # Get CLIP embeddings
    with torch.no_grad():
        img_feat = clip_model.encode_image(img_clip)
        txt_feat = clip_model.encode_text(text_clip)

    # Normalize and compute cosine similarity
    img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
    txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)
    similarity = (img_feat @ txt_feat.T).item()
    clip_scores.append(similarity)

# Final result
average_clip_score = np.mean(clip_scores)
print(f"Average CLIP cosine similarity over test set: {average_clip_score:.4f}")

Average CLIP cosine similarity over test set: 0.2882


In [15]:
# with updated
# Load netG
netG = Generator(opt).to(device)
netG.load_state_dict(torch.load(opt['init_g']))
netG.eval()

clip_scores = []

for img_id in test_images:
    # Randomly select 1 of the 5 captions
    all_captions = flower_captions[img_id]
    selected_idx = random.randint(0, 4)
    selected_caption_text = str(all_captions[selected_idx])

    # Load corresponding sentence embedding
    h = h5py.File(opt['captions_file'])
    caption_vector = np.array(h[img_id])[selected_idx][:opt['txt_size']]
    h.close()

    # Generate noise + embed caption
    z = np.random.normal(-1, 1, (1, opt['nz'], 1, 1)).astype(np.float32)
    z_tensor = torch.tensor(z, device=device)
    caption_tensor = torch.tensor(caption_vector, dtype=torch.float32, device=device).unsqueeze(0)

    with torch.no_grad():
        fake_img = netG(z_tensor, caption_tensor)

    # Post-process image: (1, 3, H, W) → (H, W, 3) in [0, 255]
    fake_img_np = fake_img[0].cpu().numpy()
    fake_img_np = ((fake_img_np + 1) * 127.5).astype(np.uint8)
    fake_img_np = np.transpose(fake_img_np, (1, 2, 0))

    # Convert and preprocess for CLIP
    img_clip = clip_transform(fake_img_np).unsqueeze(0).to(device)
    text_clip = clip.tokenize([selected_caption_text]).to(device)

    # Get CLIP embeddings
    with torch.no_grad():
        img_feat = clip_model.encode_image(img_clip)
        txt_feat = clip_model.encode_text(text_clip)

    # Normalize and compute cosine similarity
    img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
    txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)
    similarity = (img_feat @ txt_feat.T).item()
    clip_scores.append(similarity)

# Final result
average_clip_score = np.mean(clip_scores)
print(f"Average CLIP cosine similarity over test set: {average_clip_score:.4f}")

Average CLIP cosine similarity over test set: 0.2869
