****BASELINE MODEL****

***Mount to Google Drive*** (adapted from CS231n)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
FOLDERNAME = 'CS230 Project'
assert FOLDERNAME is not None, "[!] Enter the foldername."

In [4]:
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

***Load Video Frames***

In [5]:
import cv2
import os
import json

#Load train captions and videos
captions = json.load(open("/content/drive/MyDrive/CS230 Project/data/train_frames/captions.json"))
train_captions = []
train_images = []
for file in os.listdir("/content/drive/MyDrive/CS230 Project/data/train_frames"):
  if (file == 'captions.json'):
    continue
  train_images.append(file)
  train_captions.append(captions[file.split("-")[0]])

#Load test captions and videos
captions = json.load(open("/content/drive/MyDrive/CS230 Project/data/test_frames/captions.json"))
test_captions = []
test_images = []
for file in os.listdir("/content/drive/MyDrive/CS230 Project/data/test_frames"):
  if (file == 'captions.json'):
    continue
  test_images.append(file)
  test_captions.append(captions[file.split("-")[0]])

***Original CLIP Model*** 

In [6]:
!pip3 install torch==1.7.1+cu110 torchvision  torchtext torchaudio --extra-index-url https://download.pytorch.org/whl/cu110
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu110
Collecting torch==1.7.1+cu110
  Downloading https://download.pytorch.org/whl/cu110/torch-1.7.1%2Bcu110-cp37-cp37m-linux_x86_64.whl (1156.8 MB)
[K     |███████████████████████         | 834.1 MB 1.4 MB/s eta 0:03:53tcmalloc: large alloc 1147494400 bytes == 0x3a912000 @  0x7f267d4e5615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7
[K     |█████████████████████████████▏  | 1055.7 MB 1.4 MB/s eta 0:01:13tcmalloc: large alloc 1434370048 bytes == 0x7ef68000 @  0x7f267d4e5615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 

In [7]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

100%|███████████████████████████████████████| 338M/338M [00:03<00:00, 96.3MiB/s]


In [8]:
#Test CLIP image captioning (assign caption to given image)

image = preprocess(Image.open("/content/drive/MyDrive/CS230 Project/images/catimage.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
#image = preprocess(Image.open("/content/drive/MyDrive/CS230 Project/data/test_frames/" + test_images[0])).unsqueeze(0).to(device)
#text = clip.tokenize(test_captions[0:3]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)

Label probs: [[0.00192544 0.00799205 0.99008256]]


***Finetuning on CLIP Model for Transfer Learning*** (adapted from https://github.com/openai/CLIP/issues/83)

In [13]:
from PIL import Image
import torch.utils.data as tor
import torch
import torch.nn as nn
import requests
from io import BytesIO

#Create custom PyTorch dataset containing images and corresponding captions

#df = {'image': ["https://www.freeiconspng.com/thumbs/cat-png/cat-png-17.png", 'https://w7.pngwing.com/pngs/174/600/png-transparent-cat-animal-lovely-cat.png', 'https://www.pngmart.com/files/1/Dog-PNG-File.png', 'https://cdn.pixabay.com/photo/2020/06/08/22/50/dog-5276317_1280.png'],
#      'caption': ['a cat', 'a cat', 'a dog', 'a dog']}

df = {'image': train_images[:150], 'caption': train_captions[:150]}

class internet_image_caption_dataset(tor.Dataset):
    def __init__(self, df):

        self.images = df["image"]
        self.caption = df["caption"]

    def __len__(self):
        return len(self.caption)

    def __getitem__(self, idx):
        response = requests.get(self.images[idx])
        img = Image.open(BytesIO(response.content))
        images = preprocess(img) #preprocess from clip.load
        caption = self.caption[idx]
        return images,caption

class image_caption_dataset(tor.Dataset):
    def __init__(self, df):

        self.images = df["image"]
        self.caption = df["caption"]

    def __len__(self):
        return len(self.caption)

    def __getitem__(self, idx):
        img = Image.open("/content/drive/MyDrive/CS230 Project/data/train_frames/" + self.images[idx])
        images = preprocess(img) #preprocess from clip.load
        caption = self.caption[idx]
        return images,caption

dataset = image_caption_dataset(df)
BATCH_SIZE = 50
train_dataloader = tor.DataLoader(dataset,batch_size = BATCH_SIZE)

In [14]:
import torch.utils.data as tor
import torch
import torch.nn as nn

#Train model

'''
BATCH_SIZE must larger than 1
BATCH_SIZE = 2
train_dataloader = tor.DataLoader(DataSet,batch_size = BATCH_SIZE) #Define your own dataloader
'''

#https://github.com/openai/CLIP/issues/57
def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 

device = "cuda:0" if torch.cuda.is_available() else "cpu" # If using GPU then use mixed precision training.
model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training
if device == "cpu":
  model.float()
else :
  clip.model.convert_weights(model)

loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2) #Params used from paper

EPOCH = 5
for epoch in range(EPOCH):
  print("Starting epoch ", epoch, "..." )
  b = 1
  for batch in train_dataloader:
      print("Starting batch ", b)
      optimizer.zero_grad()

      list_image,list_txt = batch #list_images is list of image in numpy array(np.uint8), or list of PIL images
      images = list_image
      #images= torch.stack([preprocess(img) for img in list_image],dim=0).to(device) # omit the Image.fromarray if the images already in PIL format, change this line to images=list_image if using preprocess inside the dataset class
      texts = clip.tokenize(list_txt).to(device)
    
      logits_per_image, logits_per_text = model(images, texts)

      ground_truth = torch.arange(BATCH_SIZE,dtype=torch.long,device=device)

      total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
      total_loss.backward()
      if device == "cpu":
         optimizer.step()
      else : 
        convert_models_to_fp32(model)
        optimizer.step()
        clip.model.convert_weights(model)
      b += 1

Starting epoch  0 ...
Starting batch  1
Starting batch  2
Starting batch  3
Starting epoch  1 ...
Starting batch  1
Starting batch  2
Starting batch  3
Starting epoch  2 ...
Starting batch  1
Starting batch  2
Starting batch  3
Starting epoch  3 ...
Starting batch  1
Starting batch  2
Starting batch  3
Starting epoch  4 ...
Starting batch  1
Starting batch  2
Starting batch  3


In [15]:
#Save trained model
torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': total_loss,
        }, "/content/drive/MyDrive/CS230 Project/model_ckpts/model.pt") #just change to your preferred folder/filename

***Evaluation Metrics***

In [9]:
#Load trained model
model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training
checkpoint = torch.load("/content/drive/MyDrive/CS230 Project/model_ckpts/model.pt")

# Use these 3 lines if you use default model setting(not training setting) of the clip. For example, if you set context_length to 100 since your string is very long during training, then assign 100 to checkpoint['model_state_dict']["context_length"] 
#checkpoint['model_state_dict']["input_resolution"] = model.input_resolution #default is 224
#checkpoint['model_state_dict']["context_length"] = model.context_length # default is 77
#checkpoint['model_state_dict']["vocab_size"] = model.vocab_size 

model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [21]:
import numpy as np

#Calculate train accuracy

correct = 0

for i in range(150):
  print("Done ", i + 1, " out of 150")
  image = preprocess(Image.open("/content/drive/MyDrive/CS230 Project/data/train_frames/" + train_images[i])).unsqueeze(0).to(device)
  text = clip.tokenize(train_captions[:150]).to(device)

  with torch.no_grad():
      image_features = model.encode_image(image)
      text_features = model.encode_text(text)
    
      logits_per_image, logits_per_text = model(image, text)
      probs = logits_per_image.softmax(dim=-1).cpu().numpy()
      correct += np.max(probs, axis=1)[0] == probs[0,i]

print("Train accuracy:", correct/150)

Done  1  out of 150
Done  2  out of 150
Done  3  out of 150
Done  4  out of 150
Done  5  out of 150
Done  6  out of 150
Done  7  out of 150
Done  8  out of 150
Done  9  out of 150
Done  10  out of 150
Done  11  out of 150
Done  12  out of 150
Done  13  out of 150
Done  14  out of 150
Done  15  out of 150
Done  16  out of 150
Done  17  out of 150
Done  18  out of 150
Done  19  out of 150
Done  20  out of 150
Done  21  out of 150
Done  22  out of 150
Done  23  out of 150
Done  24  out of 150
Done  25  out of 150
Done  26  out of 150
Done  27  out of 150
Done  28  out of 150
Done  29  out of 150
Done  30  out of 150
Done  31  out of 150
Done  32  out of 150
Done  33  out of 150
Done  34  out of 150
Done  35  out of 150
Done  36  out of 150
Done  37  out of 150
Done  38  out of 150
Done  39  out of 150
Done  40  out of 150
Done  41  out of 150
Done  42  out of 150
Done  43  out of 150
Done  44  out of 150
Done  45  out of 150
Done  46  out of 150
Done  47  out of 150
Done  48  out of 150
D

In [22]:
#Calculate test accuracy

correct = 0

for i in range(50):
  print("Done ", i + 1, " out of 50")
  image = preprocess(Image.open("/content/drive/MyDrive/CS230 Project/data/test_frames/" + test_images[i])).unsqueeze(0).to(device)
  text = clip.tokenize(test_captions[:50]).to(device)

  with torch.no_grad():
      image_features = model.encode_image(image)
      text_features = model.encode_text(text)
    
      logits_per_image, logits_per_text = model(image, text)
      probs = logits_per_image.softmax(dim=-1).cpu().numpy()
      correct += np.max(probs, axis=1)[0] == probs[0,i]

print("Test accuracy:", correct/50)

Done  1  out of 50
Done  2  out of 50
Done  3  out of 50
Done  4  out of 50
Done  5  out of 50
Done  6  out of 50
Done  7  out of 50
Done  8  out of 50
Done  9  out of 50
Done  10  out of 50
Done  11  out of 50
Done  12  out of 50
Done  13  out of 50
Done  14  out of 50
Done  15  out of 50
Done  16  out of 50
Done  17  out of 50
Done  18  out of 50
Done  19  out of 50
Done  20  out of 50
Done  21  out of 50
Done  22  out of 50
Done  23  out of 50
Done  24  out of 50
Done  25  out of 50
Done  26  out of 50
Done  27  out of 50
Done  28  out of 50
Done  29  out of 50
Done  30  out of 50
Done  31  out of 50
Done  32  out of 50
Done  33  out of 50
Done  34  out of 50
Done  35  out of 50
Done  36  out of 50
Done  37  out of 50
Done  38  out of 50
Done  39  out of 50
Done  40  out of 50
Done  41  out of 50
Done  42  out of 50
Done  43  out of 50
Done  44  out of 50
Done  45  out of 50
Done  46  out of 50
Done  47  out of 50
Done  48  out of 50
Done  49  out of 50
Done  50  out of 50
Test accu