In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
import torch
import torch.nn as nn
from torchvision import transforms

from transformers import  AutoTokenizer
from transformers import AutoModelForCausalLM, AutoConfig

device = 'cuda:1'

model_name = "microsoft/phi-2"
phi2_model_pretrained = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Create dataset

In [3]:
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import pandas as pd 
import json
import os 
import h5py
import numpy as np 

In [4]:
captions_info_df = pd.read_csv('captions_images_map_COCO_train2017.csv')

  captions_info_df = pd.read_csv('captions_images_map_COCO_train2017.csv')


In [5]:
class COCO_CLIP_Dataset(Dataset):

    def __init__(
        self, caption_file, embedding_path, tokenizer, max_token_len_data, phi2_model_pretrained, max_seq_len):
        
        self.embedding_path = embedding_path
        self.caption_file = caption_file
        self.tokenizer = tokenizer
        self.max_token_len_data = max_token_len_data
        self.phi2_model = phi2_model_pretrained
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.caption_file)
    
    def __getitem__(self, index):

        row = self.caption_file.iloc[[index]]

        df_img = row['image_id'].values[0]
        img_base_name = '0'*(12-len(str(df_img))) + str(df_img)
        img_base_name = img_base_name.replace(' ', '0')
        img_clip_embedding_path = os.path.join(self.embedding_path, f'{img_base_name}.h5')

        np_array_embed_img = h5py.File(img_clip_embedding_path,'r+')['image_features'][()]
        
        img_caption = row['caption'].values[0] ## Tokenize this 
        img_caption_tokenized = self.tokenizer(img_caption, return_tensors="pt", 
                                               return_attention_mask=False).input_ids
        pad_len = self.max_seq_len - img_caption_tokenized.shape[1]
        if pad_len != 0: 
            pad_tokens = torch.tensor([self.tokenizer.eos_token_id]*pad_len).unsqueeze(0)
            img_caption_tokenized = torch.cat((img_caption_tokenized, pad_tokens), dim=-1)
            
        img_caption_embedding = self.phi2_model.get_input_embeddings()(img_caption_tokenized)
        
        return torch.tensor(np_array_embed_img).squeeze(0), img_caption_embedding.squeeze(0).detach()

In [6]:
def file_exists(image_id, fpath = '/media/App/amaranth/lavanya/Capstone_data/clip_features_base_patch32/'): 

    n = '0'*(12-len(str(image_id))) + str(image_id) + '.h5'
    fp = os.path.join(fpath, n)

    if os.path.exists(fp): 
        return True
    else: 
        return False

In [7]:
### captions_info_df contains for 1 image multiple entries, lets reduce keeping one image, one entry. 
captions_info_df_subset = captions_info_df.drop_duplicates(subset='image_id', keep='first')

In [8]:
max_token_len_data = 75
phi2_embed_dim = 2560
clip_embed_patch = 768
clip_embed_token= 49 

dataset = COCO_CLIP_Dataset(captions_info_df_subset, 
                            '/media/App/amaranth/lavanya/Capstone_data/clip_features_base_patch32/', 
                            tokenizer, max_token_len_data, phi2_model_pretrained, max_token_len_data)

In [9]:
# class MyModel(nn.Module):
#     def __init__(self, clip_embed_patch=clip_embed_patch, 
#                  max_seq_len=max_token_len_data, 
#                  phi2_embed_dim=phi2_embed_dim): 
        
#         super(MyModel, self).__init__()
        
#         self.clip_embed_patch = clip_embed_patch
#         self.max_seq_len = max_seq_len
#         self.phi2_embed_dim = phi2_embed_dim
        
#         # Global Average Pooling layer
#         self.global_avg_pooling = nn.AdaptiveAvgPool1d(1)

#         # Linear layer
#         self.fc1 = nn.Linear(self.clip_embed_patch, 3000)
#         self.fc2 = nn.Linear(3000, 3000)
#         self.fc3 = nn.Linear(3000, self.max_seq_len*self.phi2_embed_dim)

#         # Optional activation functions
#         self.relu = nn.ReLU()

#     def forward(self, x):        
#         # Global Average Pooling
#         x = self.global_avg_pooling(x.transpose(1, 2)).squeeze(dim=2)

#         # Linear layer
#         x = self.relu(self.fc1(x))
#         x = self.relu(self.fc2(x))
#         x = self.relu(self.fc2(x))
#         x = self.fc3(x)

#         x = x.view(-1, self.max_seq_len, self.phi2_embed_dim)
        
#         return x
        

# model = MyModel()

In [10]:
class SimpleResBlock(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.pre_norm = nn.LayerNorm(input_size)
        self.proj = nn.Sequential(
            nn.Linear(input_size, input_size),
            nn.ReLU(),
            nn.Linear(input_size, input_size)
        )
    def forward(self, x):
        x = self.pre_norm(x)
        return x + self.proj(x)
    
class MyModel(nn.Module):
    def __init__(self, clip_embed_patch=clip_embed_patch, clip_embed_token = clip_embed_token,
                 max_seq_len=max_token_len_data, 
                 phi2_embed_dim=phi2_embed_dim): 
        
        super(MyModel, self).__init__()
        
        self.clip_embed_patch = clip_embed_patch
        self.max_seq_len = max_seq_len
        self.phi2_embed_dim = phi2_embed_dim
        self.clip_embed_token = clip_embed_token
        
        self.linear_1 = nn.Linear(self.clip_embed_patch, 1500) 
        self.linear_2 = nn.Linear(1500, 1500)
        self.linear_3 = nn.Linear(1500, self.phi2_embed_dim)
        
        self.projection_1 = SimpleResBlock(self.phi2_embed_dim)   
        
        self.fc4 = nn.Linear(self.clip_embed_token, self.max_seq_len)

        # Optional activation functions
        self.relu = nn.ReLU()

    def forward(self, x):        
        # -1, 49, 768, --> -1, 49, 2560 
        x = self.relu(self.linear_3(self.relu(self.linear_2(self.relu(self.linear_1(x))))))    
        
        x = self.projection_1(x)  # -1, 49, 2560, --> -1, 49, 2560
        
        x = x.swapaxes(-2, -1)    # -1, 2560, 49
        x = self.fc4(x)           # -1, 2560, 49 --> -1, 2560, 75
        
        x = x.swapaxes(-2, -1)    # -1, 75, 2560
        x = self.projection_1(x)
        
        return x
        

model = MyModel()

In [11]:
batch_size_train = 32
train_dataloader = DataLoader(dataset, batch_size=batch_size_train, shuffle=True, num_workers=8)

optimizer = torch.optim.Adam(model.parameters(),lr=1e-5, eps=1e-9) 
normalize = transforms.Normalize(mean = 0, std = 1)

In [12]:
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime 

writer = SummaryWriter(log_dir=f"BiggerRemoveGAP{datetime.now().strftime('%b%d_%H-%M-%S')}")

In [None]:
num_epochs = 50
count = 0 

model.train()
for epoch in range(num_epochs): 
    
    print(f"Working on epoch {epoch}")
    for iteration, batch in enumerate(train_dataloader):
        
        optimizer.zero_grad()
        input_ = batch[0]
        gt_ = batch[1]
          
        output_ = model(input_)
                
        cosine_sim = F.cosine_similarity(output_, gt_).mean()
        loss = 1 - cosine_sim
                
        loss.backward()
        optimizer.step()
        
        print(f"Loss:", loss)
        writer.add_scalar('Loss/train', loss, count)
        count += 1
    

Working on epoch 0
Loss: tensor(0.9995, grad_fn=<RsubBackward1>)
Loss: tensor(0.9962, grad_fn=<RsubBackward1>)
Loss: tensor(0.9928, grad_fn=<RsubBackward1>)
Loss: tensor(0.9888, grad_fn=<RsubBackward1>)
Loss: tensor(0.9859, grad_fn=<RsubBackward1>)
Loss: tensor(0.9816, grad_fn=<RsubBackward1>)
Loss: tensor(0.9790, grad_fn=<RsubBackward1>)
Loss: tensor(0.9760, grad_fn=<RsubBackward1>)
Loss: tensor(0.9719, grad_fn=<RsubBackward1>)
Loss: tensor(0.9682, grad_fn=<RsubBackward1>)
Loss: tensor(0.9676, grad_fn=<RsubBackward1>)
Loss: tensor(0.9606, grad_fn=<RsubBackward1>)
Loss: tensor(0.9577, grad_fn=<RsubBackward1>)
Loss: tensor(0.9531, grad_fn=<RsubBackward1>)
Loss: tensor(0.9521, grad_fn=<RsubBackward1>)
Loss: tensor(0.9496, grad_fn=<RsubBackward1>)
Loss: tensor(0.9451, grad_fn=<RsubBackward1>)
Loss: tensor(0.9426, grad_fn=<RsubBackward1>)
Loss: tensor(0.9379, grad_fn=<RsubBackward1>)
Loss: tensor(0.9319, grad_fn=<RsubBackward1>)
Loss: tensor(0.9306, grad_fn=<RsubBackward1>)
Loss: tensor(0.

In [None]:
tokenizer.batch_decode(phi2_model_pretrained.generate(inputs_embeds=output_[0, :, :].unsqueeze(0), bos_token_id=tokenizer.bos_token_id))

In [None]:
tokenizer.batch_decode(phi2_model_pretrained.generate(inputs_embeds=gt_[0, :, :].unsqueeze(0), bos_token_id=tokenizer.bos_token_id))