In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
model_name = "microsoft/phi-2"
phi2_model_pretrained = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [140]:
phi2_model_pretrained

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((2560,),

### Create model 

In [180]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class SimpleResBlock(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.pre_norm = nn.LayerNorm(input_size)
        self.proj = nn.Sequential(
            nn.Linear(input_size, input_size),
            nn.GELU(),
            nn.Linear(input_size, input_size)
        )
    def forward(self, x):
        x = self.pre_norm(x)
        return x + self.proj(x)
    
class Phi2wrapper(nn.Module):
    
    #This defines the structure of the NN.
    def __init__(self, input_dim_CLIP=768, input_dim_phi2=2560, phi2_model=phi2_model_pretrained):
        super(Phi2wrapper, self).__init__()
        self.input_dim_CLIP = 768
        self.input_dim_phi2 = 2560
        self.projection_img = nn.Linear(self.input_dim_CLIP, self.input_dim_phi2, 
                                        bias=False)
        self.resblock = SimpleResBlock(self.input_dim_phi2)
        self.phi2_model = phi2_model

    def forward(self, x):

        x = self.projection_img(x)
        x = self.resblock(x)

        x = self.phi2_model.model.layers[0](x)
        for layer_idx in range(1, 32): 
            x = self.phi2_model.model.layers[layer_idx](x[0])
                
        x = self.phi2_model.model.final_layernorm(x[0])
        x = self.phi2_model.lm_head(x)
        
        return x 

In [181]:
phi2_projection_model = Phi2wrapper()

In [165]:
## Freezing phi-2 for projection layer training 

for name, param in phi2_projection_model.named_parameters():
    if "phi2_model" in name:
        param.requires_grad = False

### Create dataset

In [166]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd 
import json
import os 
import h5py


In [167]:
def get_image_name(image_id_from_caption, list_image_info): 
    for img in list_image_info: 
        if img['id'] == image_id_from_caption: 
            img_name = img['file_name'].split('.')[0]
            return img['file_name'].split('.')[0]
    return 'NoImgNameFound'

In [168]:
# file_path_captions_coco = '/media/App/amaranth/lavanya/Capstone_data/annotations_trainval2017/annotations/captions_train2017.json'

# with open(file_path_captions_coco) as f:
#    data = json.load(f)

# captions_info = []
# for a in data['annotations']: 
#     captions_info.append([a['image_id'], a['caption'], a['id']])

# captions_info_df = pd.DataFrame(data=captions_info, columns=['image_id', 'caption', 'caption_id'])
# captions_info_df['image_name'] = captions_info_df['image_id'].apply(lambda x: get_image_name(x, data['images']))
# captions_info_df['image_name'] = captions_info_df['image_name'].apply(lambda x: '0'*(12-len(str(x))) + str(x))
# captions_info_df.to_csv('captions_images_map_COCO_train2017.csv')

In [169]:
captions_info_df = pd.read_csv('captions_images_map_COCO_train2017.csv')

  captions_info_df = pd.read_csv('captions_images_map_COCO_train2017.csv')


In [170]:
import h5py    
import numpy as np    

In [171]:
class COCO_CLIP_Dataset(Dataset):
    def __init__(
        self, caption_file, embedding_path):
        self.embedding_path = embedding_path
        self.caption_file = caption_file

    def __len__(self):
        return len(self.caption_file)
    
    def __getitem__(self, index):
        row = self.caption_file.iloc[[index]]
        df_img = row['image_name'].values[0]
        img_base_name = '0'*(12-len(str(df_img))) + str(df_img)
        img_base_name = img_base_name.replace(' ', '0')
        img_clip_embedding_path = os.path.join(self.embedding_path, f'{img_base_name}.h5')

        np_array_embed_img = h5py.File(img_clip_embedding_path,'r+')['image_features'][()]
        
        img_caption = row['caption'] ## Tokenize this 

        return torch.tensor(np_array_embed_img)

In [172]:
dataset = COCO_CLIP_Dataset(captions_info_df, '/media/App/amaranth/lavanya/Capstone_data/clip_features_base_patch32/')

In [183]:
phi2_projection_model(dataset[0]).shape

torch.Size([1, 49, 51200])