In [1]:
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
import pandas as pd
import lmdb
import io
import pickle
import os
import json
import urllib
from transformers import CLIPTokenizer
from transformers import CLIPFeatureExtractor
from transformers import CLIPProcessor
from transformers import CLIPModel, CLIPConfig
from transformers import get_scheduler
from PIL import Image
from tqdm import tqdm
import math

In [2]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self,dataset_path, image_path,max_length = 90):
        self.max_length = max_length
        self.dataset = self.read_dataset(dataset_path)
        self.image_path = image_path
        #self.features_extract = CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32")
        #self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    
    def read_dataset(self, url,sort = False):
        dataset = pd.read_csv(url)
        labels_encoding = {'contradiction':0,'neutral': 1,
                           'entailment':2}
        dataset = dataset[['hypothesis','Flickr30kID','gold_label']]
        dataset['gold_label']=dataset['gold_label'].apply(lambda label: labels_encoding[label])
        if sort:
            dataset.sort_values(by="hypothesis", key=lambda x: x.str.len(), inplace = True)
        return dataset
    
    def get_visual_features(self,img):
        return self.features_extract(img)
    
    def get_text_features(self,text): 
        return self.tokenizer(text)
    
    def __getitem__(self, idx):
        img_name = self.dataset.loc[idx,'Flickr30kID']
        text = self.dataset.loc[idx,'hypothesis']
        label = self.dataset.loc[idx,'gold_label']
        item = self.processor(text=text, images=Image.open(self.image_path+img_name), return_tensors="pt",padding="max_length", max_length=68,truncation=True)
        item['input_ids'] = item['input_ids'][0]
        item['attention_mask'] = item['attention_mask'][0]
        item['pixel_values'] = item['pixel_values'][0]
        item['label'] = torch.tensor(label)
        return item

    def __len__(self):
        return len(self.dataset.index)
    
    def __exit__(self):
        self.img_env.close()
        self.env.close()

In [3]:
class MyTrainer():
    def __init__(self,model,train,eval_test, device = None, num_labels = 3):
        self.model = model
        self.device = device
        self.train = train
        self.eval_test = eval_test
        self.test_acc_list = []#init
        self.model_path = "./models/my_model_epoch_"
        self.num_labels = num_labels
        self.config_problem_type = "single_label_classification"
        if self.config_problem_type == "single_label_classification":
          self.loss_fct = torch.nn.CrossEntropyLoss()
          self.output_loss = lambda output,labels : self.loss_fct(output.logits.view(-1, self.num_labels), labels.view(-1)) 
        elif self.config_problem_type == "regression":
          self.loss_fct = torch.nn.MSELoss()
          if self.num_labels == 1: self.output_loss = lambda output,labels : self.loss_fct(output.logits.squeeze(), labels.squeeze())
          else: self.output_loss =  lambda output,labels : self.loss_fct(output.logits, labels)
        elif self.config_problem_type == "multi_label_classification":
          self.loss_fct = torch.nn.BCEWithLogitsLoss()
          self.output_loss = lambda output,labels : self.loss_fct(output.logits, labels)

    def train_model(self,batch_size = None, lr= None, epochs=None):
        optimizer = AdamW(self.model.parameters(), lr=lr)
        train_loader = DataLoader(self.train, batch_size=batch_size, shuffle=True, num_workers = 4)
        lr_scheduler = get_scheduler(
            name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps= epochs * len(train_loader)
        )
        for epoch in range(epochs):
            progress_bar = tqdm(range(math.ceil(len(self.train)/batch_size)))
            train_losses = []
            for item in train_loader:
                item['input_ids'] = item['input_ids'].to(self.device)
                item['attention_mask'] = item['attention_mask'].to(self.device)
                item['pixel_values'] = item['pixel_values'].to(self.device)
                item['label'] = item['label'].to(self.device)
                optimizer.zero_grad()
                outputs = self.model.forward(**item)
                label = item['label']
                loss = self.output_loss(outputs, label)
                train_losses.append(loss)
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                progress_bar.update(1)
            print("Saving model ....")
            model.save_model(self.model_path+str(epoch))
            print("Model Saved!")
            test_acc = self.eval_test.evaluate(batch_size = batch_size)
            self.test_acc_list.append(test_acc)
            print('--- Epoch ',epoch,' Acc: ',test_acc)
            mean_loss = torch.tensor(train_losses).mean().item()
            print('Training loss: %.4f' % (mean_loss))
        return

In [4]:
class MyEvaluator():
  def __init__(self,model,test, device = None):
    self.test_dataset = test
    self.model = model
    self.device =device
    
  def evaluate(self, batch_size = 64):
      self.model.eval()
      loader = DataLoader(self.test_dataset, batch_size=batch_size, shuffle = False, num_workers = 4)
      n_correct = 0
      n_possible = 0
      for item in loader:
        item['input_ids'] = item['input_ids'].to(self.device)
        item['attention_mask'] = item['attention_mask'].to(self.device)
        item['pixel_values'] = item['pixel_values'].to(self.device)
        item['label'] = item['label'].to(self.device)
        y_hat = self.model.predict(item)
        y = item['label']
        n_correct += (y == y_hat).sum().item()
        n_possible += float(y.shape[0])
      self.model.train()
      return n_correct / n_possible

In [None]:
class CLIP(CLIPModel):
    def __init__(self, num_labels=3):
      #super().__init__(CLIPConfig.from_pretrained("openai/clip-vit-base-patch32"))
      super().__init__(CLIPConfig.from_pretrained("flax-community/clip-rsicd-v2"))
      self.new_encoder_layer = torch.nn.TransformerEncoderLayer(d_model=512, nhead=8)
      self.new_transformer_encoder = torch.nn.TransformerEncoder(self.new_encoder_layer, num_layers=3)
      self.classification = torch.nn.Linear(512, num_labels, bias=True)
      self.num_labels = num_labels
      
      self.config.problem_type = "single_label_classification"
      self.num_labels = num_labels
    
    def forward(self, input_ids=None, pixel_values=None, attention_mask=None, position_ids=None, return_loss=None, output_attentions=None, output_hidden_states=None, label=None):
        output = super().forward(input_ids, pixel_values, attention_mask, position_ids, return_loss, output_attentions,output_hidden_states, return_dict=True)

        aux_vision = output.vision_model_output[0]
        aux_vision = self.visual_projection(aux_vision) 
        aux_text = output.text_model_output[0]
        aux_text = self.text_projection(aux_text)

        aux = torch.cat((aux_vision, aux_text), dim=1)

        aux = aux.reshape((aux.size()[1], aux.size()[0], aux.size()[2]))

        vision_mask = torch.ones((aux_vision.size()[0], aux_vision.size()[1])).to(device)

        multi_modal_mask = torch.cat((vision_mask, attention_mask), dim=1).to(device)


        aux = self.new_transformer_encoder(aux, src_key_padding_mask=multi_modal_mask) 
        aux = aux.reshape((aux.size()[1], aux.size()[0], aux.size()[2])) #change back shape to (batch size, sequence length, features)

        multi_modal_mask = multi_modal_mask.unsqueeze(2).expand(-1,-1, aux.size()[2])

        #TODO experimentar a mask inicial e ver se ele faz as contas bem na mesma, se nao deixar ficar assim 
        aux = torch.sum(aux * multi_modal_mask, 1) / torch.clamp(multi_modal_mask.sum(1), min=1e-9)
        aux = self.classification(aux)

        output.logits = aux
        return output
    
    def predict(self,item):
      """
      item (n_examples x n_features)
      """
      scores = model(**item)  # (n_examples x n_classes)
      predicted_labels = scores.logits.argmax(dim=-1)  # (n_examples)
      return predicted_labels
  
    def save_model(self,path):
        torch.save(self.state_dict(), path)
        
    def load_model(self,path):
        self.load_state_dict(torch.load(path))
        self.eval()

In [5]:
class CLIP(CLIPModel):
    def __init__(self, num_labels=3):
      #super().__init__(CLIPConfig.from_pretrained("openai/clip-vit-base-patch32"))
      super().__init__(CLIPConfig.from_pretrained("flax-community/clip-rsicd-v2"))
      self.new_encoder_layer = torch.nn.TransformerEncoderLayer(d_model=512, nhead=8)
      self.new_transformer_encoder = torch.nn.TransformerEncoder(self.new_encoder_layer, num_layers=3)
      self.classification = torch.nn.Linear(512, num_labels, bias=True)
      self.num_labels = num_labels
      
      self.config.problem_type = "single_label_classification"
      self.num_labels = num_labels
    
    def forward(self, input_ids=None, pixel_values=None, attention_mask=None, position_ids=None, return_loss=None, output_attentions=None, output_hidden_states=None, label=None):
        output = super().forward(input_ids, pixel_values, attention_mask, position_ids, return_loss, output_attentions, output_hidden_states, return_dict=True)
        
        print(output.vision_model_output[0].shape)
        print(output.text_model_output[0].shape)
        
        aux_vision = output.vision_model_output[0]
        aux_vision = self.visual_projection(aux_vision) 
        
        aux_text = output.text_model_output[0]
        aux_text = self.text_projection(aux_text)
        
        print(aux_text.shape)
        print(aux_vision.shape)
        
        aux = torch.cat((aux_vision, aux_text), dim=1)
        
        print(aux.shape)
        
        aux = aux.reshape((aux.size()[1], aux.size()[0], aux.size()[2]))
        
        print(aux.shape)
    
        vision_mask = torch.ones((aux_vision.size()[0], aux_vision.size()[1])).to(device)
        
        print(vision_mask.shape)

        multi_modal_mask = torch.cat((vision_mask, attention_mask), dim=1).to(device)
        
        print(multi_modal_mask.shape)

        aux = self.new_transformer_encoder(aux, src_key_padding_mask=multi_modal_mask)
        aux = aux.reshape((aux.size()[1], aux.size()[0], aux.size()[2])) #change back shape to (batch size, sequence length, features)
        
        print(aux.shape)
        clamp = torch.clamp(multi_modal_mask.sum(1), min=1e-9)
        print(multi_modal_mask.unsqueeze(-1).shape)
        multiplication = aux * multi_modal_mask
        aux = torch.sum(multiplication, 1) / clamp

        output.logits = self.classification(aux)
    
        return output
  
    def predict(self,item):
      """
      item (n_examples x n_features)
      """
      scores = model(**item)  # (n_examples x n_classes)
      predicted_labels = scores.logits.argmax(dim=-1)  # (n_examples)
      return predicted_labels
  
    def save_model(self,path):
        torch.save(self.state_dict(), path)
        
    def load_model(self,path):
        self.load_state_dict(torch.load(path))
        self.eval()

In [6]:
class CLIP(CLIPModel):
    def __init__(self, num_labels=3):
      super().__init__(CLIPConfig.from_pretrained("openai/clip-vit-base-patch32"))
      self.new_encoder_layer = torch.nn.TransformerEncoderLayer(d_model=512, nhead=8)
      self.new_transformer_encoder = torch.nn.TransformerEncoder(self.new_encoder_layer, num_layers=3)
      self.classification = torch.nn.Linear(512, num_labels, bias=True)
      self.num_labels = num_labels
    
    def forward(self, input_ids=None, pixel_values=None, attention_mask=None, position_ids=None, return_loss=None, output_attentions=None, output_hidden_states=None, label=None):
        output = super().forward(input_ids,  pixel_values, attention_mask, position_ids, return_loss, output_attentions, output_hidden_states, return_dict=True)

        aux_vision = output.vision_model_output[0]
        aux_vision = self.visual_projection(aux_vision)
        aux_text = output.text_model_output[0]
        aux_text = self.text_projection(aux_text)
        aux = torch.cat((aux_vision,aux_text),dim=1)

        ones = torch.ones(aux_vision.shape[0],aux_vision.shape[1],dtype=torch.float).to(device)
        aux_mask = torch.cat((ones,attention_mask), dim=1)
        padding_mask = torch.swapaxes(aux_mask, 0, 1)

        aux = self.new_transformer_encoder( aux, src_key_padding_mask= padding_mask)
        
        input_mask_expanded = aux_mask.unsqueeze(-1).expand(aux.size()).float()
        
        aux = torch.sum(aux * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

        output.logits = self.classification(aux)
        return output
    
    def predict(self,item):
      """
      item (n_examples x n_features)
      """
      scores = model(**item)  # (n_examples x n_classes)
      predicted_labels = scores.logits.argmax(dim=-1)  # (n_examples)
      return predicted_labels
  
    def save_model(self,path):
        torch.save(self.state_dict(), path)
        
    def load_model(self,path):
        self.load_state_dict(torch.load(path))
        self.eval()

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.cuda.get_device_name(device)

    Found GPU%d %s which is of cuda capability %d.%d.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability supported by this library is %d.%d.
    


'GeForce GTX 1080 Ti'

In [9]:
train = MyDataset('../e-ViL/data/esnlive_train.csv','../e-ViL/data/flickr30k_images/flickr30k_images/', max_length=90)
test = MyDataset('../e-ViL/data/esnlive_test.csv','../e-ViL/data/flickr30k_images/flickr30k_images/', max_length=90)
dev = MyDataset('../e-ViL/data/esnlive_dev.csv','../e-ViL/data/flickr30k_images/flickr30k_images/', max_length=90)

In [8]:
model = CLIP()
model = model.to(device)

In [None]:
task = 'train'
batch_size = 64
epochs = 10
lr = 1e-5
if task =='train':
    test_evaluator = MyEvaluator(model,test)
    dev_evaluator = MyEvaluator(model,dev)
    trainer = MyTrainer(model,train,test_evaluator)
    print("-----Training Model-----")
    trainer.train_model(epochs=epochs ,batch_size = batch_size, lr = lr)
    print('----Training finished-----')
    dev_acc = dev_evaluator.evaluate(batch_size = batch_size)
    print("---- Dev Acc: ",dev_acc)
elif task =='test':
    model.load_model("/content/drive/MyDrive/teses/tese_MECD/implementation/my_model")
    evaluator = MyEvaluator(model,dev)
    acc = evaluator.evaluate(batch_size = batch_size)
    print(acc)
    #output = run_example(model,train)

-----Training Model-----


  1%|          | 51/6277 [02:14<4:28:44,  2.59s/it]