In [None]:
!pip install ekphrasis

In [None]:
import argparse

parser = argparse.ArgumentParser(description='Train Multimodal MLP Models for Sentiment')
parser.add_argument('--vtype', type=str, default='clip',
                    help='imagenet | places | emotion | clip')
parser.add_argument('--ttype', type=str, default='clip',
                    help='bertbase | robertabase | clip')
parser.add_argument('--mvsa', type=str, default='single',
                    help='single | multiple')
parser.add_argument('--ht', type=bool, default=True,
                    help='True | False')
parser.add_argument('--bs', type=int, default=32,
                    help='32, 64, 128')
parser.add_argument('--epochs', type=int, default=100,
                    help='50, 75, 100')
parser.add_argument('--lr', type=str, default='2e-5',
                    help='1e-4, 5e-5, 2e-5')
parser.add_argument('--ftype', type=str, default='feats',
                    help='feats | logits')
parser.add_argument('--layer', type=str, default='sumavg',
                    help='sumavg, 2last, last')
parser.add_argument('--norm', type=int, default=1,
                    help='0 | 1')
parser.add_argument('--split', type=int, default=1,
                    help='1-10')
parser.add_argument('--smooth', type=bool, default=False,
                    help='False | True')
parser.add_argument('-f')

args = parser.parse_args()

mvsa = args.mvsa
batch_size = args.bs
normalize = args.norm
init_lr = float(args.lr)
epochs = args.epochs
ftype = args.ftype
vtype = args.vtype
ttype = args.ttype
layer = args.layer
split = args.split
smooth = args.smooth
htag = args.ht

In [None]:
from torch.utils.data import DataLoader, Dataset, SequentialSampler
from torch import nn
from torch.nn import functional as F
import torch

import numpy as np
import json
from sklearn import metrics, preprocessing
class MultiMLP_2Mod(nn.Module):
    def __init__(self, vdim, tdim):
        super(MultiMLP_2Mod, self).__init__()

        self.vfc1 = nn.Linear(vdim, 128)#self.vfc1 = nn.Linear(vdim, 128)
        self.tfc1 = nn.Linear(tdim, 128)
        self.vbn1 = nn.BatchNorm1d(128)
        self.tbn1 = nn.BatchNorm1d(128)
        self.cf = nn.Linear(256, 3)

        self.vdp1 = nn.Dropout(0.5)
        self.tdp1 = nn.Dropout(0.5)
        self.relu = nn.ReLU()

        self.cf1=nn.Linear(128,3)

    def forward(self, x1, x2):
        x1 = self.vdp1(self.relu(self.vbn1(self.vfc1(x1))))
        x2 = self.tdp1(self.relu(self.tbn1(self.tfc1(x2))))

        x = torch.cat((x1,x2), axis=1)

        return self.cf(x),self.cf1(x1),self.cf1(x2)

In [None]:
class MultiDataset2(Dataset):
    def __init__(self, vfeats, tfeats, labels, normalize=1):
        self.vfeats = vfeats
        self.tfeats = tfeats
        self.labels = np.array(labels).astype(np.int)
        self.normalize = normalize

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        vfeat = self.vfeats[idx]
        tfeat = self.tfeats[idx]
        label = self.labels[idx]

        if self.normalize:
            vfeat = preprocessing.normalize(vfeat.reshape(1,-1), axis=1).flatten()
            tfeat = preprocessing.normalize(tfeat.reshape(1,-1), axis=1).flatten()

        return torch.FloatTensor(vfeat), torch.FloatTensor(tfeat), torch.tensor(label)



def get_visual_feats(mvsa, vtype, ftype, htag):
    if vtype == 'places':
        feats_img = json.load(open('features/places_%s.json'%(mvsa), 'r'))[ftype]
        vdim = 2048 if ftype == 'feats' else 365
    elif vtype == 'emotion':
        feats_img = json.load(open('features/emotion_%s.json'%(mvsa), 'r'))[ftype]
        vdim = 2048 if ftype == 'feats' else 8
    elif vtype == 'imagenet':
        feats_img  = json.load(open('features/imagenet_%s.json'%(mvsa), 'r'))[ftype]
        vdim = 2048 if ftype == 'feats' else 1000
    elif vtype == 'clip':
        feats_img  = json.load(open('/content/imagenet.json', 'r'))['feats']
        vdim = 512
    else:
        feats_img = json.load(open('features/faces_%s.json'%(mvsa),'r'))[ftype]
        vdim = 512 if ftype == 'feats' else 7

    return np.array(feats_img), vdim

In [None]:
from torch.utils.data import DataLoader, Dataset, SequentialSampler
import torch
from torchvision import transforms

import os, re
import numpy as np
import pandas as pd
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons


class MMDataset(Dataset):
    def __init__(self, dloc, img_transform=None, txt_transform=None, txt_processor=None):
        self.file_names = pd.read_csv(os.path.join(dloc,'valid_pairlist.txt'), header=None)
        self.dloc = dloc
        self.img_transform = img_transform
        self.txt_transform = txt_transform
        self.txt_processor = txt_processor

    def __len__(self):
        return len(self.file_names)

    def __getitem__(self, idx):
        fname = str(self.file_names.iloc[idx,0])

        img = Image.open(os.path.join(self.dloc, 'images', fname+'.jpg')).convert('RGB')
        text = open(os.path.join(self.dloc, 'texts', fname+'.txt'), 'r', encoding='utf-8', errors='ignore').read().strip().lower()

        if self.img_transform:
            img = self.img_transform(img)
        else:
            img = transforms.ToTensor()(img)

        if self.txt_transform:
            text = self.txt_transform(text, self.txt_processor)

        return img, text


def get_text_processor(word_stats='twitter', htag=True):
    return TextPreProcessor(
            # terms that will be normalized , 'number','money', 'time','date', 'percent' removed from below list
            normalize=['url', 'email', 'phone', 'user'],
            # terms that will be annotated
            annotate={"hashtag","allcaps", "elongated", "repeated",
                      'emphasis', 'censored'},
            fix_html=True,  # fix HTML tokens

            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter=word_stats,

            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector=word_stats,

            unpack_hashtags=htag,  # perform word segmentation on hashtags
            unpack_contractions=True,  # Unpack contractions (can't -> can not)
            spell_correct_elong=True,  # spell correction for elongated words

            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,

            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons]
        )



def process_tweet(tweet, text_processor):

    proc_tweet = text_processor.pre_process_doc(tweet)

    clean_tweet = [word.strip() for word in proc_tweet if not re.search(r"[^a-z0-9.,\s]+", word)]

    clean_tweet = [word for word in clean_tweet if word not in ['rt', 'http', 'https', 'htt']]

    return " ".join(clean_tweet)




def get_bert_embeddings(tweet, model, tokenizer, device):
    # Split the sentence into tokens.
    input_ids = torch.tensor([tokenizer.encode(tweet, add_special_tokens=True)]).to(device)

    # Predict hidden states features for each layer
    with torch.no_grad():
        try:
            last_out, pooled_out, encoded_layers = model(input_ids, return_dict=False)
        except:
            last_out, encoded_layers = model(input_ids, return_dict=False)


    # Calculate the average of all 22 token vectors.
    sent_emb_last = torch.mean(last_out[0], dim=0).cpu().numpy()

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(encoded_layers, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # Stores the token vectors, with shape [22 x 3,072]
    token_vecs_cat = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.
    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Concatenate the vectors (that is, append them together) from the last
        # four layers.
        # Each layer vector is 768 values, so `cat_vec` is length 3,072.
        cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)

        # Use `cat_vec` to represent `token`.
        token_vecs_cat.append(cat_vec.cpu().numpy())

    sent_word_catavg = np.mean(token_vecs_cat, axis=0)

    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec.cpu().numpy())

    sent_word_sumavg = np.mean(token_vecs_sum, axis=0)

    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = encoded_layers[-2][0]

    # Calculate the average of all 22 token vectors.
    sent_emb_2_last = torch.mean(token_vecs, dim=0).cpu().numpy()

    return sent_word_catavg, sent_word_sumavg, sent_emb_2_last, sent_emb_last

In [None]:
from torchvision import models
import torch

import pickle
import numpy as np
import json



import argparse

parser = argparse.ArgumentParser(description='Extract Image and CLIP Features')
parser.add_argument('--vtype', type=str, default='imagenet',
                    help='imagenet | places | emotion | clip')
parser.add_argument('--mvsa', type=str, default='single',
                    help='single | multiple')
parser.add_argument('--ht', type=bool, default=True,
                    help='True | False')
parser.add_argument('-f')

args = parser.parse_args()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

img_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
])

txt_processor = get_text_processor(htag=args.ht)
txt_transform = process_tweet

def get_resnet_feats():
    feats, logits = [], []

    def feature_hook(module, input, output):
        return feats.extend(output.view(-1,output.shape[1]).data.cpu().numpy().tolist())

    if args.vtype == 'imagenet':
        print('imgnet')
        model = models.__dict__['resnet50'](pretrained=True)
    elif args.vtype == 'places':
        print('places')
        model_file = 'pre_trained/resnet101_places_best.pth.tar'
        model = models.__dict__['resnet101'](pretrained=False, num_classes=365)
        checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage)
        state_dict = {str.replace(k,'module.',''): v for k,v in checkpoint['state_dict'].items()}
        model.load_state_dict(state_dict)
    elif args.vtype == 'emotion':
        print('emotion')
        model_file = 'pre_trained/best_emo_resnet50.pt'
        model = models.__dict__['resnet50'](pretrained=False, num_classes=8)
        model.load_state_dict(torch.load(model_file))

    model.eval().to(device)

    model._modules.get('avgpool').register_forward_hook(feature_hook)

    dataset = MMDataset(dloc, img_transforms, txt_transform, txt_processor)
    dt_loader = DataLoader(dataset, batch_size=128, sampler=SequentialSampler(dataset))

    for i, batch in enumerate(dt_loader):
        print(i)

        img_inputs = batch[0].to(device)

        with torch.no_grad():
            outputs = model(img_inputs)

        logits.extend(outputs.view(-1,outputs.shape[1]).data.cpu().numpy().tolist())


    return feats, logits



def get_clip_feats():
    img_feats, txt_feats = [], []

    model, img_preprocess = clip.load('ViT-B/32', device=device)
    model.eval()

    dataset = MMDataset(dloc, img_transform=img_preprocess, txt_transform=txt_transform, txt_processor=txt_processor)
    dt_loader = DataLoader(dataset, batch_size=128, sampler=SequentialSampler(dataset))

    for i, batch in enumerate(dt_loader):
        print(i)
        img_inps, txt_inps = batch[0].to(device), batch[1]

        txt_inps = clip.tokenize(txt_inps).to(device)

        with torch.no_grad():
            image_features = model.encode_image(img_inps)
            text_features = model.encode_text(txt_inps)

            img_feats.extend(image_features.cpu().numpy().tolist())
            txt_feats.extend(text_features.cpu().numpy().tolist())

    return img_feats, txt_feats

In [None]:
!pip install transformers

In [None]:
dloc='/content/drive/MyDrive/data/mvsa_single'

In [None]:
import json
import string

from transformers import BertTokenizer, BertModel, RobertaModel, RobertaTokenizer


import argparse

parser = argparse.ArgumentParser(description='Extract BERT Features')
parser.add_argument('--btype', type=str, default='robertabase',
                    help='bertbase | robertabase')
parser.add_argument('--mvsa', type=str, default='single',
                    help='single | multiple')
parser.add_argument('--ht', type=bool, default=True,
                    help='True | False')
parser.add_argument('-f')

args = parser.parse_args()

device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

txt_processor = get_text_processor(htag=args.ht)
txt_transform = process_tweet


bert_type = {'bertbase': (BertModel,    BertTokenizer, 'bert-base-uncased'),
            'robertabase': (RobertaModel,    RobertaTokenizer, 'roberta-base')}[args.btype]

tokenizer = bert_type[1].from_pretrained(bert_type[2])
model = bert_type[0].from_pretrained(bert_type[2], output_hidden_states=True)
model.to(device).eval()


embed_dict = {'catavg':[], 'sumavg': [], '2last': [], 'last': []}

ph_data = MMDataset(dloc, txt_transform=txt_transform, txt_processor=txt_processor)
ph_loader = DataLoader(ph_data, batch_size=1, sampler=SequentialSampler(ph_data))

In [None]:
for i, batch in enumerate(ph_loader):

    txt_inps = batch[1]

    sent_word_catavg, sent_word_sumavg, sent_emb_2_last, sent_emb_last \
        = get_bert_embeddings(txt_inps, model, tokenizer, device)

    # embed_dict['catavg'].append(sent_word_catavg.tolist())
    embed_dict['sumavg'].append(sent_word_sumavg.tolist())
    embed_dict['2last'].append(sent_emb_2_last.tolist())
    embed_dict['last'].append(sent_emb_last.tolist())

json.dump(embed_dict, open('roberta.json', 'w'))

In [None]:
feats, logits = get_resnet_feats()
print(np.array(feats).shape, np.array(logits).shape)
json.dump({'feats': feats, 'logits': logits}, open('imagenet.json', 'w'))


In [None]:
vdim=2048
tdim=768
model_vi = MultiMLP_2Mod(vdim, tdim)

model_vi.to(device)
model_vi.load_state_dict(torch.load("/content/mmm.pt"))

In [None]:
pair_df = pd.read_csv(dloc+'/valid_pairlist.txt', header=None)
all_labels = pair_df[1].to_numpy().flatten()
te_ids=[0]
lab_test = all_labels[te_ids]

In [None]:
feats_text = json.load(open('/content/roberta.json','r'))
feats_text = feats_text[layer]
tdim = 3072 if 'catavg' in layer else 768
feats_text = np.array(feats_text)
ft_te_txt = feats_text[te_ids]

In [None]:
feats_img, vdim = get_visual_feats(mvsa, vtype, ftype, htag)
ft_te_img = feats_img[te_ids]

In [None]:
te_data = MultiDataset2(ft_te_img, ft_te_txt, lab_test, normalize)

In [None]:
te_loader = DataLoader(dataset=te_data, batch_size=1, num_workers=2)

In [None]:
model_vi.eval()
count=0
true=0
false=0
for inputs1, inputs2, labels in te_loader:
  inputs1, inputs2, labels = inputs1.to(device), inputs2.to(device), labels.to(device)

  outputs,outputs1,outputs2 = model_vi(inputs1, inputs2)

  preds = torch.argmax(outputs.data, 1)
  preds1=torch.argmax(outputs1.data,1)
  preds2=torch.argmax(outputs2.data,1)

  print("ground truth:  ",labels.cpu().numpy())

  #print(te_ids[count])
  print("MM output:     ",preds.cpu().numpy())
  print("Image Feature: ",preds1.cpu().numpy())
  print("Text Feature:  ",preds2.cpu().numpy())
  if labels==preds:
    true=true+1

  else:
    false=false+1

  print(" ")
  count=count+1

In [None]:
def get_extractions(dloc):
  bert_type = {'bertbase': (BertModel,    BertTokenizer, 'bert-base-uncased'),
            'robertabase': (RobertaModel,    RobertaTokenizer, 'roberta-base')}[args.btype]

  tokenizer = bert_type[1].from_pretrained(bert_type[2])
  model = bert_type[0].from_pretrained(bert_type[2], output_hidden_states=True)
  model.to(device).eval()


  embed_dict = {'catavg':[], 'sumavg': [], '2last': [], 'last': []}
  txt_processor = get_text_processor(htag=args.ht)
  txt_transform = process_tweet
  ph_data = MMDataset(dloc, txt_transform=txt_transform, txt_processor=txt_processor)
  ph_loader = DataLoader(ph_data, batch_size=1, sampler=SequentialSampler(ph_data))
  for i, batch in enumerate(ph_loader):
    print(i)
    txt_inps = batch[1]

    sent_word_catavg, sent_word_sumavg, sent_emb_2_last, sent_emb_last \
        = get_bert_embeddings(txt_inps, model, tokenizer, device)

    # embed_dict['catavg'].append(sent_word_catavg.tolist())
    embed_dict['sumavg'].append(sent_word_sumavg.tolist())
    embed_dict['2last'].append(sent_emb_2_last.tolist())
    embed_dict['last'].append(sent_emb_last.tolist())

    json.dump(embed_dict, open('roberta1.json', 'w'))

    #visual extract
    feats, logits = get_resnet_feats()
    print(np.array(feats).shape, np.array(logits).shape)
    json.dump({'feats': feats, 'logits': logits}, open('imagenet.json', 'w'))


In [None]:
get_extractions(dloc)

In [None]:
def get_loaders(dloc):
  pair_df = pd.read_csv(dloc+'/valid_pairlist.txt', header=None)
  all_labels = pair_df[1].to_numpy().flatten()
  te_ids=[0]
  lab_test = all_labels[te_ids]
  feats_text = json.load(open('/content/roberta1.json','r'))
  feats_text = feats_text[layer]
  tdim = 3072 if 'catavg' in layer else 768
  feats_text = np.array(feats_text)
  ft_te_txt = feats_text[te_ids]
  feats_img, vdim = get_visual_feats(mvsa, vtype, ftype, htag)
  ft_te_img = feats_img[te_ids]
  te_data = MultiDataset2(ft_te_img, ft_te_txt, lab_test, normalize)
  te_loader = DataLoader(dataset=te_data, batch_size=1, num_workers=2)

  return te_loader

In [None]:
dloc='/content/drive/MyDrive/data/mvsa_single'  #folder path for data

In [None]:
te_loader=get_loaders(dloc)

In [None]:
model_path="/content/mmm.pt" #Pretrained model file

In [None]:
def get_model(model_path):
  vdim=2048
  tdim=768
  model_vi = MultiMLP_2Mod(vdim, tdim)

  model_vi.to(device)
  model_vi.load_state_dict(torch.load(model_path))

  return model_vi


In [None]:
model_vi=get_model(model_path)

In [None]:
def get_results(te_loader):
  model_vi.eval()
  for inputs1, inputs2, labels in te_loader:
    inputs1, inputs2, labels = inputs1.to(device), inputs2.to(device), labels.to(device)

    outputs,outputs1,outputs2 = model_vi(inputs1, inputs2)

    preds = torch.argmax(outputs.data, 1)
    preds1=torch.argmax(outputs1.data,1)
    preds2=torch.argmax(outputs2.data,1)
    file_path=dloc+"/valid_pairlist.txt"
    with open(file_path, 'r') as file:
      data = file.read().split(',')
      num_array = np.array(data, dtype=int)

    print("ground truth text feature  : ",num_array[2])
    print("ground truth image feature : ",num_array[3])
    print("ground truth:  ",labels.cpu().numpy())
    print("MM output:     ",preds.cpu().numpy())
    print("Image Feature: ",preds1.cpu().numpy())
    print("Text Feature:  ",preds2.cpu().numpy())



In [None]:
get_results(te_loader)