In [None]:
!pip install ekphrasis
!pip install transformers

In [None]:
from torch.utils.data import DataLoader, Dataset, SequentialSampler
import torch
from torchvision import transforms

import os, re
import numpy as np
import pandas as pd
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons


class MMDataset(Dataset):
    def __init__(self, dloc, img_transform=None, txt_transform=None, txt_processor=None):
        self.file_names = pd.read_csv(os.path.join(dloc,'valid_pairlist.txt'), header=None)
        self.dloc = dloc
        self.img_transform = img_transform
        self.txt_transform = txt_transform
        self.txt_processor = txt_processor

    def __len__(self):
        return len(self.file_names)

    def __getitem__(self, idx):
        fname = str(self.file_names.iloc[idx,0])

        img = Image.open(os.path.join(self.dloc, 'images', fname+'.jpg')).convert('RGB')
        text = open(os.path.join(self.dloc, 'texts', fname+'.txt'), 'r', encoding='utf-8', errors='ignore').read().strip().lower()

        if self.img_transform:
            img = self.img_transform(img)
        else:
            img = transforms.ToTensor()(img)

        if self.txt_transform:
            text = self.txt_transform(text, self.txt_processor)

        return img, text


def get_text_processor(word_stats='twitter', htag=True):
    return TextPreProcessor(
            # terms that will be normalized , 'number','money', 'time','date', 'percent' removed from below list
            normalize=['url', 'email', 'phone', 'user'],
            # terms that will be annotated
            annotate={"hashtag","allcaps", "elongated", "repeated",
                      'emphasis', 'censored'},
            fix_html=True,  # fix HTML tokens

            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter=word_stats,

            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector=word_stats,

            unpack_hashtags=htag,  # perform word segmentation on hashtags
            unpack_contractions=True,  # Unpack contractions (can't -> can not)
            spell_correct_elong=True,  # spell correction for elongated words

            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,

            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons]
        )



def process_tweet(tweet, text_processor):

    proc_tweet = text_processor.pre_process_doc(tweet)

    clean_tweet = [word.strip() for word in proc_tweet if not re.search(r"[^a-z0-9.,\s]+", word)]

    clean_tweet = [word for word in clean_tweet if word not in ['rt', 'http', 'https', 'htt']]

    return " ".join(clean_tweet)




def get_bert_embeddings(tweet, model, tokenizer, device):
    # Split the sentence into tokens.
    input_ids = torch.tensor([tokenizer.encode(tweet, add_special_tokens=True)]).to(device)

    # Predict hidden states features for each layer
    with torch.no_grad():
        try:
            last_out, pooled_out, encoded_layers = model(input_ids, return_dict=False)
        except:
            last_out, encoded_layers = model(input_ids, return_dict=False)


    # Calculate the average of all 22 token vectors.
    sent_emb_last = torch.mean(last_out[0], dim=0).cpu().numpy()

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(encoded_layers, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # Stores the token vectors, with shape [22 x 3,072]
    token_vecs_cat = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.
    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Concatenate the vectors (that is, append them together) from the last
        # four layers.
        # Each layer vector is 768 values, so `cat_vec` is length 3,072.
        cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)

        # Use `cat_vec` to represent `token`.
        token_vecs_cat.append(cat_vec.cpu().numpy())

    sent_word_catavg = np.mean(token_vecs_cat, axis=0)

    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec.cpu().numpy())

    sent_word_sumavg = np.mean(token_vecs_sum, axis=0)

    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = encoded_layers[-2][0]

    # Calculate the average of all 22 token vectors.
    sent_emb_2_last = torch.mean(token_vecs, dim=0).cpu().numpy()

    return sent_word_catavg, sent_word_sumavg, sent_emb_2_last, sent_emb_last

In [None]:
import argparse
import json
import string

from transformers import BertTokenizer, BertModel, RobertaModel, RobertaTokenizer

import argparse
parser = argparse.ArgumentParser(description='Train Multimodal MLP Models for Sentiment')
parser.add_argument('--vtype', type=str, default='imagenet',
                    help='imagenet | places | emotion | clip')
parser.add_argument('--btype', type=str, default='robertabase',
                    help='bertbase | robertabase')
parser.add_argument('--ttype', type=str, default='clip',
                    help='bertbase | robertabase | clip')
parser.add_argument('--mvsa', type=str, default='single',
                    help='single | multiple')
parser.add_argument('--ht', type=bool, default=True,
                    help='True | False')
parser.add_argument('--bs', type=int, default=32,
                    help='32, 64, 128')
parser.add_argument('--epochs', type=int, default=100,
                    help='50, 75, 100')
parser.add_argument('--lr', type=str, default='2e-5',
                    help='1e-4, 5e-5, 2e-5')
parser.add_argument('--ftype', type=str, default='feats',
                    help='feats | logits')
parser.add_argument('--layer', type=str, default='sumavg',
                    help='sumavg, 2last, last')
parser.add_argument('--norm', type=int, default=1,
                    help='0 | 1')
parser.add_argument('--split', type=int, default=1,
                    help='1-10')
parser.add_argument('--smooth', type=bool, default=False,
                    help='False | True')
parser.add_argument('-f')

args = parser.parse_args()

mvsa = args.mvsa
batch_size = args.bs
normalize = args.norm
init_lr = float(args.lr)
epochs = args.epochs
ftype = args.ftype
btype = args.btype
vtype = args.vtype
ttype = args.ttype
layer = args.layer
split = args.split
smooth = args.smooth
htag = args.ht

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")


txt_processor = get_text_processor(htag=args.ht)
txt_transform = process_tweet

In [None]:
dloc='/content/drive/MyDrive/mvsa_single'
bert_type = {'bertbase': (BertModel,    BertTokenizer, 'bert-base-uncased'),
            'robertabase': (RobertaModel,    RobertaTokenizer, 'roberta-base')}[args.btype]

tokenizer = bert_type[1].from_pretrained(bert_type[2])
model = bert_type[0].from_pretrained(bert_type[2], output_hidden_states=True)
model.to(device).eval()


embed_dict = {'catavg':[], 'sumavg': [], '2last': [], 'last': []}

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
ph_data = MMDataset(dloc, txt_transform=txt_transform, txt_processor=txt_processor)
ph_loader = DataLoader(ph_data, batch_size=1, sampler=SequentialSampler(ph_data))

In [None]:
for i, batch in enumerate(ph_loader):
    print(i)
    txt_inps = batch[1]

    sent_word_catavg, sent_word_sumavg, sent_emb_2_last, sent_emb_last \
        = get_bert_embeddings(txt_inps, model, tokenizer, device)

    # embed_dict['catavg'].append(sent_word_catavg.tolist())
    embed_dict['sumavg'].append(sent_word_sumavg.tolist())
    embed_dict['2last'].append(sent_emb_2_last.tolist())
    embed_dict['last'].append(sent_emb_last.tolist())

json.dump(embed_dict, open('roberta1.json', 'w'))

In [None]:
!pip install openai-clip

Collecting openai-clip
  Downloading openai-clip-1.0.1.tar.gz (1.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.4/1.4 MB[0m [31m20.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: openai-clip
  Building wheel for openai-clip (setup.py) ... [?25l[?25hdone
  Created wheel for openai-clip: filename=openai_clip-1.0.1-py3-none-any.whl size=1368606 sha256=5fc1b25e80d7b11e255dfaa8340cb76427ccba82175264cd677c14e48c9dc476
  Stored in directory: /root/.cache/pip/wheels/08/77/8e/8d2f862df6bf7fb4e2007062d2cbaeae49862ec

In [None]:
from torchvision import models
import torch

import pickle
import numpy as np
import json

import clip

img_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
])

txt_processor = get_text_processor(htag=args.ht)
txt_transform = process_tweet

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [None]:
def get_clip_feats():
    img_feats, txt_feats = [], []


    model, img_preprocess = clip.load('ViT-B/32', device=device)
    model.eval()

    dataset = MMDataset(dloc, img_transform=img_preprocess, txt_transform=txt_transform, txt_processor=txt_processor)
    dt_loader = DataLoader(dataset, batch_size=128, sampler=SequentialSampler(dataset))

    for i, batch in enumerate(dt_loader):
        print(i)
        img_inps, txt_inps = batch[0].to(device), batch[1]

        txt_inps = clip.tokenize(txt_inps).to(device)

        with torch.no_grad():
            image_features = model.encode_image(img_inps)
            text_features = model.encode_text(txt_inps)

            img_feats.extend(image_features.cpu().numpy().tolist())
            txt_feats.extend(text_features.cpu().numpy().tolist())

    return img_feats, txt_feats

def get_resnet_feats():
    feats, logits = [], []

    def feature_hook(module, input, output):
        return feats.extend(output.view(-1,output.shape[1]).data.cpu().numpy().tolist())

    if args.vtype == 'imagenet':
        print('imgnet')
        model = models.__dict__['resnet50'](pretrained=True)
    elif args.vtype == 'places':
        print('places')
        model_file = 'pre_trained/resnet101_places_best.pth.tar'
        model = models.__dict__['resnet101'](pretrained=False, num_classes=365)
        checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage)
        state_dict = {str.replace(k,'module.',''): v for k,v in checkpoint['state_dict'].items()}
        model.load_state_dict(state_dict)
    elif args.vtype == 'emotion':
        print('emotion')
        model_file = 'pre_trained/best_emo_resnet50.pt'
        model = models.__dict__['resnet50'](pretrained=False, num_classes=8)
        model.load_state_dict(torch.load(model_file))

    model.eval().to(device)

    model._modules.get('avgpool').register_forward_hook(feature_hook)

    dataset = MMDataset(dloc, img_transforms, txt_transform, txt_processor)
    dt_loader = DataLoader(dataset, batch_size=128, sampler=SequentialSampler(dataset))

    for i, batch in enumerate(dt_loader):
        print(i)

        img_inputs = batch[0].to(device)

        with torch.no_grad():
            outputs = model(img_inputs)

        logits.extend(outputs.view(-1,outputs.shape[1]).data.cpu().numpy().tolist())


    return feats, logits

In [None]:
feats, logits = get_resnet_feats()
print(np.array(feats).shape, np.array(logits).shape)
json.dump({'feats': feats, 'logits': logits}, open('clip.json', 'w'))

In [None]:
from torch.utils.data import DataLoader, Dataset, SequentialSampler
from torch import nn
from torch.nn import functional as F
import torch

import numpy as np
import json
from sklearn import metrics, preprocessing

class MultiDataset2(Dataset):
    def __init__(self, vfeats, tfeats, labels, normalize=1):
        self.vfeats = vfeats
        self.tfeats = tfeats
        self.labels = np.array(labels).astype(np.int)
        self.normalize = normalize

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        vfeat = self.vfeats[idx]
        tfeat = self.tfeats[idx]
        label = self.labels[idx]

        if self.normalize:
            vfeat = preprocessing.normalize(vfeat.reshape(1,-1), axis=1).flatten()
            tfeat = preprocessing.normalize(tfeat.reshape(1,-1), axis=1).flatten()

        return torch.FloatTensor(vfeat), torch.FloatTensor(tfeat), torch.tensor(label)


class MultiMLP_2Mod(nn.Module):
    def __init__(self, vdim, tdim):
        super(MultiMLP_2Mod, self).__init__()

        self.vfc1 = nn.Linear(vdim, 128)#self.vfc1 = nn.Linear(vdim, 128)
        self.tfc1 = nn.Linear(tdim, 128)
        self.vbn1 = nn.BatchNorm1d(128)
        self.tbn1 = nn.BatchNorm1d(128)
        self.cf = nn.Linear(256, 3)

        self.vdp1 = nn.Dropout(0.5)
        self.tdp1 = nn.Dropout(0.5)
        self.relu = nn.ReLU()

        self.cf1=nn.Linear(128,3)

    def forward(self, x1, x2):
        x1 = self.vdp1(self.relu(self.vbn1(self.vfc1(x1))))
        x2 = self.tdp1(self.relu(self.tbn1(self.tfc1(x2))))

        x = torch.cat((x1,x2), axis=1)

        return self.cf(x),self.cf1(x1),self.cf1(x2)



def get_visual_feats(mvsa, vtype, ftype, htag):
    if vtype == 'places':
        feats_img = json.load(open('features/places_%s.json'%(mvsa), 'r'))[ftype]
        vdim = 2048 if ftype == 'feats' else 365
    elif vtype == 'emotion':
        feats_img = json.load(open('features/emotion_%s.json'%(mvsa), 'r'))[ftype]
        vdim = 2048 if ftype == 'feats' else 8
    elif vtype == 'imagenet':
        feats_img  = json.load(open('clip.json', 'r'))[ftype]
        vdim = 2048 if ftype == 'feats' else 1000
    elif vtype == 'clip':
        feats_img  = json.load(open('clip.json', 'r'))['img_feats']
        vdim = 512
    else:
        feats_img = json.load(open('features/faces_%s.json'%(mvsa),'r'))[ftype]
        vdim = 512 if ftype == 'feats' else 7

    return np.array(feats_img), vdim


def cal_loss(pred, gold, smoothing=False):
    ''' Calculate cross entropy loss, apply label smoothing if needed. '''

    gold = gold.contiguous().view(-1)

    if smoothing:
        eps = 0.1
        n_class = pred.size(1)
        one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)
        one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
        log_prb = F.log_softmax(pred, dim=1)

        loss = -(one_hot * log_prb).sum(dim=1)
        loss = loss.mean()  # average later
    else:
        loss = F.cross_entropy(pred, gold, reduction='mean')

    return loss

In [None]:
import torch.optim as optim

import random, copy
import pandas as pd
import time
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

def train(model, optimizer, lr_scheduler, num_epochs):

    since = time.time()

    best_model = model
    best_acc = 0.0
    best_val_loss = 100
    best_epoch = 0

    for epoch in range(1, num_epochs+1):
        print('Epoch {}/{}'.format(epoch, num_epochs))
        print('-' * 10)

        since2 = time.time()

        model.train()  # Set model to training mode

        running_loss = 0.0
        running_corrects = 0

        tot = 0.0
        cnt = 0
        # Iterate over data.
        for inputs1, inputs2, labels in tr_loader:

            inputs1, inputs2, labels = inputs1.to(device), inputs2.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()
            # forward
            outputs,outputs1,outputs2 = model(inputs1, inputs2)
            _, preds = torch.max(outputs, 1)

            # loss = criterion(outputs, labels)
            loss = cal_loss(outputs, labels, smoothing=smooth)

            # backward + optimize
            loss.backward()
            optimizer.step()

            # statistics
            running_loss += loss.item()
            running_corrects += torch.sum(preds == labels.data).item()
            tot += len(labels)

            if cnt % 50 == 0:
                print('[%d, %5d] loss: %.5f, Acc: %.2f' %
                      (epoch, cnt + 1, loss.item(), (100.0 * running_corrects) / tot))

            cnt = cnt + 1

        train_loss = running_loss / len(tr_loader)
        train_acc = running_corrects * 1.0 / (len(tr_loader.dataset))

        print('Training Loss: {:.6f} Acc: {:.2f}'.format(train_loss, 100.0 * train_acc))

        test_loss, test_acc, test_f1,  _ = evaluate(model, vl_loader)

        print('Epoch: {:d}, Val Loss: {:.4f}, Val Acc: {:.4f}, Val F1: {:.4f}'.format(epoch, test_loss,test_acc, test_f1))

        if lr_scheduler:
        	lr_scheduler.step(test_loss)

        # deep copy the model
        if test_loss <= best_val_loss:
            best_acc = test_acc
            best_val_loss = test_loss
            best_model = copy.deepcopy(model)
            best_epoch = epoch

    time_elapsed2 = time.time() - since2
    print('Epoch complete in {:.0f}m {:.0f}s'.format(
        time_elapsed2 // 60, time_elapsed2 % 60))

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    return best_model, best_epoch


def evaluate(model, loader):
    model.eval()
    test_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs1, inputs2, labels in loader:

            inputs1, inputs2, labels = inputs1.to(device), inputs2.to(device), labels.to(device)

            outputs,outputs1,outputs2 = model(inputs1, inputs2)

            preds = torch.argmax(outputs.data, 1)

            # test_loss += criterion(outputs, labels).item()
            test_loss += cal_loss(outputs, labels, smoothing=smooth).item()

            all_preds.extend(preds.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())

        acc = metrics.accuracy_score(all_labels, all_preds)
        f1 = metrics.f1_score(all_labels, all_preds, average='weighted')

    return test_loss/len(loader), acc, f1, all_preds

In [None]:
pair_df = pd.read_csv(dloc+'/valid_pairlist.txt', header=None)
all_labels = pair_df[1].to_numpy().flatten()

In [None]:
lst = list(range(1,1000))
import random
random.shuffle(lst)
np.array(lst, dtype=np.int64)

In [None]:
tr_ids=lst[0:900]
vl_ids=lst[901:950]
te_ids=lst[951:1000]

In [None]:
lab_train = all_labels[tr_ids]
lab_val = all_labels[vl_ids]
lab_test = all_labels[te_ids]

In [None]:
feats_text = json.load(open('/content/roberta1.json','r'))

In [None]:
feats_text = feats_text[layer]
tdim = 3072 if 'catavg' in layer else 768

In [None]:
feats_text = np.array(feats_text)

ft_tr_txt = feats_text[tr_ids]
ft_vl_txt = feats_text[vl_ids]
ft_te_txt = feats_text[te_ids]

In [None]:
feats_img, vdim = get_visual_feats(mvsa, vtype, ftype, htag)

In [None]:
ft_tr_img = feats_img[tr_ids]
ft_vl_img = feats_img[vl_ids]
ft_te_img = feats_img[te_ids]

In [None]:
te_data = MultiDataset2(ft_te_img, ft_te_txt, lab_test, normalize)
tr_data = MultiDataset2(ft_tr_img, ft_tr_txt, lab_train, normalize)
vl_data = MultiDataset2(ft_vl_img, ft_vl_txt, lab_val, normalize)

tr_loader = DataLoader(dataset=tr_data, batch_size=batch_size, num_workers=2,
                        shuffle=True)
vl_loader = DataLoader(dataset=vl_data, batch_size=16, num_workers=2)
te_loader = DataLoader(dataset=te_data, batch_size=16, num_workers=2)

In [None]:
criterion = nn.CrossEntropyLoss().to(device)

model_ft = MultiMLP_2Mod(vdim, tdim)

In [None]:
model_ft.to(device)
print(model_ft)

optimizer_ft = optim.Adam(model_ft.parameters(), init_lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_ft, mode='min', patience=5,verbose=True, factor=0.1)

In [None]:
model_ft, best_epoch = train(model_ft, optimizer_ft, scheduler, num_epochs=epochs)

In [None]:
torch.save(model_ft.state_dict(), 'MMImagenet+Roberta.pt')

In [None]:
te_loss, te_acc, te_f1, all_preds, = evaluate(model_ft, te_loader)
print('Best Epoch: %d, Test Acc: %.4f, %.4f, %.4f'%(best_epoch, np.round(te_loss,4), np.round(te_acc,4), np.round(te_f1,4)))