In [None]:
from torch.utils.data import DataLoader, Dataset, SequentialSampler
import torch
from torchvision import transforms

import os, re
import numpy as np
import pandas as pd
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

In [None]:
class MMDataset(Dataset):
    def __init__(self, dloc, img_transform=None, txt_transform=None, txt_processor=None):
        self.file_names = pd.read_csv(os.path.join(dloc,'valid_pairlist.txt'), header=None)
        self.dloc = dloc
        self.img_transform = img_transform
        self.txt_transform = txt_transform
        self.txt_processor = txt_processor

    def __len__(self):
        return len(self.file_names)

    def __getitem__(self, idx):
        fname = str(self.file_names.iloc[idx,0])

        img = Image.open(os.path.join(self.dloc, 'images', fname+'.jpg')).convert('RGB')
        text = open(os.path.join(self.dloc, 'texts', fname+'.txt'), 'r', encoding='utf-8', errors='ignore').read().strip().lower()

        if self.img_transform:
            img = self.img_transform(img)
        else:
            img = transforms.ToTensor()(img)

        if self.txt_transform:
            text = self.txt_transform(text, self.txt_processor)

        return img, text


def get_text_processor(word_stats='twitter', htag=True):
    return TextPreProcessor(
            # terms that will be normalized , 'number','money', 'time','date', 'percent' removed from below list
            normalize=['url', 'email', 'phone', 'user'],
            # terms that will be annotated
            annotate={"hashtag","allcaps", "elongated", "repeated",
                      'emphasis', 'censored'},
            fix_html=True,  # fix HTML tokens

            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter=word_stats,

            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector=word_stats,

            unpack_hashtags=htag,  # perform word segmentation on hashtags
            unpack_contractions=True,  # Unpack contractions (can't -> can not)
            spell_correct_elong=True,  # spell correction for elongated words

            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,

            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons]
        )



def process_tweet(tweet, text_processor):

    proc_tweet = text_processor.pre_process_doc(tweet)

    clean_tweet = [word.strip() for word in proc_tweet if not re.search(r"[^a-z0-9.,\s]+", word)]

    clean_tweet = [word for word in clean_tweet if word not in ['rt', 'http', 'https', 'htt']]

    return " ".join(clean_tweet)




def get_bert_embeddings(tweet, model, tokenizer, device):
    # Split the sentence into tokens.
    input_ids = torch.tensor([tokenizer.encode(tweet, add_special_tokens=True)]).to(device)

    # Predict hidden states features for each layer
    with torch.no_grad():
        try:
            last_out, pooled_out, encoded_layers = model(input_ids, return_dict=False)
        except:
            last_out, encoded_layers = model(input_ids, return_dict=False)


    # Calculate the average of all 22 token vectors.
    sent_emb_last = torch.mean(last_out[0], dim=0).cpu().numpy()

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(encoded_layers, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)

    # Stores the token vectors, with shape [22 x 3,072]
    token_vecs_cat = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.
    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Concatenate the vectors (that is, append them together) from the last
        # four layers.
        # Each layer vector is 768 values, so `cat_vec` is length 3,072.
        cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)

        # Use `cat_vec` to represent `token`.
        token_vecs_cat.append(cat_vec.cpu().numpy())

    sent_word_catavg = np.mean(token_vecs_cat, axis=0)

    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec.cpu().numpy())

    sent_word_sumavg = np.mean(token_vecs_sum, axis=0)

    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = encoded_layers[-2][0]

    # Calculate the average of all 22 token vectors.
    sent_emb_2_last = torch.mean(token_vecs, dim=0).cpu().numpy()

    return sent_word_catavg, sent_word_sumavg, sent_emb_2_last, sent_emb_last

In [None]:
from torchvision import models
import torch
import pickle
import numpy as np
import json
import clip
import argparse

In [None]:
parser = argparse.ArgumentParser(description='Extract Image and CLIP Features')
parser.add_argument('--vtype', type=str, default='imagenet',
                    help='imagenet | places | emotion | clip')
parser.add_argument('--mvsa', type=str, default='single',
                    help='single | multiple')
parser.add_argument('--ht', type=bool, default=True,
                    help='True | False')
parser.add_argument('-f')

args = parser.parse_args()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

img_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
])

txt_processor = get_text_processor(htag=args.ht)
txt_transform = process_tweet

In [None]:
def get_resnet_feats():
    feats, logits = [], []

    def feature_hook(module, input, output):
        return feats.extend(output.view(-1,output.shape[1]).data.cpu().numpy().tolist())

    if args.vtype == 'imagenet':
        print('imgnet')
        model = models.__dict__['resnet50'](pretrained=True)
    elif args.vtype == 'places':
        print('places')
        model_file = 'pre_trained/resnet101_places_best.pth.tar'
        model = models.__dict__['resnet101'](pretrained=False, num_classes=365)
        checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage)
        state_dict = {str.replace(k,'module.',''): v for k,v in checkpoint['state_dict'].items()}
        model.load_state_dict(state_dict)
    elif args.vtype == 'emotion':
        print('emotion')
        model_file = 'pre_trained/best_emo_resnet50.pt'
        model = models.__dict__['resnet50'](pretrained=False, num_classes=8)
        model.load_state_dict(torch.load(model_file))

    model.eval().to(device)

    model._modules.get('avgpool').register_forward_hook(feature_hook)

    dataset = MMDataset(dloc, img_transforms, txt_transform, txt_processor)
    dt_loader = DataLoader(dataset, batch_size=128, sampler=SequentialSampler(dataset))

    for i, batch in enumerate(dt_loader):
        print(i)

        img_inputs = batch[0].to(device)

        with torch.no_grad():
            outputs = model(img_inputs)

        logits.extend(outputs.view(-1,outputs.shape[1]).data.cpu().numpy().tolist())


    return feats, logits



def get_clip_feats():
    img_feats, txt_feats = [], []

    model, img_preprocess = clip.load('ViT-B/32', device=device)
    model.eval()

    dataset = MMDataset(dloc, img_transform=img_preprocess, txt_transform=txt_transform, txt_processor=txt_processor)
    dt_loader = DataLoader(dataset, batch_size=128, sampler=SequentialSampler(dataset))

    for i, batch in enumerate(dt_loader):
        print(i)
        img_inps, txt_inps = batch[0].to(device), batch[1]

        txt_inps = clip.tokenize(txt_inps).to(device)

        with torch.no_grad():
            image_features = model.encode_image(img_inps)
            text_features = model.encode_text(txt_inps)

            img_feats.extend(image_features.cpu().numpy().tolist())
            txt_feats.extend(text_features.cpu().numpy().tolist())

    return img_feats, txt_feats

In [None]:
dloc = 'data1/mvsa_%s/'%(args.mvsa)

In [None]:
if args.vtype != 'clip':
    feats, logits = get_resnet_feats()
    print(np.array(feats).shape, np.array(logits).shape)
    json.dump({'feats': feats, 'logits': logits}, open('features/%s_%s.json'%(args.vtype,args.mvsa), 'w'))
else:
    img_feats, text_feats = get_clip_feats()
    print(np.array(img_feats).shape, np.array(text_feats).shape)
    json.dump({'img_feats': img_feats, 'text_feats': text_feats}, open('features/%s_%s_ht%d.json'%(args.vtype,args.mvsa,args.ht), 'w'))

In [None]:
!pip install transformers

In [None]:
import json
import string

from transformers import BertTokenizer, BertModel, RobertaModel, RobertaTokenizer


import argparse

parser = argparse.ArgumentParser(description='Extract BERT Features')
parser.add_argument('--btype', type=str, default='robertabase',
                    help='bertbase | robertabase')
parser.add_argument('--mvsa', type=str, default='single',
                    help='single | multiple')
parser.add_argument('--ht', type=bool, default=True,
                    help='True | False')
parser.add_argument('-f')

args = parser.parse_args()

device = torch.device('cuda' if torch.cuda.is_available() else "cpu")


txt_processor = get_text_processor(htag=args.ht)
txt_transform = process_tweet


In [None]:
dloc = 'data1/mvsa_%s/'%(args.mvsa)
bert_type = {'bertbase': (BertModel,    BertTokenizer, 'bert-base-uncased'),
            'robertabase': (RobertaModel,    RobertaTokenizer, 'roberta-base')}[args.btype]

tokenizer = bert_type[1].from_pretrained(bert_type[2])
model = bert_type[0].from_pretrained(bert_type[2], output_hidden_states=True)
model.to(device).eval()


In [None]:
embed_dict = {'catavg':[], 'sumavg': [], '2last': [], 'last': []}

ph_data = MMDataset(dloc, txt_transform=txt_transform, txt_processor=txt_processor)
ph_loader = DataLoader(ph_data, batch_size=1, sampler=SequentialSampler(ph_data))

In [None]:
for i, batch in enumerate(ph_loader):

    txt_inps = batch[1]

    sent_word_catavg, sent_word_sumavg, sent_emb_2_last, sent_emb_last \
        = get_bert_embeddings(txt_inps, model, tokenizer, device)

    # embed_dict['catavg'].append(sent_word_catavg.tolist())
    embed_dict['sumavg'].append(sent_word_sumavg.tolist())
    embed_dict['2last'].append(sent_emb_2_last.tolist())
    embed_dict['last'].append(sent_emb_last.tolist())

json.dump(embed_dict, open('features/%s_%s_ht%d.json'%(args.btype, args.mvsa, args.ht), 'w'))