In [15]:
import torch
from PIL import Image
import requests
from lavis.models import load_model_and_preprocess

from torch import nn
import pandas as pd
import os

from tqdm import tqdm

from sklearn.metrics import classification_report
import json

from torch.utils.data import Dataset
import torch.utils.data as data

# Load the model

In [3]:
# setup device to use
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

In [4]:
model, vis_processors, txt_processors = load_model_and_preprocess(
    name = "blip_feature_extractor", model_type="base", is_eval=True, device=device
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.97G/1.97G [00:40<00:00, 51.6MB/s]


# Define the Dataset class

In [72]:
class TwitterCOMMsDataset(Dataset):
    def __init__(self, csv_path, img_dir):
        """
        Args:
            csv_path (string): Path to the {train_completed|val_completed}.csv file.
            image_folder_dir (string): Directory containing the images
        """
        self.df = pd.read_csv(csv_path, index_col=0)
        self.img_dir = img_dir
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = self.df.iloc[idx]
        caption = item['full_text']
        img_filename = item['filename']
        topic = item['topic']
        label = item['falsified']
        domain = topic.split('_')[0]
        diff = topic.split('_')[1]
        
        try:
            raw_image = Image.open(os.path.join(self.img_dir, img_filename)).convert('RGB')
            image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
            text_input = txt_processors["eval"](caption)
            sample = {"image": image, "text_input": [text_input]}   # image shape: [1, 3, 224, 224]
        
            features_multimodal = model.extract_features(sample, mode="multimodal")
#             features_image = model.extract_features(sample, mode="image")
#             features_text = model.extract_features(sample, mode="text")
            print(features_multimodal)
        
            return {"multimodal_emb": features_multimodal,
                    "topic": topic, 
                    "label": label, 
                    "domain": domain, 
                    "difficulty": diff}
        
        except IOError as e:
            print(e)
        
        
        
    

In [73]:
val_data = TwitterCOMMsDataset(csv_path='../data/val_completed.csv', 
                               img_dir='/import/network-temp/yimengg/data/twitter-comms/images/val_images/val_tweet_image_ids')

In [74]:
BATCH_SIZE = 64

In [75]:
val_iterator = data.DataLoader(val_data, 
                                 batch_size = BATCH_SIZE)

In [76]:
for i, data in tqdm(enumerate(val_iterator, 0), desc='iterations'):
    print(data)

iterations: 0it [00:00, ?it/s]

BlipOutputFeatures(image_embeds=tensor([[[-0.2095,  0.6907,  0.3040,  ...,  1.0300, -0.4427,  0.2093],
         [ 0.6913, -0.1128,  0.0958,  ...,  0.1492, -0.0857,  0.3687],
         [-0.2953,  1.6621,  0.3503,  ..., -0.3806,  0.2155,  0.5887],
         ...,
         [-0.0841,  1.0309,  0.3304,  ...,  0.2649, -0.0267,  0.4178],
         [ 0.8295,  0.1299,  0.9677,  ...,  0.8383,  0.2832, -0.1758],
         [ 0.1076,  0.6454,  0.6220,  ...,  0.5702, -0.2760,  0.0723]]],
       device='cuda:0'), image_embeds_proj=None, text_embeds=None, text_embeds_proj=None, multimodal_embeds=tensor([[[ 0.0268,  0.0440,  0.0601,  ...,  0.0398, -0.1383, -0.1102],
         [-0.0078,  0.0147,  0.0840,  ..., -0.0148, -0.1590, -0.0342],
         [ 0.0419,  0.0078,  0.1162,  ..., -0.0067, -0.2070, -0.0887],
         ...,
         [-0.0337,  0.0409,  0.0916,  ..., -0.0239, -0.1572, -0.0452],
         [ 0.0476, -0.0248,  0.1070,  ...,  0.0084, -0.1904, -0.0735],
         [-0.0207,  0.0789,  0.0831,  ...,  0.060

BlipOutputFeatures(image_embeds=tensor([[[-0.6623,  0.3648,  0.5726,  ...,  0.8307,  0.6266,  0.4717],
         [-0.5920,  0.3083, -0.0416,  ..., -1.0928,  0.5041,  0.6289],
         [ 0.0129,  0.5988,  0.4387,  ...,  0.3863, -0.6196,  0.7256],
         ...,
         [ 1.0664,  0.1471,  0.2749,  ..., -0.0876,  0.1102,  0.8244],
         [-0.7252,  0.6937,  0.3005,  ..., -0.2492,  0.1010, -0.0918],
         [-0.2615,  0.1816,  0.4444,  ...,  0.1587,  0.2730,  0.6006]]],
       device='cuda:0'), image_embeds_proj=None, text_embeds=None, text_embeds_proj=None, multimodal_embeds=tensor([[[ 0.0007,  0.1247,  0.0693,  ...,  0.1175, -0.0550, -0.1388],
         [-0.1108,  0.0328,  0.1323,  ...,  0.0945, -0.1019, -0.0720],
         [-0.0747,  0.0660,  0.0988,  ...,  0.0840, -0.0956, -0.1009],
         ...,
         [-0.0467,  0.0424,  0.0946,  ...,  0.0706, -0.0936, -0.1576],
         [-0.0560,  0.0234,  0.1195,  ...,  0.0864, -0.1167, -0.1361],
         [-0.0684,  0.1668,  0.1014,  ...,  0.101

BlipOutputFeatures(image_embeds=tensor([[[ 0.3457,  0.0152, -0.5108,  ...,  0.2665,  0.5326, -0.5446],
         [-1.0237, -0.4673, -0.9338,  ..., -0.5014,  0.2816,  0.3392],
         [-0.2785, -0.0778, -0.2156,  ...,  0.5931,  0.4817,  0.1763],
         ...,
         [-1.0570,  0.3561, -0.6515,  ...,  0.0525,  0.8404, -0.0221],
         [-0.2016, -0.7314,  0.1194,  ...,  0.2823, -0.0279,  0.2936],
         [-0.5885, -0.5138, -0.3749,  ..., -0.1031, -0.0169,  0.4169]]],
       device='cuda:0'), image_embeds_proj=None, text_embeds=None, text_embeds_proj=None, multimodal_embeds=tensor([[[ 0.0506, -0.0281,  0.0222,  ..., -0.0336,  0.0331, -0.0427],
         [ 0.0884, -0.0559, -0.0185,  ..., -0.0730,  0.0517, -0.0675],
         [ 0.2262, -0.1051,  0.0775,  ..., -0.1221,  0.0142,  0.0032],
         ...,
         [ 0.0628, -0.0683, -0.0383,  ..., -0.0784,  0.0627, -0.0177],
         [ 0.0233, -0.0583, -0.0263,  ..., -0.0809,  0.0475,  0.0438],
         [ 0.0104, -0.0481,  0.0219,  ...,  0.035

BlipOutputFeatures(image_embeds=tensor([[[-0.2956,  0.0317, -0.5388,  ...,  0.0931,  0.4483,  0.0550],
         [-0.2484,  0.5883, -1.0987,  ..., -0.1447,  0.5852,  1.5961],
         [-0.2506,  0.5656, -0.3839,  ..., -0.4417,  1.1258,  0.2965],
         ...,
         [ 0.4357,  0.5890, -0.2283,  ..., -0.9141,  0.4017,  0.2305],
         [ 0.0989,  0.7154, -0.5593,  ..., -0.8948,  0.3037, -0.2137],
         [-0.1453,  1.5432, -0.4022,  ..., -1.5651, -0.0317, -0.0054]]],
       device='cuda:0'), image_embeds_proj=None, text_embeds=None, text_embeds_proj=None, multimodal_embeds=tensor([[[ 0.0685, -0.0466, -0.0587,  ..., -0.0799,  0.1446,  0.1060],
         [ 0.0503,  0.0539, -0.0495,  ..., -0.0744,  0.1631,  0.0696],
         [ 0.0276, -0.0710, -0.0431,  ..., -0.1070,  0.2108,  0.0893],
         ...,
         [ 0.0192, -0.0500, -0.0350,  ..., -0.1011,  0.1922,  0.1298],
         [ 0.0170, -0.0263, -0.0694,  ..., -0.1263,  0.1936,  0.1240],
         [ 0.0241, -0.1312, -0.1047,  ..., -0.003

BlipOutputFeatures(image_embeds=tensor([[[-0.3965,  0.0174, -0.1696,  ...,  1.4853, -0.1815, -0.7512],
         [ 0.3097,  0.0911,  0.3856,  ...,  0.3252,  0.6141,  0.5907],
         [-0.9447,  0.9553,  0.6208,  ..., -0.0786,  0.8357,  0.5290],
         ...,
         [ 0.3725,  0.5922, -0.3141,  ...,  0.0114,  1.0379, -0.3665],
         [-0.0072, -0.0803,  0.5515,  ...,  0.1683,  0.2638,  0.1879],
         [-0.2888,  0.2189, -0.1354,  ...,  0.4801,  1.1365,  0.2581]]],
       device='cuda:0'), image_embeds_proj=None, text_embeds=None, text_embeds_proj=None, multimodal_embeds=tensor([[[-0.0181,  0.1034,  0.0984,  ..., -0.0027, -0.0803, -0.1693],
         [-0.0349,  0.0943,  0.1571,  ..., -0.0511, -0.0854, -0.1019],
         [-0.0704,  0.1135,  0.0854,  ...,  0.0317, -0.0997, -0.2335],
         ...,
         [-0.0717,  0.0622,  0.1404,  ..., -0.0391, -0.0911, -0.0322],
         [-0.0242,  0.1103,  0.1492,  ..., -0.0705, -0.0914, -0.1222],
         [-0.0616,  0.1500,  0.1359,  ..., -0.004

BlipOutputFeatures(image_embeds=tensor([[[-0.5843, -0.0701, -0.2040,  ...,  0.1964,  0.3986,  0.2012],
         [ 0.3278,  0.5262, -0.5619,  ..., -0.7583,  1.0434, -0.0762],
         [ 0.2387,  0.7748, -0.5502,  ..., -0.5143,  0.7901, -0.0872],
         ...,
         [-1.2702,  1.3781, -0.0355,  ..., -0.1710, -0.0148,  0.1814],
         [-0.9840,  1.4841,  0.4052,  ..., -0.2234, -0.4041,  0.2975],
         [-0.8071,  0.9097,  0.2413,  ..., -0.0206, -0.2966, -0.1122]]],
       device='cuda:0'), image_embeds_proj=None, text_embeds=None, text_embeds_proj=None, multimodal_embeds=tensor([[[ 0.0590,  0.1008,  0.1289,  ..., -0.0069,  0.0646,  0.0803],
         [-0.0298,  0.0622,  0.1538,  ..., -0.0054,  0.0122,  0.1241],
         [ 0.0320,  0.0750,  0.0237,  ..., -0.1155, -0.0436,  0.1870],
         ...,
         [-0.0308,  0.1094,  0.1135,  ..., -0.0513,  0.0740,  0.1511],
         [ 0.0067,  0.1246,  0.0949,  ..., -0.0355, -0.0248,  0.1063],
         [ 0.0932,  0.0804,  0.0907,  ...,  0.013

iterations: 0it [00:03, ?it/s]

BlipOutputFeatures(image_embeds=tensor([[[ 0.0233,  0.6448,  0.0387,  ...,  1.0194, -0.0960, -0.3207],
         [-0.6008, -0.6601, -0.2609,  ..., -0.2275,  0.7215,  0.1762],
         [-0.4044, -0.5474, -0.2280,  ..., -0.3222,  0.7804,  0.7440],
         ...,
         [ 0.5971,  0.3800,  0.3283,  ..., -0.0646,  1.0048,  0.1773],
         [-0.4858, -0.0563, -0.2069,  ...,  0.0584,  0.9172,  0.5910],
         [-0.5736, -0.1096, -0.4614,  ...,  0.3082,  1.0978,  0.3217]]],
       device='cuda:0'), image_embeds_proj=None, text_embeds=None, text_embeds_proj=None, multimodal_embeds=tensor([[[ 0.0487,  0.0414,  0.0152,  ..., -0.0653,  0.0690,  0.0282],
         [ 0.0933, -0.0547, -0.0404,  ..., -0.0171,  0.0368,  0.0346],
         [ 0.0328,  0.0637, -0.0080,  ..., -0.0853,  0.0539, -0.0424],
         ...,
         [ 0.0979,  0.0351,  0.0609,  ..., -0.1112,  0.0769,  0.0440],
         [-0.0003,  0.0316,  0.0798,  ..., -0.0439,  0.0782, -0.0472],
         [-0.0137,  0.0684,  0.0107,  ..., -0.005




RuntimeError: stack expects each tensor to be equal size, but got [1, 80, 768] at entry 0 and [1, 36, 768] at entry 2