# Reto 3

Adaptado de https://github.com/ckoutlis/memefier/tree/master

In [2]:
!ls -lha1 ./drive/MyDrive/datos.zip

-rw------- 1 root root 3.4G Feb 29 01:36 ./drive/MyDrive/datos.zip


In [3]:
!unzip ./drive/MyDrive/datos.zip -d . > /dev/null

Una vez descomprimidos, favor de adicionar los siguientes archivos a la carpeta `/data`:
* https://raw.githubusercontent.com/ckoutlis/memefier/master/data/protected_attributes_fbhm.csv
* https://github.com/ckoutlis/memefier/raw/master/data/captions_fbhm.pickle


In [14]:
## utils.py

import string
import json
import pandas as pd
import numpy as np
import random, os

import torch
from sklearn.metrics import accuracy_score, f1_score


def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)


def text_clean(text):
    clean = []
    for i, text_ in enumerate(text):
        if type(text_) is str:
            text_ = text_.lower()  # lower case
            text_ = text_.translate(
                str.maketrans("", "", string.punctuation)
            )  # remove punctuation
            text_ = "".join([i for i in text_ if not i.isdigit()])  # remove numbers
            text_ = " ".join(text_.split())  # remove double space
            clean.append(text_)
        else:
            clean.append("notext")
    return clean


def split_fbhm(directory):
    train = []
    with open(f"{directory}train.jsonl") as f:
        for line in f:
            sample = json.loads(line)
            train.append([sample["img"], sample["text"], sample["label"]])

    dev = []
    with open(f"{directory}dev.jsonl") as f:
        for line in f:
            sample = json.loads(line)
            dev.append([sample["img"], sample["text"], sample["label"]])

    return pd.DataFrame(train, columns=["img", "text", "label"]), pd.DataFrame(
        dev, columns=["img", "text", "label"]
    )


def loss_function(loss_object, prediction, target, device):
    return sum(
        [
            (
                loss_object(prediction[:, w, :], target[:, w + 1].to(device))
                * (target[:, w + 1].to(device) != 0)
            ).mean()
            for w in range(prediction.shape[1] - 1)
        ]
    ) / (prediction.shape[1] - 1)

In [16]:
# data.py
import pandas as pd
import os
import pickle
import json
from PIL import Image, ImageFile
from collections import Counter, OrderedDict

import torch
from torch.utils.data import Dataset
from torchtext.vocab import vocab


#from src.utils import split_fbhm #text_clean, split_fbhm

ImageFile.LOAD_TRUNCATED_IMAGES = True

def text_clean(text):
    clean = []
    for i, text_ in enumerate(text):
        if type(text_) is str:
            text_ = text_.lower()  # lower case
            text_ = text_.translate(
                str.maketrans("", "", string.punctuation)
            )  # remove punctuation
            text_ = "".join([i for i in text_ if not i.isdigit()])  # remove numbers
            text_ = " ".join(text_.split())  # remove double space
            clean.append(text_)
        else:
            clean.append("notext")
    return clean


class FBHM(Dataset):
    def __init__(
        self,
        directory,
        train,
        vocab_size,
        seq_len,
        transform=None,
    ):
        self.directory = directory
        self.train, self.dev = split_fbhm(self.directory)

        # Create the training text corpus vocabulary
        sentences = text_clean(self.train["text"].tolist())
        words = [w for s in sentences for w in s.split()]
        counts = sorted(Counter(words).items(), key=lambda x: x[1], reverse=True)[
            :vocab_size
        ]
        self.vocabulary = vocab(OrderedDict(counts))
        self.vocabulary.insert_token("<pad>", 0)
        self.vocabulary.insert_token("<unk>", 1)
        self.vocabulary.set_default_index(self.vocabulary["<unk>"])

        with open("data/captions_fbhm.pickle", "rb") as handle:
            self.captions = pickle.load(handle)
        caps_sentences = text_clean([self.captions[x] for x in self.captions])
        self.caps_seq_len = max([len(s.split()) for s in caps_sentences])
        caps_words = [w for s in caps_sentences for w in s.split()]
        caps_counts = sorted(
            Counter(caps_words).items(), key=lambda x: x[1], reverse=True
        )
        self.caps_vocabulary = vocab(OrderedDict(caps_counts))
        self.caps_vocabulary.insert_token("<pad>", 0)
        self.caps_vocabulary.insert_token("<unk>", 1)
        self.caps_vocabulary.insert_token("<sos>", 2)
        self.caps_vocabulary.insert_token("<eos>", 3)
        self.caps_vocabulary.set_default_index(self.caps_vocabulary["<unk>"])

        # External knowledge
        self.attributes = pd.read_csv("data/protected_attributes_fbhm.csv")[
            ["face_name_align", "race", "gender", "age"]
        ]
        self.max_attr = 7 * 3
        self.attribute_map = {
            "<pad>": 0,
            "White": 1,
            "Indian": 2,
            "Southeast Asian": 3,
            "Middle Eastern": 4,
            "Latino_Hispanic": 5,
            "Black": 6,
            "East Asian": 7,
            "Male": 8,
            "Female": 9,
            "0-2": 10,
            "3-9": 11,
            "10-19": 12,
            "20-29": 13,
            "30-39": 14,
            "40-49": 15,
            "50-59": 16,
            "60-69": 17,
            "70+": 18,
        }

        # Pick the split's examples to make the dataset
        self.memes = self.train if train else self.dev

        self.seq_len = seq_len
        self.transform = transform

    def __len__(self):
        return len(self.memes)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image_name = self.memes["img"][idx]
        image = Image.open(os.path.join(self.directory, image_name)).convert("RGB")
        text = text_clean([self.memes["text"][idx]])[0]
        text_index = self.vocabulary(text.split())
        label = self.memes["label"][idx]
        caption = text_clean([self.captions[image_name.replace("img/", "")]])[0]
        caption_index = self.caps_vocabulary(caption.split())
        image_attributes = self.attributes[
            self.attributes["face_name_align"].str.contains(image_name[4:-4])
        ].values[:, 1:]
        external_index = [self.attribute_map[y] for x in image_attributes for y in x]
        sample = {
            "image": image,
            "text": text,
            "text_index": text_index[: self.seq_len]
            if len(text_index) >= self.seq_len
            else text_index
            + [self.vocabulary["<pad>"]] * (self.seq_len - len(text_index)),
            "label": label,
            "caption": caption,
            "caption_index": [self.caps_vocabulary["<sos>"]]
            + caption_index
            + [self.caps_vocabulary["<eos>"]]
            + [self.caps_vocabulary["<pad>"]]
            * (self.caps_seq_len - len(caption_index)),
            "external_index": external_index[: self.max_attr]
            if len(external_index) >= self.max_attr
            else external_index
            + [self.attribute_map["<pad>"]] * (self.max_attr - len(external_index)),
        }
        sample["text_index"] = torch.tensor(sample["text_index"])
        sample["caption_index"] = torch.tensor(sample["caption_index"])
        sample["external_index"] = torch.tensor(sample["external_index"])
        if self.transform:
            sample["image"] = self.transform(sample["image"])
        return sample


In [17]:
# models.py

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import resnet18
#import clip

import math


def clip_vis_hook(module, input, output):
    global region_features
    region_features = output


def clip_text_hook(module, input, output):
    global word_features
    word_features = output


class ImageOnly(torch.nn.Module):
    def __init__(self, pretrained, hidden_dim, num_classes, device):
        super().__init__()
        self.device = device
        self.num_classes = num_classes

        img_fe = resnet18(pretrained=pretrained)
        img_fe.fc = torch.nn.Identity()
        self.img_fe = img_fe
        self.fc_img1 = torch.nn.Linear(512, hidden_dim * 2)

        if isinstance(num_classes, dict):
            self.fc_hum = torch.nn.Linear(hidden_dim * 2, num_classes["humour"])
            self.fc_sar = torch.nn.Linear(hidden_dim * 2, num_classes["sarcasm"])
            self.fc_off = torch.nn.Linear(hidden_dim * 2, num_classes["offensive"])
            self.fc_mot = torch.nn.Linear(hidden_dim * 2, num_classes["motivational"])
        elif isinstance(num_classes, int):
            self.fc = torch.nn.Linear(hidden_dim * 2, num_classes)
        else:
            raise Exception(
                f"num_classes can only be either a dict or an int, {num_classes} is found"
            )

    def forward(self, x):
        y = torch.tanh(self.fc_img1(self.img_fe(x["image"].to(self.device))))
        if isinstance(self.num_classes, dict):
            return [
                self.fc_hum(y),
                self.fc_sar(y),
                self.fc_off(y),
                self.fc_mot(y),
            ]
        elif isinstance(self.num_classes, int):
            return self.fc(y)


class TextOnly(torch.nn.ModuleList):
    def __init__(self, max_words, hidden_dim, lstm_layers, num_classes, device):
        super(TextOnly, self).__init__()
        self.device = device
        self.num_classes = num_classes

        # Hyperparameters
        self.input_size = max_words
        self.hidden_dim = hidden_dim
        self.LSTM_layers = lstm_layers

        self.dropout = torch.nn.Dropout(0.5)
        self.embedding = torch.nn.Embedding(
            self.input_size, self.hidden_dim, padding_idx=0
        )
        self.lstm = torch.nn.LSTM(
            input_size=self.hidden_dim,
            hidden_size=self.hidden_dim,
            num_layers=self.LSTM_layers,
            batch_first=True,
        )
        self.fc1 = torch.nn.Linear(
            in_features=self.hidden_dim, out_features=self.hidden_dim * 2
        )
        if isinstance(num_classes, dict):
            self.fc_hum = torch.nn.Linear(self.hidden_dim * 2, num_classes["humour"])
            self.fc_sar = torch.nn.Linear(self.hidden_dim * 2, num_classes["sarcasm"])
            self.fc_off = torch.nn.Linear(self.hidden_dim * 2, num_classes["offensive"])
            self.fc_mot = torch.nn.Linear(
                self.hidden_dim * 2, num_classes["motivational"]
            )
        elif isinstance(num_classes, int):
            self.fc2 = torch.nn.Linear(self.hidden_dim * 2, num_classes)
        else:
            raise Exception(
                f"num_classes can only be either a dict or an int, {num_classes} is found"
            )

    def forward(self, x):
        # Hidden and cell state definion
        h = torch.zeros(
            (self.LSTM_layers, x["text_index"].to(self.device).size(0), self.hidden_dim)
        ).to(self.device)
        c = torch.zeros(
            (self.LSTM_layers, x["text_index"].to(self.device).size(0), self.hidden_dim)
        ).to(self.device)

        # Initialization of hidden and cell states
        torch.nn.init.xavier_normal_(h)
        torch.nn.init.xavier_normal_(c)

        out = self.embedding(x["text_index"].to(self.device))
        out, (hidden, cell) = self.lstm(out, (h, c))
        out = self.dropout(out)
        out = torch.relu_(self.fc1(out[:, -1, :]))
        out = self.dropout(out)
        if isinstance(self.num_classes, dict):
            return [
                self.fc_hum(out),
                self.fc_sar(out),
                self.fc_off(out),
                self.fc_mot(out),
            ]
        elif isinstance(self.num_classes, int):
            return self.fc2(out)


class MultiModal(torch.nn.Module):
    def __init__(
        self, pretrained, max_words, hidden_dim, lstm_layers, num_classes, device
    ):
        super().__init__()
        self.device = device
        self.num_classes = num_classes

        img_fe = ImageOnly(pretrained, hidden_dim, 1, device)
        img_fe.fc = torch.nn.Identity()
        self.img_fe = img_fe

        txt_fe = TextOnly(max_words, hidden_dim, lstm_layers, 1, device)
        txt_fe.fc2 = torch.nn.Identity()
        self.txt_fe = txt_fe

        self.fc_img1 = torch.nn.Linear(hidden_dim * 2, hidden_dim * 2)
        self.fc_img2 = torch.nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc_txt1 = torch.nn.Linear(hidden_dim * 2, hidden_dim * 2)
        self.fc_txt2 = torch.nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc_multi1 = torch.nn.Linear(hidden_dim * 2, hidden_dim)
        if isinstance(num_classes, dict):
            self.fc_hum = torch.nn.Linear(hidden_dim, num_classes["humour"])
            self.fc_sar = torch.nn.Linear(hidden_dim, num_classes["sarcasm"])
            self.fc_off = torch.nn.Linear(hidden_dim, num_classes["offensive"])
            self.fc_mot = torch.nn.Linear(hidden_dim, num_classes["motivational"])
        elif isinstance(num_classes, int):
            self.fc_multi2 = torch.nn.Linear(hidden_dim, num_classes)
        else:
            raise Exception(
                f"num_classes can only be either a dict or an int, {num_classes} is found"
            )

    def forward(self, x):
        y_img = F.relu(self.fc_img2(torch.tanh(self.fc_img1(self.img_fe(x)))))
        y_txt = F.relu(self.fc_txt2(torch.tanh(self.fc_txt1(self.txt_fe(x)))))
        y = F.relu(self.fc_multi1(torch.concat([y_img, y_txt], dim=-1)))
        if isinstance(self.num_classes, dict):
            return [
                self.fc_hum(y),
                self.fc_sar(y),
                self.fc_off(y),
                self.fc_mot(y),
            ]
        return self.fc_multi2(y)

In [18]:
import torch
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from sklearn.metrics import roc_auc_score, accuracy_score

import time
import random
import numpy as np
import pickle
import json

In [19]:
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    raise Exception("No GPUs available!")

In [21]:
os.mkdir("results")

In [27]:
datdir = "data/"
savpath = "results/baselines_fbhm.pickle"
imnet_mean = [0.485, 0.456, 0.406]
imnet_std = [0.229, 0.224, 0.225]
INIT_IMG_SIZE = 256
IMG_SIZE = 224
VOCAB_SIZE = 2222  # 2222 words have at least 5 occurrences
SEQ_LEN = 20  # 90% quantile is 20, 95% is 24 and 99% is 34
hidden_dims = [64, 128, 256]
lstm_layers = [1, 3]
batch_size = 128
epochs = 10
#modalities = ["image", "text", "multi"]

modalities = ["multi"]
learning_rates = [1e-2, 1e-3, 1e-4, 1e-5]
pretrained = [True]

results = []

In [30]:
train_ds = FBHM(
    directory=datdir,
    train=True,
    vocab_size=VOCAB_SIZE,
    seq_len=SEQ_LEN,
    transform=transforms.Compose(
        [
            transforms.Resize((INIT_IMG_SIZE, INIT_IMG_SIZE)),
            transforms.RandomCrop(IMG_SIZE),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(imnet_mean, imnet_std),
        ]
    ),
)

train_dl = DataLoader(
    train_ds,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=True,
    drop_last=False,
)

val_ds = FBHM(
    directory=datdir,
    train=False,
    vocab_size=VOCAB_SIZE,
    seq_len=SEQ_LEN,
    transform=transforms.Compose(
        [
            transforms.Resize((IMG_SIZE, IMG_SIZE)),
            transforms.ToTensor(),
            transforms.Normalize(imnet_mean, imnet_std),
        ]
    ),
)

val_dl = DataLoader(
    val_ds,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
    drop_last=False,
)

In [31]:
os.mkdir("ckpt")

In [None]:

for modality in modalities:
    ckpt = f"ckpt/baseline_fbhm_{modality}.pt"
    max_auc = 0
    for learning_rate in learning_rates:
        for pretrained_ in pretrained:
            for hidden_dim in hidden_dims:
                for lstm_layers_ in lstm_layers:

                    if (modality == "text" and pretrained_ is True) or (
                        modality == "image" and lstm_layers_ in lstm_layers[1:]
                    ):
                        continue

                    print(
                        f"m={modality} - lr={learning_rate} - pt={pretrained_} - hd={hidden_dim} - l={lstm_layers_}"
                    )

                    model = MultiModal(
                            pretrained=pretrained_,
                            max_words=VOCAB_SIZE + 2,
                            hidden_dim=hidden_dim,
                            lstm_layers=lstm_layers_,
                            num_classes=1,
                            device=device,
                        ).to(device)

                    criterion = torch.nn.BCEWithLogitsLoss()
                    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

                    start = time.time()
                    val_loss = []
                    val_acc = []
                    val_auc = []
                    for epoch in range(epochs):
                        model.train()
                        for data in train_dl:
                            labels = data["label"].to(device)
                            optimizer.zero_grad()
                            outputs = model(data)
                            loss_ = criterion(outputs, labels.float().view(-1, 1))
                            loss_.backward()
                            optimizer.step()

                        model.eval()
                        val_loss_ = 0
                        y_score = []
                        y_true = []
                        y_pred = []
                        with torch.no_grad():
                            for data in val_dl:
                                labels = data["label"].to(device)
                                outputs = model(data)
                                val_loss_ += criterion(
                                    outputs, labels.float().view(-1, 1)
                                ).item()
                                score = torch.sigmoid(outputs.data)
                                y_score.extend(score.cpu().numpy().tolist())
                                y_true.extend(labels.cpu().numpy().tolist())
                                y_pred.extend((score > 0.5).cpu().numpy().tolist())

                        val_loss_ /= len(val_dl)
                        val_acc_ = accuracy_score(y_true, y_pred)
                        val_auc_ = roc_auc_score(y_true, y_score)

                        if val_auc_ > max_auc:
                            max_auc = val_auc_
                            torch.save(model, ckpt)
                            with open(f"{ckpt[:-3]}.txt", "w") as file:
                                file.write(
                                    json.dumps(
                                        {
                                            "modality": modality,
                                            "lr": learning_rate,
                                            "pretrained": pretrained_,
                                            "hidden_dim": hidden_dim,
                                            "lstm_layers": lstm_layers_,
                                            "val_loss": val_loss_,
                                            "val_acc": val_acc_,
                                            "val_auc": val_auc_,
                                            "epoch": epoch,
                                        }
                                    )
                                )

                        val_loss.append(val_loss_)
                        val_acc.append(val_acc_)
                        val_auc.append(val_auc_)

                        if epoch == int(epochs / 2) - 1:
                            for g in optimizer.param_groups:
                                g["lr"] = learning_rate / 10

                    result = {
                        "modality": modality,
                        "lr": learning_rate,
                        "pretrained": pretrained_,
                        "hidden_dim": hidden_dim,
                        "lstm_layers": lstm_layers_,
                        "val_loss": val_loss,
                        "val_acc": val_acc,
                        "val_auc": val_auc,
                        "time": time.time() - start,
                    }
                    print(json.dumps(result, indent=4))
                    results.append(result)

                    with open(savpath, "wb") as h:
                        pickle.dump(results, h, protocol=pickle.HIGHEST_PROTOCOL)

m=multi - lr=0.01 - pt=True - hd=64 - l=1


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 123MB/s]


{
    "modality": "multi",
    "lr": 0.01,
    "pretrained": true,
    "hidden_dim": 64,
    "lstm_layers": 1,
    "val_loss": [
        0.7392352968454361,
        0.8362051397562027,
        0.7179590910673141,
        0.7493745982646942,
        0.7803064584732056,
        0.789846882224083,
        0.794966533780098,
        0.7880399823188782,
        0.8234898746013641,
        0.8300914764404297
    ],
    "val_acc": [
        0.5,
        0.5,
        0.502,
        0.512,
        0.506,
        0.512,
        0.51,
        0.512,
        0.514,
        0.506
    ],
    "val_auc": [
        0.49968,
        0.49886400000000003,
        0.494664,
        0.5244960000000001,
        0.540848,
        0.539648,
        0.5436639999999999,
        0.54804,
        0.543792,
        0.5472239999999999
    ],
    "time": 1619.5324532985687
}
m=multi - lr=0.01 - pt=True - hd=64 - l=3




{
    "modality": "multi",
    "lr": 0.01,
    "pretrained": true,
    "hidden_dim": 64,
    "lstm_layers": 3,
    "val_loss": [
        0.7158435434103012,
        0.7312909811735153,
        0.7257838845252991,
        0.7380177527666092,
        0.7393548041582108,
        0.7630848586559296,
        0.7855041772127151,
        0.7545247226953506,
        0.7799421548843384,
        0.7759740650653839
    ],
    "val_acc": [
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.512,
        0.522,
        0.524,
        0.518
    ],
    "val_auc": [
        0.49168,
        0.47541600000000006,
        0.5338320000000001,
        0.534216,
        0.531736,
        0.5178959999999999,
        0.5267040000000001,
        0.530736,
        0.522736,
        0.532952
    ],
    "time": 1620.3137316703796
}
m=multi - lr=0.01 - pt=True - hd=128 - l=1




{
    "modality": "multi",
    "lr": 0.01,
    "pretrained": true,
    "hidden_dim": 128,
    "lstm_layers": 1,
    "val_loss": [
        0.7180063873529434,
        0.7061160504817963,
        0.7171470373868942,
        0.7582655102014542,
        0.7517638355493546,
        0.7717629969120026,
        0.7584666460752487,
        0.7616005688905716,
        0.7679149508476257,
        0.7735352367162704
    ],
    "val_acc": [
        0.5,
        0.5,
        0.5,
        0.504,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5
    ],
    "val_auc": [
        0.493456,
        0.493024,
        0.51224,
        0.524432,
        0.4938,
        0.522568,
        0.529848,
        0.525904,
        0.51552,
        0.501568
    ],
    "time": 1594.0964946746826
}
m=multi - lr=0.01 - pt=True - hd=128 - l=3


