In [1]:
from PIL import Image
import requests
from transformers import AutoProcessor, FlavaForPreTraining

model = FlavaForPreTraining.from_pretrained("facebook/flava-full")
processor = AutoProcessor.from_pretrained("facebook/flava-full")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True, return_codebook_pixels=True, return_image_mask=True)

# outputs = model(**inputs)
# logits_per_image = outputs.contrastive_logits_per_image  # this is the image-text similarity score
# probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

`text_config_dict` is provided which will be used to initialize `FlavaTextConfig`. The value `text_config["id2label"]` will be overriden.
`multimodal_config_dict` is provided which will be used to initialize `FlavaMultimodalConfig`. The value `multimodal_config["id2label"]` will be overriden.
`image_codebook_config_dict` is provided which will be used to initialize `FlavaImageCodebookConfig`. The value `image_codebook_config["id2label"]` will be overriden.


In [3]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'codebook_pixel_values', 'bool_masked_pos'])

In [4]:
for key, value in inputs.items():
    print(f'Type of {key}: {value.shape}')

Type of input_ids: torch.Size([1, 7])
Type of token_type_ids: torch.Size([1, 7])
Type of attention_mask: torch.Size([1, 7])
Type of pixel_values: torch.Size([1, 3, 224, 224])
Type of codebook_pixel_values: torch.Size([1, 3, 112, 112])
Type of bool_masked_pos: torch.Size([1, 14, 14])


In [10]:
inputs["pixel_values"].shape

torch.Size([1, 3, 224, 224])

In [6]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'codebook_pixel_values', 'bool_masked_pos'])

In [3]:
outputs["loss_info"]

FlavaLosses(mim=None, mlm=None, itm=None, global_contrastive=tensor(0., grad_fn=<MulBackward0>), mmm_image=tensor(7.1579, grad_fn=<MulBackward0>), mmm_text=None)

## Tokenize two descriptions at the same time

In Flava, there is a contrastive loss that aims to connect images with the appropriate description.

However, the FLAVA model expects a single string per item. For that reason, both descriptions, the correct and the incorrect have to be easily distinguishable. The way that we can distinguish between them is via[ the token type ids](https://huggingface.co/docs/transformers/glossary#token-type-ids). Which basically assign a 0 to the tokens of the first sentence and a 1 to the tokens of the second sentence:

```python
[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]

# encoded_dict["token_type_ids"]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
```

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
sequence_a = "HuggingFace is based in NYC"
sequence_b = "Where is HuggingFace based?"

encoded_dict = tokenizer(sequence_a, sequence_b)
decoded = tokenizer.decode(encoded_dict["input_ids"])

print(decoded)
print(encoded_dict["token_type_ids"])

[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]


# Prepare data for MLM with in FLAVA DataCollator

In [None]:
from transformers import DataCollatorForLanguageModeling, AutoTokenizer

model_name = "facebook/flava-full"
tokenizer = AutoTokenizer.from_pretrained(model_name)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=True, mlm_probability=0.2, return_tensors="pt")

data_collator.torch_mask_tokens(inputs=inputs['input_ids'], special_tokens_mask=inputs['special_tokens_mask'])

del inputs['special_tokens_mask']

In [4]:
import datasets

pmd = datasets.load_dataset("facebook/pmd", "wit", use_auth_token=True, streaming=True)
pmd_train_head = pmd['train'].take(2)
pmd_train_head_with_images = pmd_train_head.map(fetch_images, batched=True, batch_size=100, fn_kwargs={"num_threads": 20})
datapoint = next(iter(pmd_train_head_with_images))

Downloading builder script:   0%|          | 0.00/42.4k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/19.8k [00:00<?, ?B/s]

NameError: name 'fetch_images' is not defined

In [2]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Explore how MLM is done with BERT

FLAVA is special because it does multiple pretraining objectives at the same time. One of them is Masked Language Modelin (MLM). In order to prepare the data for MLM in FLAVA we are going to start understanding how it is done in a simpler model such as BERT:

In [12]:
from transformers import AutoTokenizer, BertForMaskedLM
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

# retrieve index of [MASK]
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
# mask labels of non-[MASK] tokens
labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)

outputs = model(**inputs, labels=labels)
round(outputs.loss.item(), 2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.88

In [14]:
inputs

{'input_ids': tensor([[ 101, 1996, 3007, 1997, 2605, 2003,  103, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [13]:
labels

tensor([[-100, -100, -100, -100, -100, -100, 3000, -100, -100]])