In [37]:
import torch
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, FlavaModel, BertTokenizer, FlavaProcessor, AutoModelForMultipleChoice, Trainer, TrainingArguments
from datasets import Dataset, load_dataset
import torchvision
from torchvision.io import read_image
import pandas as pd
import json

import data_util

In [38]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [39]:
s1 = 'i have cake and i am hungry, so'
s2 = 'i will eat a slice of cake'

tokens = tokenizer(s1, s2)
tokens

{'input_ids': [101, 1045, 2031, 9850, 1998, 1045, 2572, 7501, 1010, 2061, 102, 1045, 2097, 4521, 1037, 14704, 1997, 9850, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [40]:
# default flava model
model = FlavaModel.from_pretrained('facebook/flava-full')
tokenizer = AutoTokenizer.from_pretrained('facebook/flava-full')

`text_config_dict` is provided which will be used to initialize `FlavaTextConfig`. The value `text_config["id2label"]` will be overriden.
`multimodal_config_dict` is provided which will be used to initialize `FlavaMultimodalConfig`. The value `multimodal_config["id2label"]` will be overriden.
`image_codebook_config_dict` is provided which will be used to initialize `FlavaImageCodebookConfig`. The value `image_codebook_config["id2label"]` will be overriden.


In [41]:
# how tokenizer combines text and text_pair as lists of strings
tokens = tokenizer(['sent1 part1', 'sent2 part2'], ['sent1 part2', 'sent2 part2'])
tokenizer.decode(tokens['input_ids'][0])

'[CLS] sent1 part1 [SEP] sent1 part2 [SEP]'

In [42]:
# text only input
text = 'a photo of a dog'
inputs = tokenizer(text=[text], return_tensors='pt', padding='max_length', max_length=77)
text_embedding = model.get_text_features(**inputs)

print(f'{inputs=}')
print(f'{text_embedding=}') # length of max_length



inputs={'input_ids': tensor([[ 101, 1037, 6302, 1997, 1037, 3899,  102,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [43]:
out = model(**inputs)
out.keys()

odict_keys(['text_embeddings', 'text_output'])

In [8]:
# multimodal input
processor = FlavaProcessor.from_pretrained("facebook/flava-full")

source_id = '-1IBHYS3L-Y'
text = 'Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles. then, the man adds wax to the windshield and cuts it.' # first training example context + ending 0
image = read_image(f'../data/hellaswag_images/{source_id}.png') # shape = torch.Size([3, 144, 192])

inputs = processor(
    text=[text],
    images=[image],
    return_tensors='pt',
    padding='max_length',
    max_length=128,
)
print(f'{inputs.keys()=}')

outputs = model(**inputs)
image_embeddings = outputs.image_embeddings # Batch size X (Number of image patches + 1) x Hidden size => 2 X 197 X 768
text_embeddings = outputs.text_embeddings # Batch size X (Text sequence length + 1) X Hidden size => 2 X 77 X 768
multimodal_embeddings = outputs.multimodal_embeddings # Batch size X (Number of image patches + Text Sequence Length + 3) X Hidden size => 2 X 275 x 768

inputs.keys()=dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'pixel_values'])


In [9]:
outputs.keys()

odict_keys(['image_embeddings', 'image_output', 'text_embeddings', 'text_output', 'multimodal_embeddings', 'multimodal_output'])

In [44]:
# generate encoded ds from jsonl

# ds = load_dataset('csv', data_files='../data/csv/tiny_hellaswag_train.csv', split='train')
ds = data_util.jsonl2ds('../data/raw_data/hellaswag_train.jsonl')

def preprocess_data(batch): # garbage testing preprocessing
    # each prompt in batch is converted to a list of strings for ctx_a and
    # ctx_b + ending
    # duplicate ctx_a 4 times, one for each ending
    first_sentences = [[ctx_a] * 4 for ctx_a in batch['ctx_a']]
    # print(f'{batch["ctx_a"]=}')
    # print(f'{first_sentences=}')
    # print(f'{len(first_sentences)=}')

    # ctx_b + each ending as a list
    second_sentences = [[f'{ctx_b} {ending}' for ending in batch['endings'][ix]]
                        for ix, ctx_b in enumerate(batch['ctx_b'])]
    # print(f'{batch["ctx_b"]=}')
    # print(f'{second_sentences=}')
    # print(f'{len(second_sentences)=}')

    # flatten contexts and endings
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    # print()
    # print(f'{first_sentences=}')
    # print(f'{second_sentences=}')
    # print(f'{len(first_sentences)=}')
    # print(f'{len(second_sentences)=}')

    encoding = tokenizer(first_sentences,
                         second_sentences,
                         return_tensors='pt',
                         truncation=True,
                         max_length=128,
                         padding='max_length',)
    # encoding['label'] = batch['label']
    # print(f'{encoding.keys()}')
    # print(f'{len(encoding["input_ids"])=}')
    # print(f'{list(encoding.items())[0]=}')
    ret = {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in encoding.items()}
    ret['label'] = batch['label']
    # print(f'{ret.keys()=}')
    # print(f'{len(ret["input_ids"])=}')
    # print(f'{len(ret["label"])=}')
    # print(f'{ret["input_ids"][0]=}')
    # print(f'{ret["label"][0]=}')

    return ret

encoded_ds = ds.map(preprocess_data, batched=True)
print(encoded_ds)
# for i in range(2):
#     for j in range(4):
#         print(tokenizer.decode(encoded_ds['input_ids'][i][j]))
#     print(encoded_ds['label'][i])

Map:   0%|          | 0/14740 [00:00<?, ? examples/s]

Dataset({
    features: ['ind', 'activity_label', 'ctx_a', 'ctx_b', 'ctx', 'split', 'split_type', 'label', 'endings', 'source_id', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 14740
})
[CLS] then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles. [SEP] then, the man adds wax to the windshield and cuts it. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] then, the man writes over the snow covering the window 

# Trainer

In [45]:
output_dir = './checkpoint'
batch_size = 16
learning_rate = 2e-5
num_train_epochs = 5

In [46]:
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    optim="adamw_torch",
)

# FLAVA

In [None]:
model = nn.Sequential(
    nn.Linear(10, 32),
    nn.ReLU(),
    nn.Linear(32, 1)
)

training_args = TrainingArguments(
    output_dir='./checkpoint',
    evaluation_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    optim="adamw_torch",
)

trainer = Trainer(model=model, args=training_args,
                  train_dataset=encoded_ds
                  )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

TypeError: can only concatenate list (not "str") to list

In [None]:
trainer.train()

  0%|          | 0/5 [00:00<?, ?it/s]

KeyError: 0

# BERT

In [48]:
bert_model = AutoModelForMultipleChoice.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
bert_trainer = Trainer(model=bert_model,
                       args=training_args,
                       train_dataset=encoded_ds
                       # !!! more args later
                       )

In [52]:
bert_trainer.train()

  0%|          | 0/4610 [00:00<?, ?it/s]

KeyboardInterrupt: 