In [6]:
import logging
from pathlib import Path
from typing import Any, Dict


import torch
from omegaconf import DictConfig
from rich import print
from torch.utils.data import DataLoader

from generalist.generalist_datasets.coco.coco import (
    CocoCaption,
    CocoCaptionTargetTranform,
    CocoFilepaths,
    CocoImageTransforms,
)
from generalist.eval import preliminary_eval
from generalist.generalist_datasets.hf.summary import BillSum, XSum, SummaryTransforms
from generalist.generalist_datasets.utils.data_collate import collate_func_helper
from generalist.generalist_datasets.utils.multiple_datasets import ChainedDataset
from generalist.generalist_tokenizers import image_tokenizers, text_tokenizers
from generalist.models.embedding_model import EmbeddingModel
from generalist.models.model import GeneralistModel
from generalist.models.output_model import GeneralOutput
from generalist.predict import ImageCaptionPrediction
from generalist.utils.display.display import GeneralistDisplay
from generalist.utils.utils import get_hostname, save_checkpoint

import hydra
from hydra import initialize, compose

In [7]:
with initialize(version_base=None, config_path="../config"):
    cfg = compose(config_name=get_hostname())

In [8]:
model_save_dir = Path(cfg.model_save_dir)
display_flag = cfg.display.display_flag
device = cfg.device
context_length = cfg.context_length

learning_rate = cfg.training.learning_rate
batch_size = cfg.training.batch_size
n_epochs = cfg.training.n_epochs

model_dim = cfg.model.model_dim

image_tokenizer = image_tokenizers.ImageTokenizer(device=device)
text_tokenizer = text_tokenizers.TextTokenizerBert.from_pretrained("bert-base-uncased")

embedding_model = EmbeddingModel(model_dim=model_dim)
# output_model = GeneralClassificationOutput(model_dim=model_dim, num_classes=10, reduce_type="cls")
output_model = GeneralOutput(model_dim=model_dim, output_dim=text_tokenizer.vocab_size)
model = GeneralistModel(output_model=output_model, **cfg.model).to(device)

start_tokens = torch.Tensor([text_tokenizer.cls_token_id]).to(device).to(int)

embedding_model.to(device)
model.to(device)

loss_fn = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.AdamW(
    [
        {"params": embedding_model.parameters()},
        {"params": model.parameters()},
    ],
    lr=learning_rate,
)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

tokenizers = [image_tokenizer, text_tokenizer]

text_tokenizer_kwargs = cfg.text_tokenizer

coco_filepaths = CocoFilepaths(base_dir=cfg.coco_dir, split="train")

coco_caption = CocoCaption(
    root=coco_filepaths.images_root,
    annFile=coco_filepaths.captions_filepath,
    transform=CocoImageTransforms.train,
    target_transform=CocoCaptionTargetTranform.get(text_tokenizer=text_tokenizer, text_tokenizer_kwargs=text_tokenizer_kwargs).train,
)

summary_dataset = XSum(
    text_transform=SummaryTransforms.make_transforms(text_tokenizer=text_tokenizer, text_tokenizer_kwargs=text_tokenizer_kwargs).train,
)
from evaluate import evaluator
task_evaluator = evaluator("summarization")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'TextTokenizerBert'.


loading annotations into memory...
Done (t=0.72s)
creating index...
index created!


Using custom data configuration default
Reusing dataset xsum (/data/graham/datasets/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


In [11]:
task_evaluator

<evaluate.evaluator.text2text_generation.SummarizationEvaluator at 0x7f0c00a45840>

In [None]:
class MyPipeline:
    def __init__(self):
        self.model = model
        self.embedding_model = embedding_model
    def __call__(self)

In [19]:
sample = summary_dataset[0]
embedded_data = embedding_model(sample.data.to(device))
embedded_tgt = embedding_model(sample.target.to(device))

tgt_mask = model.get_tgt_mask_tri(embedded_tgt=embedded_tgt)
logits = model(embedded_data, embedded_tgt=embedded_tgt, tgt_key_padding_mask=None, tgt_mask=tgt_mask)


In [20]:
logits

tensor([[[ 0.0952,  0.4816,  0.5421,  ...,  0.0427, -1.1204, -0.8917],
         [ 0.2325,  0.5230,  0.6460,  ...,  0.4650, -0.9121, -0.9191],
         [ 0.4001,  0.7019,  0.7704,  ...,  0.6375, -1.5120, -0.7964],
         ...,
         [ 0.7627,  0.5170,  0.5903,  ...,  0.2748, -1.2592, -0.8494],
         [ 0.6674,  0.7275,  0.8456,  ...,  0.3820, -1.1086, -1.1657],
         [ 0.8795,  0.6612,  0.7759,  ...,  0.6136, -1.0662, -0.9765]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>)

In [27]:
task_evaluator.__class__

evaluate.evaluator.text2text_generation.SummarizationEvaluator

In [32]:
text_tokenizer.batch_decode(sample.data)



In [34]:
from evaluate import load
perplexity = load("perplexity", module_type="metric")


Downloading builder script: 100%|████████████████████████████████████████████████████████████████████████████████████████| 8.41k/8.41k [00:00<00:00, 3.77MB/s]


In [35]:
generated_summary = "I absolutely loved reading the Hunger Games"
reference_summary = "I loved reading the Hunger Games"

In [37]:
!pip install rouge_score


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting nltk
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting joblib
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24936 sha256=9889440783bb88370444dd9a1caefb14919c8c7144da2f81ffbf918dee637217
  Stored in directory: /home/graham/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: joblib, nltk, rouge_score
Successfully installed joblib-1.2.0 nltk-3.7 rouge_score-0.1.2


In [38]:
import evaluate

rouge_score = evaluate.load("rouge")

In [39]:
scores = rouge_score.compute(
    predictions=[generated_summary], references=[reference_summary]
)
scores

{'rouge1': 0.923076923076923,
 'rouge2': 0.7272727272727272,
 'rougeL': 0.923076923076923,
 'rougeLsum': 0.923076923076923}

In [41]:
scores = rouge_score.compute(
    predictions=[1, 3, 5, 10], references=[13, 5, 10]
)
scores

ValueError: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id=None)}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: 1,
Input references: 13