# Load datasets
---

In [1]:
import datasets
from xsum_dataset import XsumDataset

In [2]:
xsum_data_raw = datasets.load_dataset("xsum")

Using custom data configuration default
Reusing dataset xsum (/home/wk247/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# train/val/test data
# xsum_train_data = XsumDataset(xsum_data_raw["train"])
# xsum_val_data = XsumDataset(xsum_data_raw["validation"])
# xsum_test_data = XsumDataset(xsum_data_raw["test"])

In [3]:
xsum_test_data = XsumDataset(xsum_data_raw["test"])

# concat data
xsum_data_raw_cc = datasets.concatenate_datasets(
    [xsum_data_raw["train"], xsum_data_raw["validation"], xsum_data_raw["test"]]
    )
xsum_concat_data = XsumDataset(xsum_data_raw_cc)

## select a sample

In [4]:
import random

random.seed(0)

### * one to be perturbed

In [5]:
# sample one bbcid
bbc_id = random.choice(list(xsum_test_data.data_by_id.keys()))
# or fix one: bbc_ids = ["33858956"]
bbc_id

'35616768'

In [6]:
# selected_data - dict with keys: (id, document, true_summary, (factuality_data, faithfulness_data))
selected_data = xsum_test_data.data_by_id[bbc_id]

# original_docs - documents to sumamrize
original_doc = selected_data["document"]
true_summary = selected_data["true_summary"]
print("original doc to summarize:\n", original_doc)

original doc to summarize:
 The agreement, reached late on Friday after two days of talks in Brussels, gives the UK power to limit some EU migrants' benefits.
It also includes a treaty change so the UK is not bound to "ever closer union" with other EU member states, he said.
EU exit campaigners said the "hollow" deal offered only "very minor changes".
Mr Cameron is set to the announce the date of a referendum on whether Britain should remain in the EU after a cabinet meeting which is happening at 10:00 GMT - the referendum is widely expected to be on Thursday, 23 June.
Once the date is announced, ministers will be allowed to campaign for whichever side they want - one of Mr Cameron's closest political allies Michael Gove has already been named as supporting the Leave camp. Others, such as Iain Duncan Smith are expected to follow - but a question mark remains over which way London Mayor Boris Johnson will jump.
The key points of the deal are:
The prime minster had to make concessions to

In [7]:
# sample one bbcid
ood_id = random.choice(list(xsum_test_data.data_by_id.keys()))
ood_selected_data = xsum_test_data.data_by_id[ood_id]

# original_docs - documents to sumamrize
ood_doc = ood_selected_data["document"]
print("ood doc to summarize:\n", ood_doc)

ood doc to summarize:
 The team is processing satellite images to show how rocks in a belt that stretches from Europe's Alps to China are slowly accumulating strain.
Movements on the scale of just millimetres per year are being sought.
The new maps are being made available to help researchers produce more robust assessments of seismic hazard.
The kind of change they are trying to chart is not noticeable in the everyday human sense, but over time will put faults under such pressure that they eventually rupture - often with catastrophic consequences.
"We may well discover regions that have very small strain rates that we have not been able to detect before," said Dr Richard Walters.
"And that may well tell us that earthquakes are more likely in some areas that traditionally have been thought of as being completely stable and not at risk of having earthquakes at all."
Dr Walters is affiliated to the UK Centre for Observation and Modelling of Earthquakes, Volcanoes and Tectonics (COMET).
H

## analyze beam search output

In [7]:
from generate_xsum_summary import load_summarization_model_and_tokenizer, generate_summaries, generate_token_entropy_metadata

In [8]:
import time
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
model_name = "facebook/bart-large-xsum"
model, tokenizer = load_summarization_model_and_tokenizer(model_name)

In [10]:
# input
inputs = tokenizer(
    original_doc,
    # max_length=1024,  # default is 1024 for 'facebook/bart-large-xsum'
    truncation=True,
    return_tensors="pt",
    padding=True,
)
input_token_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)

In [135]:
# input
ood_inputs = tokenizer(
    ood_doc,
    # max_length=1024,  # default is 1024 for 'facebook/bart-large-xsum'
    truncation=True,
    return_tensors="pt",
    padding=True,
)
ood_input_token_ids = ood_inputs.input_ids.to(device)
ood_attention_mask = ood_inputs.attention_mask.to(device)

## Generate beam-search output
---
(https://huggingface.co/docs/transformers/main/en/internal/generation_utils#transformers.generation_utils.BeamSearchEncoderDecoderOutput)

* **sequences** (torch.LongTensor of shape (batch_size*num_return_sequences, sequence_length))
    * The generated sequences. The second dimension (sequence_length) is either equal to max_length or shorter if all batches finished early due to the eos_token_id.

* **sequences_scores** (torch.FloatTensor of shape (batch_size*num_return_sequences), optional, returned when output_scores=True is passed or when config.output_scores=True)
    * Final beam scores of the generated sequences.

* **scores** (tuple(torch.FloatTensor) optional, returned when output_scores=True is passed or when config.output_scores=True)
    * Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam. (max_length-1,)-shaped tuple of torch.FloatTensor with each tensor of shape (batch_size*num_beams, config.vocab_size)).

* **beam_indices** (tuple(tuple(torch.LongTensor)), optional, returned when output_scores=True is passed or when config.output_scores=True)
    * Beam indices of generated token id at each generation step. (batch_size*num_return_sequences)-shaped tuple of (max_length-1,)-shaped tuples of scalar torch.LongTensor tensors.

* \+ attentions, hidden states

In [12]:
num_beams = 10

In [25]:
import torch
torch.random.seed = 0

In [12]:
torch.cuda.empty_cache()

In [16]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 3            |        cudaMalloc retries: 3         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   22957 MB |   22957 MB |   49934 MB |   26977 MB |
|       from large pool |   22952 MB |   22952 MB |   49906 MB |   26953 MB |
|       from small pool |       5 MB |       6 MB |      28 MB |      23 MB |
|---------------------------------------------------------------------------|
| Active memory         |   22957 MB |   22957 MB |   49934 MB |   26977 MB |
|       from large pool |   22952 MB |   22952 MB |   49906 MB |   26953 MB |
|       from small pool |       5 MB |       6 MB |      28 MB |      23 MB |
|---------------------------------------------------------------

In [17]:
beam_multi_output = model.generate(
    input_token_ids,
    do_sample=True, 
    max_length=100, 
    top_p=0.92, 
    top_k=0,
    num_return_sequences=100,
    
#     num_beams=num_beams,
#     max_length=150,
#     early_stopping=False,  # check
#     return_dict_in_generate=True,
#     output_scores=True,
)

RuntimeError: CUDA out of memory. Tried to allocate 240.00 MiB (GPU 0; 23.65 GiB total capacity; 22.42 GiB already allocated; 181.44 MiB free; 22.45 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [14]:
beam_es_multi_output = model.generate(
    input_token_ids,
    num_beams=num_beams,
    num_return_sequences=num_beams,
    max_length=150,
    early_stopping=True,  # check
    return_dict_in_generate=True,
    output_scores=True,
)

In [15]:
[tokenizer.decode(
    seq, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
for seq in beam_es_multi_output.sequences]

['David Cameron has said the UK will never be part of a "European superstate" after reaching a deal with other EU leaders to renegotiate its membership.',
 'David Cameron has said the UK will never be part of a "European superstate" after reaching a deal with EU leaders to renegotiate the country\'s membership.',
 'David Cameron has said the UK will never be part of a "European superstate" after agreeing a deal with other EU leaders to stay in the bloc.',
 'David Cameron has said the UK will never be part of a " European superstate" after reaching a deal with other EU leaders to renegotiate its membership.',
 'David Cameron has said the UK will never be part of a "European superstate" after reaching a deal with other EU leaders on renegotiating its membership.',
 'David Cameron has said the UK will never be part of a "European superstate" after reaching a deal with EU leaders to renegotiate its membership.',
 'David Cameron has said the UK will never be part of a "European superstate" 

In [31]:
[tokenizer.decode(
    seq, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
for seq in beam_multi_output] #.sequences]

["David Cameron has said he has achieved the reforms he wanted in the UK's renegotiations with the European Union.",
 'David Cameron has said the UK will "never be part of a European superstate" after securing a deal to stay in the EU.',
 'David Cameron has said the UK\'s "special status" in the European Union has been secured in a deal with other EU leaders.',
 'David Cameron has said he has secured a "historic deal" on the UK\'s membership of the European Union.',
 "David Cameron has said he has achieved the reforms he wanted in the UK's renegotiations with the European Union.",
 'David Cameron has said the UK\'s deal to renegotiate its membership of the European Union is a "once-in-a-generation moment" for Britain.',
 'David Cameron has said the UK will "never be part of a European superstate" after reaching a deal with EU leaders to renegotiate the country\'s membership.',
 'David Cameron has said the UK will never be part of a "European superstate" after reaching a deal with other

In [34]:
true_summary

'David Cameron says a deal struck with EU leaders will give the UK "special status" and he will campaign with his "heart and soul" to stay in the union.'

In [None]:
tmp_selected_logit = torch.index_select(logits, 0, beam_indices)[0][None, :] # -> 1, 33, 50264
tmp_selected_logit.shape

In [157]:
beam_multi_output.sequences[0]

tensor([    2,  8773,  5628,    34,    26,     5,   987,    40,   393,    28,
          233,     9,    10,    22, 17108,  2422,  4897,   113,    71,  3970,
           10,   432,    19,    97,  1281,   917,     7, 20663,   877,     5,
          247,    18,  6332,     4,     2], device='cuda:0')

In [57]:
beam_multi_output.sequences_scores

tensor([-0.6115, -0.6121, -0.6239, -0.6271, -0.6271, -0.6273, -0.6303, -0.6460,
        -0.6462, -0.6574], device='cuda:0')

In [58]:
# remove sos token
gen_sequences = beam_multi_output.sequences[:, 1:]
gen_sequences.shape

torch.Size([10, 33])

In [64]:
gen_labels = gen_sequences.masked_fill(gen_sequences==tokenizer.pad_token_id, -100)
gen_labels

tensor([[ 8773,  5628,    34,    26,     5,   987,    40,   393,    28,   233,
             9,    10,    22, 17108,  2422,  4897,   113,    71,  3970,    10,
           432,    19,    97,  1281,   917,     7, 20663,   877,    63,  6332,
             4,     2,  -100],
        [ 8773,  5628,    34,    26,     5,   987,    40,   393,    28,   233,
             9,    10,    22, 17108,  2422,  4897,   113,    71,  3970,    10,
           432,    19,  1281,   917,     7, 20663,   877,     5,   247,    18,
          6332,     4,     2],
        [ 8773,  5628,    34,    26,     5,   987,    40,   393,    28,   233,
             9,    10,    22, 17108,  2422,  4897,   113,    71, 14176,    10,
           432,    19,    97,  1281,   917,     7,  1095,    11,     5,  7667,
             4,     2,  -100],
        [ 8773,  5628,    34,    26,     5,   987,    40,   393,    28,   233,
             9,    10,    22,   796,  2422,  4897,   113,    71,  3970,    10,
           432,    19,    97,  1281,  

In [14]:
not_pad = (gen_sequences != tokenizer.pad_token_id)  # False if padding

In [15]:
# let's stack the logits generated at each step to a tensor and transform
# logits to probs
probs = torch.stack(gen_output.scores, dim=1).softmax(-1)  # -> shape [3, 15, vocab_size]
probs.shape

torch.Size([10, 33, 50264])

In [16]:
# now we need to collect the probability of the generated token
# we need to add a dummy dim in the end to make gather work
gen_probs = torch.gather(probs, 2, gen_sequences[:, :, None]).squeeze(-1)
gen_probs.shape

torch.Size([10, 33])

In [17]:
# mask pad tokens
gen_probs = gen_probs.masked_fill(not_pad==0, -torch.inf)

In [18]:
gen_probs.log().nansum(1)

tensor([-123.5888, -195.8047, -255.2489, -250.1562, -185.2542, -261.7501,
        -291.4669, -256.7234, -215.1470, -266.0401], device='cuda:0')

## test with just one sequence
---
### 1. Beam search

In [146]:
num_beams=10

In [147]:
beam_output = model.generate(
    input_token_ids,
    num_beams=1,
    num_return_sequences=1,
    max_length=150,
    early_stopping=True,  # check
    return_dict_in_generate=True,
    output_scores=True,
)

In [148]:
# this is the generated summary
gen_sum = tokenizer.decode(
        beam_output.sequences[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
gen_sum

'David Cameron has said he has secured a "significant" deal with EU leaders on the UK\'s membership.'

In [21]:
# generated sequence - remove sos token
gen_sequences = beam_output.sequences[:, 1:]
gen_sequences.shape

torch.Size([1, 22])

* stack **logits** and collect

In [22]:
beam_scores = beam_output.scores

In [23]:
beam_logits_all = torch.stack(beam_scores, dim=1)
beam_logits_all.shape

torch.Size([1, 22, 50264])

In [24]:
beam_logit = torch.gather(beam_logits_all, 2, gen_sequences[:, :, None]).squeeze(-1)
beam_logit.shape

torch.Size([1, 22])

* stack **probs** and collect

In [25]:
beam_probs_all = torch.stack(beam_scores, dim=1).softmax(-1)  # -> shape [3, 15, vocab_size]
beam_probs_all.shape

torch.Size([1, 22, 50264])

In [26]:
beam_probs = torch.gather(beam_probs_all, 2, gen_sequences[:, :, None]).squeeze(-1)
beam_probs.shape

torch.Size([1, 22])

In [27]:
beam_probs.log().nansum(1)

tensor([-17.2895], device='cuda:0')

* maybe I should include beam indices?

In [41]:
beam_indices = torch.stack(beam_output.beam_indices[0], dim=0)
beam_indices

tensor([0, 0, 0, 0, 0, 3, 1, 8, 4, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 1, 0, 3, 7, 5, 1], device='cuda:0')

In [45]:
stacked = torch.zeros((len(gen_sequences[0]) + 1, tokenizer.vocab_size - 1)).to(device)

for token_index, (beam_idx, beam_score) in enumerate(zip(beam_indices, beam_scores)):
    stacked[token_index,:] = beam_score[beam_idx]

stacked = stacked[None, :]
stacked.shape

torch.Size([1, 33, 50264])

In [176]:
tmp_gen_logits = torch.gather(stacked, 2, gen_sequences[:, :, None]).squeeze(-1)
tmp_gen_logits.shape

torch.Size([1, 32])

In [178]:
tmp_gen_logits

tensor([[ -0.7450,  -7.7351, -10.7854, -11.3415, -10.4671, -12.7027,  -1.6221,
          -7.2993, -10.0530,  -0.6768,  -0.0911, -14.9387,  -2.5317, -11.8655,
          -0.0671,  -0.0310,  -0.2152,  -0.3412,  -1.9294,  -0.2500,  -0.9693,
          -0.6250,  -1.4357,  -7.9370,  -0.4833,  -8.0090,  -1.6379,  -0.0937,
          -0.9335,  -0.2856,  -9.3447,  -0.1209]], device='cuda:0')

* model output

In [31]:
with torch.no_grad():
    model_output = model(input_ids=input_token_ids, labels=gen_sequences)

In [32]:
gen_sequences

tensor([[ 8773,  5628,    34,    26,    37,    34,  5288,    10,    22, 18880,
           113,   432,    19,  1281,   917,    15,     5,   987,    18,  6332,
             4,     2]], device='cuda:0')

In [37]:
model_probs_all = model_output.logits.softmax(-1)

In [38]:
model_probs_all.max(-1).indices == gen_sequences  # max is not always label sequences with beam search

tensor([[True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True]],
       device='cuda:0')

In [39]:
model_probs_all.shape

torch.Size([1, 22, 50264])

In [41]:
model_probs = torch.gather(model_probs, 2, gen_sequences[:, :, None]).squeeze(-1)

In [48]:
model_probs.shape

torch.Size([1, 22])

In [47]:
# log prob of the sequence = - loss
-model_probs.log().mean() == model_output.loss

tensor(True, device='cuda:0')

In [54]:
log_prob = -model_output.loss*len(gen_sequences[0])  # do we need to take product of sequence length?

* get batched input

In [97]:
with torch.no_grad():
    model_multi_output = model(input_ids=input_token_ids.repeat(num_beams, 1), labels=gen_labels)

In [109]:
with torch.no_grad():
    model_first_output = model(input_ids=input_token_ids, labels=gen_labels[0, :-1][None, :])

In [98]:
with torch.no_grad():
    model_second_output = model(input_ids=input_token_ids, labels=gen_labels[1, :][None, :])

In [113]:
from torch import nn
criterion = nn.CrossEntropyLoss(ignore_index=-100, reduction='none')

In [125]:
loss_multi = criterion(model_multi_output.logits.permute(0,2,1), gen_labels)

In [177]:
loss_multi.shape

torch.Size([10, 33])

In [132]:
seq_losses = -loss_multi.masked_fill(loss_multi==0., torch.nan).nansum(1)
seq_losses

tensor([-19.5676, -20.1987, -19.9657, -20.0667, -20.0671, -19.4462, -20.8011,
        -21.3187, -20.6768, -19.0633], device='cuda:0')

* ood

In [165]:
with torch.no_grad():
    ood_model_multi_output = model(input_ids=ood_input_token_ids.repeat(num_beams, 1), labels=gen_labels)

In [140]:
ood_loss_multi = criterion(ood_model_multi_output.logits.permute(0,2,1), gen_labels)

In [141]:
ood_seq_losses = -ood_loss_multi.masked_fill(ood_loss_multi==0., torch.nan).nansum(1)
ood_seq_losses

tensor([-144.3786, -141.6270, -145.0091, -146.9162, -150.7780, -139.9524,
        -140.1727, -143.6599, -143.9860, -143.6413], device='cuda:0')

In [144]:
seq_losses - ood_seq_losses

tensor([124.8110, 121.4283, 125.0434, 126.8494, 130.7109, 120.5062, 119.3715,
        122.3412, 123.3092, 124.5780], device='cuda:0')

In [145]:
torch.sum(seq_losses-ood_seq_losses) / num_beams

tensor(123.8949, device='cuda:0')

In [175]:
beam_multi_output.sequences_scores

tensor([-0.6034, -0.6115, -0.6121, -0.6239, -0.6271, -0.6271, -0.6273, -0.6303,
        -0.6365, -0.6401], device='cuda:0')

In [None]:
log_prob = -model_output.loss*len(gen_sequences[0])

In [65]:
gen_labels.shape

torch.Size([10, 33])

* how the loss is computed

In [185]:
model_output.logits.shape

torch.Size([1, 32, 50264])

In [186]:
gen_sequences

tensor([[ 8773,  5628,    34,    26,     5,   987,    40,   393,    28,   233,
             9,    10,    22, 17108,  2422,  4897,   113,    71,  3970,    10,
           432,    19,    97,  1281,   917,     7, 20663,   877,    63,  6332,
             4,     2]], device='cuda:0')

In [193]:
model_output.logits[0][1][gen_sequences[0][1]]

tensor(13.5911, device='cuda:0')

In [181]:
model_logits = torch.gather(model_output.logits, 2, gen_sequences[:, :, None]).squeeze(-1)
model_logits

tensor([[13.4391, 13.5911, 12.9642, 12.7980, 11.7870, 13.7139, 12.1136, 12.4357,
         13.0785, 13.0724, 13.3151, 13.0819, 13.5901, 13.9457, 14.9705, 16.1668,
         13.1780, 12.9528, 11.8507, 13.5252, 13.6520, 12.9519, 12.4470, 14.7332,
         13.7435, 12.3594, 12.0541, 14.5040, 11.8986, 14.0034, 13.2752, 13.1118]],
       device='cuda:0')

In [182]:
gen_logits

tensor([[ -0.7450,  -0.1185,  -0.4535,  -0.9146,  -1.3093, -12.5437,  -9.6778,
         -11.7671, -11.8425, -16.4339, -16.9507,  -0.2015,  -0.3638,  -0.8337,
          -0.0625,  -0.0311,  -0.2149,  -0.3307,  -1.9473,  -0.2904,  -0.5547,
          -0.6038,  -1.4306,  -0.2682, -11.4617,  -1.0268,  -2.3290, -11.0826,
          -0.9402,  -6.5864,  -0.1513,  -0.1212]], device='cuda:0')

In [184]:
tmp_gen_logits

tensor([[ -0.7450,  -7.7351, -10.7854, -11.3415, -10.4671, -12.7027,  -1.6221,
          -7.2993, -10.0530,  -0.6768,  -0.0911, -14.9387,  -2.5317, -11.8655,
          -0.0671,  -0.0310,  -0.2152,  -0.3412,  -1.9294,  -0.2500,  -0.9693,
          -0.6250,  -1.4357,  -7.9370,  -0.4833,  -8.0090,  -1.6379,  -0.0937,
          -0.9335,  -0.2856,  -9.3447,  -0.1209]], device='cuda:0')

In [57]:
torch.isclose(model_logits, gen_logits)  # logits are the same

tensor([[True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True]],
       device='cuda:0')

In [28]:
# loss is - negative log probability
-model_probs.log().mean() == model_output.loss

NameError: name 'model_probs' is not defined

tensor(0.7859, device='cuda:0')

In [30]:
logits.shape

torch.Size([10, 33, 50264])

In [None]:
gen_probs = torch.gather(logits, 2, gen_sequences[:, :, None]).squeeze(-1)

In [61]:
with torch.no_grad():
    model_output = model(input_ids=input_token_ids, labels=gen_output.sequences[:, :])

In [197]:
model_output.logits.shape

torch.Size([1, 32, 50264])

In [63]:
model_probs = model_output.logits.softmax(-1)

tensor(1.3801, device='cuda:0')

In [12]:
model_output.logits

NameError: name 'model_output' is not defined

In [65]:
gen_output.scores[0].shape

torch.Size([10, 50264])

In [None]:
with torch.no_grad():
    model_output = model(input_ids=input_token_ids.repeat(num_beams, 1), labels=model_output.sequences)

In [28]:
tmp_probs = torch.tensor([[0.1, 0.2, 0.7], [0.2, 0.3, 0.5]])

In [29]:
tmp_probs.prod(-1)

tensor([0.0140, 0.0300])

In [32]:
tmp_probs.log().sum(1)

tensor([-4.2687, -3.5066])

In [33]:
tmp_probs.logsumexp(1).exp()

tensor([4.3403, 4.2200])

In [81]:
model_output.sequences.shape

torch.Size([10, 34])

In [None]:
eval_batch[“targets”][eval_batch[“targets”]==tokenizer.convert_tokens_to_ids(tokenizer.pad_token)] = -100

In [92]:
input_token_ids.repeat(num_beams, 1).shape

torch.Size([10, 1024])

In [93]:
model_output.sequences.shape

torch.Size([10, 34])

In [94]:
with torch.no_grad():
    outputs = model(input_ids=input_token_ids.repeat(num_beams, 1), labels=model_output.sequences)

In [101]:
tokenizer.pad_token_id

1

In [102]:
PAD_IDX = 1

In [100]:
from torch import nn

In [103]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX, reduction='none')

In [112]:
logits = outputs.logits

TypeError: 'Tensor' object is not callable

In [121]:
criterion(logits.permute(0,2,1), model_output.sequences).mean(dim=1).log()

tensor([0.2923, 0.3051, 0.3011, 0.3032, 0.3042, 0.2882, 0.3183, 0.3297, 0.3172,
        0.2812], device='cuda:0')

AttributeError: 'Seq2SeqLMOutput' object has no attribute 'sequences'

In [None]:
criterion(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))

In [None]:
criterion(outputs.logits, )