In [2]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

tokenizer = T5Tokenizer.from_pretrained("t5-base")
data_frame = pd.read_csv('news_summary.csv')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
data_frame["text"] = "summarize: " + data_frame["text"]

In [12]:
kwargs = {"d_ff": 512,
          "d_kv": 64,
          "d_model": 384,
          "decoder_start_token_id": 0,
          "dropout_rate": 0.1,
          "eos_token_id": 1,
          "feed_forward_proj": "gated-gelu",
          "initializer_factor": 1.0,
          "is_encoder_decoder": True,
          "layer_norm_epsilon": 1e-06,
          "model_type": "t5",
          "n_positions": 512,
          "num_decoder_layers": 12,
          "num_heads": 12,
          "num_layers": 2,
          "output_past": True,
          "pad_token_id": 0,
          "relative_attention_max_distance": 128,
          "relative_attention_num_buckets": 32,
          "tie_word_embeddings": False,
          "transformers_version": "4.23.1",
          "use_cache": True,
          "vocab_size": 32128}
config = T5Config(max_length=512, **kwargs)

In [13]:
config

T5Config {
  "d_ff": 512,
  "d_kv": 64,
  "d_model": 384,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "max_length": 512,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 2,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.25.1",
  "use_cache": true,
  "vocab_size": 32128
}

In [14]:
model = T5ForConditionalGeneration(config=config)

In [15]:
sum(m.numel() for m in model.parameters()) / 1e6

63.619584

In [27]:
from torch.utils.data import Dataset
import torch


class DatasetC(Dataset):
    def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        return len(self.target_text)

    def __getitem__(self, index):
        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        #cleaning data so as to ensure data is in string type
        source_text = ' '.join(source_text.split())
        target_text = ' '.join(target_text.split())

        source = self.tokenizer.batch_encode_plus([source_text], max_length=self.source_len, pad_to_max_length=True,
                                                  truncation=True, padding="max_length", return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([target_text], max_length=self.summ_len, pad_to_max_length=True,
                                                  truncation=True, padding="max_length", return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [41]:
source_length = config.max_length
target_length = config.max_length
device = 'cpu'

In [42]:
dataset = DatasetC(dataframe=data_frame, tokenizer=tokenizer, source_len=source_length, target_len=target_length,
                   source_text='text', target_text='headlines')

In [43]:
data = dataset.__getitem__(1)

In [44]:
data.keys()

dict_keys(['source_ids', 'source_mask', 'target_ids', 'target_ids_y'])

In [45]:

y = data['target_ids'].to(device, dtype=torch.long)
decoder_input = y[:, :-1].contiguous()
lm_labels = y[:, 1:].clone().detach()
lm_labels[y[:, 1:] == 0] = -100

input_id = data['source_ids'].to(device, dtype=torch.long)
mask = data['source_mask'].to(device, dtype=torch.long)

In [46]:
l = model(
    input_ids=input_id,
    attention_mask=mask,
    decoder_input_ids=decoder_input,
    labels=lm_labels
)

In [47]:
l

Seq2SeqLMOutput(loss=tensor(10.8785, grad_fn=<NllLossBackward0>), logits=tensor([[[-1.4653,  0.5827,  0.7939,  ...,  0.3943, -1.0565,  0.1616],
         [-0.9908, -0.6016,  0.7667,  ...,  1.4062, -0.5698,  0.4423],
         [-0.4742, -0.2349, -0.9143,  ...,  1.6782, -1.6007,  0.9491],
         ...,
         [ 7.4453, -0.0394,  0.3866,  ...,  0.2320, -0.6576,  0.9147],
         [ 5.3801,  0.3985,  0.5571,  ...,  0.2532, -0.8793,  0.6429],
         [ 6.5283, -0.2686,  0.7978,  ..., -0.2158, -0.4460,  1.5036]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-0.5582, -0.8425, -2.6876,  ...,  2.0932,  0.5175,  0.7685],
          [ 2.2800,  0.4031,  0.1079,  ..., -0.0919,  0.2806,  1.2037],
          [-1.6509,  0.0820, -1.7438,  ..., -0.8824,  0.3296,  1.3354],
          ...,
          [ 1.5762,  0.1912,  0.3511,  ...,  1.1633,  1.5136, -0.5548],
          [ 1.3656, -0.0196,  0.6012,  ...,  0.8258,  1.2681,  0.1557],
          [ 1.4685, -0.0143, -0.1048,  ...,  1.3831, 