In [1]:
from transformers import AdamW, T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, get_scheduler
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import gc
from tqdm.auto import tqdm

In [2]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

In [3]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
device

device(type='cuda')

In [4]:
raw = load_dataset('paws', 'labeled_final')
paraphrases = raw.filter(lambda example: example['label'] == 1)
paraphrases

Reusing dataset paws (C:\Users\Pranav\.cache\huggingface\datasets\paws\labeled_final\1.1.0\09d8fae989bb569009a8f5b879ccf2924d3e5cd55bfe2e89e6dab1c0b50ecd34)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Loading cached processed dataset at C:\Users\Pranav\.cache\huggingface\datasets\paws\labeled_final\1.1.0\09d8fae989bb569009a8f5b879ccf2924d3e5cd55bfe2e89e6dab1c0b50ecd34\cache-f8f4b05c63816056.arrow
Loading cached processed dataset at C:\Users\Pranav\.cache\huggingface\datasets\paws\labeled_final\1.1.0\09d8fae989bb569009a8f5b879ccf2924d3e5cd55bfe2e89e6dab1c0b50ecd34\cache-9bfc9aaee2aa1e33.arrow
Loading cached processed dataset at C:\Users\Pranav\.cache\huggingface\datasets\paws\labeled_final\1.1.0\09d8fae989bb569009a8f5b879ccf2924d3e5cd55bfe2e89e6dab1c0b50ecd34\cache-274702c64a43f78c.arrow





DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 21829
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 3536
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 3539
    })
})

In [5]:
train_loader = DataLoader(paraphrases['train'], shuffle=True, batch_size=8)
eval_loader = DataLoader(paraphrases['validation'], batch_size=8)

In [6]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [7]:
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [8]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [10]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = tokenizer(batch['sentence1'], return_tensors="pt", padding=True).input_ids.to(device)
        labels = tokenizer(batch['sentence2'], return_tensors="pt", padding=True).input_ids.to(device)
        outputs = model(input_ids=input_ids, labels=labels)
        outputs.loss.backward()

        optimizer.step()
        #lr_scheduler.step()
        optimizer.zero_grad()
        del input_ids
        del labels
        del outputs
        progress_bar.update(1)

HBox(children=(FloatProgress(value=0.0, max=8187.0), HTML(value='')))

In [30]:
model.eval()
sentence = "From the merger of the Four Rivers Council and the Audubon Council , the Shawnee Trails Council was born."

encoding = tokenizer.encode_plus(sentence,pad_to_max_length=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

with torch.no_grad():
    beam_outputs = model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        do_sample=True,
        max_length=256,
        top_k=120,
        top_p=0.98,
        early_stopping=True,
        num_return_sequences=10
    )
    print ("\nOriginal:")
    print (sentence)
    print ("\n")
    print ("Paraphrases:")
    final_outputs =[]
    for beam_output in beam_outputs:
        sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
        if sent.lower() != sentence.lower() and sent not in final_outputs:
            final_outputs.append(sent)

    for i, final_output in enumerate(final_outputs):
        print("{}: {}".format(i, final_output))


Original:
From the merger of the Four Rivers Council and the Audubon Council , the Shawnee Trails Council was born.


Paraphrases:
0: The Shawnee Trails Council was created from the merger of the Four Rivers Council and Audubon Council.
1: The Shawnee Trails Council was born from the merger of the Four Rivers Council and the Audubon Council.
2: From the merger of the four rivers council and the Audubon council, the Shawnee Trails Council was born.
3: The Shawnee Trails Council was born out of the merger of the Four Rivers Council and Audubon Council.
4: From the merger of the Four Rivers Council with the Audubon Council, the Shawnee Trails Council was born.
5: From the merger of Four Rivers Council and Audubon Council, the Shawnee Trails Council was born.
6: Through the merger of the Four Rivers Council and the Audubon Council, the Shawnee Trails Council was born.


In [None]:
torch.cuda.memory_allocated(device=device)

In [None]:
gc.collect()