In [4]:
from datasets import load_dataset
dataset = load_dataset('glue', 'qqp')

Downloading:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/qqp (download: 39.76 MiB, generated: 106.55 MiB, post-processed: Unknown size, total: 146.32 MiB) to /root/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


In [5]:
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

In [6]:
import time
import random
import math
import spacy
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from torchtext import data, vocab

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import seaborn as sns

In [7]:

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

2021-09-17 15:06:59.956169: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [8]:
train_data = train_data.filter(lambda example: example['label']==1)
train_data = train_data.remove_columns( ['idx', 'label'])
train_data = train_data.to_csv('train_data.csv')
train_data = pd.read_csv('train_data.csv')
train_data = train_data[['question1', 'question2']]
train_data= train_data[:20000]

  0%|          | 0/364 [00:00<?, ?ba/s]

In [9]:
class paraDataset(Dataset):
    def __init__(self, df, tokenizer, source_len):
        super().__init__()

        self.tokenizer = tokenizer
        self.data = df
        self.source_len = source_len
        self.summary_len = source_len
        self.text = df.question1
        self.para = df.question2
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        text = str(self.text[item])
        text = " ".join(text.split())

        para = str(self.para[item])
        para = " ".join(para.split())

        source = self.tokenizer.encode_plus(
            text,
            max_length=self.source_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt')
        
        target = self.tokenizer.encode_plus(
            para,
            max_length=self.summary_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt')

        return {
            "source_ids": source["input_ids"].flatten(),
            "source_mask": source["attention_mask"].flatten(),
            "target_ids": target["input_ids"].flatten(),
            "target_mask": target["attention_mask"].flatten()
        }

In [10]:
from transformers import AutoTokenizer,  T5ForConditionalGeneration
MODEL_NAME = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained("t5-small")

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [11]:
train_dataset = paraDataset(train_data, tokenizer, 512)
train_data_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
sample = next(iter(train_data_loader))
sample['source_ids'].shape, sample['source_mask'].shape, sample['target_ids'].shape



(torch.Size([2, 512]), torch.Size([2, 512]), torch.Size([2, 512]))

In [12]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [13]:
def train(data_loader, model, tokenizer, optimizer, device):
    model.train()

    total_steps = len(data_loader)
    epoch_loss = 0

    for idx, batch in enumerate(data_loader):
        optimizer.zero_grad()

        ids = batch["source_ids"].to(device)
        mask = batch["source_mask"].to(device)

        target_ids = batch["target_ids"].to(device)
        
        y_ids = target_ids[:, :-1].contiguous()
        lm_labels = target_ids[:, 1:].clone().detach()
        lm_labels[target_ids[:, 1:] == tokenizer.pad_token_id] = -100

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels
        )

        loss = outputs[0]
        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()
        writer.add_scalar("Loss/train", loss, epoch)

        if idx%100 == 0:
            print(f"Step: {idx}/{total_steps} | Loss: {loss.item()}")
    
    return epoch_loss / total_steps

In [14]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [15]:
best_valid_loss = float('inf')

for epoch in range(1):
    start_time = time.time()
    train_loss = train(train_data_loader, model, tokenizer, optimizer, device)
    #val_loss = evaluate(val_data_loader, model, TOKENIZER, device)
    end_time = time.time()
   
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    #if val_loss < best_valid_loss:
    #    best_valid_loss = val_loss
    #    torch.save(model.state_dict(), MODEL_PATH)
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\t Train Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):5.4f}")
    #print(f"\t Val Loss: {val_loss:.3f} | Val PPL: {np.exp(val_loss):5.4f}")
writer.flush()

Step: 0/10000 | Loss: 2.282130479812622
Step: 100/10000 | Loss: 0.8982489109039307
Step: 200/10000 | Loss: 4.225487232208252
Step: 300/10000 | Loss: 1.317039132118225
Step: 400/10000 | Loss: 2.7486627101898193
Step: 500/10000 | Loss: 2.058995485305786
Step: 600/10000 | Loss: 2.9621689319610596
Step: 700/10000 | Loss: 1.0606534481048584
Step: 800/10000 | Loss: 1.741492509841919
Step: 900/10000 | Loss: 0.9447327256202698
Step: 1000/10000 | Loss: 2.990053176879883
Step: 1100/10000 | Loss: 1.050997018814087
Step: 1200/10000 | Loss: 1.4592121839523315
Step: 1300/10000 | Loss: 2.662625789642334
Step: 1400/10000 | Loss: 1.0493813753128052
Step: 1500/10000 | Loss: 2.137808322906494
Step: 1600/10000 | Loss: 1.5480986833572388
Step: 1700/10000 | Loss: 2.8785765171051025
Step: 1800/10000 | Loss: 2.0084145069122314
Step: 1900/10000 | Loss: 2.365753650665283
Step: 2000/10000 | Loss: 1.8568999767303467
Step: 2100/10000 | Loss: 1.7412028312683105
Step: 2200/10000 | Loss: 2.6733286380767822
Step: 2300

In [32]:
torch.save(model, './model')

In [24]:
def inference(model, phrase, tokenizer, device):
    model.eval()

    paraphrase = "paraphrase: " + phrase

    source = tokenizer.encode_plus(
        phrase,
        max_length=512,
        pad_to_max_length=True,
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt')
    
    with torch.no_grad():
        ids = source["input_ids"].to(device)
        mask = source["attention_mask"].to(device)

        generated_ids = model.generate(
            input_ids=ids,
            attention_mask=mask,
            max_length=512,
            num_beams=2,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True
        )

        summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

    return summary[0]

In [30]:
phrase = "I should have positive attitude"
summary = inference(model, phrase, tokenizer, device)
summary

'I should have a positive attitude towards my life.'