In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation
import wandb

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
    )

[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# connection to wandb
YOUR_API_KEY = 'n/a'
os.environ["WANDB_API_KEY"] = '3d4d9c4f219a83a45067149237d96e54395bffa4'
wandb.init(project="dyna_T5", entity='cfg2')
run_name = wandb.run.name

[34m[1mwandb[0m: Currently logged in as: [33mcfg2[0m (use `wandb login --relogin` to force relogin)


In [None]:
# get training data
train_path = "data/msr_paraphrase_train.csv"
val_path = "data/msr_paraphrase_eval.csv"

In [None]:
train = pd.read_csv("data/msr_paraphrase_train.csv")
eval_df = pd.read_csv("data/msr_paraphrase_eval.csv")


In [None]:
eval_df.head()

Unnamed: 0.1,Unnamed: 0,string_1,string_2
0,489,"Peterson, a former fertilizer salesman, is cha...","Peterson, 31, is now charged with murder in th..."
1,609,"Mr Kerkorian said: ""We believe that recent tra...",We believe that recent trading prices of MGM's...
2,717,The camp hosts summer religious retreats for c...,The Saint Sophia Camp hosts religious retreats...
3,216,"In January, Georgia's U.N. envoy Revaz Adamia ...","In January, it accused Russia of annexing the ..."
4,180,The new Mobile AMD Athlon 64 processors are nu...,"The Mobile 3200+, 3000+ and 2800+ cost $293, $..."


In [None]:
# setup model training
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)


class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        #print(hparams)
        #self.hparams = hparams
        
        self.hparams.update(vars(hparams))
        
        #for key in hparams.keys():
        #    help_hparams[key]=hparams[key]
        #self.hparams = argparse.Namespace(**help_hparams)
        
        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)

    def is_logger(self):
        return self.trainer.global_rank <= 0

    def forward(
            self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
    ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
        )

    def _step(self, batch):
        labels = batch["target_ids"]
        labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            labels=labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]

        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}
        #return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        return {"val_loss": loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    # def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
    #     # if self.trainer.use_tpu:
    #     #     xm.optimizer_step(optimizer)
    #     # else:
    #         # optimizer.step()
    #     optimizer.step()
    #     optimizer.zero_grad()
    #     self.lr_scheduler.step()

    def optimizer_step(self,
                     epoch=None,
                     batch_idx=None,
                     optimizer=None,
                     optimizer_idx=None,
                     optimizer_closure=None,
                     on_tpu=None,
                     using_native_amp=None,
                     using_lbfgs=None):

        optimizer.step() # remove 'closure=optimizer_closure' here
        optimizer.zero_grad()
        self.lr_scheduler.step()


    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict

    def train_dataloader(self):
        train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="msr_paraphrase_train", args=self.hparams)
        dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True,
                                num_workers=4)
        t_total = (
                (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
                // self.hparams.gradient_accumulation_steps
                * float(self.hparams.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="msr_paraphrase_eval", args=self.hparams)
        return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

logger = logging.getLogger(__name__)


class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
      # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

    def on_test_end(self, trainer, pl_module):
        logger.info("***** Test results *****")

        if pl_module.is_logger():
            metrics = trainer.callback_metrics

      # Log and save results to file
            output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
            with open(output_test_results_file, "w") as writer:
                for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                        logger.info("{} = {}\n".format(key, str(metrics[key])))
                        writer.write("{} = {}\n".format(key, str(metrics[key])))



args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path='t5-small',
    tokenizer_name_or_path='t5-small',
    max_seq_length=64,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=64,
    eval_batch_size=64,
    num_train_epochs=10,
    gradient_accumulation_steps=2,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
    report_to="wandb",
)



tokenizer = T5Tokenizer.from_pretrained('t5-small')



class ParaphraseDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, max_len=64):
        self.path = os.path.join(data_dir, type_path + '.csv')

        self.source_column = "string_1"
        self.target_column = "string_2"
        self.data = pd.read_csv(self.path)

        #data_help = pd.read_csv(self.path)

        #self.data = data_help.loc[:20] 

        
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        for idx in range(len(self.data)):
            input_, target = self.data.loc[idx, self.source_column], self.data.loc[idx, self.target_column]

            input_ = "paraphrase: "+ str(input_) + ' </s>'
            target = str(target) + " </s>"

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)



dataset = ParaphraseDataset(tokenizer, 'data', 'msr_paraphrase_eval', 64)
print("Val dataset: ",len(dataset))

data = dataset[3]
print(tokenizer.decode(data['source_ids']))
print(tokenizer.decode(data['target_ids']))

if not os.path.exists('t5_paraphrase'):
    os.makedirs('t5_paraphrase')

args_dict.update({'data_dir': 'data', 'output_dir': 't5_paraphrase', 'num_train_epochs':10,'max_seq_length':64})
args = argparse.Namespace(**args_dict)
print(args_dict)



checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, monitor="val_loss", mode="min", save_top_k=5
    #filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    #early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    # checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)



def get_dataset(tokenizer, type_path, args):
    return ParaphraseDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Val dataset:  507
paraphrase: In January, Georgia's U.N. envoy Revaz Adamia accused Russia of annexing the region and appealed to the Security Council to "assume effective leadership over the peace process."</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
In January, it accused Russia of annexing the region and appealed to the U.N. Security Council to "assume effective leadership over the peace process."</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
{'data_dir': 'data', 'output_dir': 't5_paraphrase', 'model_name_or_path': 't5-small', 'tokenizer_name_or_path': 't5-small', 'max_seq_length': 64, 'learning_rate': 0.0003, 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'warmup_steps': 0, 'train_batch_size': 64, 'eval_batch_size': 64, 'num_train_epochs': 10, 'gradient_accumulation_steps': 2, 'n_gpu': 1, 'early_stop_callback': False, 'fp_16': False, 'opt_level': 'O1', 

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


In [None]:
print ("Initialize model")
model = T5FineTuner(args)


trainer = pl.Trainer(**train_params)


print (" Training model")
trainer.fit(model)

print ("training finished")


print ("Saving model")
model.model.save_pretrained('t5_paraphrase_small')

print ("Saved model")

Initialize model


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


 Training model



  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  rank_zero_warn(


Training: -1it [00:00, ?it/s]



Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

training finished
Saving model
Saved model


In [None]:
print("t")

In [None]:
model_final = T5ForConditionalGeneration.from_pretrained('t5_paraphrase_small')
tokenizer = T5Tokenizer.from_pretrained(args.tokenizer_name_or_path)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device ",device)
model = model_final.to(device)

device  cuda


In [None]:
#sentence = "She was one of them, really, blithe and girlish in her manner and her tastes—video games, Harry Potter, the baffling pop music they listened to."
#sentence = "What are the ingredients required to bake a perfect cake?"
sentence = "What is the best possible approach to learn aeronautical engineering?"
# sentence = "Do apples taste better than oranges in general?"


text =  "paraphase:" + sentence + " </s>"


max_len = 64

encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)


# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
beam_outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    do_sample=True,
    max_length=64,
    top_k=120,
    top_p=0.98,
    early_stopping=True,
    num_return_sequences=5
)


print ("\nOriginal Sentence::")
print (sentence)
print ("\n")
print ("Simplified Sentence:: ")
final_outputs =[]
for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    if sent.lower() != sentence.lower() and sent not in final_outputs:
        final_outputs.append(sent)

for i, final_output in enumerate(final_outputs):
    print("{}: {}".format(i, final_output))


Original Sentence::
What is the best possible approach to learn aeronautical engineering?


Simplified Sentence:: 
0: How does aviation engineering work?
1: Who is the best approach to learn aeronautical engineering?
2: What is the best way to learn Aeronautical Engineering?
3: What is the best approach to learn Aeronautical engineering?


In [None]:
df_test = pd.read_csv('data/msr_paraphrase_test.csv')

In [None]:
df_test

Unnamed: 0.1,Unnamed: 0,string_1,string_2
0,820,The department's position threatens to alienat...,The department's stance disappointed some abor...
1,172,US District Judge William M. Hoeveler's remova...,U.S. District Judge William M. Hoeveler's remo...
2,817,The charges came after the federal government ...,The charges came after the federal government ...
3,486,Analysts surveyed by Reuters Research had been...,Analysts surveyed by First Call were expecting...
4,616,"News Corp., whose empire spans Hollywood's Twe...","News Corp., whose empire spans Hollywood's Twe..."
...,...,...,...
502,897,"Martin, 58, will be freed today after serving ...",Martin served two thirds of a five-year senten...
503,541,Mr. Rowland attended a party in South Windsor ...,Rowland was making an appearance at a holiday ...
504,597,Captain Robert Ramsey of US 1St Armored Divisi...,"Earlier, Captain Robert Ramsey of the First Ar..."
505,530,"A senior Whitehall official said: ""It devalued...","A senior Whitehall official said recently: ""It..."


In [None]:
def testing(sentence):

    
    text =  "paraphrase:" + sentence + " </s>"

    max_len = 64

    encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)


    # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
    beam_outputs = model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        do_sample=True,
        max_length=64,
        top_k=120,
        top_p=0.98,
        early_stopping=True,
        num_return_sequences=5
    )
    

    final_outputs =[]
    for beam_output in beam_outputs:
        sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
        if sent.lower() != sentence.lower() and sent not in final_outputs:
            final_outputs.append(sent)

    #for i, final_output in enumerate(final_outputs):
    #   print("{}: {}".format(i, final_output))
    #print(final_outputs)

    if not final_outputs:
        final_outputs= [""]
    
    return final_outputs[0]
    

In [None]:
df_test['prediction'] = df_test.string_1.apply(lambda x: testing(x))

Token indices sequence length is longer than the specified maximum sequence length for this model (688 > 512). Running this sequence through the model will result in indexing errors


In [None]:
df_test

Unnamed: 0.1,Unnamed: 0,string_1,string_2,prediction
0,820,The department's position threatens to alienat...,The department's stance disappointed some abor...,The department's position threatens to alienat...
1,172,US District Judge William M. Hoeveler's remova...,U.S. District Judge William M. Hoeveler's remo...,US District Judge William Hoeveler's will by h...
2,817,The charges came after the federal government ...,The charges came after the federal government ...,The charges came after the federal government ...
3,486,Analysts surveyed by Reuters Research had been...,Analysts surveyed by First Call were expecting...,News report reveals Reuters Research had avera...
4,616,"News Corp., whose empire spans Hollywood's Twe...","News Corp., whose empire spans Hollywood's Twe...","News Corp., founded after Hollywood's Twentiet..."
...,...,...,...,...
502,897,"Martin, 58, will be freed today after serving ...",Martin served two thirds of a five-year senten...,"Martin, 58, has served two thirds of his five-..."
503,541,Mr. Rowland attended a party in South Windsor ...,Rowland was making an appearance at a holiday ...,Ruth Rowland attended a party for families of ...
504,597,Captain Robert Ramsey of US 1St Armored Divisi...,"Earlier, Captain Robert Ramsey of the First Ar...",Report: A truck was reported to have exploded ...
505,530,"A senior Whitehall official said: ""It devalued...","A senior Whitehall official said recently: ""It...",The whitehall officials had a statement from W...


In [None]:
df_test.to_csv('data/msr_paraphrase_preds.csv')