In [1]:
from datetime import datetime
from typing import Optional

import datasets
import torch
import pytorch_lightning as pl

from torch.utils.data import DataLoader
from transformers import (
    AdamW,
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)
from pytorch_lightning import LightningDataModule, LightningModule, Trainer

AVAIL_GPUS = min(1, torch.cuda.device_count())


class GLUEDataModule(LightningDataModule):

    glue_task_num_labels = {
        "adversarial_nli_r3": 3,
        "lexical": 3,
        "boolean": 3,
        "comparative": 3,
        "conditional": 3,
        "counting": 3,
        "negation": 3,
        "quantifier": 3,
        "transitive": 2,
        "hypernymy": 2,
        "hyponymy": 2,
        "ner": 2,
        "verbcorner": 2,
        "verbnet": 2,
        "syntactic_alternation": 2,
        "syntactic_variation": 2,
        "monotonicity_infer": 3,
        "syllogism": 2,
        "coreference": 3,
        "puns": 3,
        "sentiment": 2,
        "kg_relations": 2,
        "context_align": 3,
        "sprl": 2,
        "atomic": 3,
        "social_chem": 3,
        "socialqa": 3,
        "physicalqa": 3,
        "logiqa": 3,
        "ester": 3,
        "cosmoqa": 3,
        "drop": 3,
        "entailment_tree": 3,
        "proof_writer": 2,
        "temporal": 2,
        "spatial": 3,
        "counterfactual": 3
    }

    loader_columns = [
        "datasets_idx",
        "input_ids",
        "token_type_ids",
        "attention_mask",
        "start_positions",
        "end_positions",
        "labels",
    ]

    def __init__(
        self,
        model_name_or_path: str,
        task_name: str = "atomic",
        max_seq_length: int = 128,
        train_batch_size: int = 8,
        eval_batch_size: int = 16,
        **kwargs,
    ):
        super().__init__()
        self.model_name_or_path = model_name_or_path
        self.task_name = task_name
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size

        self.text_fields = ['premise', 'hypothesis']
        self.num_labels = self.glue_task_num_labels[task_name]
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)

    def setup(self, stage: str):
        self.dataset = datasets.load_dataset("curriculum_load_dataset.py", self.task_name)

        for split in self.dataset.keys():
            self.dataset[split] = self.dataset[split].map(
                self.convert_to_features,
                batched=True,
                remove_columns=["label"],
            )
            self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns]
            self.dataset[split].set_format(type="torch", columns=self.columns)

        self.eval_splits = [x for x in self.dataset.keys() if "validation" in x]

    def prepare_data(self):
        datasets.load_dataset("curriculum_load_dataset.py", self.task_name)
        AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)

    def train_dataloader(self):
        return DataLoader(self.dataset["train"], batch_size=self.train_batch_size)

    def val_dataloader(self):
        if len(self.eval_splits) == 1:
            return DataLoader(self.dataset["validation"], batch_size=len(self.dataset["validation"]))
        elif len(self.eval_splits) > 1:
            return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]

    def test_dataloader(self):
        if len(self.eval_splits) == 1:
            return DataLoader(self.dataset["test"], batch_size=self.eval_batch_size)
        elif len(self.eval_splits) > 1:
            return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]

    def convert_to_features(self, example_batch, indices=None):

        # Either encode single sentence or sentence pairs
        if len(self.text_fields) > 1:
            texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))
        else:
            texts_or_text_pairs = example_batch[self.text_fields[0]]

        # Tokenize the text/text pairs
        features = self.tokenizer.batch_encode_plus(
            texts_or_text_pairs, max_length=self.max_seq_length, pad_to_max_length=True, truncation=True
        )

        # Rename label to labels to make it easier to pass to model forward
        features["labels"] = example_batch["label"]

        return features

In [2]:
import logging
import jiant.utils.python.io as py_io
from jiant.utils.zlog import ZLogger

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.INFO
)
logger = logging.getLogger(__name__)

def init_log_writer(output_dir):
    os.makedirs(output_dir, exist_ok=True)
    return ZLogger(output_dir, overwrite=True)

class GLUETransformer(LightningModule):
    def __init__(
        self,
        model_name_or_path: str,
        num_labels: int,
        train_loader: DataLoader,
        task_name: str,
        learning_rate: float = 1e-5,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 1000,
        weight_decay: float = 0.0,
        train_batch_size: int = 8,
        eval_batch_size: int = 16,
        eval_splits: Optional[list] = None,
        **kwargs,
    ):
        super().__init__()

        self.save_hyperparameters()
        self.log_writer = init_log_writer(f"./train_dynamics/{task_name}/")
        self.train_loader = train_loader
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
        self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=self.config)
        self.metric = datasets.load_metric(
            "glue", 'mnli', experiment_id=datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
        )

    def forward(self, **inputs):
        return self.model(**inputs)

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        difficult_loss = outputs[0]

        #data_with_loss = zip(batch, difficult_loss)
        #data_loss_dict = {}
        #for (data, ls) in data_with_loss:
        #    data_loss_dict[data] = ls
        #data_loss_pdict = pqdict(data_loss_dict)
        #ranked_batch = list(data_loss_pdict.popkeys())

        return difficult_loss

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(**batch)
        val_loss, logits = outputs[:2]

        if self.hparams.num_labels >= 1:
            preds = torch.argmax(logits, axis=1)
        elif self.hparams.num_labels == 1:
            preds = logits.squeeze()

        labels = batch["labels"]

        return {"loss": val_loss, "preds": preds, "labels": labels}

    def validation_epoch_end(self, outputs):
        preds = torch.cat([x["preds"] for x in outputs]).detach().cpu().numpy()
        labels = torch.cat([x["labels"] for x in outputs]).detach().cpu().numpy()
        loss = torch.stack([x["loss"] for x in outputs]).mean()
        eval_metrics = self.metric.compute(
            predictions=preds, references=labels
        )
        self.log("val_loss", loss, prog_bar=True)
        self.log_dict(eval_metrics, prog_bar=True)
        logger.info(f"val_acc: {eval_metrics['accuracy']}")
        return loss

    def setup(self, stage=None) -> None:
        if stage != "fit":
            return

        # Calculate total steps
        tb_size = self.hparams.train_batch_size * max(1, self.trainer.gpus)
        ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs)
        self.total_steps = (len(self.train_loader.dataset) // tb_size) // ab_size

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 1e-5,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.total_steps,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]

In [5]:
import os
from pytorch_lightning.loggers import TensorBoardLogger

dm = GLUEDataModule(
    model_name_or_path="roberta-large",
    task_name="adversarial_nli_r3",
)
dm.setup("fit")

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=f"./runs/{dm.task_name}/roberta-large/checkpoints/",
    filename="checkpoint_best",
    monitor="val_loss",
    mode="min",
    save_top_k=2
)

train_params = dict(
    gpus=1,
    max_epochs=2,
    progress_bar_refresh_rate=1,
    checkpoint_callback=True,
    callbacks=[checkpoint_callback],
    logger=TensorBoardLogger(
        os.path.join("./runs", 'logs'),
        name=f"{dm.model_name_or_path}-{dm.task_name}",
        version='trial_1'
    ),
    precision=16
)



In [4]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

model = GLUETransformer(
    model_name_or_path="roberta-large",
    num_labels=dm.num_labels,
    eval_splits=dm.eval_splits,
    task_name=dm.task_name,
    train_loader=dm.train_dataloader()
)

trainer = Trainer(**train_params)
trainer.fit(model, dm.train_dataloader(), dm.val_dataloader())
trainer.validate(model, dm.val_dataloader())

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'clas

Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


Validation sanity check: 100%|██████████| 2/2 [00:00<00:00,  3.11it/s]

2022-01-06 04:32:00,212 - INFO - __main__ - val_acc: 0.0


                                                                      

  rank_zero_warn(


Epoch 0:   0%|          | 1/20434 [00:00<2:08:02,  2.66it/s, loss=1.2, v_num=al_1]



Epoch 0: 100%|█████████▉| 20433/20434 [2:13:43<00:00,  2.55it/s, loss=1.23, v_num=al_1]  

2022-01-06 06:45:44,001 - INFO - __main__ - val_acc: 0.335


Epoch 1: 100%|█████████▉| 20433/20434 [2:14:31<00:00,  2.53it/s, loss=1.23, v_num=al_1, val_loss=1.110, accuracy=0.335]  

2022-01-06 09:00:28,970 - INFO - __main__ - val_acc: 0.335


Epoch 1: 100%|██████████| 20434/20434 [2:14:46<00:00,  2.53it/s, loss=1.23, v_num=al_1, val_loss=1.110, accuracy=0.335]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]




2022-01-06 09:00:53,407 - INFO - __main__ - val_acc: 0.335


--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'accuracy': 0.33500000834465027, 'val_loss': 1.1080350875854492}
--------------------------------------------------------------------------------


[{'val_loss': 1.1080350875854492, 'accuracy': 0.33500000834465027}]

In [6]:
trainer.validate(model, dm.val_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]




2022-01-06 14:31:38,483 - INFO - __main__ - val_acc: 0.335


--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'accuracy': 0.33500000834465027, 'val_loss': 1.1080350875854492}
--------------------------------------------------------------------------------


[{'val_loss': 1.1080350875854492, 'accuracy': 0.33500000834465027}]

In [7]:
len(dm.val_dataloader())

75

In [None]:
validate_model = model.load_from_checkpoint("./runs/adversarial_nli_r3/roberta-large/checkpoints/checkpoint_best.ckpt")
trainer = Trainer(**train_params)
trainer.validate(validate_model, dm.val_dataloader())

In [2]:
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer


def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_paraphraser')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device ",device)
model = model.to(device)

sentence = "Which course should I take to get started in data science?"
# sentence = "What are the ingredients required to bake a perfect cake?"
# sentence = "What is the best possible approach to learn aeronautical engineering?"
# sentence = "Do apples taste better than oranges in general?"
sentence = "Dustin, Milton, Louis, Bill, Roland, Dean, Tim, Micheal, Philip, Adrian, Eddie, Bradley, Andy, Lawrence, Edgar and Phillip have only visited Ecuador"


text =  "paraphrase: " + sentence + " </s>"


max_len = 256

encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)


# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
beam_outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    do_sample=True,
    max_length=256,
    top_k=120,
    top_p=0.98,
    early_stopping=True,
    num_return_sequences=10
)


print ("\nOriginal Question ::")
print (sentence)
print ("\n")
print ("Paraphrased Questions :: ")
final_outputs =[]
for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    if sent.lower() != sentence.lower() and sent not in final_outputs:
        final_outputs.append(sent)

for i, final_output in enumerate(final_outputs):
    print("{}: {}".format(i, final_output))

device  cuda





Original Question ::
Dustin, Milton, Louis, Bill, Roland, Dean, Tim, Micheal, Philip, Adrian, Eddie, Bradley, Andy, Lawrence, Edgar and Phillip have only visited Ecuador


Paraphrased Questions :: 
0: Dustin, Milton, Louis, Bill, Roland, Dean, Tim, Micheal, Philip, Adrian, Eddie, Bradley, Andy, Lawrence, Edgar and Phillip have only been visited by Ecuador. They spent time with Cuban, Brazil, Turkey and Honduras.
1: Dustin, Milton, Louis, Bill, Roland, Dean, Tim, Micheal, Philip, Adrian, Eddie, Bradley, Andy, Lawrence, Edgar and Phillip have only visited Ecuador.
2: Dustin, Milton, Louis, Bill, Roland, Roland, Dean, Tim, Micheal, Philip, Adrian, Eddie, Bradley, Andy, Lawrence, Edgar and Phillip have only visited Ecuador. Other notable sights include Rome, Spain, Brazil, Austria, Portugal, Thailand, Vietnam, Austria, Germany, France, The Bahamas, Portugal.
3: Dustin, Milton, Louis, Bill, Roland, Dean, Tim, Micheal, Philip, Adrian, Eddie, Bradley, Andy, Lawrence, Edgar, and Phillip have 

In [8]:
from ray import tune


def objective(step, alpha, beta):
    return (0.1 + alpha * step / 100)**(-1) + beta * 0.1


def training_function(config):
    # Hyperparameters
    alpha, beta = config["alpha"], config["beta"]
    for step in range(10):
        # Iterative training function - can be any arbitrary training procedure.
        intermediate_score = objective(step, alpha, beta)
        # Feed the score back back to Tune.
        tune.report(mean_loss=intermediate_score)


analysis = tune.run(
    training_function,
    config={
        "alpha": tune.grid_search([0.001, 0.01, 0.1]),
        "beta": tune.choice([1, 2, 3])
    })

print("Best config: ", analysis.get_best_config(
    metric="mean_loss", mode="min"))

# Get a dataframe for analyzing trial results.
df = analysis.results_df



Trial name,status,loc,alpha,beta
training_function_580e9_00000,PENDING,,0.001,1
training_function_580e9_00001,PENDING,,0.01,1
training_function_580e9_00002,PENDING,,0.1,1


2022-01-04 18:37:57,016	ERROR syncer.py:75 -- Log sync requires rsync to be installed.


Result for training_function_580e9_00000:
  date: 2022-01-04_18-37-58
  done: false
  experiment_id: 90066193acd745f6ad8d99d4df405cde
  hostname: DESKTOP-UIUES8U
  iterations_since_restore: 1
  mean_loss: 10.1
  neg_mean_loss: -10.1
  node_ip: 127.0.0.1
  pid: 33648
  time_since_restore: 0.0
  time_this_iter_s: 0.0
  time_total_s: 0.0
  timestamp: 1641339478
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 580e9_00000
  
Result for training_function_580e9_00002:
  date: 2022-01-04_18-37-58
  done: false
  experiment_id: a3d9de9aa3f84369b2eeeb8315810dd0
  hostname: DESKTOP-UIUES8U
  iterations_since_restore: 1
  mean_loss: 10.1
  neg_mean_loss: -10.1
  node_ip: 127.0.0.1
  pid: 48184
  time_since_restore: 0.0
  time_this_iter_s: 0.0
  time_total_s: 0.0
  timestamp: 1641339478
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 580e9_00002
  
Result for training_function_580e9_00001:
  date: 2022-01-04_18-37-58
  done: false
  experiment_id: a4f2ed638c06489ea

Trial name,status,loc,alpha,beta,loss,iter,total time (s),neg_mean_loss
training_function_580e9_00000,TERMINATED,127.0.0.1:33648,0.001,1,10.091,10,0.0479996,-10.091
training_function_580e9_00001,TERMINATED,127.0.0.1:48908,0.01,1,10.0108,10,0.0680001,-10.0108
training_function_580e9_00002,TERMINATED,127.0.0.1:48184,0.1,1,9.27431,10,0.0449984,-9.27431


2022-01-04 18:37:58,566	INFO tune.py:630 -- Total run time: 2.05 seconds (1.77 seconds for the tuning loop).
[2m[36m(pid=33648)[0m Windows fatal exception: access violation
[2m[36m(pid=33648)[0m 


Best config:  {'alpha': 0.1, 'beta': 1}


[2m[36m(pid=48184)[0m Windows fatal exception: access violation
[2m[36m(pid=48184)[0m 
[2m[36m(pid=48908)[0m Windows fatal exception: access violation
[2m[36m(pid=48908)[0m 
