# Generate K-shot Data

In [None]:
import argparse
import os
import numpy as np
import pandas as pd
from pandas import DataFrame
from jiant.utils.python.io import read_jsonl, write_jsonl

def load_curriculum_datasets(data_dir, tasks):
    datasets = {}
    for task in tasks:
        dataset = {}
        dirname = os.path.join(data_dir, task)
        splits = ["train", "val"]
        for split in splits:
            filename = os.path.join(dirname, f"{split}.jsonl")
            dataset[split] = read_jsonl(filename)
        datasets[task] = dataset
    return datasets

In [None]:
args_dict = dict(
    k=32,
    task=["lexical_nli", "socialqa_nli"],
    seed=[100, 13, 21, 42, 87],
    data_dir="/content/tasks/data",
    output_dir="./few_shot/",
    mode='k-shot', # k-shot-10x
)

args = argparse.Namespace(**args_dict)
args.output_dir = os.path.join(args.output_dir, args.mode)

In [None]:

k = args.k
print("K =", k)
datasets = load_curriculum_datasets(args.data_dir, args.task)

for seed in args.seed:
    print("Seed = %d" % (seed))
    for task, dataset in datasets.items():
        # Set random seed
        np.random.seed(seed)

        # Shuffle the training set
        print("| Task = %s" % (task))
        train_lines = dataset['train']
        np.random.shuffle(train_lines)

        # Set up dir
        task_dir = os.path.join(args.output_dir, task)
        setting_dir = os.path.join(task_dir, f"{k}-{seed}")
        os.makedirs(setting_dir, exist_ok=True)

        # Write test splits
        write_jsonl(dataset['val'], os.path.join(
            setting_dir, 'val.jsonl'))

        # Get label list for balanced sampling
        label_list = {}
        for line in train_lines:
            label = line['gold_label']
            if label not in label_list:
                label_list[label] = [line]
            else:
                label_list[label].append(line)

        new_train = []
        for label in label_list:
            new_train += label_list[label][:k]
        write_jsonl(new_train, os.path.join(
            setting_dir, 'train.jsonl'))

        new_dev = []
        for label in label_list:
            dev_rate = 11 if '10x' in args.mode else 2
            for line in label_list[label][k:k*dev_rate]:
                new_dev.append(line)
        write_jsonl(new_dev, os.path.join(
            setting_dir, 'dev.jsonl'))

In [1]:
from datetime import datetime
from typing import Optional

import datasets
import torch
import pytorch_lightning as pl

from torch.utils.data import DataLoader
from transformers import (
    AdamW,
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)
from pytorch_lightning import LightningDataModule, LightningModule, Trainer

AVAIL_GPUS = min(1, torch.cuda.device_count())


class GLUEDataModule(LightningDataModule):

    glue_task_num_labels = {
        "lexical": 3,
        "boolean": 3,
        "comparative": 3,
        "conditional": 3,
        "counting": 3,
        "negation": 3,
        "quantifier": 3,
        "transitive": 2,
        "hypernymy": 2,
        "hyponymy": 2,
        "ner": 2,
        "verbcorner": 2,
        "verbnet": 2,
        "syntactic_alternation": 2,
        "syntactic_variation": 2,
        "monotonicity_infer": 3,
        "syllogism": 2,
        "coreference": 3,
        "puns": 3,
        "sentiment": 2,
        "kg_relations": 2,
        "context_align": 3,
        "sprl": 2,
        "atomic": 3,
        "social_chem": 3,
        "socialqa": 3,
        "physicalqa": 3,
        "logiqa": 3,
        "ester": 3,
        "cosmoqa": 3,
        "drop": 3,
        "entailment_tree": 3,
        "proof_writer": 2,
        "temporal": 2,
        "spatial": 3,
        "counterfactual": 3
    }

    loader_columns = [
        "datasets_idx",
        "input_ids",
        "token_type_ids",
        "attention_mask",
        "start_positions",
        "end_positions",
        "labels",
    ]

    def __init__(
        self,
        model_name_or_path: str,
        task_name: str = "atomic",
        max_seq_length: int = 128,
        train_batch_size: int = 32,
        eval_batch_size: int = 32,
        **kwargs,
    ):
        super().__init__()
        self.model_name_or_path = model_name_or_path
        self.task_name = task_name
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size

        self.text_fields = ['premise', 'hypothesis']
        self.num_labels = self.glue_task_num_labels[task_name]
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)

    def setup(self, stage: str):
        self.dataset = datasets.load_dataset("curriculum_load_dataset.py", self.task_name)

        for split in self.dataset.keys():
            self.dataset[split] = self.dataset[split].map(
                self.convert_to_features,
                batched=True,
                remove_columns=["label"],
            )
            self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns]
            self.dataset[split].set_format(type="torch", columns=self.columns)

        self.eval_splits = [x for x in self.dataset.keys() if "validation" in x]

    def prepare_data(self):
        datasets.load_dataset("curriculum_load_dataset.py", self.task_name)
        AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)

    def train_dataloader(self):
        return DataLoader(self.dataset["train"], batch_size=self.train_batch_size)

    def val_dataloader(self):
        if len(self.eval_splits) == 1:
            return DataLoader(self.dataset["validation"], batch_size=self.eval_batch_size)
        elif len(self.eval_splits) > 1:
            return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]

    def test_dataloader(self):
        if len(self.eval_splits) == 1:
            return DataLoader(self.dataset["test"], batch_size=self.eval_batch_size)
        elif len(self.eval_splits) > 1:
            return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]

    def convert_to_features(self, example_batch, indices=None):

        # Either encode single sentence or sentence pairs
        if len(self.text_fields) > 1:
            texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))
        else:
            texts_or_text_pairs = example_batch[self.text_fields[0]]

        # Tokenize the text/text pairs
        features = self.tokenizer.batch_encode_plus(
            texts_or_text_pairs, max_length=self.max_seq_length, pad_to_max_length=True, truncation=True
        )

        # Rename label to labels to make it easier to pass to model forward
        features["labels"] = example_batch["label"]

        return features

In [8]:
from pqdict import pqdict
import jiant.utils.python.io as py_io
from jiant.utils.zlog import ZLogger

def init_log_writer(output_dir):
    os.makedirs(output_dir, exist_ok=True)
    return ZLogger(output_dir, overwrite=True)

class GLUETransformer(LightningModule):
    def __init__(
        self,
        model_name_or_path: str,
        num_labels: int,
        train_loader: DataLoader,
        task_name: str,
        learning_rate: float = 1e-5,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 0,
        weight_decay: float = 0.0,
        train_batch_size: int = 8,
        eval_batch_size: int = 16,
        eval_splits: Optional[list] = None,
        **kwargs,
    ):
        super().__init__()

        self.save_hyperparameters()
        #self.current_epoch = 0
        #self.log_writer = init_log_writer(f"./train_dynamics/{task_name}/")
        self.train_loader = train_loader
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
        self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=self.config)
        self.metric = datasets.load_metric(
            "glue", 'mnli', experiment_id=datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
        )

        step = 5
        raise_rate = 0.2


    def forward(self, **inputs):
        return self.model(**inputs)

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        difficult_loss = outputs[0]

        #data_with_loss = zip(batch, difficult_loss)
        #data_loss_dict = {}
        #for (data, ls) in data_with_loss:
        #    data_loss_dict[data] = ls
        #data_loss_pdict = pqdict(data_loss_dict)
        #ranked_batch = list(data_loss_pdict.popkeys())

        return difficult_loss

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(**batch)
        val_loss, logits = outputs[:2]

        if self.hparams.num_labels >= 1:
            preds = torch.argmax(logits, axis=1)
        elif self.hparams.num_labels == 1:
            preds = logits.squeeze()

        labels = batch["labels"]

        return {"loss": val_loss, "preds": preds, "labels": labels}

    #def training_epoch_end(self, outputs):
    #    self.current_epoch += 1

    def validation_epoch_end(self, outputs):
        preds = torch.cat([x["preds"] for x in outputs]).detach().cpu().numpy()
        labels = torch.cat([x["labels"] for x in outputs]).detach().cpu().numpy()
        loss = torch.stack([x["loss"] for x in outputs]).mean()
        self.log("val_loss", loss, prog_bar=True)
        self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True)
        val_loss_record = py_io.read_json("./runs/zero-shot/anli_roberta/inoculation_base_loss.json")
        val_loss_record[self.hparams.task_name] = loss.item()
        py_io.write_json(val_loss_record, "./runs/zero-shot/anli_roberta/inoculation_base_loss.json")
        return loss

    def setup(self, stage=None) -> None:
        if stage != "fit":
            return

        # Calculate total steps
        tb_size = self.hparams.train_batch_size * max(1, self.trainer.gpus)
        ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs)
        self.total_steps = (len(self.train_loader.dataset) // tb_size) // ab_size

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 1e-5,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.total_steps,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]

In [12]:
from pytorch_lightning.loggers import TensorBoardLogger

dm = GLUEDataModule(
    model_name_or_path="ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli",
    task_name="drop",
)
dm.setup("fit")

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=f"./runs/{dm.task_name}/roberta-base/checkpoints/",
    filename="checkpoint_best",
    monitor="val_loss",
    mode="min",
    save_top_k=2
)

train_params = dict(
    gpus=1,
    max_epochs=5,
    progress_bar_refresh_rate=1,
    checkpoint_callback=True,
    callbacks=[checkpoint_callback],
    logger=TensorBoardLogger(
        os.path.join("./runs", 'logs'),
        name=f"{dm.model_name_or_path}-{dm.task_name}",
        version='trial_1')
)

Downloading and preparing dataset curriculum/drop (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to C:\Users\Admin\.cache\huggingface\datasets\curriculum\drop\1.0.0\7aab0b873a95ffa65a2bb4ba2c1ccbc5b2e1d1a89a4fd6c0e8613e7ce79f7465...




Dataset curriculum downloaded and prepared to C:\Users\Admin\.cache\huggingface\datasets\curriculum\drop\1.0.0\7aab0b873a95ffa65a2bb4ba2c1ccbc5b2e1d1a89a4fd6c0e8613e7ce79f7465. Subsequent calls will reuse this data.


100%|██████████| 21/21 [00:02<00:00,  7.27ba/s]
100%|██████████| 12/12 [00:01<00:00,  9.42ba/s]


In [13]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

model = GLUETransformer(
    model_name_or_path="ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli",
    num_labels=dm.num_labels,
    eval_splits=dm.eval_splits,
    task_name=dm.task_name,
    train_loader=dm.train_dataloader()
)

trainer = Trainer(**train_params)
#trainer.fit(model, dm)
trainer.validate(model, dm.val_dataloader())

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validating: 100%|██████████| 352/352 [01:20<00:00,  4.55it/s]--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'accuracy': 0.4816098213195801, 'val_loss': 3.045374870300293}
--------------------------------------------------------------------------------


[{'val_loss': 3.045374870300293, 'accuracy': 0.4816098213195801}]

In [8]:
validate_model = GLUETransformer.load_from_checkpoint("./runs/context_align/roberta-base/checkpoints/checkpoint_best.ckpt")
trainer = Trainer(**train_params)
trainer.validate(validate_model, dm.val_dataloader())

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'accuracy': 0.6036446690559387, 'val_loss': 1.1908454895019531}
--------------------------------------------------------------------------------


[{'val_loss': 1.1908454895019531, 'accuracy': 0.6036446690559387}]

In [2]:
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer


def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_paraphraser')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device ",device)
model = model.to(device)

sentence = "Which course should I take to get started in data science?"
# sentence = "What are the ingredients required to bake a perfect cake?"
# sentence = "What is the best possible approach to learn aeronautical engineering?"
# sentence = "Do apples taste better than oranges in general?"
sentence = "Dustin, Milton, Louis, Bill, Roland, Dean, Tim, Micheal, Philip, Adrian, Eddie, Bradley, Andy, Lawrence, Edgar and Phillip have only visited Ecuador"


text =  "paraphrase: " + sentence + " </s>"


max_len = 256

encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)


# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
beam_outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    do_sample=True,
    max_length=256,
    top_k=120,
    top_p=0.98,
    early_stopping=True,
    num_return_sequences=10
)


print ("\nOriginal Question ::")
print (sentence)
print ("\n")
print ("Paraphrased Questions :: ")
final_outputs =[]
for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    if sent.lower() != sentence.lower() and sent not in final_outputs:
        final_outputs.append(sent)

for i, final_output in enumerate(final_outputs):
    print("{}: {}".format(i, final_output))

device  cuda





Original Question ::
Dustin, Milton, Louis, Bill, Roland, Dean, Tim, Micheal, Philip, Adrian, Eddie, Bradley, Andy, Lawrence, Edgar and Phillip have only visited Ecuador


Paraphrased Questions :: 
0: Dustin, Milton, Louis, Bill, Roland, Dean, Tim, Micheal, Philip, Adrian, Eddie, Bradley, Andy, Lawrence, Edgar and Phillip have only been visited by Ecuador. They spent time with Cuban, Brazil, Turkey and Honduras.
1: Dustin, Milton, Louis, Bill, Roland, Dean, Tim, Micheal, Philip, Adrian, Eddie, Bradley, Andy, Lawrence, Edgar and Phillip have only visited Ecuador.
2: Dustin, Milton, Louis, Bill, Roland, Roland, Dean, Tim, Micheal, Philip, Adrian, Eddie, Bradley, Andy, Lawrence, Edgar and Phillip have only visited Ecuador. Other notable sights include Rome, Spain, Brazil, Austria, Portugal, Thailand, Vietnam, Austria, Germany, France, The Bahamas, Portugal.
3: Dustin, Milton, Louis, Bill, Roland, Dean, Tim, Micheal, Philip, Adrian, Eddie, Bradley, Andy, Lawrence, Edgar, and Phillip have 