In [1]:
!pip install transformers==2.11.0
!pip install nlp==0.2.0

Collecting transformers==2.11.0
  Downloading transformers-2.11.0-py3-none-any.whl (674 kB)
[K     |████████████████████████████████| 674 kB 4.4 MB/s eta 0:00:01
Collecting sentencepiece
  Downloading sentencepiece-0.1.94-cp38-cp38-manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 16.8 MB/s eta 0:00:01
[?25hCollecting tokenizers==0.7.0
  Downloading tokenizers-0.7.0-cp38-cp38-manylinux1_x86_64.whl (7.5 MB)
[K     |████████████████████████████████| 7.5 MB 14.4 MB/s eta 0:00:01
[?25hInstalling collected packages: tokenizers, sentencepiece, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.9.4
    Uninstalling tokenizers-0.9.4:
      Successfully uninstalled tokenizers-0.9.4
  Attempting uninstall: transformers
    Found existing installation: transformers 4.0.1
    Uninstalling transformers-4.0.1:
      Successfully uninstalled transformers-4.0.1
Successfully installed sentencepiece-0.1.94 tokenizers-0.7.0 

In [1]:
import numpy as np
import torch
import torch.nn as nn
import transformers
import nlp
from transformers import XLNetTokenizer, XLNetModel,AutoTokenizer
import dataclasses
from torch.utils.data.dataloader import DataLoader
from transformers.training_args import is_tpu_available
from transformers.trainer import get_tpu_sampler
from transformers.data.data_collator import DataCollator, InputDataClass
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler
from typing import List, Union, Dict
from time import time
# import logging
# logging.basicConfig(level=logging.INFO)

In [2]:
dataset_dict = {
    "cola": nlp.load_dataset('glue', name="cola"),
    "stsb": nlp.load_dataset('glue', name="stsb"),
    "wnli": nlp.load_dataset('glue', name="wnli")   
}

In [3]:
for task_name, dataset in dataset_dict.items():
    print(task_name)
    print(dataset_dict[task_name]["train"][5])
    print()

cola
{'sentence': "I'll fix you a drink.", 'label': 1, 'idx': 5}

stsb
{'sentence1': 'Some men are fighting.', 'sentence2': 'Two men are fighting.', 'label': 4.25, 'idx': 5}

wnli
{'sentence1': 'George got free tickets to the play, but he gave them to Eric, because he was particularly eager to see it.', 'sentence2': 'George was particularly eager to see it.', 'label': 0, 'idx': 5}



In [4]:
class MultitaskModel(transformers.PreTrainedModel):
    def __init__(self, encoder, taskmodels_dict):
        """
        Setting MultitaskModel up as a PretrainedModel allows us
        to take better advantage of Trainer features
        """
        super().__init__(transformers.PretrainedConfig())

        self.encoder = encoder
        self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)

    @classmethod
    def create(cls, model_name, model_type_dict, model_config_dict=None):
        """
        This creates a MultitaskModel using the model class and config objects
        from single-task models. 

        We do this by creating each single-task model, and having them share
        the same encoder transformer.
        """
        shared_encoder = None
        taskmodels_dict = {}
        for task_name, model_type in model_type_dict.items():
            model = model_type.from_pretrained(
                model_name, 
                config=model_config_dict[task_name],
            )
            if shared_encoder is None:
                print('*****************')
                #print(cls.get_encoder_attr_name(model))
                
                shared_encoder = getattr(model, model.base_model_prefix)
                #shared_encoder
                print(shared_encoder)
                print('*****************')
            else:
                setattr(model, model.base_model_prefix
                        , shared_encoder)
            taskmodels_dict[task_name] = model
        return cls(encoder=shared_encoder, taskmodels_dict=taskmodels_dict)


    def forward(self, task_name, **kwargs):
        return self.taskmodels_dict[task_name](**kwargs)

In [5]:
model_name = 'xlnet-base-cased'
multitask_model = MultitaskModel.create(
    model_name=model_name,
    model_type_dict={
        "stsb": transformers.AutoModelForSequenceClassification,
        "cola": transformers.AutoModelForSequenceClassification,
        "wnli": transformers.AutoModelForSequenceClassification,
        #"mnli": transformers.AutoModelForSequenceClassification,
    },
    model_config_dict={
        "stsb": transformers.AutoConfig.from_pretrained(model_name, num_labels=1),
        "cola": transformers.AutoConfig.from_pretrained(model_name, num_labels=2),
        "wnli": transformers.AutoConfig.from_pretrained(model_name, num_labels=2),
        #"mnli": transformers.AutoConfig.from_pretrained(model_name, num_labels=3)
    },
)

*****************
XLNetModel(
  (word_embedding): Embedding(32000, 768)
  (layer): ModuleList(
    (0): XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): XLNetFeedForward(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (layer_1): Linear(in_features=768, out_features=3072, bias=True)
        (layer_2): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): XLNetFeedForward(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (layer_1): Linear(in_features=768, ou

In [6]:
max_length = 340

def convert_to_sst2_features(example_batch):
    inputs = list(zip(example_batch['sentence']))
    features = tokenizer.batch_encode_plus(
        inputs,
        max_length=max_length,
        pad_to_max_length=True
    )
    
    features['labels'] = example_batch['label']
    return features

def convert_to_cola_features(example_batch):
    #print(example_batch)
    inputs = example_batch['sentence']
    features = tokenizer.batch_encode_plus(
        inputs, max_length=max_length, pad_to_max_length=True
    )
    features["labels"] = example_batch["label"]
    return features

def convert_to_qnli_features(example_batch):
    #print(example_batch)
    inputs = list(zip(example_batch['question'], example_batch['sentence']))
    features = tokenizer.batch_encode_plus(
        inputs, max_length=max_length, pad_to_max_length=True
    )
    features["labels"] = example_batch["label"]
    return features

def convert_to_wnli_features(example_batch):
    #print(example_batch)
    inputs = list(zip(example_batch['sentence1'], example_batch['sentence2']))
    features = tokenizer.batch_encode_plus(
        inputs, max_length=max_length, pad_to_max_length=True
    )
    features["labels"] = example_batch["label"]
    return features

def convert_to_mnli_features(example_batch):
    #print(example_batch)
    inputs = list(zip(example_batch['premise'], example_batch['hypothesis']))
    features = tokenizer.batch_encode_plus(
        inputs, max_length=max_length, pad_to_max_length=True
    )
    features["labels"] = example_batch["label"]
    return features

def convert_to_stsb_features(example_batch):
    inputs = list(zip(example_batch['sentence1'], example_batch['sentence2']))
    features = tokenizer.batch_encode_plus(
        inputs, max_length=max_length, pad_to_max_length=True
    )
    features["labels"] = example_batch["label"]
    return features

def convert_to_rte_features(example_batch):
    inputs = list(zip(example_batch['sentence1'], example_batch['sentence2']))
    features = tokenizer.batch_encode_plus(
        inputs, max_length=max_length, pad_to_max_length=True
    )
    features["labels"] = example_batch["label"]
    return features

def convert_to_commonsense_qa_features(example_batch):
    num_examples = len(example_batch["question"])
    num_choices = len(example_batch["choices"][0]["text"])
    features = {}
    for example_i in range(num_examples):
        choices_inputs = tokenizer.batch_encode_plus(
            list(zip(
                [example_batch["question"][example_i]] * num_choices,
                example_batch["choices"][example_i]["text"],
            )),
            max_length=max_length, pad_to_max_length=True,
        )
        for k, v in choices_inputs.items():
            if k not in features:
                features[k] = []
            features[k].append(v)
    labels2id = {char: i for i, char in enumerate("ABCDE")}
    # Dummy answers for test
    if example_batch["answerKey"][0]:
        features["labels"] = [labels2id[ans] for ans in example_batch["answerKey"]]
    else:
        features["labels"] = [0] * num_examples    
    return features

In [7]:
convert_func_dict = {
    "stsb": convert_to_stsb_features,
    "cola": convert_to_cola_features,
    "wnli": convert_to_wnli_features
    #"mnli": convert_to_mnli_features,
}

In [8]:
tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased',do_lower_case=False)
columns_dict = {
    "stsb": ['input_ids', 'attention_mask', 'labels'],
    "cola": ['input_ids', 'attention_mask', 'labels'],
    "wnli": ['input_ids', 'attention_mask', 'labels'],
    #"mnli": ['input_ids', 'attention_mask', 'labels'],
    
}

features_dict = {}
for task_name, dataset in dataset_dict.items():
    features_dict[task_name] = {}
    for phase, phase_dataset in dataset.items():
        features_dict[task_name][phase] = phase_dataset.map(
            convert_func_dict[task_name],
            batched=True,
            load_from_cache_file=False,
        )
        print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase]))
        features_dict[task_name][phase].set_format(
            type="torch", 
            columns=columns_dict[task_name],
        )
        print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase]))

100%|██████████| 9/9 [00:01<00:00,  8.36it/s]
100%|██████████| 2/2 [00:00<00:00, 14.62it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

cola train 8551 8551
cola train 8551 8551
cola validation 1043 1043
cola validation 1043 1043


100%|██████████| 2/2 [00:00<00:00, 14.29it/s]
 17%|█▋        | 1/6 [00:00<00:00,  6.04it/s]

cola test 1063 1063
cola test 1063 1063


100%|██████████| 6/6 [00:01<00:00,  5.09it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

stsb train 5749 5749
stsb train 5749 5749


100%|██████████| 2/2 [00:00<00:00,  6.40it/s]
 50%|█████     | 1/2 [00:00<00:00,  5.46it/s]

stsb validation 1500 1500
stsb validation 1500 1500


100%|██████████| 2/2 [00:00<00:00,  7.56it/s]
100%|██████████| 1/1 [00:00<00:00,  7.17it/s]
100%|██████████| 1/1 [00:00<00:00, 57.80it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

stsb test 1379 1379
stsb test 1379 1379
wnli train 635 635
wnli train 635 635
wnli validation 71 71
wnli validation 71 71


100%|██████████| 1/1 [00:00<00:00, 23.03it/s]

wnli test 146 146
wnli test 146 146





In [9]:
class NLPDataCollator(DataCollator):
    """
    Extending the existing DataCollator to work with NLP dataset batches
    """
    def collate_batch(self, features: List[Union[InputDataClass, Dict]]) -> Dict[str, torch.Tensor]:
        first = features[0]
        if isinstance(first, dict):
          # NLP data sets current works presents features as lists of dictionary
          # (one per example), so we  will adapt the collate_batch logic for that
            if "labels" in first and first["labels"] is not None:
                if first["labels"].dtype == torch.int64:
                    labels = torch.tensor([f["labels"] for f in features], dtype=torch.long)
                else:
                    labels = torch.tensor([f["labels"] for f in features], dtype=torch.float)
                batch = {"labels": labels}
            for k, v in first.items():
                if k != "labels" and v is not None and not isinstance(v, str):
                    batch[k] = torch.stack([f[k] for f in features])
            return batch
        else:
          # otherwise, revert to using the default collate_batch
          return DefaultDataCollator().collate_batch(features)


class StrIgnoreDevice(str):
    """
    This is a hack. The Trainer is going call .to(device) on every input
    value, but we need to pass in an additional `task_name` string.
    This prevents it from throwing an error
    """
    def to(self, device):
        return self


class DataLoaderWithTaskname:
    """
    Wrapper around a DataLoader to also yield a task name
    """
    def __init__(self, task_name, data_loader):
        self.task_name = task_name
        self.data_loader = data_loader

        self.batch_size = data_loader.batch_size
        self.dataset = data_loader.dataset

    def __len__(self):
        return len(self.data_loader)
    
    def __iter__(self):
        for batch in self.data_loader:
            batch["task_name"] = StrIgnoreDevice(self.task_name)
            yield batch


class MultitaskDataloader:
    """
    Data loader that combines and samples from multiple single-task
    data loaders.
    """
    def __init__(self, dataloader_dict):
        self.dataloader_dict = dataloader_dict
        self.num_batches_dict = {
            task_name: len(dataloader) 
            for task_name, dataloader in self.dataloader_dict.items()
        }
        self.task_name_list = list(self.dataloader_dict)
        self.dataset = [None] * sum(
            len(dataloader.dataset) 
            for dataloader in self.dataloader_dict.values()
        )

    def __len__(self):
        return sum(self.num_batches_dict.values())

    def __iter__(self):
        """
        For each batch, sample a task, and yield a batch from the respective
        task Dataloader.

        We use size-proportional sampling, but you could easily modify this
        to sample from some-other distribution.
        """
        task_choice_list = []
        for i, task_name in enumerate(self.task_name_list):
            task_choice_list += [i] * self.num_batches_dict[task_name]
        task_choice_list = np.array(task_choice_list)
        np.random.shuffle(task_choice_list)
        dataloader_iter_dict = {
            task_name: iter(dataloader) 
            for task_name, dataloader in self.dataloader_dict.items()
        }
        for task_choice in task_choice_list:
            task_name = self.task_name_list[task_choice]
            yield next(dataloader_iter_dict[task_name])    

class MultitaskTrainer(transformers.Trainer):

    def get_single_train_dataloader(self, task_name, train_dataset):
        """
        Create a single-task data loader that also yields task names
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        if is_tpu_available():
            train_sampler = get_tpu_sampler(train_dataset)
        else:
            train_sampler = (
                RandomSampler(train_dataset)
                if self.args.local_rank == -1
                else DistributedSampler(train_dataset)
            )

        data_loader = DataLoaderWithTaskname(
            task_name=task_name,
            data_loader=DataLoader(
              train_dataset,
              batch_size=self.args.train_batch_size,
              sampler=train_sampler,
              collate_fn=self.data_collator.collate_batch,
            ),
        )

        if is_tpu_available():
            data_loader = pl.ParallelLoader(
                data_loader, [self.args.device]
            ).per_device_loader(self.args.device)
        return data_loader

    def get_train_dataloader(self):
        """
        Returns a MultitaskDataloader, which is not actually a Dataloader
        but an iterable that returns a generator that samples from each 
        task Dataloader
        """
        return MultitaskDataloader({
            task_name: self.get_single_train_dataloader(task_name, task_dataset)
            for task_name, task_dataset in self.train_dataset.items()
        })

In [10]:
start=time()
print('Training started at ',start)
train_dataset = {
    task_name: dataset["train"] 
    for task_name, dataset in features_dict.items()
}
trainer = MultitaskTrainer(
    model=multitask_model,
    args=transformers.TrainingArguments(
        output_dir="./models/xlnet_third_run",
        overwrite_output_dir=True,
        learning_rate=5e-5,
        do_train=True,
        num_train_epochs=3,
        # Adjust batch size if this doesn't fit on the Colab GPU
        per_device_train_batch_size=8,  
        save_steps=3000,
        logging_steps=100,
        logging_dir='xlnet_logs_third_run'
    ),
    data_collator=NLPDataCollator(),
    train_dataset=train_dataset,
)
trainer.train()
print(time()-start)


Training started at  1608180744.130317


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1868.0, style=ProgressStyle(description_w…

{"loss": 1.3997068860381843, "learning_rate": 4.9107780157030696e-05, "epoch": 0.05353319057815846, "step": 100}
{"loss": 1.368909692466259, "learning_rate": 4.821556031406139e-05, "epoch": 0.10706638115631692, "step": 200}
{"loss": 1.0613712200522423, "learning_rate": 4.732334047109208e-05, "epoch": 0.16059957173447537, "step": 300}
{"loss": 1.2776457597315312, "learning_rate": 4.643112062812277e-05, "epoch": 0.21413276231263384, "step": 400}
{"loss": 1.1486877569556235, "learning_rate": 4.553890078515346e-05, "epoch": 0.2676659528907923, "step": 500}
{"loss": 1.293742711544037, "learning_rate": 4.4646680942184155e-05, "epoch": 0.32119914346895073, "step": 600}
{"loss": 1.1060688519477844, "learning_rate": 4.375446109921485e-05, "epoch": 0.3747323340471092, "step": 700}
{"loss": 1.159701759070158, "learning_rate": 4.286224125624554e-05, "epoch": 0.4282655246252677, "step": 800}
{"loss": 1.2830065727233886, "learning_rate": 4.1970021413276235e-05, "epoch": 0.4817987152034261, "step": 9

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1868.0, style=ProgressStyle(description_w…

{"loss": 1.2560767555236816, "learning_rate": 3.3047822983583155e-05, "epoch": 1.0171306209850106, "step": 1900}
{"loss": 1.1454512107372283, "learning_rate": 3.215560314061385e-05, "epoch": 1.0706638115631693, "step": 2000}
{"loss": 1.1601292730867863, "learning_rate": 3.126338329764454e-05, "epoch": 1.1241970021413277, "step": 2100}
{"loss": 1.161359928548336, "learning_rate": 3.0371163454675235e-05, "epoch": 1.177730192719486, "step": 2200}
{"loss": 1.1161352533102036, "learning_rate": 2.9478943611705928e-05, "epoch": 1.2312633832976445, "step": 2300}
{"loss": 1.124098533987999, "learning_rate": 2.8586723768736618e-05, "epoch": 1.284796573875803, "step": 2400}
{"loss": 1.1488444618880749, "learning_rate": 2.769450392576731e-05, "epoch": 1.3383297644539613, "step": 2500}
{"loss": 1.1802565118670463, "learning_rate": 2.6802284082798e-05, "epoch": 1.39186295503212, "step": 2600}
{"loss": 1.195280080586672, "learning_rate": 2.5910064239828698e-05, "epoch": 1.4453961456102784, "step": 27



{"loss": 1.4074445095658303, "learning_rate": 2.2341184867951464e-05, "epoch": 1.6595289079229123, "step": 3100}
{"loss": 1.2184402349591255, "learning_rate": 2.1448965024982158e-05, "epoch": 1.7130620985010707, "step": 3200}




KeyboardInterrupt: 

In [13]:
# from time import time
# start=time()
# print('Training started at ',start)
# train_dataset = {
#     task_name: dataset["train"] 
#     for task_name, dataset in features_dict.items()
# }
# trainer = MultitaskTrainer(
#     model=multitask_model,
#     args=transformers.TrainingArguments(
#         output_dir="./models/multitask_model",
#         overwrite_output_dir=True,
#         learning_rate=5e-5,
#         do_train=True,
#         num_train_epochs=3,
#         # Adjust batch size if this doesn't fit on the Colab GPU
#         per_device_train_batch_size=32,  
#         save_steps=3000,
#     ),
#     data_collator=NLPDataCollator(),
#     train_dataset=train_dataset,
# )
# trainer.train(optimizer='adamax')
# print(time()-start)


In [None]:
print((time()-start) / 60)

In [104]:
preds_dict = {}
for task_name in ["cola", "stsb", "wnli"]:
    eval_dataloader = DataLoaderWithTaskname(
        task_name,
        trainer.get_eval_dataloader(eval_dataset=features_dict[task_name]["validation"])
    )
    print(eval_dataloader.data_loader.collate_fn)
    preds_dict[task_name] = trainer._prediction_loop(
        eval_dataloader, 
        description=f"Validation: {task_name}",
    )
    #print(task_name,preds_dict[task_name])

<bound method NLPDataCollator.collate_batch of <__main__.NLPDataCollator object at 0x7f50e8c02c40>>


HBox(children=(FloatProgress(value=0.0, description='Validation: cola', max=131.0, style=ProgressStyle(descrip…


<bound method NLPDataCollator.collate_batch of <__main__.NLPDataCollator object at 0x7f50e8c02c40>>


HBox(children=(FloatProgress(value=0.0, description='Validation: stsb', max=188.0, style=ProgressStyle(descrip…


<bound method NLPDataCollator.collate_batch of <__main__.NLPDataCollator object at 0x7f50e8c02c40>>


HBox(children=(FloatProgress(value=0.0, description='Validation: wnli', max=9.0, style=ProgressStyle(descripti…




In [16]:
! pip install scipy sklearn

Collecting scipy
  Downloading scipy-1.5.4-cp38-cp38-manylinux1_x86_64.whl (25.8 MB)
[K     |████████████████████████████████| 25.8 MB 4.3 MB/s eta 0:00:01
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-0.23.2-cp38-cp38-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 65.0 MB/s eta 0:00:01
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1316 sha256=ffa0a9c3054b6baa6216e6ca0ec1a2f9259a3923e5293dceaed2ca24f7371cdd
  Stored in directory: /home/ecbm4040/.cache/pip/wheels/22/0b/40/fd3f795caaa1fb4c6cb738bc1f56100be1e57da95849bfc897
Successfully built sklearn
Installing collected packages: threadpoolctl, scipy, scikit-learn, sklearn
Successfully installed scikit-learn-0.2

In [105]:
scores = {}

scores['cola'] = nlp.load_metric('glue', name='cola').compute(
    np.argmax(preds_dict['cola'].predictions, axis=1),
    preds_dict['cola'].label_ids
)
scores['stsb'] = nlp.load_metric('glue', name="stsb").compute(
    preds_dict["stsb"].predictions.flatten(),
    preds_dict["stsb"].label_ids,
)
scores['wnli'] = nlp.load_metric('glue', name='wnli').compute(
    np.argmax(preds_dict['wnli'].predictions, axis=1),
    preds_dict['wnli'].label_ids
)

In [106]:
scores

{'cola': {'matthews_correlation': 0.002973634822921437},
 'stsb': {'pearson': 0.8518357251907471, 'spearmanr': 0.852182635385541},
 'wnli': {'accuracy': 0.5352112676056338}}

In [112]:
import pickle 

with open('xlnet_scores_3.pkl', 'wb') as fd :
    pickle.dump(scores, fd)