### Install Libraries 

In [0]:
# ! apt install cuda

In [0]:

!pip install pytorch-pretrained-bert
!pip install spacy
!python -m spacy download en

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 9.8MB/s eta 0:00:01[K     |█████▎                          | 20kB 1.5MB/s eta 0:00:01[K     |████████                        | 30kB 2.2MB/s eta 0:00:01[K     |██████████▋                     | 40kB 1.6MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 1.9MB/s eta 0:00:01[K     |███████████████▉                | 61kB 2.3MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 2.7MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 3.0MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 3.4MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 2.6MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 2.6MB/s eta 0:00:01[K     |███████████████████████

### *Open the dataset and store it in a DataFrame*

In [0]:
import os
import glob
import re

In [0]:
files = glob.glob("*.csv")

In [0]:
import pandas as pd
import numpy as np

In [0]:
df = pd.read_csv(files[0]) # Read the first csv file
df = df.dropna()           # Drop rows with NaN
df.tail()

Unnamed: 0,Document,Paragraph,Risk Classification,Risk Type,New Definition
6041,ZSIndia_NDA_04.05.2017,No representation or promise relating to and n...,0.0,General,Others
6042,ZSIndia_NDA_04.05.2017,The Recipient acknowledges that due to the uni...,0.0,Injunctive Relief,Others
6043,ZSIndia_NDA_04.05.2017,The Parties intend to share Confidential Infor...,1.0,Purpose,Scope
6044,ZSIndia_NDA_04.05.2017,The confidentiality obligations hereunder shal...,0.0,Survival & Exceptions,Others
6045,ZSIndia_NDA_04.05.2017,The Recipient may also make disclosures as may...,0.0,Survival & Exceptions,Others


## Download the model. 

#### *I am using BERT base uncased*

In [0]:
if not os.path.isfile('uncased_L-12_H-768_A-12.zip'):

  os.system( "wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip")
  
else:
  print("Weights already downloaded !")

In [0]:
if not os.path.exists('unzip uncased_L-12_H-768_A-12/'):
  os.system("unzip uncased_L-12_H-768_A-12.zip")

else:
  print("Weights already extracted !")
!ls uncased_L-12_H-768_A-12/

bert_config.json		     bert_model.ckpt.index  vocab.txt
bert_model.ckpt.data-00000-of-00001  bert_model.ckpt.meta


##  Pre-process text

Clean the data. Remove extra spaces.


1.   ''\<cp> CAPITALS\</cp>'' for capital words 
2.   \<no.> 9821 \</no.> 
3.   \<ph> Phone no. \</ph> 
4.   \<ml> e-mail \<ml>



In [0]:
data_df = df.filter(['Paragraph','Risk Classification','New Definition'], axis=1) # Create new dataframe with only text and labels,
data_df = data_df.rename(columns={'Paragraph':'Text','Risk Classification':'Criticality','New Definition': 'Labels'})

Type 1 is RISK TYPE (New Definition)
Type 2 is Risk Classification

In [0]:
data_df.head()

Unnamed: 0,Text,Criticality,Labels
0,Either party may disclose (“Disclosing Party”)...,1.0,Confidentiality
1,This Confidentiality and Non-Disclosure Agreem...,1.0,Legal
2,Receiving Party will use the Confidential Info...,1.0,Confidentiality
3,Prior to disclosing any Confidential Informati...,1.0,Confidentiality
4,The obligations contained in this Section 2 wi...,1.0,Confidentiality


In [0]:
def cleanse_text(inp):

  # Oaps = Opening apostrophe
  # Eaps = Ending apostrophe
  # Oprs =  Opening paranthesis
  # Eprs = Ending paranthesis
  # Ocase = Casing (Capital) only to first element
  # Wcase = Casing to whole word
  # aps = Apostrophe - "
  
  inp = inp.replace('“'," <oaps> ").replace('”'," <eaps> ").replace("("," <oprs> ").replace(")"," <eprs> ").replace("-"," - ").replace(":"," : ").replace(","," , ").replace(";"," , ").replace("."," . ").replace('"',"<aps>").replace("‘","<oaps>").replace("’","<eaps>")
  
  inp = (re.sub( ' +',' ',str(inp) ) ) ## Removing extra spaces
  
  new_formt_sent = []
  for word in inp.split(" "):
    if len(word)<1: continue
    if (not word.islower() and not word.isupper()) and word[0].isupper():
      new_formt_sent.append("<ocase>")
      new_formt_sent.append(word)
      new_formt_sent.append("</ocase>")
      continue
      
    if word.isupper():
      new_formt_sent.append("<wcase>")
      new_formt_sent.append(word)
      new_formt_sent.append("</wcase>")
      continue
    
    new_formt_sent.append(word)
    
  return " ".join(new_formt_sent).lower()
    

In [0]:
# never_split = ["<oaps>","<eaps>","<oprs>","<eprs>","<oaps>","<aps>","<ocase>","</ocase>","<wcase>","</wcase>",]

In [0]:
vec_cleanse = np.vectorize(cleanse_text) # For applying the function parallely
data_df['Text'] = vec_cleanse(data_df['Text']) # Passing list of paragraphs to list of functions and Output is a list of formatted clauses.

In [0]:
df['Paragraph'][1]

'This Confidentiality and Non-Disclosure Agreement (“Agreement”) dated 20th March, 2017 (“Effective Date”) is entered into between Gramener Technology Solutions Pvt. Ltd., an Indian corporation with its principal place of business at Plot 9/2, 2nd floor, Survey No.64, HUDA Techno Enclave, Phase 2, Madhapur, Hyderabad – 500081, Telangana, India (“Gramener”) and Aditya Birla Management Corporate Pvt. Ltd., an Indian corporation with its principal places of business at Aditya Birla Centre, S.K. Ahire Marg, Worli, Mumbai 400 030 and Ahura Centre, ‘A’ wing, Ground Floor. Mahakali Caves Road, Andheri East, Mumbai – 400093 (“the Company”).  '

In [0]:
data_df['Text'][1]

'<ocase> this </ocase> <ocase> confidentiality </ocase> and <ocase> non </ocase> - <ocase> disclosure </ocase> <ocase> agreement </ocase> <oprs> <oaps> <ocase> agreement </ocase> <eaps> <eprs> dated 20th <ocase> march </ocase> , 2017 <oprs> <oaps> <ocase> effective </ocase> <ocase> date </ocase> <eaps> <eprs> is entered into between <ocase> gramener </ocase> <ocase> technology </ocase> <ocase> solutions </ocase> <ocase> pvt </ocase> . <ocase> ltd </ocase> . , an <ocase> indian </ocase> corporation with its principal place of business at <ocase> plot </ocase> 9/2 , 2nd floor , <ocase> survey </ocase> <ocase> no </ocase> . 64 , <wcase> huda </wcase> <ocase> techno </ocase> <ocase> enclave </ocase> , <ocase> phase </ocase> 2 , <ocase> madhapur </ocase> , <ocase> hyderabad </ocase> – 500081 , <ocase> telangana </ocase> , <ocase> india </ocase> <oprs> <oaps> <ocase> gramener </ocase> <eaps> <eprs> and <ocase> aditya </ocase> <ocase> birla </ocase> <ocase> management </ocase> <ocase> corpora

In [0]:
set(data_df['Labels'])

{'Confidentiality',
 'Finance',
 'Intelectual Property',
 'Legal',
 'Non Compete',
 'Non Solicitation',
 'Others',
 'Scope',
 'Sub Contract'}

In [0]:
if not os.path.exists("data/"):
  os.mkdir("data/")
  
  
data_df = data_df.sample(frac=1).reset_index(drop=True) # Shuffling the rows

split_ratio = 0.2  # For Train/Val ratio

val_df = data_df.iloc[-int(split_ratio*data_df.shape[0]):,:]
data_df = data_df.iloc[:int((1-split_ratio)*data_df.shape[0]),:]
data_df.to_csv("data/train.csv")
val_df.to_csv("data/val.csv")


## BERT finetuning runner.

*Taken from HuggingFace github repo on BERT-pytorch.*

Reference : https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_classifier.py


BERT is trained using TensorFlow. So, we need to convert [TensorFlow checkpoints](https://github.com/google-research/bert#pre-trained-models) to PyTorch.

```
# Example Code

export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12

pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch \
  $BERT_BASE_DIR/bert_model.ckpt \
  $BERT_BASE_DIR/bert_config.json \
  $BERT_BASE_DIR/pytorch_model.bin
  
```



In [0]:
# !export BERT_BASE_DIR='content/uncased_L-12_H-768_A-12'

if not os.path.isfile("uncased_L-12_H-768_A-12/pytorch_model.bin"):
  os.system('''pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch \
    uncased_L-12_H-768_A-12/bert_model.ckpt \
    uncased_L-12_H-768_A-12/bert_config.json \
    uncased_L-12_H-768_A-12/pytorch_model.bin ''')
else:
  
  print("Checkpoints already converted to PyTorch !!")

In [0]:


 
from __future__ import absolute_import, division, print_function

import argparse
import csv
import logging
import os
import random
import sys

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange, tqdm_notebook

from torch.nn import CrossEntropyLoss, MSELoss
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score

from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig,BertPreTrainedModel,BertModel
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

logger = logging.getLogger(__name__)

In [0]:
class BertForMultiTaskSequenceClassification(BertPreTrainedModel):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    Params:
        `config`: a BertConfig class instance with the configuration to build a new model.
        `num_labels`: the number of classes for the classifier. Default = 2.
    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
            with indices selected in [0, ..., num_labels].
    Outputs:
        if `labels` is not `None`:
            Outputs the CrossEntropy classification loss of the output with the labels.
        if `labels` is `None`:
            Outputs the classification logits of shape [batch_size, num_labels].
    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
    num_labels = 2
    model = BertForSequenceClassification(config, num_labels)
    logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, config, task1_num_labels=2,task2_num_labels=2):
        super(BertForMultiTaskSequenceClassification, self).__init__(config)
        self.task1_num_labels = task1_num_labels
        self.task2_num_labels = task2_num_labels
        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.task1_classifier = torch.nn.Linear(config.hidden_size, task1_num_labels)
        self.task2_classifier = torch.nn.Linear(config.hidden_size, task2_num_labels)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, task1_labels=None,task2_labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        

        task1_logits = self.task1_classifier(pooled_output)
        
#         elif task_name == 'task2':
        task2_logits = self.task2_classifier(pooled_output)
        
#         if labels is not None and task_name == 'task1':
        if task1_labels is not None and task1_labels is not None:
            loss_fct = CrossEntropyLoss()
            task1_loss = loss_fct(task1_logits.view(-1, self.task1_num_labels), task1_labels.view(-1))
#             return loss
          
#         elif labels is not None and task_name == 'task2':
#             loss_fct = CrossEntropyLoss()
            task2_loss = loss_fct(task2_logits.view(-1, self.task2_num_labels), task2_labels.view(-1))
            return task1_loss,task2_loss
          
        else:
            return task1_logits,task2_logits
        
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [0]:
output_dir_models = 'model_checkpoints_and_outputs/'
if not os.path.exists(output_dir_models):
  os.mkdir(output_dir_models)


cache_dir = 'cache_from_model/'
if not os.path.exists(cache_dir):
  os.mkdir(cache_dir)

  
args = {
    "train_size": -1,
    "val_size": -1,
    "full_data_dir": 'data/',
    "data_dir": 'data/',
    "cache_dir": cache_dir,
    "task_name": "SCRI",
    "no_cuda": False,
    "bert_model": 'uncased_L-12_H-768_A-12/',
    "output_dir": output_dir_models,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 8,
    "eval_batch_size": 8,
    "learning_rate": 3e-5,
    "num_train_epochs": 6.0,
    "warmup_proportion": 0.1,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": False,
    "loss_scale": 128
}



In [0]:

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, task1_label=None,task2_label=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.task1_label = task1_label
        self.task2_label = task2_label

In [0]:
class InputFeatures(object):
    """A single set of features of data.
    
    Args: 
         input_ids : Numerical ids for tokenized text
         input_mask : For padding, 0 and for actual tokens, it is 1.
         segment_ids : If two sentences are the input(for LM training or MRPC task), 0s for 1 and 1s for 2nd
                       If only one sentence is used, use 1s.
         label_ids : One-hot encoded labels for text
    
    """

    def __init__(self, input_ids, input_mask, segment_ids, task1_label_id,task2_label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.task1_label_id = task1_label_id
        self.task2_label_id = task2_label_id


In [0]:


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()
        
    def get_test_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError() 

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_csv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        print("INPUT file is ",input_file)
        with open(input_file, "r", encoding="utf-8") as f:
#             reader = csv.reader(f, delimiter=",", quotechar=quotechar)
            reader = pd.read_csv(input_file).values.tolist()
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines

In [0]:


class ScriProcessor(DataProcessor):
    """Processor for the SCRI dataset."""

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_csv(os.path.join(data_dir, "train.csv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_csv(os.path.join(data_dir, "val.csv")), "val")
  
    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_csv(os.path.join(data_dir, "test.csv")), "test")

    def get_task1_labels(self):
        """See base class. We show labels here"""
        ''' '0',
           'Confidentiality',
           'Finance',
           'Intelectual Property',
           'Legal',
           'Non Compete',
           'Non Solicitation',
           'Others',
           'Scope',
           'Sub Contract'
          '''
        return ['Confidentiality','Finance','Intelectual Property','Legal','Non Compete','Non Solicitation','Others','Scope','Sub Contract']   

    def get_task2_labels(self):
        """See base class. We show labels here"""
        ''' '0',
           '1'
          '''
        return ['0','1']   
      
      
      
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)
            text_a = line[1]
            text_a = cleanse_text(text_a)   ## Additionally created. For Test data conversion.
            task1_label=None
            if set_type!='test':
              task2_label = line[2]
              task1_label = line[3]
            else:
              pass

            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, task1_label=task1_label,task2_label = task2_label))
        return examples

In [0]:
def convert_examples_to_features(examples, task1_label_list,task2_label_list, max_seq_length,
                                 tokenizer, output_mode):
    """Loads a data file into a list of `InputBatch`s."""

    task1_label_map = {label : i for i, label in enumerate(task1_label_list)}
    task2_label_map = {label : i for i, label in enumerate(task2_label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        
        
        if output_mode == "classification":
            
            task1_label_id = None
            task2_label_id = None
            if not 'test' in example.guid:   # Becuase we don't have labels in TEST
              task1_label_id = task1_label_map[example.task1_label]
              task2_label_id = task2_label_map[str(int(example.task2_label))]
            
        elif output_mode == "regression":
            label_id=None
            if not 'test' in example.guid:
              
              label_id = float(example.label)
        else:
            raise KeyError(output_mode)

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
#             logger.info("label: %s (id = %d)" % (example.label, label_id))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              task1_label_id=task1_label_id,task2_label_id = task2_label_id))
    return features

In [0]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


In [0]:
def simple_accuracy(preds, labels):
    return (preds == labels).mean()


def acc_and_f1(preds, labels,avg_type = 'weighted'):
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds,average=avg_type)
    return {
        "acc": acc,
        "f1": f1,
        "acc_and_f1": (acc + f1) / 2,
    }



def pearson_and_spearman(preds, labels):
    pearson_corr = pearsonr(preds, labels)[0]
    spearman_corr = spearmanr(preds, labels)[0]
    return {
        "pearson": pearson_corr,
        "spearmanr": spearman_corr,
        "corr": (pearson_corr + spearman_corr) / 2,
        }


def compute_metrics(task_name, preds, labels):
    assert len(preds) == len(labels)
    if task_name == "cola":
        return {"mcc": matthews_corrcoef(labels, preds)}
    elif task_name == "sst-2":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "mrpc":
        return acc_and_f1(preds, labels)
    elif task_name == "sts-b":
        return pearson_and_spearman(preds, labels)
    elif task_name == "qqp":
        return acc_and_f1(preds, labels)
    elif task_name == "mnli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "mnli-mm":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "qnli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "rte":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "wnli":
        return {"acc": simple_accuracy(preds, labels)}
      
    else:
        raise KeyError(task_name)

In [0]:
processors = {
    "scri": ScriProcessor
}

output_modes = {
        "scri": "classification",
}




# Setup GPU parameters

if args["local_rank"] == -1 or args["no_cuda"]:
    device = torch.device("cuda" if torch.cuda.is_available() and not args["no_cuda"] else "cpu")
    n_gpu = torch.cuda.device_count()
#     n_gpu = 1
else:
    torch.cuda.set_device(args['local_rank'])
    device = torch.device("cuda", args['local_rank'])
    n_gpu = 1
    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args['local_rank'] != -1), args['fp16']))


In [0]:
args['train_batch_size'] = int(args['train_batch_size'] / args['gradient_accumulation_steps'])
random.seed(args['seed'])
np.random.seed(args['seed'])
torch.manual_seed(args['seed'])
if n_gpu > 0:
    torch.cuda.manual_seed_all(args['seed'])

In [0]:
task_name = args['task_name'].lower()

if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))

NameError: ignored

In [0]:
processor = processors[task_name]()
output_mode = output_modes[task_name]

NameError: ignored

In [0]:
task1_label_list = processor.get_task1_labels()
task1_num_labels = len(task1_label_list)
task2_label_list = processor.get_task2_labels()
task2_num_labels = len(task2_label_list)

NameError: ignored

In [0]:
never_split = ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]","<oaps>","<eaps>","<oprs>","<eprs>","<oaps>","<aps>","<ocase>","</ocase>","<wcase>","</wcase>",]
tokenizer = BertTokenizer.from_pretrained(args['bert_model'], do_lower_case=args['do_lower_case'],never_split=never_split)



In [0]:
train_examples = None
num_train_steps = None
if args['do_train']:
    train_examples = processor.get_train_examples(args['data_dir'])
#     train_examples = processor.get_train_examples(args['data_dir'], size=args['train_size'])
    num_train_steps = int(
        len(train_examples) / args['train_batch_size'] / args['gradient_accumulation_steps']) * args['num_train_epochs']



In [0]:

model_state_dict = None      # If you want to retrain a trained model, then model_state_dict stores the torch trained model



# Prepare model
def get_model():
#     pdb.set_trace()
    if model_state_dict:
        model = BertForMultiTaskSequenceClassification.from_pretrained(args['bert_model'], task1_num_labels = task1_num_labels, task2_num_labels = task2_num_labels, state_dict=model_state_dict,cache_dir=args['cache_dir'])
    else:
        model = BertForMultiTaskSequenceClassification.from_pretrained(args['bert_model'], task1_num_labels = task1_num_labels,task2_num_labels = task2_num_labels,cache_dir=args['cache_dir'])
    return model

model = get_model()

if args['fp16']:
    model.half()
model.to(device)
if args['local_rank'] != -1:
    try:
        from apex.parallel import DistributedDataParallel as DDP
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

    model = DDP(model)
elif n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [0]:

from torch.optim.lr_scheduler import _LRScheduler, Optimizer

class CyclicLR(object):
    """Sets the learning rate of each parameter group according to
    cyclical learning rate policy (CLR). The policy cycles the learning
    rate between two boundaries with a constant frequency, as detailed in
    the paper `Cyclical Learning Rates for Training Neural Networks`_.
    The distance between the two boundaries can be scaled on a per-iteration
    or per-cycle basis.
    Cyclical learning rate policy changes the learning rate after every batch.
    `batch_step` should be called after a batch has been used for training.
    To resume training, save `last_batch_iteration` and use it to instantiate `CycleLR`.
    This class has three built-in policies, as put forth in the paper:
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each
        cycle iteration.
    This implementation was adapted from the github repo: `bckenstler/CLR`_
    Args:
        optimizer (Optimizer): Wrapped optimizer.
        base_lr (float or list): Initial learning rate which is the
            lower boundary in the cycle for eachparam groups.
            Default: 0.001
        max_lr (float or list): Upper boundaries in the cycle for
            each parameter group. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore
            max_lr may not actually be reached depending on
            scaling function. Default: 0.006
        step_size (int): Number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch. Default: 2000
        mode (str): One of {triangular, triangular2, exp_range}.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
            Default: 'triangular'
        gamma (float): Constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
            Default: 1.0
        scale_fn (function): Custom scaling policy defined by a single
            argument lambda function, where
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored
            Default: None
        scale_mode (str): {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on
            cycle number or cycle iterations (training
            iterations since start of cycle).
            Default: 'cycle'
        last_batch_iteration (int): The index of the last batch. Default: -1
    Example:
        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
        >>> scheduler = torch.optim.CyclicLR(optimizer)
        >>> data_loader = torch.utils.data.DataLoader(...)
        >>> for epoch in range(10):
        >>>     for batch in data_loader:
        >>>         scheduler.batch_step()
        >>>         train_batch(...)
    .. _Cyclical Learning Rates for Training Neural Networks: https://arxiv.org/abs/1506.01186
    .. _bckenstler/CLR: https://github.com/bckenstler/CLR
    """

    def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
                 step_size=2000, mode='triangular', gamma=1.,
                 scale_fn=None, scale_mode='cycle', last_batch_iteration=-1):

#         if not isinstance(optimizer, Optimizer):
#             raise TypeError('{} is not an Optimizer'.format(
#                 type(optimizer).__name__))
        self.optimizer = optimizer

        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
            if len(base_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} base_lr, got {}".format(
                    len(optimizer.param_groups), len(base_lr)))
            self.base_lrs = list(base_lr)
        else:
            self.base_lrs = [base_lr] * len(optimizer.param_groups)

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} max_lr, got {}".format(
                    len(optimizer.param_groups), len(max_lr)))
            self.max_lrs = list(max_lr)
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)

        self.step_size = step_size

        if mode not in ['triangular', 'triangular2', 'exp_range'] \
                and scale_fn is None:
            raise ValueError('mode is invalid and scale_fn is None')

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode

        self.batch_step(last_batch_iteration + 1)
        self.last_batch_iteration = last_batch_iteration

    def batch_step(self, batch_iteration=None):
        if batch_iteration is None:
            batch_iteration = self.last_batch_iteration + 1
        self.last_batch_iteration = batch_iteration
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

    def _triangular_scale_fn(self, x):
        return 1.

    def _triangular2_scale_fn(self, x):
        return 1 / (2. ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma**(x)

    def get_lr(self):
        step_size = float(self.step_size)
        cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
        x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)

        lrs = []
        param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
        for param_group, base_lr, max_lr in param_lrs:
            base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
            if self.scale_mode == 'cycle':
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
            lrs.append(lr)
        return lrs



In [0]:
if args['do_train']:
  # Prepare optimizer
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
      {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
      {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
      ]
  t_total = num_train_steps
  if args['local_rank'] != -1:
      t_total = t_total // torch.distributed.get_world_size()
  if args['fp16']:
      try:
          from apex.optimizers import FP16_Optimizer
          from apex.optimizers import FusedAdam
      except ImportError:
          raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

      optimizer = FusedAdam(optimizer_grouped_parameters,
                            lr=args['learning_rate'],
                            bias_correction=False,
                            max_grad_norm=1.0)
      if args['loss_scale'] == 0:
          optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
      else:
          optimizer = FP16_Optimizer(optimizer, static_loss_scale=args['loss_scale'])
      warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,t_total=num_train_steps)
  else:

      optimizer = BertAdam(optimizer_grouped_parameters,
                           lr=args['learning_rate'],
                           warmup=args['warmup_proportion'],
                           t_total=t_total)
      print("BERT optimizer is used")

  scheduler = CyclicLR(optimizer, base_lr=2e-5, max_lr=5e-5, step_size=2500, last_batch_iteration=0)



In [0]:
def warmup_linear_self(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x

## Load Training Data

In [0]:
global_step = 0
nb_tr_steps = 0
tr_loss = 0

if args['do_train']:
  train_features = convert_examples_to_features(train_examples, task1_label_list,task2_label_list, args['max_seq_length'], tokenizer,output_mode)

  logger.info("***** Running training *****")
  logger.info("  Num examples = %d", len(train_examples))
  logger.info("  Batch size = %d", args['train_batch_size'])
  logger.info("  Num steps = %d", num_train_steps)
  

  all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
  all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

  if output_mode == "classification":
    all_task1_label_ids = torch.tensor([f.task1_label_id for f in train_features], dtype=torch.long)
    all_task2_label_ids = torch.tensor([f.task2_label_id for f in train_features], dtype=torch.long)
  elif output_mode == "regression":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
    
  train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_task1_label_ids,all_task2_label_ids)
  if args['local_rank'] == -1:
      train_sampler = RandomSampler(train_data)
  else:
      train_sampler = DistributedSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args['train_batch_size'])

In [0]:
print(task1_num_labels)
task2_num_labels

## Training the model

In [0]:
!pip install livelossplot
from livelossplot import PlotLosses

In [0]:
# Eval Fn

from tqdm import tqdm_notebook,tqdm

args['eval_batch_size'] = 32

eval_examples = processor.get_dev_examples(args['data_dir'])
def eval():

    if not os.path.exists(args['output_dir']):
      os.mkdir(args['output_dir'])
    
    eval_features = convert_examples_to_features(
        eval_examples, task1_label_list,task2_label_list, args['max_seq_length'], tokenizer,output_mode=output_mode)
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_examples))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_task1_label_ids = torch.tensor([f.task1_label_id for f in eval_features], dtype=torch.long)
    all_task2_label_ids = torch.tensor([f.task2_label_id for f in eval_features], dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_task1_label_ids,all_task2_label_ids)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args['eval_batch_size'])
    
    all_task1_logits = None
    all_task1_labels = None
    all_task2_logits = None
    all_task2_labels = None
    
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    eval_task1_loss, eval_task1_accuracy,eval_task2_loss, eval_task2_accuracy = 0,0,0,0
    nb_eval_steps, nb_eval_examples = 0, 0
    for input_ids, input_mask, segment_ids, task1_label_ids,task2_label_ids in tqdm_notebook(eval_dataloader,desc="evaluating:"):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        task1_label_ids = task1_label_ids.to(device)
        
        task2_label_ids = task2_label_ids.to(device)

        with torch.no_grad():
            tmp_task1_eval_loss,tmp_task2_eval_loss = model(input_ids, segment_ids, input_mask, task1_label_ids,task2_label_ids)
            task1_logits, task2_logits = model(input_ids, segment_ids, input_mask)

#             tmp_task2_eval_loss = model(input_ids, segment_ids, input_mask, task2_label_ids,task_name='task2')
#             task2_logits = model(input_ids, segment_ids, input_mask,task_name = 'task2')

            
            
#         logits = logits.detach().cpu().numpy()
#         label_ids = label_ids.to('cpu').numpy()
       
#         tmp_eval_accuracy = simple_accuracy(torch.max(logits,1)[1].cpu().numpy(), label_ids.cpu().numpy())
#         print("Batch acc mean : ",tmp_eval_accuracy)

      #         tmp_eval_accuracy = accuracy_thresh(logits, label_ids)
      
      
        if all_task1_logits is None:
            all_task1_logits = torch.max(task1_logits,1)[1].detach().cpu().numpy()
        else:
            all_task1_logits = np.concatenate((all_task1_logits, torch.max(task1_logits,1)[1].detach().cpu().numpy()), axis=0)

        if all_task2_logits is None:
            all_task2_logits = torch.max(task2_logits,1)[1].detach().cpu().numpy()
        else:
            all_task2_logits = np.concatenate((all_task2_logits, torch.max(task2_logits,1)[1].detach().cpu().numpy()), axis=0)

        if all_task1_labels is None:
            all_task1_labels = task1_label_ids.detach().cpu().numpy()
        else:    
            all_task1_labels = np.concatenate((all_task1_labels, task1_label_ids.detach().cpu().numpy()), axis=0)

        if all_task2_labels is None:
            all_task2_labels = task2_label_ids.detach().cpu().numpy()
        else:    
            all_task2_labels = np.concatenate((all_task2_labels, task2_label_ids.detach().cpu().numpy()), axis=0)


        eval_task1_loss += tmp_task1_eval_loss.item()
        eval_task2_loss += tmp_task2_eval_loss.item()
        
#         eval_accuracy += tmp_eval_accuracy
  
        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

    eval_task1_loss = eval_task1_loss / nb_eval_steps
    eval_task2_loss = eval_task2_loss / nb_eval_steps
    eval_task1_accuracy = simple_accuracy(all_task1_logits,all_task1_labels)
    eval_task1_acf1 = acc_and_f1(all_task1_logits,all_task1_labels)
    
    eval_task2_accuracy = simple_accuracy(all_task2_logits,all_task2_labels)
    eval_task2_acf1 = acc_and_f1(all_task2_logits,all_task2_labels)

    
    print("From evaluation of Task 1 (Risk type),",eval_task1_loss, eval_task1_accuracy)
    print(" Eval Task 1 acc and f1 and acc+f1/2 is ", eval_task1_acf1)
    print("From evaluation of Task 2 (Criticality type),",eval_task2_loss, eval_task2_accuracy)
    print(" Eval Task 2 acc and f1 and acc+f1/2 is ", eval_task2_acf1)
    
    return eval_task1_loss,eval_task2_loss, eval_task1_accuracy, eval_task2_accuracy
    
# #     ROC-AUC calcualation
#     # Compute ROC curve and ROC area for each class
#     fpr = dict()
#     tpr = dict()
#     roc_auc = dict()
    
#     for i in range(num_labels):
#         fpr[i], tpr[i], _ = roc_curve(all_labels[:, i], all_logits[:, i])
#         roc_auc[i] = auc(fpr[i], tpr[i])
        
#     # Compute micro-average ROC curve and ROC area
#     fpr["micro"], tpr["micro"], _ = roc_curve(all_labels.ravel(), all_logits.ravel())
#     roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

#     result = {'eval_loss': eval_loss,
#               'eval_accuracy': eval_accuracy,
# #               'loss': tr_loss/nb_tr_steps,
#               'roc_auc': roc_auc  }
#     print("From evaluation (Otherwise same as train),",result)
#     output_eval_file = os.path.join(args['output_dir'], "eval_results.txt")
#     with open(output_eval_file, "w") as writer:
#         logger.info("***** Eval results *****")
#         for key in sorted(result.keys()):
#             logger.info("  %s = %s", key, str(result[key]))
#             writer.write("%s = %s\n" % (key, str(result[key])))
            
#     return result

In [0]:
eval()

In [0]:


def fit(num_epocs=args['num_train_epochs']):
    liveloss = PlotLosses()
    
    global_step = 0
    model.train()
    
    for i_ in tqdm_notebook(range(int(num_epocs)), desc="Epoch"):
        
        logs = {} # For plotting the loss, acc live
        
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm_notebook(train_dataloader, desc="Iteration")):

            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, task1_label_ids,task2_label_ids = batch
            
#             loss = model(input_ids, segment_ids, input_mask, label_ids)
              
            # define a new function to compute loss values for both output_modes
            task1_logits,task2_logits = model(input_ids, segment_ids, input_mask, task1_labels=None,task2_labels=None)
            
            if output_mode == "classification":
              loss_fct = CrossEntropyLoss()
              task1_loss = loss_fct(task1_logits.view(-1, task1_num_labels), task1_label_ids.view(-1))
#               task2_loss_fct = CrossEntropyLoss()
              task2_loss = loss_fct(task2_logits.view(-1, task2_num_labels), task2_label_ids.view(-1))
              
#               print("Task 1 and 2 losses are ",task1_loss.item(),task2_loss.item())
            elif output_mode == "regression":
              loss_fct = MSELoss()
              loss = loss_fct(logits.view(-1), label_ids.view(-1))              
              

            loss = task1_loss+task2_loss
            if n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu.
            if args['gradient_accumulation_steps'] > 1:
                loss = loss / args['gradient_accumulation_steps']

            if args['fp16']:
                optimizer.backward(loss)
            else:
                loss.backward()

            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args['gradient_accumulation_steps'] == 0:
    #             scheduler.batch_step()
                # modify learning rate with special warm up BERT uses
#                 lr_this_step = args['learning_rate'] * warmup_linear_self(global_step/t_total, args['warmup_proportion'])
                
  
                if args['fp16']:
                          # modify learning rate with special warm up BERT uses
                          # if args.fp16 is False, BertAdam is used that handles this automatically
                  lr_this_step = args['learning_rate'] * warmup_linear.get_lr(global_step, args['warmup_proportion'])
                  for param_group in optimizer.param_groups:
                      param_group['lr'] = lr_this_step
              
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

        logger.info('Loss after epoc {}'.format(tr_loss / nb_tr_steps))
        logger.info('Eval after epoc {}'.format(i_+1))
        print("Loss after epoc {}".format(tr_loss/nb_tr_steps))
        print('Eval after epoc {}'.format(i_+1))

        prefix=''
        
        logs[prefix+'task1_loss']=task1_loss.item()
        logs[prefix+'task2_loss']=task2_loss.item()
        

        prefix = 'val_'        
        eval_task1_loss,eval_task2_loss, eval_task1_accuracy, eval_task2_accuracy=eval()
        logs[prefix+'task1_loss'] = eval_task1_loss
        logs[prefix+'task2_loss'] = eval_task2_loss
        logs[prefix+'task2_acc'] = eval_task2_accuracy
        logs[prefix+'task1_acc'] = eval_task1_accuracy
        liveloss.update(logs)
        liveloss.draw()        



In [0]:

fit()


'''
if retraining, set warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,t_total=num_train_steps) 

OR

optimizer = BertAdam(optimizer_grouped_parameters,
                           lr=args['learning_rate'],
                           warmup=args['warmup_proportion'],
                           t_total=t_total) otherwise comment it out '''

In [0]:
# eval()

In [0]:
num_epocs=4
# warmup_linear = WarmupLinearSchedule(warmup=args['warmup_proportion'],t_total=num_train_steps)
optimizer = BertAdam(optimizer_grouped_parameters,
                           lr=args['learning_rate'],
                           warmup=args['warmup_proportion'],
                           t_total=t_total*int(num_epocs/args['num_train_epochs']))

fit()

In [0]:
# eval()

### Saving the model, configuration and tokenizer

In [0]:
# model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

# # If we save using the predefined names, we can load using `from_pretrained`
# output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
# output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

# torch.save(model_to_save.state_dict(), output_model_file)
# model_to_save.config.to_json_file(output_config_file)
# tokenizer.save_vocabulary(args.output_dir)




# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
output_model_file = os.path.join(args['cache_dir'], "finetuned_pytorch_model.bin")
torch.save(model_to_save.state_dict(), output_model_file)

# Load a trained model that you have fine-tuned
model_state_dict = torch.load(output_model_file)
model = BertForMultiTaskSequenceClassification.from_pretrained(args['bert_model'], num_labels = num_labels, state_dict=model_state_dict)
model.to(device)



## Complete Evaluation Mode

In [0]:
eval()

In [0]:

'''TEST'''


args['eval_batch_size'] = 1

# Load a trained model that you have fine-tuned
model_state_dict = torch.load(args['cache_dir']+"/finetuned_pytorch_model.bin")
model = BertForMultiTaskSequenceClassification.from_pretrained(args['bert_model'], num_labels = num_labels, state_dict=model_state_dict)
model.to(device)


never_split = ["<oaps>","<eaps>","<oprs>","<eprs>","<oaps>","<aps>","<ocase>","</ocase>","<wcase>","</wcase>",]
tokenizer = BertTokenizer.from_pretrained(args['bert_model'], do_lower_case=args['do_lower_case'],never_split=never_split)



def predict(model, path, test_filename='test.csv'):
    predict_processor = ScriProcessor()
    label_list = predict_processor.get_labels()
    label_map = {label : i for i, label in enumerate(label_list)}
    test_examples = predict_processor.get_test_examples(path)
    
    # Hold input data for returning it 
    input_data = [{ 'id': input_example.guid, 'comment_text': input_example.text_a } for input_example in test_examples]
    
    
    test_features = convert_examples_to_features(
        test_examples, label_list, args['max_seq_length'], tokenizer,output_mode=output_mode)
    
    logger.info("***** Running prediction *****")
    logger.info("  Num examples = %d", len(test_examples))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    
    all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)

    test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
    
    # Run prediction for full data
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args['eval_batch_size'])
    
    all_logits = None
    
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for step, batch in enumerate(tqdm_notebook(test_dataloader, desc="Prediction Iteration")):
        input_ids, input_mask, segment_ids = batch
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)

        with torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask)
#             logits = logits.sigmoid()

        if all_logits is None:
            all_logits = logits.detach().cpu().numpy()
        else:
            all_logits = np.concatenate((all_logits, logits.detach().cpu().numpy()), axis=0)
            
        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

    return pd.merge(pd.DataFrame(input_data), pd.DataFrame(all_logits, columns=label_list), left_index=True, right_index=True)

In [0]:
!cp data/val.csv data/test.csv

In [0]:
# predict(model,'data')