In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import json
import ast
import subprocess
from pprint import pprint
# import spacy
# import nltk
# from nltk.corpus import stopwords

import re
import logging
import os
import random
import shutil
import sys
import time
# from wordcloud import WordCloud
import networkx as nx

from io import StringIO
import string
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from datetime import datetime
from seqeval.metrics import f1_score, precision_score, recall_score
from tqdm import tqdm, trange

# from spacy.tokens import DocBin
from tqdm import tqdm
import spacy
from spacy.util import filter_spans

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    BertConfig,
    BertForTokenClassification,
    BertTokenizer,
    CamembertConfig,
    CamembertForTokenClassification,
    CamembertTokenizer,
    DistilBertConfig,
    DistilBertForTokenClassification,
    DistilBertTokenizer,
    RobertaConfig,
    RobertaForTokenClassification,
    RobertaTokenizer,
    XLMRobertaConfig,
    XLMRobertaForTokenClassification,
    XLMRobertaTokenizer,
    get_linear_schedule_with_warmup,
)
from torch.utils.data.distributed import DistributedSampler


from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

logger = logging.getLogger(__name__)



import argparse
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap


In [2]:
logger = logging.getLogger(__name__)

In [3]:
# Defining a parser which is required for the pipeline
def get_bert_parser():
    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=False,
        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
    )

    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=False,
        help="Path to pre-trained model or shortcut name selected in the list: ",
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=False,
        help="The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--labels",
        default="",
        type=str,
        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
    )
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help="The maximum total input sequence length after tokenization. Sequences longer "
             "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Whether to run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
    )

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
             "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    return parser


In [4]:
# To make sure that the randomness in the training can be recreated
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


In [5]:
class InputExample(object):
    """A single training/test example for token classification."""

    def __init__(self, guid, words, labels):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            words: list. The words of the sequence.
            labels: (Optional) list. The labels for each word of the sequence. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.words = words
        self.labels = labels

In [6]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids

In [7]:
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""):

    eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation %s *****", prefix)
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    model.eval()


    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type in ["bert", "xlnet"] else None
                )  # XLM and RoBERTa don"t use segment_ids
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            if args.n_gpu > 1:
                tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating

            eval_loss += tmp_eval_loss.item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    preds = np.argmax(preds, axis=2)

    label_map = {i: label for i, label in enumerate(labels)}
    print(3)

    out_label_list = [[] for _ in range(out_label_ids.shape[0])]
    preds_list = [[] for _ in range(out_label_ids.shape[0])]

    for i in range(out_label_ids.shape[0]):
        for j in range(out_label_ids.shape[1]):
            if out_label_ids[i, j] != pad_token_label_id:
                out_label_list[i].append(label_map[out_label_ids[i][j]])
                preds_list[i].append(label_map[preds[i][j]])

    results = {
        "loss": eval_loss,
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list),
    }
    print("results : ",results)
    logger.info("***** Eval results %s *****", prefix)
    for key in sorted(results.keys()):
        logger.info("  %s = %s", key, str(results[key]))

    return results, preds_list


In [8]:
def read_examples_from_file(data_dir, mode):
    file_path = os.path.join(data_dir, "{}.txt".format(mode))
    guid_index = 1
    examples = []
    with open(file_path, encoding="utf-8") as f:
        words = []
        labels = []
        for line in f:
            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                if words:
                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels))
                    guid_index += 1
                    words = []
                    labels = []
            else:
                splits = line.split(" ")
                words.append(splits[0])
                if len(splits) > 1:
                    labels.append(splits[-1].replace("\n", ""))
                else:
                    # Examples could have no label for mode = "test"
                    labels.append("O")
        if words:
            examples.append(InputExample(guid="%s-%d".format(mode, guid_index), words=words, labels=labels))
            
    return examples


In [9]:
def convert_examples_to_features(
    examples,
    label_list,
    max_seq_length,
    tokenizer,
    cls_token_at_end=False,
    cls_token="[CLS]",
    cls_token_segment_id=1,
    sep_token="[SEP]",
    sep_token_extra=False,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    pad_token_label_id=-100,
    sequence_a_segment_id=0,
    mask_padding_with_zero=True,
):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d", ex_index, len(examples))

        tokens = []
        label_ids = []
        for word, label in zip(example.words, example.labels):
            word_tokens = tokenizer.tokenize(word)
            tokens.extend(word_tokens)
            # print('word_tokens : ',word_tokens)
            # print('label : ',label)
            # print('label_map : ',label_map)

            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = 3 if sep_token_extra else 2
        if len(tokens) > max_seq_length - special_tokens_count:
            tokens = tokens[: (max_seq_length - special_tokens_count)]
            label_ids = label_ids[: (max_seq_length - special_tokens_count)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens += [sep_token]
        label_ids += [pad_token_label_id]
        if sep_token_extra:
            # roberta uses an extra separator b/w pairs of sentences
            tokens += [sep_token]
            label_ids += [pad_token_label_id]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        if cls_token_at_end:
            tokens += [cls_token]
            label_ids += [pad_token_label_id]
            segment_ids += [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            label_ids = [pad_token_label_id] + label_ids
            segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
            label_ids = ([pad_token_label_id] * padding_length) + label_ids
        else:
            input_ids += [pad_token] * padding_length
            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
            segment_ids += [pad_token_segment_id] * padding_length
            label_ids += [pad_token_label_id] * padding_length

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s", example.guid)
            logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
            logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
            logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))

        features.append(
            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids)
        )
    return features


In [10]:
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    logger.info("Creating features from dataset file at %s", args.data_dir)
    examples = read_examples_from_file(args.data_dir, mode)
    features = convert_examples_to_features(
        examples,
        labels,
        args.max_seq_length,
        tokenizer,
        cls_token_at_end=bool(args.model_type in ["xlnet"]),
        # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=bool(args.model_type in ["roberta"]),
        # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
        pad_on_left=bool(args.model_type in ["xlnet"]),
        # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
        pad_token_label_id=pad_token_label_id,
    )

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    return dataset


In [11]:
def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
            os.path.join(args.model_name_or_path, "scheduler.pt")
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path) and "checkpoint" in args.model_name_or_path:
        # if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type in ["bert", "xlnet"] else None
                )  # XLM and RoBERTa don"t use segment_ids

            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev")
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step


In [12]:

class BERTmodelTrain:
    """
    Class for training educational attributes
    """
    parser = get_bert_parser()

    original_args = sys.argv
    sys.argv = []
    args = parser.parse_args()
    sys.argv = original_args

    args.model_type = 'distilbert'
    args.model_name_or_path = 'distilbert-base-cased'
    args.max_seq_length = 256
    args.num_train_epochs = 6
    args.per_gpu_train_batch_size = 4
    args.save_steps = 750
    args.seed = 1
    # args.labels = 'data/labels.txt'
    args.do_train = True

    # Setup CUDA, GPU & distributed training
    args.device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu')
    args.n_gpu = torch.cuda.device_count()
    # config_class, model_class, tokenizer_class = DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer
    config_class, model_class, tokenizer_class = BertConfig, BertForTokenClassification, BertTokenizer

    # # Prepare CONLL-2003 task
    # labels =['O', 'B-LOC', 'B-PER', 'B-ORG', 'I-PER', 'I-ORG', 'B-MISC', 'I-MISC', 'I-LOC']
    # num_labels = len(labels)

    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    def __init__(self, output_dir, labels, load_model_locally=None, epochs=None):
        if epochs:
            self.args.num_train_epochs = epochs
        self.args.output_dir = output_dir
        self.labels = labels
        self.num_labels = len(self.labels)
        if load_model_locally:
            self.load_saved_model()
        else:
            # loading the pretrained model
            self._load_pretrained_model()
            
    def load_saved_model(self):
        """
        To load the model
        :return:
        """

        labels_ = self.labels
        pad_token_label_id_ = CrossEntropyLoss().ignore_index
        # model_class, tokenizer_class = (DistilBertForTokenClassification, DistilBertTokenizer)
        model_class, tokenizer_class = (BertForTokenClassification, BertTokenizer)

        # if args.do_predict and args.local_rank in [-1, 0]:
        print('\n---Loading model...')
        tm_s = time.time()

        self.tokenizer = tokenizer_class.from_pretrained(self.args.output_dir, do_lower_case=self.args.do_lower_case)
        self.model= model_class.from_pretrained(self.args.output_dir)
        self.model.to(self.args.device)

        tm_e = time.time()
        print('---Model loaded. Time taken is %f seconds...', np.round(tm_e - tm_s, 2))


    def _load_pretrained_model(self):
        print('Loading pretrained model...')
        ts = time.time()


        self.config = BertConfig.from_pretrained(
            self.args.config_name if self.args.config_name else self.args.model_name_or_path,
            num_labels=self.num_labels,
        )
        self.tokenizer = BertTokenizer.from_pretrained(
            self.args.tokenizer_name if self.args.tokenizer_name else self.args.model_name_or_path,
            do_lower_case=self.args.do_lower_case,
            cache_dir=self.args.cache_dir if self.args.cache_dir else None,
        )
        self.model = BertForTokenClassification.from_pretrained(
            self.args.model_name_or_path,
            from_tf=bool(".ckpt" in self.args.model_name_or_path),
            config=self.config,
            cache_dir=self.args.cache_dir if self.args.cache_dir else None,
        )

        self.model.to(self.args.device)

        te = time.time()
        print('\nPretrained model loaded.  Time taken is %f seconds...',np.round(te - ts, 2))

    def _train(self, data_dir, dump_model=True):
        self.args.data_dir = data_dir
        train_dataset = load_and_cache_examples(self.args, self.tokenizer, self.labels, self.pad_token_label_id,
                                                mode="train")
        train(self.args, train_dataset, self.model, self.tokenizer, self.labels, self.pad_token_label_id)

        # save and upload the model to minio
        if dump_model:
            self._save_the_model()

    def _save_the_model(self):

        if os.path.exists(self.args.output_dir) and os.listdir(self.args.output_dir):
            shutil.rmtree(self.args.output_dir)
        if not os.path.exists(self.args.output_dir):
            os.makedirs(self.args.output_dir)

        logger.info("Saving model checkpoint to %s", self.args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            self.model.module if hasattr(self.model, "module") else self.model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(self.args.output_dir)
        self.tokenizer.save_pretrained(self.args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(self.args, os.path.join(self.args.output_dir, "training_args.bin"))
        
    def _predict(self):
        education_output = {}
        print("""os.path.join(self.args.data_dir, 'test.txt') """,os.path.join(self.args.data_dir, 'test.txt'))
        file = open(os.path.join(self.args.data_dir, 'test.txt'), 'r')
        actual_values_text = file.read()
        file.close()

        tokens_and_labels_list = actual_values_text.split('\n')

        try:
            results, predictions = evaluate(self.args, self.model, self.tokenizer, self.labels, self.pad_token_label_id,
                                      mode='test')
            print("results : ",results)
        except Exception as error_message:
            print("In the evaluation part %s", str(error_message))

        try:
            prediction_text = ""
            sequence_id = 0
            for token_and_label in tokens_and_labels_list:
                if token_and_label.startswith("-DOCSTART-") or token_and_label == "" or token_and_label == "\n":
                    prediction_text += '\n'
                    if not predictions[sequence_id]:
                        sequence_id += 1
                elif predictions[sequence_id]:
                    output_line = token_and_label.split()[0] + " " + predictions[sequence_id].pop(0) + "\n"
                    prediction_text += output_line

                else:
                    # print("Maximum sequence length exceeded: No prediction for '%s'.",
                    #                   token_and_label.split()[0])
                    output_line = token_and_label.split()[0] + " O\n"
                    prediction_text += output_line
            prediction_text = prediction_text[:-1]
        except Exception as error_message:
            print("In prediction section", str(error_message))

        
        return actual_values_text, prediction_text




In [13]:
data_path = '/home/cibin/Desktop/exl/TD/data/DS_v1/annotated/docanno_output/batch1/batch1_247/processed/final'
save_dir = '/home/cibin/Desktop/exl/TD/models/transformers/model_batch1_distilbert'

In [14]:
train_resume_df = pd.read_csv(data_path+"/train.txt", sep=' ', header=None, names=['token','label'])
test_resume_df = pd.read_csv(data_path+"/test.txt", sep=' ', header=None, names=['token','label'])

train_resume_df

Unnamed: 0,token,label
0,Agent:,O
1,Good,O
2,morning!,O
3,This,O
4,is,O
...,...,...
27429,is,O
27430,now,O
27431,active.,O
27432,Customer:,O


In [15]:
label_list = train_resume_df.label.unique().tolist()
label_list

['O', 'Authentication']

In [16]:
train_resume_df[train_resume_df.label.isna()]

Unnamed: 0,token,label


In [17]:
# data_path = '/home/cibin/Desktop/exl/TD/data/DS_v1/annotated/docanno_output/sample_data/processed/final'
# save_dir = '/home/cibin/Desktop/exl/TD/models/transformers/sample/v1'
bert_obj = BERTmodelTrain(save_dir, labels=label_list, epochs=10)
print(bert_obj.args.num_train_epochs)

Loading pretrained model...


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.den


Pretrained model loaded.  Time taken is %f seconds... 1.46
10


In [18]:
bert_obj._train(
        data_dir=data_path)

Epoch:   0%|                                             | 0/10 [00:00<?, ?it/s]

Iteration:   2%|▋                                | 1/50 [00:00<00:32,  1.52it/s][A
Iteration:   4%|█▎                               | 2/50 [00:01<00:25,  1.88it/s][A
Iteration:   6%|█▉                               | 3/50 [00:01<00:23,  1.98it/s][A
Iteration:   8%|██▋                              | 4/50 [00:02<00:22,  2.04it/s][A
Iteration:  10%|███▎                             | 5/50 [00:02<00:21,  2.08it/s][A
Iteration:  12%|███▉                             | 6/50 [00:02<00:20,  2.10it/s][A
Iteration:  14%|████▌                            | 7/50 [00:03<00:20,  2.11it/s][A
Iteration:  16%|█████▎                           | 8/50 [00:03<00:19,  2.12it/s][A
Iteration:  18%|█████▉                           | 9/50 [00:04<00:19,  2.13it/s][A
Iteration:  20%|██████▍                         | 10/50 [00:04<00:18,  2.13it/s][A
Iteration:  22%|███████                         | 11/50 [00:05<00:18,  2.13it/

Iteration:  60%|███████████████████▏            | 30/50 [00:14<00:09,  2.09it/s][A
Iteration:  62%|███████████████████▊            | 31/50 [00:14<00:09,  2.08it/s][A
Iteration:  64%|████████████████████▍           | 32/50 [00:15<00:08,  2.09it/s][A
Iteration:  66%|█████████████████████           | 33/50 [00:15<00:08,  2.09it/s][A
Iteration:  68%|█████████████████████▊          | 34/50 [00:16<00:07,  2.08it/s][A
Iteration:  70%|██████████████████████▍         | 35/50 [00:16<00:07,  2.08it/s][A
Iteration:  72%|███████████████████████         | 36/50 [00:17<00:06,  2.08it/s][A
Iteration:  74%|███████████████████████▋        | 37/50 [00:17<00:06,  2.09it/s][A
Iteration:  76%|████████████████████████▎       | 38/50 [00:18<00:05,  2.08it/s][A
Iteration:  78%|████████████████████████▉       | 39/50 [00:18<00:05,  2.09it/s][A
Iteration:  80%|█████████████████████████▌      | 40/50 [00:19<00:04,  2.09it/s][A
Iteration:  82%|██████████████████████████▏     | 41/50 [00:19<00:04,  2.09i

Iteration:  46%|██████████████▋                 | 23/50 [00:11<00:13,  2.07it/s][A
Iteration:  48%|███████████████▎                | 24/50 [00:11<00:12,  2.07it/s][A
Iteration:  50%|████████████████                | 25/50 [00:12<00:12,  2.06it/s][A
Iteration:  52%|████████████████▋               | 26/50 [00:12<00:11,  2.07it/s][A
Iteration:  54%|█████████████████▎              | 27/50 [00:13<00:11,  2.07it/s][A
Iteration:  56%|█████████████████▉              | 28/50 [00:13<00:10,  2.07it/s][A
Iteration:  58%|██████████████████▌             | 29/50 [00:14<00:10,  2.07it/s][A
Iteration:  60%|███████████████████▏            | 30/50 [00:14<00:09,  2.07it/s][A
Iteration:  62%|███████████████████▊            | 31/50 [00:14<00:09,  2.07it/s][A
Iteration:  64%|████████████████████▍           | 32/50 [00:15<00:08,  2.08it/s][A
Iteration:  66%|█████████████████████           | 33/50 [00:15<00:08,  2.08it/s][A
Iteration:  68%|█████████████████████▊          | 34/50 [00:16<00:07,  2.07i

Iteration:  32%|██████████▏                     | 16/50 [00:07<00:16,  2.06it/s][A
Iteration:  34%|██████████▉                     | 17/50 [00:08<00:15,  2.06it/s][A
Iteration:  36%|███████████▌                    | 18/50 [00:08<00:15,  2.07it/s][A
Iteration:  38%|████████████▏                   | 19/50 [00:09<00:15,  2.06it/s][A
Iteration:  40%|████████████▊                   | 20/50 [00:09<00:14,  2.06it/s][A
Iteration:  42%|█████████████▍                  | 21/50 [00:10<00:14,  2.07it/s][A
Iteration:  44%|██████████████                  | 22/50 [00:10<00:13,  2.07it/s][A
Iteration:  46%|██████████████▋                 | 23/50 [00:11<00:13,  2.07it/s][A
Iteration:  48%|███████████████▎                | 24/50 [00:11<00:12,  2.08it/s][A
Iteration:  50%|████████████████                | 25/50 [00:12<00:12,  2.08it/s][A
Iteration:  52%|████████████████▋               | 26/50 [00:12<00:11,  2.07it/s][A
Iteration:  54%|█████████████████▎              | 27/50 [00:13<00:11,  2.07i

Iteration:  18%|█████▉                           | 9/50 [00:04<00:19,  2.07it/s][A
Iteration:  20%|██████▍                         | 10/50 [00:04<00:19,  2.06it/s][A
Iteration:  22%|███████                         | 11/50 [00:05<00:18,  2.06it/s][A
Iteration:  24%|███████▋                        | 12/50 [00:05<00:18,  2.06it/s][A
Iteration:  26%|████████▎                       | 13/50 [00:06<00:17,  2.06it/s][A
Iteration:  28%|████████▉                       | 14/50 [00:06<00:17,  2.06it/s][A
Iteration:  30%|█████████▌                      | 15/50 [00:07<00:17,  2.06it/s][A
Iteration:  32%|██████████▏                     | 16/50 [00:07<00:16,  2.06it/s][A
Iteration:  34%|██████████▉                     | 17/50 [00:08<00:15,  2.06it/s][A
Iteration:  36%|███████████▌                    | 18/50 [00:08<00:15,  2.07it/s][A
Iteration:  38%|████████████▏                   | 19/50 [00:09<00:15,  2.06it/s][A
Iteration:  40%|████████████▊                   | 20/50 [00:09<00:14,  2.06i

Iteration:   4%|█▎                               | 2/50 [00:00<00:23,  2.07it/s][A
Iteration:   6%|█▉                               | 3/50 [00:01<00:22,  2.07it/s][A
Iteration:   8%|██▋                              | 4/50 [00:01<00:22,  2.06it/s][A
Iteration:  10%|███▎                             | 5/50 [00:02<00:21,  2.06it/s][A
Iteration:  12%|███▉                             | 6/50 [00:02<00:21,  2.06it/s][A
Iteration:  14%|████▌                            | 7/50 [00:03<00:20,  2.06it/s][A
Iteration:  16%|█████▎                           | 8/50 [00:03<00:20,  2.05it/s][A
Iteration:  18%|█████▉                           | 9/50 [00:04<00:19,  2.06it/s][A
Iteration:  20%|██████▍                         | 10/50 [00:04<00:19,  2.06it/s][A
Iteration:  22%|███████                         | 11/50 [00:05<00:18,  2.06it/s][A
Iteration:  24%|███████▋                        | 12/50 [00:05<00:18,  2.05it/s][A
Iteration:  26%|████████▎                       | 13/50 [00:06<00:17,  2.06i

In [19]:
actual_values, predictions = bert_obj._predict()

os.path.join(self.args.data_dir, 'test.txt')  /home/cibin/Desktop/exl/TD/data/DS_v1/annotated/docanno_output/batch1/batch1_247/processed/final/test.txt


Evaluating: 100%|█████████████████████████████████| 7/7 [00:02<00:00,  3.29it/s]

3
results :  {'loss': 0.1978007054754666, 'precision': 0.049019607843137254, 'recall': 0.2127659574468085, 'f1': 0.07968127490039839}
results :  {'loss': 0.1978007054754666, 'precision': 0.049019607843137254, 'recall': 0.2127659574468085, 'f1': 0.07968127490039839}





In [20]:
predictions_df = pd.read_csv(StringIO(predictions), sep=' ', names=['token','label'])
predictions_df

Unnamed: 0,token,label
0,Agent:,O
1,Maple,O
2,Bank's,O
3,Retirement,O
4,"Planning,",O
...,...,...
6679,scam.,O
6680,Customer:,O
6681,Good,O
6682,to,Authentication


In [21]:
predictions_df.to_csv("distil_predictions.csv", index=None)

In [22]:
from IPython.display import HTML

def html_ner_visualization(predictions_df):
    html = []
    current_speaker = None
    
    for _, row in predictions_df.iterrows():
        token = row['token']
        label = row['label']
        
        if token.endswith(':'):
            if current_speaker:
                html.append('</div>')
            current_speaker = token
            html.append(f'<div><strong>{token}</strong> ')
        else:
            if label != 'O':
                html.append(f'<span style="background-color: #ffcccc; border-radius: 3px;" title="{label}">{token}</span> ')
            else:
                html.append(f'{token} ')
    
    if current_speaker:
        html.append('</div>')
    
    display(HTML(''.join(html)))

html_ner_visualization(predictions_df.head(400))