In [1]:
from google.colab import drive
drive.mount('/content/drive')
#!ls "/content/drive/My Drive/collab_sandbox"
%cd drive/MyDrive/collab_sandbox/NER/ner_tenses_recognition/
!ls

Mounted at /content/drive
/content/drive/MyDrive/collab_sandbox/NER/ner_tenses_recognition
advanced_tutorial_crf_lstm.ipynb  NERDA_models
best-val-model-86.pt		  NER_SANDBOX.ipynb
catalyst_ner			  NER_tenses_pytorch_lighting_catalyst
datasets			  ner_tenses_recognition_conll_type.ipynb
index.html			  ner-test.csv
logs				  results
my_nerda			  spacy_ner
my_pure_pytorch			  spacy_recognition
nerda_copy_library		  Token_classification.ipynb
NERDA.ipynb


In [None]:
!pip install 'transformers<=3.5.1' 'torch<=1.7.1' pytorch-crf

In [39]:
a = torch.randn(4, 128, 768) 
b = torch.randn(4, 768).repeat(128, 1, 1).permute(1, 0, 2)
torch.cat([a, b], dim=2).shape

torch.Size([4, 128, 1536])

In [64]:
import torch
import sklearn
import warnings
import transformers
# import sklearn.preprocessing
import numpy as np
from sklearn import preprocessing
from transformers import AdamW, get_linear_schedule_with_warmup
import random
from tqdm.notebook import tqdm
import torch.nn as nn
from transformers import AutoConfig

import pandas as pd
import os
import sys
# import sklearn.preprocessing
from transformers import AutoModel, AutoTokenizer, AutoConfig
from typing import List
from typing import List
from sklearn.metrics import precision_recall_fscore_support
import warnings
from tqdm import tqdm_notebook
# from catalyst.contrib.nn.optimizers import RAdam, Lookahead, QHAdamW, AdamP
from torchcrf import CRF

class NERDADataSetReader():
    def __init__(self, 
                sentences, 
                tags, 
                transformer_tokenizer, 
                transformer_config, 
                tag_encoder, 
                tag_outside
            ):
        self.sentences = sentences
        self.tags = tags
        self.transformer_tokenizer = transformer_tokenizer
        self.max_len = 128
        self.tag_encoder = tag_encoder
        self.pad_token_id = transformer_config.pad_token_id
        self.tag_outside_transformed = tag_encoder.transform([tag_outside])[0]
    
    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        sentence = self.sentences[item]
        tags = self.tags[item]
        # encode tags
        tags = self.tag_encoder.transform(tags)
        
        # check inputs for consistancy
        assert len(sentence) == len(tags)

        input_ids = []
        target_tags = []
        tokens = []
        offsets = []
        
        # for debugging purposes
        # print(item)
        for i, word in enumerate(sentence):
            # bert tokenization
            wordpieces = self.transformer_tokenizer.tokenize(word)
            tokens.extend(wordpieces)
            # make room for CLS
            offsets.extend([1]+[0]*(len(wordpieces)-1))
            # Extends the ner_tag if the word has been split by the wordpiece tokenizer
            target_tags.extend([tags[i]] * len(wordpieces)) 
        
        # Make room for adding special tokens (one for both 'CLS' and 'SEP' special tokens)
        # max_len includes _all_ tokens.
        if len(tokens) > self.max_len - 2:
            msg = f'Sentence #{item} length {len(tokens)} exceeds max_len {self.max_len} and has been truncated'
            warnings.warn(msg)
        tokens = tokens[:self.max_len - 2] 
        target_tags = target_tags[:self.max_len - 2]
        offsets = offsets[:self.max_len - 2]

        # encode tokens for BERT
        input_ids = self.transformer_tokenizer.encode(tokens)
        
        # fill out other inputs for model.    
        target_tags = [self.tag_outside_transformed] + target_tags + [self.tag_outside_transformed] 
        masks = [1] * len(input_ids)
        # set to 0, because we are not doing NSP or QA type task (across multiple sentences)
        # token_type_ids distinguishes sentences.
        token_type_ids = [0] * len(input_ids) 
        offsets = [1] + offsets + [1]

        # Padding to max length 
        # compute padding length
        padding_len = self.max_len - len(input_ids)
        input_ids = input_ids + ([self.pad_token_id] * padding_len)
        masks = masks + ([0] * padding_len)  
        offsets = offsets + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_tags = target_tags + ([self.tag_outside_transformed] * padding_len)  

        return {'input_ids' : torch.tensor(input_ids, dtype = torch.long),
                'masks' : torch.tensor(masks, dtype = torch.long),
                'token_type_ids' : torch.tensor(token_type_ids, dtype = torch.long),
                'target_tags' : torch.tensor(target_tags, dtype = torch.long),
                'offsets': torch.tensor(offsets, dtype = torch.long)} 
      

def flatten(l):
    return [item for sublist in l for item in sublist]

def compute_f1_scores(y_pred, y_true, labels, average=None): 
    # check inputs.
    assert sum([len(t) < len(p) for t, p in zip(y_true, y_pred)]) == 0, "Length of predictions must not exceed length of observed values"

    # check, if some lengths of observed values exceed predicted values.
    n_exceeds = sum([len(t) > len(p) for t, p in zip(y_true, y_pred)])

    # truncate observed values dimensions to match predicted values,
    # this is needed if predictions have been truncated earlier in 
    # the flow.
    y_true = [t[:len(p)] for t, p in zip(y_true, y_pred)]
    
    y_pred = flatten(y_pred)
    y_true = flatten(y_true) 

    f1_scores = precision_recall_fscore_support(y_true = y_true,
                                                y_pred = y_pred,
                                                labels = labels,
                                                average=average) 

    return f1_scores



class NERDANetwork(nn.Module):
    def __init__(self, transformer, device, n_tags, dropout = 0.1):
        super(NERDANetwork, self).__init__()
        transformer_name = transformer.name_or_path
        transformer_config = AutoConfig.from_pretrained(transformer_name)
        self.transformer = transformer
        self.dropout = nn.Dropout(dropout)
        self.tags = nn.Linear(transformer_config.hidden_size, n_tags)
        self.device = device
        # self.norm_1 = nn.BatchNorm1d(128)
        # self.norm_2 = nn.BatchNorm1d(128)

    def forward(self, 
                input_ids: torch.Tensor, 
                masks: torch.Tensor, 
                token_type_ids: torch.Tensor, 
                target_tags: torch.Tensor, 
                offsets: torch.Tensor 
                ):
        transformer_inputs = {
            'input_ids': input_ids.to(self.device),
            'attention_mask': masks.to(self.device),
            }
        outputs = self.transformer(**transformer_inputs)[0]
        # print(outputs.shape, outputs[:, 0].shape)
        
        # assert False

        # sent = outputs[:, 0].repeat(128, 1, 1).permute(1, 0, 2)
        # outputs = self.norm_1(outputs)
        outputs = self.dropout(outputs)
        # outputs = torch.tanh(outputs + sent)
        # outputs = torch.tanh(torch.cat([outputs, sent], dim=2))
        # outputs = self.dropout(outputs)
        # outputs = self.norm_2(outputs)
        outputs = self.tags(outputs)
        return outputs




class NERDA:
    def __init__(self, 
                 model_name = 'roberta-base',
                 tag_scheme = None,
                 tag_outside = 'O',
                 dataset_training = None,
                 dataset_validation = None,
                 max_len = 128,
                 network = NERDANetwork,
                 train_batch_size = 4,
                 dropout = 0.1,
                 grad_clip = 1,
                 optimizer_class = AdamW,
                 ):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.tag_scheme = tag_scheme
        self.tag_outside = tag_outside
        self.model_name = model_name  
        self.dataset_training = dataset_training
        self.dataset_validation = dataset_validation
        # self.hyperparameters = hyperparameters
        self.tag_outside = tag_outside
        self.tag_scheme = tag_scheme
        tag_complete = [tag_outside] + tag_scheme
        # fit encoder to _all_ possible tags.
        self.max_len = max_len
        self.tag_encoder = sklearn.preprocessing.LabelEncoder()
        self.tag_encoder.fit(tag_complete)
        self.transformer_model = AutoModel.from_pretrained(model_name)
        self.transformer_tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                                                  #  **tokenizer_parameters
                                                                   do_lower_case=True
                                                                   )
        self.transformer_config = AutoConfig.from_pretrained(model_name)  
        self.network = NERDANetwork(self.transformer_model, self.device, len(tag_complete), dropout = dropout)
        self.network.to(self.device)
        # self.validation_batch_size = validation_batch_size
        self.train_losses = []
        self.valid_loss = np.nan
        self.train_batch_size = train_batch_size
        self.dl_train = None
        self.dl_validate = None
        self.scheduler = None
        self.n_tags = self.tag_encoder.classes_.shape[0]
        self.optimizer = None
        self.criterion = torch.nn.CrossEntropyLoss()
        self.clip = grad_clip
        self.optimizer_class = optimizer_class
        self.model_crf = CRF(51)
        self.model_crf.to(self.device)
        

    def experiment(self, 
                    epochs = 10,
                    warmup_steps = 300,
                    learning_rate = 5e-5,
                   ):
        self.setup()
        self.n_tags = self.tag_encoder.classes_.shape[0]

        optimizer_parameters = self.network.parameters()

        num_train_steps = int(len(self.dataset_training.get('sentences')) / self.train_batch_size * epochs)
        
        self.optimizer = self.optimizer_class(optimizer_parameters, lr = learning_rate)
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps = warmup_steps, num_training_steps = num_train_steps
        )

        train_losses = []
        best_valid_loss = 10
        best_parameters = None

        for epoch in range(epochs):
            
            print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

            train_loss = self.train()
            train_losses.append(train_loss)
            valid_loss = self.validate()

            print(f"Train Loss = {train_loss} Valid Loss = {valid_loss}")

            if valid_loss < best_valid_loss:
                best_parameters = self.network.state_dict()            
                best_valid_loss = valid_loss
        print(f"Best val => {best_valid_loss}")
        # self.network.load_state_dict(best_parameters)
        

        return "Model trained successfully"
    def setup(self):
      # prepare datasets for modelling by creating data readers and loaders
      self.dl_train = self.create_dataloader(sentences = self.dataset_training.get('sentences'),
                                  tags = self.dataset_training.get('tags'), 
                                )
      self.dl_validate = self.create_dataloader(sentences = self.dataset_validation.get('sentences'), 
                                      tags = self.dataset_validation.get('tags'),
                                    )

    def create_dataloader(self, sentences, tags):
      
      data_reader = NERDADataSetReader(
          sentences = sentences, 
          tags = tags,
          transformer_tokenizer = self.transformer_tokenizer, 
          transformer_config = self.transformer_config,
          tag_encoder = self.tag_encoder,
          tag_outside = self.tag_outside)

      data_loader = torch.utils.data.DataLoader(
          data_reader, batch_size = self.train_batch_size, num_workers = 2
      )

      return data_loader
    def train(self,):
      self.network.train()    
      final_loss = 0.0
      
      for dl in tqdm(self.dl_train, total=len(self.dl_train)):

          self.optimizer.zero_grad()
          outputs = self.network(**dl)
          loss = self.compute_loss(outputs, 
                              dl.get('target_tags'),
                              dl.get('masks'), 
                              )
          # loss = self.network(**dl)
          loss.backward()
          # torch.nn.utils.clip_grad_norm_(self.network.parameters(), self.clip)
          self.optimizer.step()
          self.scheduler.step()
          final_loss += loss.item()

      return final_loss / len(self.dl_train)
    def validate(self):
        self.network.eval()
        final_loss = 0.0

        for dl in tqdm(self.dl_validate, total=len(self.dl_validate)):
            
            outputs = self.network(**dl)
            loss = self.compute_loss(outputs, 
                                dl.get('target_tags'),
                                dl.get('masks'), 
                                )
            # loss = self.network(**dl)
            final_loss += loss.item()
        
        return final_loss / len(self.dl_validate)   
    def compute_loss(self, preds, target_tags, masks,):
        # ----------- CRF -----------------
        # preds = preds.permute(1, 0, 2).to(self.device)
        # target_tags = target_tags.permute(1, 0).type(torch.long).to(self.device)
        # target_tags = target_tags.type(torch.uint8)
        # masks = masks.permute(1, 0).type(torch.uint8).to(self.device)
        # print(target_tags, masks)
        # masks = masks.type(torch.uint8)
        # print(preds.shape, target_tags.shape, masks.shape)
        # loss = self.model_crf(preds, target_tags, mask=masks)
        # ----------- CRF -----------------
        active_loss = masks.view(-1) == 1

        active_logits = preds.view(-1, self.n_tags)
        active_labels = torch.where(
            active_loss,
            target_tags.view(-1),
            torch.tensor(self.criterion.ignore_index).type_as(target_tags)
        )
        active_labels = torch.as_tensor(active_labels, device = self.device, dtype = torch.long)
        loss = self.criterion(active_logits, active_labels)
        return loss
    def evaluate_performance(self, dataset):
        
        tags_predicted = self.predict(dataset.get('sentences'))
        
        f1 = compute_f1_scores(y_pred = tags_predicted, 
                               y_true = dataset.get('tags'),
                               labels = self.tag_scheme,
                               average = None
                               )
        
        # create DataFrame with performance scores (=F1)
        df = list(zip(self.tag_scheme, f1[2]))
        df = pd.DataFrame(df, columns = ['Level', 'F1-Score'])    
        
        # compute MICRO-averaged F1-scores and add to table.
        f1_micro = compute_f1_scores(y_pred = tags_predicted, 
                                     y_true = dataset.get('tags'),
                                     labels = self.tag_scheme,
                                     average = 'micro')
        f1_micro = pd.DataFrame({'Level' : ['AVG_MICRO'], 'F1-Score': [f1_micro[2]]})
        df = df.append(f1_micro)

        # compute MACRO-averaged F1-scores and add to table.
        f1_macro = compute_f1_scores(y_pred = tags_predicted, 
                                     y_true = dataset.get('tags'),
                                     labels = self.tag_scheme,
                                     average = 'macro')
        f1_macro = pd.DataFrame({'Level' : ['AVG_MACRO'], 'F1-Score': [f1_macro[2]]})
        df = df.append(f1_macro)
      
        return df 
    def predict(self, sentences):
      self.network.eval()

      tag_fill = [self.tag_encoder.classes_[0]]
      tags_dummy = [tag_fill * len(sent) for sent in sentences]
      
      dl_test = self.create_dataloader(sentences=sentences, tags = tags_dummy)

      predictions = []
      with torch.no_grad():
        for i, dl in enumerate(dl_test): 
          outputs = self.network(**dl)   
          # outputs = self.network.predict(**dl)   
          # print(outputs)
          for i in range(len(outputs)):
              preds = self.tag_encoder.inverse_transform(
                  outputs[i].argmax(-1).cpu().numpy()
              )
              # preds = self.tag_encoder.inverse_transform(
              #     outputs[i]
              # )
              # preds = outputs[i]
              preds = [prediction for prediction, offset in zip(preds.tolist(), dl.get('offsets')[i]) if offset]
              preds = preds[1:-1]
              predictions.append(preds)

      return predictions

In [None]:
# model.evaluate_performance(get_conll_data('test'))

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, random_split, Dataset
import numpy as np
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

SEED = 1234
 
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


full_dataset = pd.read_csv('./datasets/combined_24_02_2021_conll2003.csv', encoding='utf-8') 
train, valid, test = np.split(full_dataset.sample(frac=1, random_state=42), 
                       [int(.7*len(full_dataset)), int(.85*len(full_dataset))])

get_sent_data = lambda df, field: [str(item).split() for item in df[field]]
get_label_data = lambda df, field: [str(item).split() for item in df[field]]
train_data = {
    'sentences': get_sent_data(train, 'sent'), 
    'tags': get_label_data(train, 'named_entity')
}
valid_data = {
    'sentences': get_sent_data(valid, 'sent'), 
    'tags': get_label_data(valid, 'named_entity')
}
test_data = {
    'sentences': get_sent_data(test, 'sent'), 
    'tags': get_label_data(test, 'named_entity')
}

tag_scheme = {
  # "O": 0,
  'B-a1_be_have_do_in_the_past':1,
  'B-a1_can':2,
  'B-a1_comparative_exept':3,
  'B-a1_comparative_long':4,
  'B-a1_comparative_short':5,
  'B-a1_future_simple':6,
  'B-a1_have_has_got':7,
  'B-a1_past_simple_irreg':8,
  'B-a1_past_simple_reg':9,
  'B-a1_possesive_s_sing':10,
  'B-a1_possessive_s_plurar':11,
  'B-a1_present_continuous_act_rn':12,
  'B-a1_present_simple_3d_pers':13,
  'B-a1_present_simple_reg_act':14,
  'B-a1_special_questions':15,
  'B-a1_superlative_exept':16,
  'B-a1_superlative_long':17,
  'B-a1_superlative_short':18,
  'B-a1_there_is_am_are':19,
  'B-a1_there_was_were':20,
  'B-a1_there_will_be':21,
  'B-a1_to_be_future_will_be':22,
  'B-a1_to_be_past_was_were':23,
  'B-a1_to_be_present_is_am_are':24,
  'B-a1_want_would_like_to':25,
  # inside
  'I-a1_can':26,
  'I-a1_comparative_exept':27,
  'I-a1_comparative_long':28,
  'I-a1_comparative_short':29,
  'I-a1_future_simple':30,
  'I-a1_have_has_got':31,
  'I-a1_past_simple_irreg':32,
  'I-a1_past_simple_reg':33,
  'I-a1_possesive_s_sing':34,
  'I-a1_possessive_s_plurar':35,
  'I-a1_present_continuous_act_rn':36,
  'I-a1_present_simple_3d_pers':37,
  'I-a1_present_simple_reg_act':38,
  'I-a1_special_questions':39,
  'I-a1_superlative_exept':40,
  'I-a1_superlative_long':41,
  'I-a1_superlative_short':42,
  'I-a1_there_is_am_are':43,
  'I-a1_there_was_were':44,
  'I-a1_there_will_be':45,
  'I-a1_to_be_future_will_be':46,
  'I-a1_to_be_past_was_were':47,
  'I-a1_to_be_present_is_am_are':48,
  'I-a1_want_would_like_to':49,
  'I-a1_be_have_do_in_the_past':50,
}

tag_scheme = list(tag_scheme.keys())


In [8]:
len(train_data['sentences']), len(valid_data['sentences'])

(490, 105)

In [65]:
model = NERDA(
  dataset_training = train_data,
  dataset_validation = valid_data,
  tag_scheme = tag_scheme, 
  model_name = 'roberta-base',
  dropout=0.2,
  train_batch_size=4,
  # grad_clip=100,
  optimizer_class=AdamW
)
# RAdam, Lookahead, QHAdamW, AdamP
# model.experiment(epochs=10)

In [26]:
torch.tensor([1]).unsqueeze(0)

tensor([[1]])

In [67]:
model.evaluate_performance(test_data)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Level,F1-Score
0,B-a1_be_have_do_in_the_past,0.0
1,B-a1_can,0.933333
2,B-a1_comparative_exept,0.6
3,B-a1_comparative_long,0.833333
4,B-a1_comparative_short,0.75
5,B-a1_future_simple,0.666667
6,B-a1_have_has_got,1.0
7,B-a1_past_simple_irreg,0.708333
8,B-a1_past_simple_reg,0.727273
9,B-a1_possesive_s_sing,0.709677


In [66]:
# torch.save(model.network.state_dict(), "./my_nerda/roberta-base-0.764-loss-0.266.pt")
model.network.load_state_dict(torch.load("./my_nerda/roberta-base-0.77-loss-0.278.pt"))

<All keys matched successfully>

In [31]:
# torch.save(model.network.state_dict(), "./my_nerda/roberta-base-0.77-loss-0.278.pt")

In [68]:
sentences_orig = [
         'DeepSpeed is a deep learning library on top of PyTorch that makes training models at extreme-scale efficient and easy for everyone.', 
         'DeepSpeed offers powerful training features for data scientists training on massive supercomputers as well as those training on low-end clusters or even on a single GPU.',
         'There are other ways that I am using self - care to tend to my mental health.',
         "What's more, essential workers are risking their lives so that we can have our necessities.",
         'While all of this is happening, the luxury travel market is growing and becoming even more exclusive than it already was.',
         "Let's say you live in iowa and you have a bang coming on once a month.",
          "The thing is, I might even like what he's saying and maybe even agree with him.",
         "Sometimes his mouth moves like he's talking to someone, but there's no one else there.",
         "He's just sitting there outside her house in normal civilian clothes, all alone.",
         "He doesn't know where he's going; he just goes.",
         "He's still smiling, looking at Sean with kind, knowing eyes.",
         "He follows Joey into his apartment, past a living area where two other guys are watching TV.",
         "Sean accidentally shoulders the elevator wall as he's walking out, but doesn't acknowledge it.",
         "He feels comfortable giving away his address, whereas Sean would never dare give him his.",
         "It's the same address from the police files, just two blocks away.",
         "He's smiling in the picture, even though you're not supposed to do that for ID photos.",
         'When her friends asked her about it, she told them that she was having trouble with some "horse syndicate" people.',
         "In a letter addressed to her husband, Renee wrote that she was unhappy in her marriage and was contemplating getting a divorce.",
         "Terry was coming toward me, a huge grin plastered on his face - as he pumped his legs on his unicycle.",
         "We met at Piedmont Park, where Terry had suggested we could ride bikes.",
         "What I had feared ended up coming true - my hair smelled like Terry's discount ham for weeks.",
         "Perseverance has a large amount of data in its memory banks which it is gradually offloading to Earth.",
         "Nasa is promising more in the next few days.",
         "It shows the robot heading down to the ground on Thursday to make its landing.",
         "It was acquired by the rocket cradle that placed the vehicle on the surface.",
         "Perseverance has been put in a near-equatorial Martian crater known as Jezero where it will search for signs of past microbial life.",
         "You can see the dust kicked up by the engines.",
         "We're probably about 2m or so above the surface of Mars.",
         "And then the curly electrical umbilical that is taking all of the electrical signals from the descent stage down to the computer inside the belly of the rover, the ones and zeros that represent this image.",
         "Engineers report Perseverance to be in good health, as they gradually commission its systems.",
         "Even now, with just this limited first release of pictures, there were fascinating rocks to discuss, she told reporters.",
         "The $2.7bn (£1.9bn) robot is the fifth rover to be put on Mars by Nasa.",
         "As well as searching for signs of life, Perseverance's other key objective is to select and package rock samples that can be brought back to Earth laboratories by later missions.",
         ]
sentences = [[str(i)for i in nlp(str(sent))] for sent in sentences_orig]
predicts = model.predict(sentences)
# sentences

In [49]:
def get_entity_from_sent(sent, label):
    prev_label = label[0][2:]
    # sent = normal_nlp(str(sent))
    sent = [str(token) for token in sent]
    relu = lambda x: x if x > 0 else 0
    accum = []
    ent_len = 0
    start = 0
    for i, tok in enumerate(label):
      tok = tok[2:]
      ent_len += 1
      if tok != label[relu(i-1)][2:]:
        prev_ent = label[relu(i-1)][2:]
        if ent_len > 1 and prev_ent != '':
          accum.append({
              'ent': prev_ent,
              'start': relu(start),
              'end': relu(i)
              })
        ent_len = 0
        start = i
 
    full_sent =  str(" ".join(sent))
    result = {
          'text': full_sent,
          'ents': [],
          'title': None
      }
    for ent in accum:
      sub_ent = " ".join(sent[ent['start']:ent['end']])
      start = full_sent.index(sub_ent)
      end = start + len(sub_ent)
      result['ents'].append(
          {'start': start, 'end': end, 'label': ent['ent']}
      )
    return result

In [69]:
for sent, pred in zip(sentences, predicts):
  entities = get_entity_from_sent(sent, pred)
  html = displacy.render(entities, style="ent", manual=True, jupyter=True)