In [1]:
import os
isColab = True
try:
  # running on CoLAB Hosted runtime
  print(os.environ['HOSTNAME'])
  print('Running on CoLAB Hosted runtime.')
except:
  # running on Local runtime
  print('You are running this script on your local machine!')
  isColab = False

You are running this script on your local machine!


In [2]:
'''Installing/Upgrading the Libraries Not Available to CoLAB'''
if isColab:
  !pip install -U 'spacy'==3.6.1
  # # Install this if you want to run on Colab GPU
  # !pip install -U spacy[cuda-autodetect]
  !pip install -U rdflib
  !pip install -U spaczz
  !pip install nervaluate
  !pip install -U print-dict
  !pip install -U wandb

In [3]:
'''Downloading Model lg = larger - with word vectors'''
if isColab:
  !python -m spacy download en_core_web_lg
  !python -m spacy download en_core_web_trf

In [4]:
# General Libraries
import pandas as pd
import srsly
import json
import os
import re
import csv
import time
from nervaluate import Evaluator
import warnings
import wandb
import shutil

# Spacy Related Imports
import spacy
from spacy.util import minibatch, compounding, compile_infix_regex, get_words_and_spaces
from spacy.tokens import Span, DocBin, Doc
from spacy.vocab import Vocab
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.scorer import Scorer
from spacy.training import Example
from print_dict import pd as pdic

# RDFLib libraries
from rdflib import Graph
import pprint
from rdflib import RDFS
from rdflib import URIRef
from rdflib.namespace import RDF

In [5]:
# wandb.login()

# Data Conversion Part
#### From Docanno Annotated Data for NER and RE into Spacy NER

In [6]:
def create_out_dir(OUTPUT_DIR):
  '''This function will be used to remove the outputs in different run'''
  if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)

  os.makedirs(OUTPUT_DIR)

In [7]:
# Defining some of the Directories... Please also check the Main function at the bottom
COLAB_DIR = ""
if isColab:
  COLAB_DIR = "/content/"

data_path = COLAB_DIR+"dataset"
if not os.path.exists(data_path):
  '''Downloading the Dataset into CoLab temporary directory'''
  !mkdir /content/dataset
  !mkdir /content/dataset/train
  !mkdir /content/dataset/val
  !mkdir /content/dataset/test
  !mkdir /content/config

In [8]:
TRAIN_DIR = COLAB_DIR+'dataset/train'
VALID_DIR = COLAB_DIR+'dataset/val'
TEST_DIR = COLAB_DIR+'dataset/test'
CONFIG_DIR = COLAB_DIR+'config'
OUTPUT_DIR = COLAB_DIR+'dataset'

In [9]:
""" This is just to SHOW the Named Entities - No Real Purpose """
ENTITY_LABELS = []
for lbl in srsly.read_json(CONFIG_DIR + '/label_config_Entity.json'):
  ENTITY_LABELS.append(lbl['text'])
print(ENTITY_LABELS)

['NAME', 'LOCATION', 'COLLEGE NAME', 'SKILLS', 'LANGUAGE', 'WORKED AS', 'YEARS OF EXPERIENCE', 'DEGREE', 'CERTIFICATION', 'UNIVERSITY', 'COMPANIES WORKED AT', 'AWARDS']


In [10]:
def trim_entity_spans(text, spans):
  '''Data Cleaning: Removes leading and trailing white spaces from entity spans.'''
  invalid_span_tokens = re.compile(r'\s')

  valid_spans = []
  for start, end, label in spans:
    valid_start = start
    valid_end = end
    while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
      valid_start += 1
    while valid_end > 1 and invalid_span_tokens.match(text[valid_end - 1]):
      valid_end -= 1

    if valid_start < valid_end:
      valid_spans.append((valid_start, valid_end, label))

  return valid_spans

In [11]:
def docanno_to_spacy_ner_db(DATA_DIR):
  '''
  This function takes a directory of docanno annotated datasets for NER/RE
  and converts them into SpaCy DocBin Object which is Trainable via commandline
  '''
  # Creates a blank Tokenizer with just the English vocab
  nlp = spacy.blank("en")

  Doc.set_extension("rel", default={},force=True)
  vocab = Vocab()

  word_count = 0
  no_files = 0
  no_doc = 0
  no_entities = 0
  error_cnt = 0

  # the DocBin will store the example documents
  db = DocBin()

  for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
      file_path = os.path.join(dirname, filename)

      try:
        """ Iterate through the Jsonl file to create serialize Docbin object / .spacy IOB File """
        for json_line in srsly.read_jsonl(file_path):

          # parsing the docanno JSON data (per-line)
          text = json_line["text"]
          entities = json_line["entities"]
          # id = json_line["id"]

          new_spans = []
          for span in entities:
            new_spans.append((span[0], span[1], span[2]))

          # cleaning and validating the leading and trailing spaces from the annotated entities
          valid_spans = trim_entity_spans(text, new_spans)

          """ Parsing tokens from Text """
          tokens = nlp(text)

          entities = []

          spaces = [True if tok.whitespace_ else False for tok in tokens]
          words = [t.text for t in tokens]
          doc = Doc(nlp.vocab, words=words, spaces=spaces)

          for start, end, label in valid_spans:
            """ The modes should be: strict, contract, and expand """
            # print(eg['text'][int(span["start_offset"]):int(span["end_offset"])])
            entity = doc.char_span(start, end, label=label, alignment_mode='contract')

            # Not considering the spans which are Erroneous
            if entity is None:
              error_cnt += 1
              # print(tokens)
              # file_name = ttext.split('\n')[0]
              #print(f'Error Found in Span: {(start, end, label)} -- Part: {tokens[start: end]}')

            else:
              no_entities += 1
              entities.append(entity)

          # print(entities)
          try:
            doc.ents = entities
            word_count += len(words)
          except:
            print()
            # print(entities)
            print(f"Cannot Assign Entities")
            continue

          db.add(doc)
          no_doc += 1

      except:
        print('Error While Loading JSON Data From Input Directory. Please check if you have other file type...')

      no_files +=1
  print(f"- Files: {no_files} \n- Processed Documents: {no_doc} \n- Total Entities: {no_entities} \n- Erroneous Entities (Ignored): {error_cnt} \n- Total Words: {word_count}")

  return doc, db

In [13]:
'''Saving Spacy Trainable Object File for NER'''
print('Preparing Training Dataset from Structured Data:')
doc_train, db_train = docanno_to_spacy_ner_db(TRAIN_DIR)
db_train.to_disk(OUTPUT_DIR + "/disease_A-Z_train.spacy")

print('\nPreparing Validation Dataset from Ground Truth Validation:')
doc_valid, db_valid = docanno_to_spacy_ner_db(VALID_DIR)
db_valid.to_disk(OUTPUT_DIR + "/disease_A-Z_valid.spacy")

print('\nPreparing Test Dataset from Ground Truth Test:')
doc_test, db_test = docanno_to_spacy_ner_db(TEST_DIR)
db_test.to_disk(OUTPUT_DIR + "/disease_A-Z_test.spacy")

Preparing Training Dataset from Structured Data:
- Files: 12 
- Processed Documents: 3106 
- Total Entities: 3106 
- Erroneous Entities (Ignored): 0 
- Total Words: 8523

Preparing Validation Dataset from Ground Truth Validation:
- Files: 12 
- Processed Documents: 3106 
- Total Entities: 3106 
- Erroneous Entities (Ignored): 0 
- Total Words: 8523

Preparing Test Dataset from Ground Truth Test:
- Files: 1 
- Processed Documents: 20 
- Total Entities: 2140 
- Erroneous Entities (Ignored): 107 
- Total Words: 38459


# Spacy Model Training Part

In [16]:
# This is to debug the data in case they fail to be trained
# !python -m spacy debug data "/content/config/config.cfg"

In [None]:
# Creates the training configuration file from the given base configuration. You can configure it yourself on:
# https://spacy.io/usage/training#quickstart
# We are using a GPU based training setting for Accuracy (RoBERTa model)

#!python -m spacy init fill-config config/base_config.cfg config/config.cfg

In [44]:
'''
WARNING: Training Will Take Time -
Trains a SpaCy NER model on our training data - Please REMOVE
--gpu-id 0 if want to run this in CPU
'''
# This will remove the Models from previous run - comment it out if you want the model to retain
#create_out_dir(COLAB_DIR+"LM-SD_Model")

train_start_time = time.time()

#!python -m spacy train config/config.cfg --gpu-id 0 --output /content/LM-SD_Model --paths.train dataset/disease_A-Z_train.spacy --paths.dev dataset/disease_A-Z_valid.spacy

train_end_time = time.time()
print(f'Total Training Time = {train_end_time - train_start_time} (sec)')

2024-02-26 14:44:00.981121: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-26 14:44:00.981172: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-26 14:44:00.982410: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[38;5;4mℹ Saving to output directory: /content/LM-SD_Model[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing Roberta

In [45]:
#!zip -r /content/LM-SD_Model/model-best.zip /content/LM-SD_Model/model-best

  adding: content/LM-SD_Model/model-best/ (stored 0%)
  adding: content/LM-SD_Model/model-best/tokenizer (deflated 81%)
  adding: content/LM-SD_Model/model-best/meta.json (deflated 67%)
  adding: content/LM-SD_Model/model-best/transformer/ (stored 0%)
  adding: content/LM-SD_Model/model-best/transformer/model (deflated 15%)
  adding: content/LM-SD_Model/model-best/transformer/cfg (stored 0%)
  adding: content/LM-SD_Model/model-best/ner/ (stored 0%)
  adding: content/LM-SD_Model/model-best/ner/moves (deflated 74%)
  adding: content/LM-SD_Model/model-best/ner/model (deflated 8%)
  adding: content/LM-SD_Model/model-best/ner/cfg (deflated 33%)
  adding: content/LM-SD_Model/model-best/config.cfg (deflated 61%)
  adding: content/LM-SD_Model/model-best/vocab/ (stored 0%)
  adding: content/LM-SD_Model/model-best/vocab/strings.json (deflated 70%)
  adding: content/LM-SD_Model/model-best/vocab/key2row (stored 0%)
  adding: content/LM-SD_Model/model-best/vocab/lookups.bin (stored 0%)
  adding: co

In [15]:
# loading the best model from the directory (saved during the training)
# Please download it from here: https://drive.google.com/file/d/1JlrIfJycQwQ3k9rLlOAIhYEWo9EIQExz/view?usp=sharing

nlp_ner = spacy.load(COLAB_DIR+"LM-SD_Model/model-best")

In [16]:
'''Assigns different colors to the Entities during visualization.'''

color_list = ['yellow', 'white', 'orange', '#008000', '#800000', '#0D9CB4', '#5813C7', '#0D350E', '#1AA436',
          '#1AE0F9', '#BADCA1', '#78A2E5', '#D845FB', '#54B69E', '#800080', '#FF00FF', '#000080']

colors = dict(zip(ENTITY_LABELS, color_list))
options = {"colors": colors}

In [17]:
# Inferencing and visualizing some sample text using the trained model
text_inf = "Bikram Adhikari,Network Operations Center Engineer at Vianet Communications,Kathmandu, Bāgmatī, Nepal,Experience,Vianet Communications,Network Operations Center Engineer,Education,Purbanchal Vishwavidalaya,Bachelor in Information Technology"
doc_inf = nlp_ner(text_inf)

spacy.displacy.render(doc_inf, style="ent", options=options, jupyter=True)

In [None]:
'''
Strict Evaluation of the model separately - Using Ground Truth Test Data
This Evaluation Score does not consider Partial Match
'''
start_time = time.time()

!python -m spacy evaluate --gpu-id 0 LM-SD_Model/model-best/ dataset/disease_A-Z_test.spacy

end_time = time.time()
print(f'\nTotal Test Time = {end_time - start_time} (sec)')

# Evaluation using the Test/Validation Data [without cmd]
## Need to Implement the Confusion Matrix (Sklearn)

### Check the following Tutorial:
https://github.com/wjbmattingly/spacy_tutorials_3x/blob/main/02_02_formal_test.ipynb

In [19]:
def load_data(file_path: str, nlp):
  '''This function loads data from SpaCy docbin formatted files into spacy compitable JSON format'''
  doc_bin = DocBin().from_disk(file_path)
  samples, entities_count = [], 0
  for doc in doc_bin.get_docs(nlp.vocab):
    sample = {
      "text": doc.text,
      "entities": []
    }
    if len(doc.ents) > 0:
      entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
      sample["entities"] = entities
      entities_count += len(entities)
    else:
      warnings.warn("Sample without entities!")
    samples.append(sample)
  return samples, entities_count

In [20]:
samples_val, entities_count_val = load_data(OUTPUT_DIR + "/disease_A-Z_valid.spacy", nlp_ner)
samples_test, entities_count_test = load_data(OUTPUT_DIR + "/disease_A-Z_test.spacy", nlp_ner)

In [21]:
# Looking into one particular samples from the ground truth of the validation set
ground = samples_test[0]
ground

{'text': "\xa0 \xa0,Contact,www.linkedin.com/in/bikram-,adhikari-61716b240  (LinkedIn) Bikram Adhikari,Network Operations Center Engineer at Vianet Communications,Kathmandu, Bāgmatī, Nepal,Experience,Vianet Communications,Network Operations Center Engineer,Education,Purbanchal Vishwavidalaya,Bachelor in Information Technology,\xa0Information Technology \xa0·\xa0(December,2011\xa0-\xa0December 2015),\xa0 Page 1 of 1, \xa0 \xa0,Contact,www.linkedin.com/in/bipin-,rai-0821a1201  (LinkedIn),Top Skills,JavaScript,Project Management,English Bipin Rai,Full Stack Developer | React JS | Node JS,Kathmandu,Experience,Scrypt Spider,Frontend Web Developer,July 2021\xa0-\xa0Present\xa0 (1 year 1 month),Kathmandu, Bāgmatī, Nepal,Education,Vedas College,Bachelor's,\xa0Computer Science \xa0·\xa0(2016\xa0-\xa02021),\xa0 Page 1 of 1, \xa0 \xa0,Contact,www.linkedin.com/in/suresh-,tamang-6636b4134  (LinkedIn),Top Skills,React.js,React Native,laravel Suresh Tamang,React | React Native | Blockchain Enthusiast

In [22]:
# predicting the text of the above single sample with the model
pred = nlp_ner(ground['text'])

print(ground['text'])

print('\nPhrase --> Predicted Entity\n')
for ent in pred.ents:
  print(ent.text, '-->', ent.label_)

   ,Contact,www.linkedin.com/in/bikram-,adhikari-61716b240  (LinkedIn) Bikram Adhikari,Network Operations Center Engineer at Vianet Communications,Kathmandu, Bāgmatī, Nepal,Experience,Vianet Communications,Network Operations Center Engineer,Education,Purbanchal Vishwavidalaya,Bachelor in Information Technology, Information Technology  · (December,2011 - December 2015),  Page 1 of 1,    ,Contact,www.linkedin.com/in/bipin-,rai-0821a1201  (LinkedIn),Top Skills,JavaScript,Project Management,English Bipin Rai,Full Stack Developer | React JS | Node JS,Kathmandu,Experience,Scrypt Spider,Frontend Web Developer,July 2021 - Present  (1 year 1 month),Kathmandu, Bāgmatī, Nepal,Education,Vedas College,Bachelor's, Computer Science  · (2016 - 2021),  Page 1 of 1,    ,Contact,www.linkedin.com/in/suresh-,tamang-6636b4134  (LinkedIn),Top Skills,React.js,React Native,laravel Suresh Tamang,React | React Native | Blockchain Enthusiast |,Nepal,Summary,I love to accept challenges and learn new technology tha

In [23]:
def evaluate(ner_model, samples):
  '''Given a trained spacy ner model along with json formatted data, this function will evaluate the model on the data'''
  scorer = Scorer(ner_model)
  example = []
  for sample in samples:
    pred = ner_model(sample['text'])
    #print(type(pred))
    temp_ex = Example.from_dict(pred, {'entities': sample['entities']})
    example.append(temp_ex)
  scores = scorer.score(example)
  return scores

In [24]:
# results = evaluate(nlp_ner, samples_val, )

In [25]:
# from print_dict import pd as pdic
# pdic(results)

# SemEval Evaluation Scripts

In [26]:
def list_to_spacy_ner_doc(ner_pred):
  '''
  This function takes a list of directory of NER predictions of the form
  {'text': '...', 'entities':[(start, end, tag)]} and converts them into SpaCy Doc Object
  '''
  # Creates a blank Tokenizer with just the English vocab
  nlp = spacy.blank("en")

  Doc.set_extension("rel", default={},force=True)
  vocab = Vocab()

  # try:
  # parsing the docanno JSON data (per-line)
  text = ner_pred["text"]
  spans = ner_pred["entities"]

  """ Parsing tokens from Text """
  tokens = nlp(text)

  entities = []

  spaces = [True if tok.whitespace_ else False for tok in tokens]
  words = [t.text for t in tokens]
  doc = Doc(nlp.vocab, words=words, spaces=spaces)

  for start, end, label in spans:
    """ The modes should be: strict, contract, and expand """
      # print(eg['text'][int(span["start_offset"]):int(span["end_offset"])])
    entity = doc.char_span(start, end, label=label, alignment_mode='contract')

    # Not considering the spans which are Erroneous
    if entity is None:
      # disease_name = text.split('\n')[0]
      # print(f'No Entity Found in File: {disease_name};\n Span = {start}-{end}; Phrase = {doc.text[start:end]}; Label = {label}\n')
      continue
    else:
      entities.append(entity)

  # print(entities[0].label_)
  try:
    doc.ents = entities
  except:
    print("=>> Error")
    print(text)

  # except:
  #   print('Error While Loading Predicted List...')

  return doc

In [27]:
def render_sample_pred(ner_doc):
  spacy.displacy.render(ner_doc, style="ent", options=options, jupyter=True)

In [28]:
# Looking into one particular samples from the ground truth of the validation set
ground = samples_test[0]
print(ground)
print()

pred = nlp_ner(ground['text'])
print(pred)

print('\nPhrase --> Predicted Entity\n')
for ent in pred.ents:
  print(ent.text, '-->', ent.label_)

{'text': "\xa0 \xa0,Contact,www.linkedin.com/in/bikram-,adhikari-61716b240  (LinkedIn) Bikram Adhikari,Network Operations Center Engineer at Vianet Communications,Kathmandu, Bāgmatī, Nepal,Experience,Vianet Communications,Network Operations Center Engineer,Education,Purbanchal Vishwavidalaya,Bachelor in Information Technology,\xa0Information Technology \xa0·\xa0(December,2011\xa0-\xa0December 2015),\xa0 Page 1 of 1, \xa0 \xa0,Contact,www.linkedin.com/in/bipin-,rai-0821a1201  (LinkedIn),Top Skills,JavaScript,Project Management,English Bipin Rai,Full Stack Developer | React JS | Node JS,Kathmandu,Experience,Scrypt Spider,Frontend Web Developer,July 2021\xa0-\xa0Present\xa0 (1 year 1 month),Kathmandu, Bāgmatī, Nepal,Education,Vedas College,Bachelor's,\xa0Computer Science \xa0·\xa0(2016\xa0-\xa02021),\xa0 Page 1 of 1, \xa0 \xa0,Contact,www.linkedin.com/in/suresh-,tamang-6636b4134  (LinkedIn),Top Skills,React.js,React Native,laravel Suresh Tamang,React | React Native | Blockchain Enthusiast

   ,Contact,www.linkedin.com/in/bikram-,adhikari-61716b240  (LinkedIn) Bikram Adhikari,Network Operations Center Engineer at Vianet Communications,Kathmandu, Bāgmatī, Nepal,Experience,Vianet Communications,Network Operations Center Engineer,Education,Purbanchal Vishwavidalaya,Bachelor in Information Technology, Information Technology  · (December,2011 - December 2015),  Page 1 of 1,    ,Contact,www.linkedin.com/in/bipin-,rai-0821a1201  (LinkedIn),Top Skills,JavaScript,Project Management,English Bipin Rai,Full Stack Developer | React JS | Node JS,Kathmandu,Experience,Scrypt Spider,Frontend Web Developer,July 2021 - Present  (1 year 1 month),Kathmandu, Bāgmatī, Nepal,Education,Vedas College,Bachelor's, Computer Science  · (2016 - 2021),  Page 1 of 1,    ,Contact,www.linkedin.com/in/suresh-,tamang-6636b4134  (LinkedIn),Top Skills,React.js,React Native,laravel Suresh Tamang,React | React Native | Blockchain Enthusiast |,Nepal,Summary,I love to accept challenges and learn new technology tha

In [29]:
# Visualizing the NER Predictions against the Ground Truth 'samples'
print('\n########### Prediction ###########\n')
render_sample_pred(pred)
print('\n########### Ground Truth ###########\n')
render_sample_pred(list_to_spacy_ner_doc(ground))


########### Prediction ###########




########### Ground Truth ###########



In [30]:
def spacy_evaluate(ner_predictions, samples, show_res=False):
  '''Spacy Evaluation Function - Not going to use it
  Given a trained spacy ner model along with json formatted data, this function will evaluate the model on the data'''
  scorer = Scorer()
  example = []
  for pred, sample in zip(ner_predictions, samples):
    # print(pred)
    pred_doc = list_to_spacy_ner_doc(pred)

    if show_res:
      print('\n########### Prediction ###########\n')
      render_sample_pred(pred_doc)
      print('\n########### Ground Truth ###########\n')
      render_sample_pred(list_to_spacy_ner_doc(sample))

    temp_ex = Example.from_dict(pred_doc, {'entities': sample['entities']})
    example.append(temp_ex)

  scores = scorer.score(example)
  return scores

In [31]:
def save_predictions(ner_predictions, filename, semeval_format=True, docanno_format=True):
  # Saving the predictions as JSON - each dictionary on a line
  semeval_ent = []
  with open(OUTPUT_DIR+'/'+filename, 'w', encoding='UTF-8') as json_file:
    for pred in ner_predictions:
      tmp_ent = []
      if semeval_format:
        if docanno_format:
            # Prodigy/Docanno formatted Ground Truth to work with nereval library - for SemEval 2013 - 9.1 task.
            for ent in pred['entities']:
              # saved in this format: [{"label": "PER", "start": 2, "end": 4}, ... ]
              tmp_ent.append({"label": ent[2], "start": ent[0], "end": ent[1]})

        else:
            # Spacy Doc object to work with nereval library - for SemEval 2013 - 9.1 task.
            for ent in pred.ents:
              # saved in this format: [{"label": "PER", "start": 2, "end": 4}, ... ]
              tmp_ent.append({"label": ent.label_, "start": ent.start_char, "end": ent.end_char})

        semeval_ent.append(tmp_ent)

      else:
        # this is regullar spacy format, can be used for spacy's default evaluation later also for input of Ex-2
        # saved in this format: {"text": "", "entities": [[36, 40, "Complication_E"], [44, 51, "Anatomy_E"], ...]}
        for ent in pred.ents:
            tmp_ent.append([ent.start_char, ent.end_char, ent.label_])

        pred_json = {"text": pred.text, "entities": tmp_ent}
        json_file.write(json.dumps(pred_json, ensure_ascii=False))
        json_file.write('\n')

    if semeval_format:
      # dumping it into a JSON file
      json_file.write(json.dumps(semeval_ent, ensure_ascii=False))

  return semeval_ent
  # # This is single line JSON Dump of the entile list of dictionary - parser cannot parse it directly
  # with open(OUTPUT_DIR+'/predition.jsonl', 'w') as fout:
  #     json.dump(ner_predictions, fout)

In [32]:
def preprocess_results(results_by_tag):
    results_by_entity = []
    for entity in ENTITY_LABELS:
        if entity != 'Code_E':
            df = pd.DataFrame(results_by_tag[entity])
            df = df.round(decimals = 2)
            df.insert(0,'Entity','')
            df['Entity'] = entity
            results_by_entity.append(df)
    return results_by_entity

In [33]:
def semeval_evaluation(true, pred):
    evaluator = Evaluator(true, pred, tags=ENTITY_LABELS)
    results, results_by_tag = evaluator.evaluate()

    results = pd.DataFrame(results)
    results.to_excel(OUTPUT_DIR+'/'+'overall_benchmark.xlsx')

    results_by_entity = pd.concat(preprocess_results(results_by_tag))
    results_by_entity.to_excel(OUTPUT_DIR+'/'+'entity_benchmark.xlsx')

    return results, results_by_entity

### Validation Evaluation

In [None]:
start_time = time.time()
# Saving the predictions in a list for Validation Set
ner_predictions_val = []

for sample in samples_val:
    ner_predictions_val.append(nlp_ner(sample['text']))

In [None]:
print(ner_predictions_val[0])

In [None]:
# saving the ground and predictions into a JSONL file for later evaluation.
semeval_ground_val = save_predictions(samples_val, filename= 'ground_val_semEval.jsonl')
semeval_pred_val = save_predictions(ner_predictions_val, filename='predition_val_semEval.jsonl', semeval_format=True, docanno_format=False)

In [None]:
# Validation evaluation following SemEval 2013 metrics
results, results_by_entity = semeval_evaluation(true=semeval_ground_val, pred=semeval_pred_val)
end_time = time.time()
print(f'\nTotal Validation Time = {end_time - start_time} (sec)')

### Test Evaluation

In [34]:
start_time = time.time()
# Saving the predictions in a list for Test Set
ner_predictions_test = []

for sample in samples_test:
    ner_predictions_test.append(nlp_ner(sample['text']))

In [35]:
# saving the grond and predictions into a JSONL file for later evaluation.
semeval_ground_test = save_predictions(samples_test, filename= 'ground_test.jsonl')
semeval_pred_test = save_predictions(ner_predictions_test, filename='predition_test.jsonl', semeval_format=True, docanno_format=False)

In [36]:
# Test evaluation following SemEval 2013 metrics
results, results_by_entity = semeval_evaluation(true=semeval_ground_test, pred=semeval_pred_test)
end_time = time.time()
print(f'\nTotal Test Time = {end_time - start_time} (sec)')


Total Test Time = 77.91499400138855 (sec)


In [37]:
print('\n########### Overall Results ###########\n')
print(f"Precision: {results['partial']['precision']}\nRecall: {results['partial']['recall']}\nF1: {results['partial']['f1']}\n")


########### Overall Results ###########

Precision: 0.2555023923444976
Recall: 0.1245916938870742
F1: 0.16750313676286074

