<a href="https://colab.research.google.com/github/dtim-upc/THOR/blob/main/LM-Human.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# General Libraries
import pandas as pd
import srsly
import json
import os
import re
import csv
import time
from nervaluate import Evaluator
import warnings
import wandb

# Spacy Related Imports
import spacy
from spacy.util import minibatch, compounding, compile_infix_regex, get_words_and_spaces
from spacy.tokens import Span, DocBin, Doc
from spacy.vocab import Vocab
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.scorer import Scorer
from spacy.training import Example
from print_dict import pd as pdic

# RDFLib libraries
from rdflib import Graph
import pprint
from rdflib import RDFS
from rdflib import URIRef
from rdflib.namespace import RDF

In [None]:
# wandb.login()

# Data Conversion Part
#### From Docanno Annotated Data for NER and RE into Spacy NER

In [None]:
TRAIN_DIR = 'dataset/train'
VALID_DIR = 'dataset/val'
TEST_DIR = 'dataset/test'
CONFIG_DIR = 'config'
OUTPUT_DIR = 'dataset'
SCHEMA_FILE = "dataset/schema/Disease_Schema_Extended.ttl"
# STRUCTURED_DATA_DIR = "dataset/csv"

In [None]:
""" This is just to SHOW the Named Entities - No Real Purpose """
ENTITY_LABELS = []
for lbl in srsly.read_json(CONFIG_DIR + '/label_config_Entity.json'):
  ENTITY_LABELS.append(lbl['text'])
print(ENTITY_LABELS)

['Disease_E', 'Anatomy_E', 'Cause_E', 'Code_E', 'Diagnosis_E', 'Precaution_E', 'Riskfactor_E', 'Symptom_E', 'Medicine_E', 'Composition_E', 'Complication_E', 'Surgery_E']


In [None]:
def trim_entity_spans(text, spans):
  '''Data Cleaning: Removes leading and trailing white spaces from entity spans.'''
  invalid_span_tokens = re.compile(r'\s')

  valid_spans = []
  for start, end, label in spans:
    #print(start, end, label)

    valid_start = start
    valid_end = end

    # fixes leading and trailing white-spaces in the span
    while valid_start < len(text)-1 and invalid_span_tokens.match(text[valid_start]):
      valid_start += 1
    while valid_end > 1 and valid_end < len(text) and invalid_span_tokens.match(text[valid_end - 1]):
      valid_end -= 1

    if valid_start < valid_end:
        #print(text[valid_start:valid_end], valid_start, valid_end, label)
        valid_spans.append((valid_start, valid_end, label))

  return valid_spans

In [None]:
# def read_json(DATA_DIR):
# '''This function solves the issues with JSON Dumping non-ASCII characters'''
#     i=1
#     for json_line in srsly.read_jsonl(DATA_DIR+'/predition_spacy.json'):

#         with open(DATA_DIR+f'/doc-{i}.json', 'w', encoding='utf8') as json_file:
#             json.dump(json_line, json_file, ensure_ascii=False)
#         i+=1

# read_json(TRAIN_DIR)

In [None]:
def map_to_spacy_ner_db(DATA_DIR, is_spacy=False):
    """
    This function takes a directory of Docanno or Spacy annotated JSON/JSONL datasets for NER/RE
    and converts them into SpaCy DocBin Object which is Trainable via commandline

    Parameters:
    DATA_DIR = string containing the directory of the JSON/JSONL files
    is_spacy = 'True' if the files are already in 'Spacy' JSON format
    """
    # Creates a blank Tokenizer with just the English vocab
    nlp = spacy.blank("en")

    Doc.set_extension("rel", default={}, force=True)
    vocab = Vocab()

    word_count = 0
    no_doc = 0
    missing_doc = 0
    no_entities = 0
    error_cnt = 0

    # the DocBin will store the example documents
    db = DocBin()

    for dirname, _, filenames in os.walk(DATA_DIR):
        for filename in filenames:
            file_path = os.path.join(dirname, filename)
            """ Iterate through the Jsonl file to create serialize Docbin object / .spacy IOB File """
            for json_line in srsly.read_jsonl(file_path):

                # parsing the docanno JSON data (per-line)
                text = json_line["text"]
                spans = json_line["entities"]

                # if the datasets are not in Spacy JSON format
                if not is_spacy:
                    new_spans = []
                    for span in spans:
                        new_spans.append((span["start_offset"], span["end_offset"], span["label"]))
                    spans = new_spans

                # cleaning and validating the leading and trailing spaces from the annotated entities
                spans = trim_entity_spans(text, spans)

                """ Parsing tokens from Text """
                tokens = nlp(text)

                entities = []

                spaces = [True if tok.whitespace_ else False for tok in tokens]
                words = [t.text for t in tokens]
                doc = Doc(nlp.vocab, words=words, spaces=spaces)

                for start, end, label in spans:
                    """ The modes should be: strict, contract, and expand """
                    # print(eg['text'][int(span["start_offset"]):int(span["end_offset"])])
                    entity = doc.char_span(
                        start, end, label=label, alignment_mode="contract")

                    # Not considering the spans which are Erroneous
                    if entity is None:
                        error_cnt += 1
                        #print(f"Entity is None for Doc {no_doc+1}\n")

                    else:
                        no_entities += 1
                        entities.append(entity)

                # print(entities)
                try:
                    doc.ents = entities
                    word_count += len(words)
                except:
                    #print(f"=>> Error in Assigning Entities to Doc: {no_doc+1}\n")
                    missing_doc += 1
                    continue

                db.add(doc)
                no_doc += 1

        print(f"- Total Files: {len(filenames)} \n- Processed Documents: {no_doc} \n- Missed Documents: {missing_doc} \n- Total Entities: {no_entities} \n- Erroneous Entities (Ignored): {error_cnt} \n- Total Words: {word_count}")

    return db


In [None]:
'''Saving Spacy Trainable Object File for NER'''
print('Preparing Training Dataset:')
# db_train = map_to_spacy_ner_db(TRAIN_DIR, is_spacy=True)
db_train = map_to_spacy_ner_db(TRAIN_DIR)
db_train.to_disk(OUTPUT_DIR + "/disease_A-Z_train.spacy")

print('\nPreparing Validation Dataset:')
db_valid = map_to_spacy_ner_db(VALID_DIR)
db_valid.to_disk(OUTPUT_DIR + "/disease_A-Z_valid.spacy")

print('\nPreparing Test Dataset:')
db_test = map_to_spacy_ner_db(TEST_DIR)
db_test.to_disk(OUTPUT_DIR + "/disease_A-Z_test.spacy")

Preparing Training Dataset:
- Total Files: 240 
- Processed Documents: 1438 
- Missed Documents: 0 
- Total Entities: 18539 
- Erroneous Entities (Ignored): 33 
- Total Words: 178882

Preparing Validation Dataset:
- Total Files: 61 
- Processed Documents: 366 
- Missed Documents: 0 
- Total Entities: 3989 
- Erroneous Entities (Ignored): 10 
- Total Words: 41284

Preparing Test Dataset:
- Total Files: 13 
- Processed Documents: 90 
- Missed Documents: 0 
- Total Entities: 2222 
- Erroneous Entities (Ignored): 6 
- Total Words: 19867


# Spacy Model Training Part

In [None]:
# Creates the training configuration file from the given base configuration. You can configure it yourself on:
# https://spacy.io/usage/training#quickstart
# We are using a GPU based training setting for Accuracy (RoBERTa model)

!python -m spacy init fill-config config/base_config.cfg config/config.cfg

[+] Auto-filled config with all values
[+] Saved config
config\config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
'''
WARNING: Training Will Take Time -
Trains a SpaCy NER model on our training data - Please REMOVE
--gpu-id 0 if want to run this in CPU
'''

train_start_time = time.time()

# !python -m spacy train config/config.cfg --gpu-id 0 --output LM-Human_Model --paths.train dataset/disease_A-Z_train.spacy --paths.dev dataset/disease_A-Z_valid.spacy

train_end_time = time.time()
print(f'Total Training Time = {train_end_time - train_start_time} (sec)')

[i] Saving to output directory: LM-Human_Model
[i] Using GPU: 0
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['transformer', 'ner']
[i] Initial learn rate: 0.0
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0         934.47   1386.49    0.62    0.34    3.31    0.01
  2     200      415848.76  111215.95   22.81   30.47   18.23    0.23
  4     400      449245.87  61863.56   53.76   52.17   55.45    0.54
  6     600       31524.74  34055.74   61.54   62.54   60.57    0.62
  7     800       23000.05  27059.91   60.48   54.53   67.89    0.60
  9    1000       22391.16  22520.11   64.32   63.67   64.98    0.64
 11    1200       18980.25  17460.48   62.58   59.29   66.26    0.63
 13    1400       11982.32  13195.33   62.66   59.02   66.78    0.63
 15    1600       10726.46  10369.00   64.66   69.19   60.69    0.65
 17    1800       10618.04   8937.11   60.50   55.08   67.08    0.60
 19    

[2023-12-10 02:52:55,273] [INFO] Set up nlp object from config
[2023-12-10 02:52:55,943] [INFO] Pipeline: ['transformer', 'ner']
[2023-12-10 02:52:55,945] [INFO] Created vocabulary
[2023-12-10 02:52:55,947] [INFO] Finished initializing nlp object
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2023-12-10 02:53:06,537] [INFO] Initialized pipel

In [None]:
# loading the best model from the directory (saved during the training)
# Please download it from here: https://drive.google.com/file/d/1JlrIfJycQwQ3k9rLlOAIhYEWo9EIQExz/view?usp=sharing

nlp_ner = spacy.load("LM-Human_Model/model-best")

In [None]:
# Inferencing and visualizing some sample text using the trained model
text_inf = "Tuberculosis generally damages the lungs, but it can also impair other parts of the body such as brain and spine. Typical signs of active Tuberculosis include chronic cough with blood-containing mucus, fever, night sweats, and weight loss. Tuberculosis damages the lungs whereas Malaria could detriment both kidneys by impairing the liver."

doc_inf = nlp_ner(text_inf)

colors = {'Disease_E': 'yellow', 'Anatomy_E': 'silver', 'Cause_E': '#0D9CB4',
          'Code_E': '#5813C7', 'Diagnosis_E': '#0D350E', 'Precaution_E': '#1AA436',
          'Riskfactor_E': '#1AE0F9', 'Symptom_E': 'orange', 'Medicine_E': '#BADCA1',
          'Composition_E': '#78A2E5', 'Complication_E': '#D845FB', 'Surgery_E': '#54B69E'}
options = {"colors": colors}

spacy.displacy.render(doc_inf, style="ent", options=options, jupyter=True)

In [None]:
# Inferencing and visualizing some sample text using the trained model
text_inf = "Chagas (CHAH-gus) disease is an inflammatory, infectious disease caused by the parasite Trypanosoma cruzi. This parasite is found in the feces of the triatomine (reduviid) bug. This bug is also known as the 'kissing bug'. Chagas disease is common in South America, Central America and Mexico, the primary home of the triatomine bug. Rare cases of Chagas disease have also been found in the southern United States."

doc_inf = nlp_ner(text_inf)

colors = {'Disease_E': 'yellow', 'Anatomy_E': 'silver', 'Cause_E': '#0D9CB4',
          'Code_E': '#5813C7', 'Diagnosis_E': '#0D350E', 'Precaution_E': '#1AA436',
          'Riskfactor_E': '#1AE0F9', 'Symptom_E': 'orange', 'Medicine_E': '#BADCA1',
          'Composition_E': '#78A2E5', 'Complication_E': '#D845FB', 'Surgery_E': '#54B69E'}
options = {"colors": colors}

spacy.displacy.render(doc_inf, style="ent", options=options, jupyter=True)

In [None]:
'''
Strict Evaluation of the model separately - Using Ground Truth Validation Data
This Evaluation Score does not consider Partial Match
'''
start_time = time.time()

!python -m spacy evaluate --gpu-id 0 LM-Human_Model/model-best/ dataset/disease_A-Z_valid.spacy

end_time = time.time()
print(f'\nTotal Validation Time = {end_time - start_time} (sec)')

[i] Using GPU: 0
[1m

TOK     100.00
NER P   68.38 
NER R   60.94 
NER F   64.45 
SPEED   4175  

[1m

                     P       R       F
Disease_E        84.31   85.11   84.71
Anatomy_E        43.01   39.22   41.03
Complication_E   50.72   39.77   44.58
Diagnosis_E      71.63   57.46   63.77
Cause_E          52.24   44.90   48.30
Symptom_E        80.71   76.42   78.51
Medicine_E       66.54   50.14   57.19
Surgery_E        83.33   56.18   67.11
Riskfactor_E     40.54   26.95   32.37
Composition_E     0.00    0.00    0.00
Precaution_E     56.44   51.40   53.80


Total Validation Time = 17.20250177383423 (sec)


In [None]:
'''
Strict Evaluation of the model separately - Using Ground Truth Test Data
This Evaluation Score does not consider Partial Match
'''
start_time = time.time()

!python -m spacy evaluate --gpu-id 0 LM-Human_Model/model-best/ dataset/disease_A-Z_test.spacy

end_time = time.time()
print(f'\nTotal Test Time = {end_time - start_time} (sec)')

[i] Using GPU: 0
[1m

TOK     100.00
NER P   63.08 
NER R   42.44 
NER F   50.74 
SPEED   2791  

[1m

                     P       R       F
Disease_E        78.59   78.78   78.68
Anatomy_E        51.32   31.71   39.20
Cause_E          33.96   38.30   36.00
Complication_E   51.59   16.93   25.49
Symptom_E        50.97   57.66   54.11
Riskfactor_E     64.13   43.38   51.75
Diagnosis_E      80.43   52.48   63.52
Surgery_E        71.88   54.12   61.74
Medicine_E       71.43   30.59   42.83
Precaution_E     42.17   48.61   45.16
Composition_E    40.00   18.46   25.26


Total Test Time = 14.14452862739563 (sec)


# Evaluation using the Test/Validation Data [without cmd]

### Check the following Tutorial:
https://github.com/wjbmattingly/spacy_tutorials_3x/blob/main/02_02_formal_test.ipynb

In [None]:
def load_data(file_path: str, nlp):
  '''This function loads data from SpaCy docbin formatted files into spacy compitable JSON format'''
  doc_bin = DocBin().from_disk(file_path)
  samples, entities_count = [], 0
  for doc in doc_bin.get_docs(nlp.vocab):
    sample = {
      "text": doc.text,
      "entities": []
    }
    if len(doc.ents) > 0:
      entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
      sample["entities"] = entities
      entities_count += len(entities)
    else:
      warnings.warn("Sample without entities!")
    samples.append(sample)
  return samples, entities_count

In [None]:
samples_val, entities_count_val = load_data(OUTPUT_DIR + "/disease_A-Z_valid.spacy", nlp_ner)
samples_test, entities_count_test = load_data(OUTPUT_DIR + "/disease_A-Z_test.spacy", nlp_ner)

In [None]:
# Looking into one particular samples from the ground truth of the validation set
ground = samples_test[0]
ground

{'text': "#Causes - Abdominal aortic aneurysm - BD50.4Z\nAneurysms can develop anywhere along the aorta, but most aortic aneurysms occur in the part of the aorta that's in the belly area (abdomen). Several things can play a role in the development of an abdominal aortic aneurysm, including: Hardening of the arteries (atherosclerosis). Atherosclerosis occurs when fat and other substances build up on the lining of a blood vessel. High blood pressure. High blood pressure can damage and weaken the aorta's walls. Blood vessel diseases. These are diseases that cause blood vessels to become inflamed. Infection in the aorta. Rarely, a bacterial or fungal infection might cause an abdominal aortic aneurysms. Trauma. For example, being injured in a car accident can cause an abdominal aortic aneurysms. ",
 'entities': [(10, 35, 'Disease_E'),
  (46, 55, 'Complication_E'),
  (87, 92, 'Anatomy_E'),
  (103, 119, 'Complication_E'),
  (133, 150, 'Anatomy_E'),
  (165, 185, 'Anatomy_E'),
  (243, 268, 'Dis

In [None]:
# predicting the text of the above single sample with the model
pred = nlp_ner(ground['text'])

print(ground['text'])

print('\nPhrase --> Predicted Entity\n')
for ent in pred.ents:
  print(ent.text, '-->', ent.label_)

#Causes - Abdominal aortic aneurysm - BD50.4Z
Aneurysms can develop anywhere along the aorta, but most aortic aneurysms occur in the part of the aorta that's in the belly area (abdomen). Several things can play a role in the development of an abdominal aortic aneurysm, including: Hardening of the arteries (atherosclerosis). Atherosclerosis occurs when fat and other substances build up on the lining of a blood vessel. High blood pressure. High blood pressure can damage and weaken the aorta's walls. Blood vessel diseases. These are diseases that cause blood vessels to become inflamed. Infection in the aorta. Rarely, a bacterial or fungal infection might cause an abdominal aortic aneurysms. Trauma. For example, being injured in a car accident can cause an abdominal aortic aneurysms. 

Phrase --> Predicted Entity

Abdominal aortic aneurysm --> Disease_E
Aneurysms --> Disease_E
aorta --> Anatomy_E
aortic aneurysms --> Disease_E
belly area (abdomen) --> Anatomy_E
abdominal aortic aneurysm --

In [None]:
def evaluate(ner_model, samples):
  '''Given a trained spacy ner model along with json formatted data, this function will evaluate the model on the data'''
  scorer = Scorer(ner_model)
  example = []
  for sample in samples:
    pred = ner_model(sample['text'])
    #print(type(pred))
    temp_ex = Example.from_dict(pred, {'entities': sample['entities']})
    example.append(temp_ex)
  scores = scorer.score(example)
  return scores

In [None]:
# results = evaluate(nlp_ner, samples_val, )

In [None]:
# from print_dict import pd as pdic
# pdic(results)

# SemEval Evaluation Scripts

In [None]:
def list_to_spacy_ner_doc(ner_pred):
  '''
  This function takes a list of directory of NER predictions of the form
  {'text': '...', 'entities':[(start, end, tag)]} and converts them into SpaCy Doc Object
  '''
  # Creates a blank Tokenizer with just the English vocab
  nlp = spacy.blank("en")

  Doc.set_extension("rel", default={},force=True)
  vocab = Vocab()

  # try:
  # parsing the docanno JSON data (per-line)
  text = ner_pred["text"]
  spans = ner_pred["entities"]

  """ Parsing tokens from Text """
  tokens = nlp(text)

  entities = []

  spaces = [True if tok.whitespace_ else False for tok in tokens]
  words = [t.text for t in tokens]
  doc = Doc(nlp.vocab, words=words, spaces=spaces)

  for start, end, label in spans:
    """ The modes should be: strict, contract, and expand """
      # print(eg['text'][int(span["start_offset"]):int(span["end_offset"])])
    entity = doc.char_span(start, end, label=label, alignment_mode='contract')

    # Not considering the spans which are Erroneous
    if entity is None:
      # disease_name = text.split('\n')[0]
      # print(f'No Entity Found in File: {disease_name};\n Span = {start}-{end}; Phrase = {doc.text[start:end]}; Label = {label}\n')
      continue
    else:
      entities.append(entity)

  # print(entities[0].label_)
  try:
    doc.ents = entities
  except:
    print("=>> Error")
    print(text)

  # except:
  #   print('Error While Loading Predicted List...')

  return doc

In [None]:
def render_sample_pred(ner_doc):
  spacy.displacy.render(ner_doc, style="ent", options=options, jupyter=True)

In [None]:
# Looking into one particular samples from the ground truth of the validation set
ground = samples_test[0]
print(ground)
print()

pred = nlp_ner(ground['text'])
print(pred)

print('\nPhrase --> Predicted Entity\n')
for ent in pred.ents:
  print(ent.text, '-->', ent.label_)

{'text': "#Causes - Abdominal aortic aneurysm - BD50.4Z\nAneurysms can develop anywhere along the aorta, but most aortic aneurysms occur in the part of the aorta that's in the belly area (abdomen). Several things can play a role in the development of an abdominal aortic aneurysm, including: Hardening of the arteries (atherosclerosis). Atherosclerosis occurs when fat and other substances build up on the lining of a blood vessel. High blood pressure. High blood pressure can damage and weaken the aorta's walls. Blood vessel diseases. These are diseases that cause blood vessels to become inflamed. Infection in the aorta. Rarely, a bacterial or fungal infection might cause an abdominal aortic aneurysms. Trauma. For example, being injured in a car accident can cause an abdominal aortic aneurysms. ", 'entities': [(10, 35, 'Disease_E'), (46, 55, 'Complication_E'), (87, 92, 'Anatomy_E'), (103, 119, 'Complication_E'), (133, 150, 'Anatomy_E'), (165, 185, 'Anatomy_E'), (243, 268, 'Disease_E'), (28

In [None]:
# Visualizing the NER Predictions against the Ground Truth 'samples'
print('\n########### Prediction ###########\n')
render_sample_pred(pred)
print('\n########### Ground Truth ###########\n')
render_sample_pred(list_to_spacy_ner_doc(ground))


########### Prediction ###########




########### Ground Truth ###########



In [None]:
def spacy_evaluate(ner_predictions, samples, show_res=False):
  '''Spacy Evaluation Function - Not going to use it
  Given a trained spacy ner model along with json formatted data, this function will evaluate the model on the data'''
  scorer = Scorer()
  example = []
  for pred, sample in zip(ner_predictions, samples):
    # print(pred)
    pred_doc = list_to_spacy_ner_doc(pred)

    if show_res:
      print('\n########### Prediction ###########\n')
      render_sample_pred(pred_doc)
      print('\n########### Ground Truth ###########\n')
      render_sample_pred(list_to_spacy_ner_doc(sample))

    temp_ex = Example.from_dict(pred_doc, {'entities': sample['entities']})
    example.append(temp_ex)

  scores = scorer.score(example)
  return scores

In [None]:
def save_predictions(ner_predictions, filename, semeval_format=True):
  # Saving the predictions as JSON - each dictionary on a line
  semeval_ent = []
  with open(OUTPUT_DIR+'/'+filename, 'w') as json_file:
    for pred in ner_predictions:
      tmp_ent = []
      if semeval_format:
        # prodigy format to work with nereval library - for SemEval 2013 - 9.1 task.
        for ent in pred['entities']:
          # saved in this format: [{"label": "PER", "start": 2, "end": 4}, ... ]
          tmp_ent.append({"label": ent[2], "start": ent[0], "end": ent[1]})

        semeval_ent.append(tmp_ent)

      else:
        # Spacy Doc object to work with nereval library - for SemEval 2013 - 9.1 task.
        for ent in pred.ents:
          # saved in this format: [{"label": "PER", "start": 2, "end": 4}, ... ]
          tmp_ent.append({"label": ent.label_, "start": ent.start_char, "end": ent.end_char})

        semeval_ent.append(tmp_ent)

    # dumping it into a JSON file
    json_file.write(json.dumps(semeval_ent))

  return semeval_ent
  # # This is single line JSON Dump of the entile list of dictionary - parser cannot parse it directly
  # with open(OUTPUT_DIR+'/predition.jsonl', 'w') as fout:
  #     json.dump(ner_predictions, fout)

In [None]:
def preprocess_results(results_by_tag):
    results_by_entity = []
    for entity in ENTITY_LABELS:
        if entity != 'Code_E':
            df = pd.DataFrame(results_by_tag[entity])
            df = df.round(decimals = 2)
            df.insert(0,'Entity','')
            df['Entity'] = entity
            results_by_entity.append(df)
    return results_by_entity

In [None]:
def semeval_evaluation(true, pred):
    evaluator = Evaluator(true, pred, tags=ENTITY_LABELS)
    results, results_by_tag = evaluator.evaluate()

    results = pd.DataFrame(results)
    results.to_excel(OUTPUT_DIR+'/'+'overall_benchmark.xlsx')

    results_by_entity = pd.concat(preprocess_results(results_by_tag))
    results_by_entity.to_excel(OUTPUT_DIR+'/'+'entity_benchmark.xlsx')

    return results, results_by_entity

### Validation Evaluation

In [None]:
start_time = time.time()
# Saving the predictions in a list for Validation Set
ner_predictions_val = []

for sample in samples_val:
    ner_predictions_val.append(nlp_ner(sample['text']))

In [None]:
print(ner_predictions_val[0].ents)

(Quinsy, Peri Tonsillar Abscess, acute tonsillitis)


In [None]:
# saving the grond and predictions into a JSONL file for later evaluation.
semeval_ground_val = save_predictions(samples_val, filename= 'ground_val.jsonl')
semeval_pred_val = save_predictions(ner_predictions_val, filename='predition_val.jsonl', semeval_format=False)

In [None]:
# Validation evaluation following SemEval 2013 metrics
results, results_by_entity = semeval_evaluation(true=semeval_ground_val, pred=semeval_pred_val)
end_time = time.time()
print(f'\nTotal Test Time = {end_time - start_time} (sec)')


Total Test Time = 50.84383749961853 (sec)


### Test Evaluation

In [None]:
start_time = time.time()
# Saving the predictions in a list for Test Set
ner_predictions_test = []

for sample in samples_test:
    ner_predictions_test.append(nlp_ner(sample['text']))

In [None]:
# saving the grond and predictions into a JSONL file for later evaluation.
semeval_ground_test = save_predictions(samples_test, filename= 'ground_test.jsonl')
semeval_pred_test = save_predictions(ner_predictions_test, filename='predition_test.jsonl', semeval_format=False)

In [None]:
# Test evaluation following SemEval 2013 metrics
results, results_by_entity = semeval_evaluation(true=semeval_ground_test, pred=semeval_pred_test)

end_time = time.time()
print(f'\nTotal Test Time = {end_time - start_time} (sec)')


Total Test Time = 25.958266496658325 (sec)


In [None]:
print('\n########### Overall Results ###########\n')
print(f"Precision: {results['partial']['precision']}\nRecall: {results['partial']['recall']}\nF1: {results['partial']['f1']}\n")


########### Overall Results ###########

Precision: 0.8301003344481606
Recall: 0.5585058505850585
F1: 0.6677428033360236



## Push to HuggingFace Hub

In [None]:
!pip install spacy-huggingface-hub

In [None]:
!python -m spacy package ./model/model-best ./hf-output --build wheel

In [None]:
from spacy_huggingface_hub import push

result = push("./hf-output/EN_Disease_A_Z_SpaCy-0.0.0/dist/EN_Disease_A_Z_SpaCy-0.0.0-py3-none-any.whl")