In [None]:
import os
isColab = True
try:
  # running on CoLAB Hosted runtime
  print(os.environ['HOSTNAME'])
  print('Running on CoLAB Hosted runtime.')
except:
  # running on Local runtime
  print('You are running this script on your local machine!')
  isColab = False

e027fc11067c
Running on CoLAB Hosted runtime.


In [None]:
'''Installing/Upgrading the Libraries Not Available to CoLAB'''
if isColab:
  !pip install -U 'spacy'==3.6.1
  # # Install this if you want to run on Colab GPU
  # !pip install -U spacy[cuda-autodetect]
  !pip install -U rdflib
  !pip install -U spaczz
  !pip install nervaluate
  !pip install -U print-dict
  !pip install -U wandb

Collecting spacy==3.6.1
  Downloading spacy-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting thinc<8.2.0,>=8.1.8 (from spacy==3.6.1)
  Downloading thinc-8.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (919 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m919.6/919.6 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
Collecting pathy>=0.10.0 (from spacy==3.6.1)
  Downloading pathy-0.11.0-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.3/47.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting pathlib-abc==0.1.1 (from pathy>=0.10.0->spacy==3.6.1)
  Downloading pathlib_abc-0.1.1-py3-none-any.whl (23 kB)
Installing collected packages: pathlib-abc, pathy, thinc, spacy
  Attempting uninstall: thinc
    Found existing installation: thinc 8.2.3
    Uninstalling

In [None]:
'''Downloading Model lg = larger - with word vectors'''
if isColab:
  !python -m spacy download en_core_web_lg
  !python -m spacy download en_core_web_trf

2024-02-26 02:43:05.465262: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-26 02:43:05.465319: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-26 02:43:05.466653: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-26 02:43:05.474213: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-26 02:43:08.009235: I external/local_

In [None]:
# General Libraries
import pandas as pd
import srsly
import json
import os
import re
import csv
import time
from nervaluate import Evaluator
import warnings
import wandb

# Spacy Related Imports
import spacy
from spacy.util import minibatch, compounding, compile_infix_regex, get_words_and_spaces
from spacy.tokens import Span, DocBin, Doc
from spacy.vocab import Vocab
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.scorer import Scorer
from spacy.training import Example
from print_dict import pd as pdic
import shutil

# RDFLib libraries
from rdflib import Graph
import pprint
from rdflib import RDFS
from rdflib import URIRef
from rdflib.namespace import RDF
warnings.filterwarnings("ignore", message=r"\\[W007\\]", category=UserWarning)

In [None]:
# wandb.login()/

# Data Conversion Part
#### From Docanno Annotated Data for NER and RE into Spacy NER

In [None]:
def create_out_dir(OUTPUT_DIR):
  '''This function will be used to remove the outputs in different run'''
  if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)

  os.makedirs(OUTPUT_DIR)

In [None]:
# Defining some of the Directories... Please also check the Main function at the bottom
COLAB_DIR = ""
if isColab:
  COLAB_DIR = "/content/"

data_path = COLAB_DIR+"dataset"
if not os.path.exists(data_path):
  '''Downloading the Dataset into CoLab temporary directory'''
  !mkdir /content/dataset
  !mkdir /content/dataset/train
  !mkdir /content/dataset/val
  !mkdir /content/dataset/test
  !mkdir /content/config

In [None]:
TRAIN_DIR = COLAB_DIR+'dataset/train'
VALID_DIR = COLAB_DIR+'dataset/val'
TEST_DIR = COLAB_DIR+'dataset/test'
CONFIG_DIR = COLAB_DIR+'config'
OUTPUT_DIR = COLAB_DIR+'dataset'
# SCHEMA_FILE = "dataset/schema/Disease_Schema_Extended.ttl"
# STRUCTURED_DATA_DIR = "dataset/csv"

In [None]:
""" This is just to SHOW the Named Entities - No Real Purpose """
ENTITY_LABELS = []
for lbl in srsly.read_json(CONFIG_DIR + '/label_config_Entity.json'):
  ENTITY_LABELS.append(lbl['text'])
print(ENTITY_LABELS)

['NAME', 'LOCATION', 'COLLEGE NAME', 'SKILLS', 'LANGUAGE', 'WORKED AS', 'YEARS OF EXPERIENCE', 'DEGREE', 'CERTIFICATION', 'UNIVERSITY', 'COMPANIES WORKED AT', 'AWARDS']


In [None]:
import string

def trim_entity_spans(text, spans):
    '''Data Cleaning: Removes leading and trailing whitespace and punctuation from entity spans.'''
    valid_spans = []
    for start, end, label in spans:
        valid_start = start
        valid_end = end

        # Trim leading whitespace and punctuation
        while valid_start < len(text) and (text[valid_start].isspace() or text[valid_start] in string.punctuation):
            valid_start += 1

        # Trim trailing whitespace and punctuation
        while valid_end > valid_start and (text[valid_end - 1].isspace() or text[valid_end - 1] in string.punctuation):
            valid_end -= 1

        if valid_start < valid_end:
            valid_spans.append((valid_start, valid_end, label))

    return valid_spans


In [None]:
def clean_entities(entities):
  ''''This function will remove overlapping spans'''
  entities_copy = entities.copy()

  # append entity only if it is longer than its overlapping entity
  i = 0
  for entity in entities_copy:
      j = 0
      for overlapping_entity in entities_copy:
          # Skip self
          if i != j:
              e_start, e_end, oe_start, oe_end = entity[0], entity[1], overlapping_entity[0], overlapping_entity[1]
              # Delete any entity that overlaps, keep if longer
              if ((e_start >= oe_start and e_start <= oe_end) \
              or (e_end <= oe_end and e_end >= oe_start)) \
              and ((e_end - e_start) <= (oe_end - oe_start)):
                  entities.remove(entity)
          j += 1
      i += 1

  return entities

In [None]:
# def read_json(DATA_DIR):
# '''This function solves the issues with JSON Dumping non-ASCII characters'''
#     i=1
#     for json_line in srsly.read_jsonl(DATA_DIR+'/predition_spacy.json'):

#         with open(DATA_DIR+f'/doc-{i}.json', 'w', encoding='utf8') as json_file:
#             json.dump(json_line, json_file, ensure_ascii=False)
#         i+=1

# read_json(TRAIN_DIR)

In [None]:
def map_to_spacy_ner_db(DATA_DIR, is_spacy=False):
    """
    This function takes a directory of Docanno or Spacy annotated JSON/JSONL datasets for NER/RE
    and converts them into SpaCy DocBin Object which is Trainable via commandline

    Parameters:
    DATA_DIR = string containing the directory of the JSON/JSONL files
    is_spacy = 'True' if the files are already in 'Spacy' JSON format
    """
    # Creates a blank Tokenizer with just the English vocab
    nlp = spacy.blank("en")

    Doc.set_extension("rel", default={}, force=True)
    vocab = Vocab()

    word_count = 0
    no_doc = 0
    missing_doc = 0
    no_entities = 0
    error_cnt = 0

    # the DocBin will store the example documents
    db = DocBin()

    for dirname, _, filenames in os.walk(DATA_DIR):
        for filename in filenames:
            file_path = os.path.join(dirname, filename)
            """ Iterate through the Jsonl file to create serialize Docbin object / .spacy IOB File """
            for json_line in srsly.read_jsonl(file_path):

                # parsing the docanno JSON data (per-line)
                text = json_line["text"]
                spans = json_line["entities"]

                # spans = clean_entities(spans)

                # if the datasets are not in Spacy JSON format
                if not is_spacy:
                    new_spans = []
                    for span in spans:
                        new_spans.append((span[0], span[1], span[2]))
                    spans = new_spans

                # cleaning and validating the leading and trailing spaces from the annotated entities
                # spans = trim_entity_spans(text, spans)

                """ Parsing tokens from Text """
                tokens = nlp(text)

                entities = []

                spaces = [True if tok.whitespace_ else False for tok in tokens]
                words = [t.text for t in tokens]
                doc = Doc(nlp.vocab, words=words, spaces=spaces)

                for start, end, label in spans:
                    """ The modes should be: strict, contract, and expand """
                    # print(eg['text'][int(span["start_offset"]):int(span["end_offset"])])
                    entity = doc.char_span(
                        start, end, label=label, alignment_mode="contract")

                    # Not considering the spans which are Erroneous
                    if entity is None:
                        error_cnt += 1
                        #print(f"Entity is None for Doc {no_doc+1}\n")

                    else:
                        no_entities += 1
                        entities.append(entity)

                # print(entities)
                try:
                    doc.ents = entities
                    word_count += len(words)
                except:
                    #print(f"=>> Error in Assigning Entities to Doc: {no_doc+1}\n")
                    missing_doc += 1
                    continue

                db.add(doc)
                no_doc += 1

        print(f"- Total Files: {len(filenames)} \n- Processed Documents: {no_doc} \n- Missed Documents: {missing_doc} \n- Total Entities: {no_entities} \n- Erroneous Entities (Ignored): {error_cnt} \n- Total Words: {word_count}")

    return db


In [None]:
'''Saving Spacy Trainable Object File for NER'''
print('Preparing Training Dataset:')
db_train = map_to_spacy_ner_db(TRAIN_DIR, is_spacy=True)
# db_train = map_to_spacy_ner_db(TRAIN_DIR)
db_train.to_disk(OUTPUT_DIR + "/disease_A-Z_train.spacy")

print('\nPreparing Validation Dataset:')
db_valid = map_to_spacy_ner_db(VALID_DIR, is_spacy=True)
db_valid.to_disk(OUTPUT_DIR + "/disease_A-Z_valid.spacy")

print('\nPreparing Test Dataset:')
db_test = map_to_spacy_ner_db(TEST_DIR, is_spacy=True)
db_test.to_disk(OUTPUT_DIR + "/disease_A-Z_test.spacy")

Preparing Training Dataset:
- Total Files: 1 
- Processed Documents: 20 
- Missed Documents: 0 
- Total Entities: 1656 
- Erroneous Entities (Ignored): 113 
- Total Words: 41675

Preparing Validation Dataset:
- Total Files: 1 
- Processed Documents: 20 
- Missed Documents: 0 
- Total Entities: 1656 
- Erroneous Entities (Ignored): 113 
- Total Words: 41675

Preparing Test Dataset:
- Total Files: 1 
- Processed Documents: 20 
- Missed Documents: 0 
- Total Entities: 2140 
- Erroneous Entities (Ignored): 107 
- Total Words: 38459


# Spacy Model Training Part

In [None]:
# This is to debug the data in case they fail to be trained
# !python -m spacy debug data "/content/config/config.cfg"

In [None]:
# Creates the training configuration file from the given base configuration. You can configure it yourself on:
# https://spacy.io/usage/training#quickstart
# We are using a GPU based training setting for Accuracy (RoBERTa model)

!python -m spacy init fill-config config/base_config.cfg config/config.cfg

[+] Auto-filled config with all values
[+] Saved config
config\config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
'''
WARNING: Training Will Take Time -
Trains a SpaCy NER model on our training data - Please REMOVE
--gpu-id 0 if want to run this in CPU
'''

# This will remove the Models from previous run - comment it out if you want the model to retain
create_out_dir(COLAB_DIR+"LM-Human_Model")

train_start_time = time.time()

!python -m spacy train config/config.cfg --gpu-id 0 --output /content/LM-Human_Model --paths.train /content/dataset/disease_A-Z_train.spacy --paths.dev /content/dataset/disease_A-Z_valid.spacy

train_end_time = time.time()
print(f'Total Training Time = {train_end_time - train_start_time} (sec)')

2024-02-26 03:00:48.854292: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-26 03:00:48.854344: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-26 03:00:48.855590: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[38;5;4mℹ Saving to output directory: /content/LM-Human_Model[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing Robe

In [None]:
!zip -r /content/LM-Human_Model/model-best.zip /content/LM-Human_Model/model-best

  adding: content/LM-Human_Model/model-best/ (stored 0%)
  adding: content/LM-Human_Model/model-best/tokenizer (deflated 81%)
  adding: content/LM-Human_Model/model-best/meta.json (deflated 66%)
  adding: content/LM-Human_Model/model-best/transformer/ (stored 0%)
  adding: content/LM-Human_Model/model-best/transformer/model (deflated 15%)
  adding: content/LM-Human_Model/model-best/transformer/cfg (stored 0%)
  adding: content/LM-Human_Model/model-best/ner/ (stored 0%)
  adding: content/LM-Human_Model/model-best/ner/moves (deflated 74%)
  adding: content/LM-Human_Model/model-best/ner/model (deflated 8%)
  adding: content/LM-Human_Model/model-best/ner/cfg (deflated 33%)
  adding: content/LM-Human_Model/model-best/config.cfg (deflated 61%)
  adding: content/LM-Human_Model/model-best/vocab/ (stored 0%)
  adding: content/LM-Human_Model/model-best/vocab/strings.json (deflated 73%)
  adding: content/LM-Human_Model/model-best/vocab/key2row (stored 0%)
  adding: content/LM-Human_Model/model-be

In [None]:
# loading the best model from the directory (saved during the training)
# Please download it from here: https://drive.google.com/file/d/1JlrIfJycQwQ3k9rLlOAIhYEWo9EIQExz/view?usp=sharing
from spacy.util import load_model_from_path

nlp_ner = spacy.load(COLAB_DIR+"LM-Human_Model/model-best")

In [None]:
'''Assigns different colors to the Entities during visualization.'''

color_list = ['yellow', 'white', 'orange', '#008000', '#800000', '#0D9CB4', '#5813C7', '#0D350E', '#1AA436',
          '#1AE0F9', '#BADCA1', '#78A2E5', '#D845FB', '#54B69E', '#800080', '#FF00FF', '#000080']

colors = dict(zip(ENTITY_LABELS, color_list))
options = {"colors": colors}

In [None]:
# Inferencing and visualizing some sample text using the trained model
text_inf = "Bikram Adhikari,Network Operations Center Engineer at Vianet Communications,Kathmandu, Bāgmatī, Nepal,Experience,Vianet Communications,Network Operations Center Engineer,Education,Purbanchal Vishwavidalaya,Bachelor in Information Technology"
doc_inf = nlp_ner(text_inf)

spacy.displacy.render(doc_inf, style="ent", options=options, jupyter=True)

In [None]:
'''
Strict Evaluation of the model separately - Using Ground Truth Test Data
This Evaluation Score does not consider Partial Match
'''
start_time = time.time()

!python -m spacy evaluate --gpu-id 0 LM-Human_Model/model-best/ dataset/disease_A-Z_test.spacy

end_time = time.time()
print(f'\nTotal Test Time = {end_time - start_time} (sec)')

2024-02-26 03:15:03.207165: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-26 03:15:03.207223: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-26 03:15:03.208527: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[38;5;4mℹ Using GPU: 0[0m
Token indices sequence length is longer than the specified maximum sequence length for this model (3739 > 512). Running this sequence through the model will result in indexing errors
[1m

TOK     100.00
NER P   53.75 
NER R   12.71 
NER F   20.56 
SPEED   7998  

[1m

                          P       R       F
NAME                  

# Evaluation using the Test/Validation Data [without cmd]

### Check the following Tutorial:
https://github.com/wjbmattingly/spacy_tutorials_3x/blob/main/02_02_formal_test.ipynb

In [None]:
def load_data(file_path: str, nlp):
  '''This function loads data from SpaCy docbin formatted files into spacy compitable JSON format'''
  doc_bin = DocBin().from_disk(file_path)
  samples, entities_count = [], 0
  for doc in doc_bin.get_docs(nlp.vocab):
    sample = {
      "text": doc.text,
      "entities": []
    }
    if len(doc.ents) > 0:
      entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
      sample["entities"] = entities
      entities_count += len(entities)
    else:
      warnings.warn("Sample without entities!")
    samples.append(sample)
  return samples, entities_count

In [None]:
samples_val, entities_count_val = load_data(OUTPUT_DIR + "/disease_A-Z_valid.spacy", nlp_ner)
samples_test, entities_count_test = load_data(OUTPUT_DIR + "/disease_A-Z_test.spacy", nlp_ner)

In [None]:
# Looking into one particular samples from the ground truth of the validation set
ground = samples_test[0]
ground

{'text': "\xa0 \xa0,Contact,www.linkedin.com/in/bikram-,adhikari-61716b240  (LinkedIn) Bikram Adhikari,Network Operations Center Engineer at Vianet Communications,Kathmandu, Bāgmatī, Nepal,Experience,Vianet Communications,Network Operations Center Engineer,Education,Purbanchal Vishwavidalaya,Bachelor in Information Technology,\xa0Information Technology \xa0·\xa0(December,2011\xa0-\xa0December 2015),\xa0 Page 1 of 1, \xa0 \xa0,Contact,www.linkedin.com/in/bipin-,rai-0821a1201  (LinkedIn),Top Skills,JavaScript,Project Management,English Bipin Rai,Full Stack Developer | React JS | Node JS,Kathmandu,Experience,Scrypt Spider,Frontend Web Developer,July 2021\xa0-\xa0Present\xa0 (1 year 1 month),Kathmandu, Bāgmatī, Nepal,Education,Vedas College,Bachelor's,\xa0Computer Science \xa0·\xa0(2016\xa0-\xa02021),\xa0 Page 1 of 1, \xa0 \xa0,Contact,www.linkedin.com/in/suresh-,tamang-6636b4134  (LinkedIn),Top Skills,React.js,React Native,laravel Suresh Tamang,React | React Native | Blockchain Enthusiast

In [None]:
# predicting the text of the above single sample with the model
pred = nlp_ner(ground['text'])

print(ground['text'])

print('\nPhrase --> Predicted Entity\n')
for ent in pred.ents:
  print(ent.text, '-->', ent.label_)

Token indices sequence length is longer than the specified maximum sequence length for this model (1384 > 512). Running this sequence through the model will result in indexing errors


   ,Contact,www.linkedin.com/in/bikram-,adhikari-61716b240  (LinkedIn) Bikram Adhikari,Network Operations Center Engineer at Vianet Communications,Kathmandu, Bāgmatī, Nepal,Experience,Vianet Communications,Network Operations Center Engineer,Education,Purbanchal Vishwavidalaya,Bachelor in Information Technology, Information Technology  · (December,2011 - December 2015),  Page 1 of 1,    ,Contact,www.linkedin.com/in/bipin-,rai-0821a1201  (LinkedIn),Top Skills,JavaScript,Project Management,English Bipin Rai,Full Stack Developer | React JS | Node JS,Kathmandu,Experience,Scrypt Spider,Frontend Web Developer,July 2021 - Present  (1 year 1 month),Kathmandu, Bāgmatī, Nepal,Education,Vedas College,Bachelor's, Computer Science  · (2016 - 2021),  Page 1 of 1,    ,Contact,www.linkedin.com/in/suresh-,tamang-6636b4134  (LinkedIn),Top Skills,React.js,React Native,laravel Suresh Tamang,React | React Native | Blockchain Enthusiast |,Nepal,Summary,I love to accept challenges and learn new technology tha

In [None]:
def evaluate(ner_model, samples):
  '''Given a trained spacy ner model along with json formatted data, this function will evaluate the model on the data'''
  scorer = Scorer(ner_model)
  example = []
  for sample in samples:
    pred = ner_model(sample['text'])
    #print(type(pred))
    temp_ex = Example.from_dict(pred, {'entities': sample['entities']})
    example.append(temp_ex)
  scores = scorer.score(example)
  return scores

In [None]:
# results = evaluate(nlp_ner, samples_val, )

In [None]:
# from print_dict import pd as pdic
# pdic(results)

# SemEval Evaluation Scripts

In [None]:
def list_to_spacy_ner_doc(ner_pred):
  '''
  This function takes a list of directory of NER predictions of the form
  {'text': '...', 'entities':[(start, end, tag)]} and converts them into SpaCy Doc Object
  '''
  # Creates a blank Tokenizer with just the English vocab
  nlp = spacy.blank("en")

  Doc.set_extension("rel", default={},force=True)
  vocab = Vocab()

  # try:
  # parsing the docanno JSON data (per-line)
  text = ner_pred["text"]
  spans = ner_pred["entities"]

  """ Parsing tokens from Text """
  tokens = nlp(text)

  entities = []

  spaces = [True if tok.whitespace_ else False for tok in tokens]
  words = [t.text for t in tokens]
  doc = Doc(nlp.vocab, words=words, spaces=spaces)

  for start, end, label in spans:
    """ The modes should be: strict, contract, and expand """
      # print(eg['text'][int(span["start_offset"]):int(span["end_offset"])])
    entity = doc.char_span(start, end, label=label, alignment_mode='contract')

    # Not considering the spans which are Erroneous
    if entity is None:
      # disease_name = text.split('\n')[0]
      # print(f'No Entity Found in File: {disease_name};\n Span = {start}-{end}; Phrase = {doc.text[start:end]}; Label = {label}\n')
      continue
    else:
      entities.append(entity)

  # print(entities[0].label_)
  try:
    doc.ents = entities
  except:
    print("=>> Error")
    print(text)

  # except:
  #   print('Error While Loading Predicted List...')

  return doc

In [None]:
def render_sample_pred(ner_doc):
  spacy.displacy.render(ner_doc, style="ent", options=options, jupyter=True)

In [None]:
# Looking into one particular samples from the ground truth of the validation set
ground = samples_test[0]
print(ground)
print()

pred = nlp_ner(ground['text'])
print(pred)

print('\nPhrase --> Predicted Entity\n')
for ent in pred.ents:
  print(ent.text, '-->', ent.label_)

{'text': "\xa0 \xa0,Contact,www.linkedin.com/in/bikram-,adhikari-61716b240  (LinkedIn) Bikram Adhikari,Network Operations Center Engineer at Vianet Communications,Kathmandu, Bāgmatī, Nepal,Experience,Vianet Communications,Network Operations Center Engineer,Education,Purbanchal Vishwavidalaya,Bachelor in Information Technology,\xa0Information Technology \xa0·\xa0(December,2011\xa0-\xa0December 2015),\xa0 Page 1 of 1, \xa0 \xa0,Contact,www.linkedin.com/in/bipin-,rai-0821a1201  (LinkedIn),Top Skills,JavaScript,Project Management,English Bipin Rai,Full Stack Developer | React JS | Node JS,Kathmandu,Experience,Scrypt Spider,Frontend Web Developer,July 2021\xa0-\xa0Present\xa0 (1 year 1 month),Kathmandu, Bāgmatī, Nepal,Education,Vedas College,Bachelor's,\xa0Computer Science \xa0·\xa0(2016\xa0-\xa02021),\xa0 Page 1 of 1, \xa0 \xa0,Contact,www.linkedin.com/in/suresh-,tamang-6636b4134  (LinkedIn),Top Skills,React.js,React Native,laravel Suresh Tamang,React | React Native | Blockchain Enthusiast

In [None]:
# Visualizing the NER Predictions against the Ground Truth 'samples'
print('\n########### Prediction ###########\n')
render_sample_pred(pred)
print('\n########### Ground Truth ###########\n')
render_sample_pred(list_to_spacy_ner_doc(ground))


########### Prediction ###########




########### Ground Truth ###########



In [None]:
def spacy_evaluate(ner_predictions, samples, show_res=False):
  '''Spacy Evaluation Function - Not going to use it
  Given a trained spacy ner model along with json formatted data, this function will evaluate the model on the data'''
  scorer = Scorer()
  example = []
  for pred, sample in zip(ner_predictions, samples):
    # print(pred)
    pred_doc = list_to_spacy_ner_doc(pred)

    if show_res:
      print('\n########### Prediction ###########\n')
      render_sample_pred(pred_doc)
      print('\n########### Ground Truth ###########\n')
      render_sample_pred(list_to_spacy_ner_doc(sample))

    temp_ex = Example.from_dict(pred_doc, {'entities': sample['entities']})
    example.append(temp_ex)

  scores = scorer.score(example)
  return scores

In [None]:
def save_predictions(ner_predictions, filename, semeval_format=True):
  # Saving the predictions as JSON - each dictionary on a line
  semeval_ent = []
  with open(OUTPUT_DIR+'/'+filename, 'w') as json_file:
    for pred in ner_predictions:
      tmp_ent = []
      if semeval_format:
        # prodigy format to work with nereval library - for SemEval 2013 - 9.1 task.
        for ent in pred['entities']:
          # saved in this format: [{"label": "PER", "start": 2, "end": 4}, ... ]
          tmp_ent.append({"label": ent[2], "start": ent[0], "end": ent[1]})

        semeval_ent.append(tmp_ent)

      else:
        # Spacy Doc object to work with nereval library - for SemEval 2013 - 9.1 task.
        for ent in pred.ents:
          # saved in this format: [{"label": "PER", "start": 2, "end": 4}, ... ]
          tmp_ent.append({"label": ent.label_, "start": ent.start_char, "end": ent.end_char})

        semeval_ent.append(tmp_ent)

    # dumping it into a JSON file
    json_file.write(json.dumps(semeval_ent))

  return semeval_ent
  # # This is single line JSON Dump of the entile list of dictionary - parser cannot parse it directly
  # with open(OUTPUT_DIR+'/predition.jsonl', 'w') as fout:
  #     json.dump(ner_predictions, fout)

In [None]:
def preprocess_results(results_by_tag):
    results_by_entity = []
    for entity in ENTITY_LABELS:
        if entity != 'Code_E':
            df = pd.DataFrame(results_by_tag[entity])
            df = df.round(decimals = 2)
            df.insert(0,'Entity','')
            df['Entity'] = entity
            results_by_entity.append(df)
    return results_by_entity

In [None]:
def semeval_evaluation(true, pred):
    evaluator = Evaluator(true, pred, tags=ENTITY_LABELS)
    results, results_by_tag = evaluator.evaluate()

    results = pd.DataFrame(results)
    results.to_excel(OUTPUT_DIR+'/'+'overall_benchmark.xlsx')

    results_by_entity = pd.concat(preprocess_results(results_by_tag))
    results_by_entity.to_excel(OUTPUT_DIR+'/'+'entity_benchmark.xlsx')

    return results, results_by_entity

### Validation Evaluation

In [None]:
start_time = time.time()
# Saving the predictions in a list for Validation Set
ner_predictions_val = []

for sample in samples_val:
    ner_predictions_val.append(nlp_ner(sample['text']))

In [None]:
print(ner_predictions_val[0].ents)

In [None]:
# saving the grond and predictions into a JSONL file for later evaluation.
semeval_ground_val = save_predictions(samples_val, filename= 'ground_val.jsonl')
semeval_pred_val = save_predictions(ner_predictions_val, filename='predition_val.jsonl', semeval_format=False)

In [None]:
# Validation evaluation following SemEval 2013 metrics
results, results_by_entity = semeval_evaluation(true=semeval_ground_val, pred=semeval_pred_val)
end_time = time.time()
print(f'\nTotal Test Time = {end_time - start_time} (sec)')

### Test Evaluation

In [None]:
start_time = time.time()
# Saving the predictions in a list for Test Set
ner_predictions_test = []

for sample in samples_test:
    ner_predictions_test.append(nlp_ner(sample['text']))

In [None]:
# saving the grond and predictions into a JSONL file for later evaluation.
semeval_ground_test = save_predictions(samples_test, filename= 'ground_test.jsonl')
semeval_pred_test = save_predictions(ner_predictions_test, filename='predition_test.jsonl', semeval_format=False)

In [None]:
# Test evaluation following SemEval 2013 metrics
results, results_by_entity = semeval_evaluation(true=semeval_ground_test, pred=semeval_pred_test)

end_time = time.time()
print(f'\nTotal Test Time = {end_time - start_time} (sec)')


Total Test Time = 57.62936568260193 (sec)


In [None]:
print('\n########### Overall Results ###########\n')
print(f"Precision: {results['partial']['precision']}\nRecall: {results['partial']['recall']}\nF1: {results['partial']['f1']}\n")


########### Overall Results ###########

Precision: 0.7144268774703557
Recall: 0.1689252336448598
F1: 0.2732426303854875



## Push to HuggingFace Hub

In [None]:
!pip install spacy-huggingface-hub

In [None]:
!python -m spacy package ./model/model-best ./hf-output --build wheel

In [None]:
from spacy_huggingface_hub import push

result = push("./hf-output/EN_Disease_A_Z_SpaCy-0.0.0/dist/EN_Disease_A_Z_SpaCy-0.0.0-py3-none-any.whl")