<a href="https://colab.research.google.com/github/dtim-upc/THOR/blob/main/LM-SD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# General Libraries
import pandas as pd
import srsly
import json
import os
import re
import csv
import time
from nervaluate import Evaluator
import warnings
import wandb

# Spacy Related Imports
import spacy
from spacy.util import minibatch, compounding, compile_infix_regex, get_words_and_spaces
from spacy.tokens import Span, DocBin, Doc
from spacy.vocab import Vocab
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.scorer import Scorer
from spacy.training import Example
from print_dict import pd as pdic

# RDFLib libraries
from rdflib import Graph
import pprint
from rdflib import RDFS
from rdflib import URIRef
from rdflib.namespace import RDF

In [None]:
# wandb.login()

# Data Conversion Part
#### From Docanno Annotated Data for NER and RE into Spacy NER

In [None]:
TRAIN_DIR = 'dataset/train'
VALID_DIR = 'dataset/val'
TEST_ON_TRAIN_GT_DIR = 'dataset/test'
TEST_MAYO_DIR = 'dataset/test_mayoclinic'
CONFIG_DIR = 'config'
OUTPUT_DIR = 'dataset'
SCHEMA_FILE = "dataset/schema/Disease_Schema_Extended.ttl"
STRUCTURED_DATA_DIR = "dataset/csv"

In [None]:
def get_data_from_csv(file_name=""):
  '''
  This function will get a CSV from the user having TWO COL i.e., Disease, Anatomy and
  returns a Dictionary having a structure: {"Tuberculosis": ['lungs', 'brain', 'kidneys', 'spine'], ...}
  '''

  structured_data = {}

  with open(file_name, 'r') as file:
      csvreader = csv.reader(file)
      # we can ignore the header
      header = next(csvreader)
      # print('Reading New Structured Data Source: {}'.format(file_name.split('\\')[1]))
      # print('Data Headers: ', header)

      for row in csvreader:
        # # print(row)
        # splits the comma separated values from the 1st column
        first_cols = row[0].split(',')

        # each of the instances of the first col (subject) will have the same value domain
        for instance in first_cols:
          instance = instance.strip()

          # splits the comma separated values from the 2nd column
          value_domain = row[1].split(',')
          # removes the leading and trailing spaces
          value_domain = [x.strip() for x in value_domain]
          # making instance/value-domain dictionary
          structured_data[instance] = value_domain

  # print('Instances/Values:')
  # print(structured_data)
  # print()

  return header, structured_data

In [None]:
def sd_to_docanno(sd_data, is_reverse, S_P_O, TEXT_ID, ENT_ID, REL_ID, out_dir=TRAIN_DIR):
    '''This function will convert the instance-value (e.g., 'Tuberculosis', 'lungs') pair of the structured data
       into docanno formatted annotation for both NER and RE and save them into JSONL files
    '''
    # file_name will be the name of the relationship
    file_name = out_dir + '/' + S_P_O[1] + '.jsonl'

    # splitts the name of the relationship by removing the _ and R from it ('has_side_effect_R' -> ' has side effect ')
    rel_name_spaced = ' '
    for word in S_P_O[1].split('_')[0:-1]:
        rel_name_spaced += word + ' '

    with open(file_name, 'w') as json_file:
        # if the relationship is inverse w.r.t how it is organized in the CSV
        if is_reverse:
            for instance, val_domains in sd_data.items():
                for val in val_domains:
                    text = val + rel_name_spaced + instance
                    text_len = len(text)

                    # Doccano formatted JSONL structure for NER and RE
                    ent_1 = {'id':ENT_ID, "label": S_P_O[0], "start_offset": 0, "end_offset": len(val)}
                    ent_2 = {'id':ENT_ID+1, "label": S_P_O[2], "start_offset": text_len - len(instance), "end_offset": text_len}
                    rel = {"id": REL_ID, "from_id": ENT_ID, "to_id": ENT_ID+1, "type": S_P_O[1]}

                    tem_docc_json = {"id": TEXT_ID, "text": text, "entities": [ent_1, ent_2], "relations": rel, "Comments": []}

                    TEXT_ID += 1
                    REL_ID +=1
                    ENT_ID += 2

                    # writing the annotation into JSONL file
                    json_file.write(json.dumps(tem_docc_json))
                    json_file.write('\n')

        else:
            for instance, val_domains in sd_data.items():
                for val in val_domains:
                    text = instance + rel_name_spaced + val
                    text_len = len(text)

                    # Doccano formatted JSONL structure for NER and RE
                    ent_1 = {'id':ENT_ID, "label": S_P_O[0], "start_offset": 0, "end_offset": len(instance)}
                    ent_2 = {'id':ENT_ID+1, "label": S_P_O[2], "start_offset": text_len - len(val), "end_offset": text_len}
                    rel = {"id": REL_ID, "from_id": ENT_ID, "to_id": ENT_ID+1, "type": S_P_O[1]}

                    tem_docc_json = {"id": TEXT_ID, "text": text, "entities": [ent_1, ent_2], "relations": rel, "Comments": []}

                    TEXT_ID += 1
                    REL_ID +=1
                    ENT_ID += 2

                    # writing the annotation into JSONL file
                    json_file.write(json.dumps(tem_docc_json))
                    json_file.write('\n')

    return TEXT_ID, ENT_ID, REL_ID

In [None]:
def structured_data_to_triples(templates, STRUCTURED_DATA_DIR=STRUCTURED_DATA_DIR):
    '''This will read the Structured Data (SD) From the CSV Files of a Directory
    '''

    # id's for Annotating Docanno Formatted data from Structured Data
    TEXT_ID = 0
    ENT_ID = 0
    REL_ID = 0

    for dirname, _, filenames in os.walk(STRUCTURED_DATA_DIR):
      for filename in filenames:
        file_path = os.path.join(dirname, filename)
        #print(file_path)
        sd_header, sd_data = get_data_from_csv(file_path)
        print(sd_header)

        is_reverse = None
        S_P_O = None

        # matching this file with the templates to determine the type of relationship
        for S, P, O in templates:
            if sd_header[0] == S and sd_header[1] == O:
                # there is an exact relationship
                S_P_O = (S + '_E', P + '_R', O + '_E')
                is_reverse = False
                break
            elif sd_header[0] == O and sd_header[1] == S:
                # the relationship will be reversed
                S_P_O = (S + '_E', P + '_R', O + '_E')
                is_reverse = True
                break

        if is_reverse != None:
            # calling the function to convert and save the SD into Docanno compatible annotation.
            TEXT_ID, ENT_ID, REL_ID = sd_to_docanno(sd_data, is_reverse, S_P_O, TEXT_ID, ENT_ID, REL_ID, out_dir=TRAIN_DIR)
        else:
            print(f'Mismatch Between the Schema of the Graph and CSV Files in {sd_header}')

In [None]:
def get_rdf_graph(file_name=""):
      '''Getting the RDF file from the user (.nt/.ttl/.xml etc)'''

      # if file_name is empty, that means we are using CoLAB Google Hosted Runtime
      if not file_name:
        file_name = input_file()

      # parsing the graph
      g = Graph()
      g.parse(file_name)

      # print('\nTotal Triples Found = {}\n'.format(len(g)))

      # Loop through some of the triples in the graph (subj, pred, obj)
#       print('First 10 Triples:')
#       for triple in list(g)[:10]:
#           # Check if there is at least one triple in the Graph
#           # if (subj, pred, obj) not in g:
#           #    raise Exception("It better be!")
#           print(triple)
      print('RDF File Reading Complete...')

      return g

In [None]:
def get_templates(g):
  '''Given a RDF graph 'g' this function will return a list of (S, P, O) triples having only names'''

  # Getting all the unique S, P, O from the graph using the RDFS.domain and RDFS.range
  preds_subs = list(g.subject_objects(predicate=RDFS.domain))
  preds_objs = list(g.subject_objects(predicate=RDFS.range))

  # Getting all the subclasses with corresponding superclasses
  sup_sub = g.subject_objects(predicate=RDFS.subClassOf)

  # dictionary having a superclasses and it's subclasses {'Treatment':['Medicine', 'Precaution', 'Surgery']}
  sup_sub_dic = {}

  # populating the dictionary from the graph
  for sc in sup_sub:
    subj = sc[0].split('#')[1]
    obj = sc[1].split('#')[1]

    if obj in sup_sub_dic:
      sup_sub_dic[obj].append(subj)
    else:
      sup_sub_dic[obj] = [subj]

  # dictionary having a structure {'P':['S', 'O']}
  dic_triples = {}

  # gets only the name of Predicates and Subjects splitting from the URI's
  for ps in preds_subs:
    pred = ps[0].split('#')[1]
    subj = ps[1].split('#')[1]
    dic_triples[pred] = [subj]
    # # print(subj, pred)

  # matches the Subjects having specific Predicates with Objects
  for po in preds_objs:
    pred = po[0].split('#')[1]
    obj = po[1].split('#')[1]
    dic_triples[pred].append(obj)
    # # print(pred, obj)

  # saves the triples from the dictionary into a list of tuple -> [(S, P, O)]
  triples_name = []
  for pred in dic_triples:
    subj = dic_triples[pred][0]
    obj = dic_triples[pred][1]
    triples_name.append((subj, pred, obj))
    # # print('({}, {}, {})'.format(subj, pred, obj))

    # checking if the subject is a superclass... If so, copy it's predicate to all it's subclasses (sub_cls) along with the range
    if subj in sup_sub_dic:
      for sub_cls in sup_sub_dic[subj]:
        triples_name.append((sub_cls, pred, obj))

  # print("Total Templates = {}\n".format(len(triples_name)))
#   for triple in triples_name:
#     print(triple)

  return triples_name

In [None]:
# for local runtime - upload the file (first time) in the file upload option (left)
graph = get_rdf_graph(file_name = SCHEMA_FILE)

RDF File Reading Complete...


In [None]:
templates = get_templates(graph)
pdic(templates)

[('Disease', 'affects', 'Anatomy'), ('Disease', 'caused_by', 'Cause'),
 ('Disease', 'has_code', 'Code'), ('Disease', 'has_complication', 'Complication'),
 ('Disease', 'has_diagnosis', 'Diagnosis'), ('Disease', 'has_precaution', 'Precaution'),
 ('Disease', 'has_risk_factor', 'Riskfactor'), ('Disease', 'has_symptom', 'Symptom'),
 ('Diagnosis', 'diagnosis_on', 'Anatomy'), ('Diagnosis', 'needs', 'Surgery'),
 ('Treatment', 'has_side_effect', 'Complication'),
 ('Medicine', 'has_side_effect', 'Complication'),
 ('Precaution', 'has_side_effect', 'Complication'),
 ('Surgery', 'has_side_effect', 'Complication'), ('Complication', 'influence', 'Anatomy'),
 ('Medicine', 'made_with', 'Composition'), ('Medicine', 'prescribed_for', 'Disease'),
 ('Surgery', 'surgery_for', 'Disease'), ('Surgery', 'surgery_on', 'Anatomy')]


In [None]:
structured_data_to_triples(templates)

['Anatomy', 'Surgery']
['Disease', 'Anatomy']
['Disease', 'Cause']
['Disease', 'Complication']
['Diagnosis', 'Disease']
['Disease', 'Medicine']
['Disease', 'Precaution']
['Disease', 'Riskfactor']
['Disease', 'Symptom']
['Medicine', 'Composition']


In [None]:
""" This is just to SHOW the Named Entities - No Real Purpose """
ENTITY_LABELS = []
for lbl in srsly.read_json(CONFIG_DIR + '/label_config_Entity.json'):
  ENTITY_LABELS.append(lbl['text'])
print(ENTITY_LABELS)

['Disease_E', 'Anatomy_E', 'Cause_E', 'Code_E', 'Diagnosis_E', 'Precaution_E', 'Riskfactor_E', 'Symptom_E', 'Medicine_E', 'Composition_E', 'Complication_E', 'Surgery_E']


In [None]:
def trim_entity_spans(text, spans):
  '''Data Cleaning: Removes leading and trailing white spaces from entity spans.'''
  invalid_span_tokens = re.compile(r'\s')

  valid_spans = []
  for start, end, label in spans:
    valid_start = start
    valid_end = end
    while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
      valid_start += 1
    while valid_end > 1 and invalid_span_tokens.match(text[valid_end - 1]):
      valid_end -= 1
    valid_spans.append((valid_start, valid_end, label))

  return valid_spans

In [None]:
def docanno_to_spacy_ner_db(DATA_DIR):
  '''
  This function takes a directory of docanno annotated datasets for NER/RE
  and converts them into SpaCy DocBin Object which is Trainable via commandline
  '''
  # Creates a blank Tokenizer with just the English vocab
  nlp = spacy.blank("en")

  Doc.set_extension("rel", default={},force=True)
  vocab = Vocab()

  word_count = 0
  no_disease = 0
  no_doc = 0
  no_entities = 0
  error_cnt = 0

  # the DocBin will store the example documents
  db = DocBin()

  for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
      file_path = os.path.join(dirname, filename)

      try:
        """ Iterate through the Jsonl file to create serialize Docbin object / .spacy IOB File """
        for json_line in srsly.read_jsonl(file_path):

          # parsing the docanno JSON data (per-line)
          text = json_line["text"]
          spans = json_line["entities"]

          new_spans = []
          for span in spans:
            new_spans.append((span["start_offset"], span["end_offset"], span["label"]))

          # cleaning and validating the leading and trailing spaces from the annotated entities
          valid_spans = trim_entity_spans(text, new_spans)

          """ Parsing tokens from Text """
          tokens = nlp(text)

          entities = []

          spaces = [True if tok.whitespace_ else False for tok in tokens]
          words = [t.text for t in tokens]
          doc = Doc(nlp.vocab, words=words, spaces=spaces)

          for start, end, label in valid_spans:
            """ The modes should be: strict, contract, and expand """
            # print(eg['text'][int(span["start_offset"]):int(span["end_offset"])])
            entity = doc.char_span(start, end, label=label, alignment_mode='contract')

            # Not considering the spans which are Erroneous
            if entity is None:
              error_cnt += 1
              #print(f'Error Found in File: {filename};\n ID = {json_line["id"]}; Label = {label}\n')

            else:
              no_entities += 1
              entities.append(entity)

          # print(entities)
          try:
            doc.ents = entities
            word_count += len(words)
          except:
            # print("=>> Error")
            continue

          db.add(doc)
          no_doc += 1

      except:
        print('Error While Loading JSON Data From Input Directory. Please check if you have other file type...')

      no_disease +=1
  print(f"- Diseases: {no_disease} \n- Processed Documents: {no_doc} \n- Total Entities: {no_entities} \n- Erroneous Entities (Ignored): {error_cnt} \n- Total Words: {word_count}")

  return db

In [None]:
'''Saving Spacy Trainable Object File for NER'''
print('Preparing Training Dataset from Structured Data:')
db_train = docanno_to_spacy_ner_db(TRAIN_DIR)
db_train.to_disk(OUTPUT_DIR + "/disease_A-Z_train.spacy")

print('\nPreparing Validation Dataset from Ground Truth Validation:')
db_valid = docanno_to_spacy_ner_db(VALID_DIR)
db_valid.to_disk(OUTPUT_DIR + "/disease_A-Z_valid.spacy")

print('\nPreparing Test Dataset from Ground Truth Training:')
db_test = docanno_to_spacy_ner_db(TEST_ON_TRAIN_GT_DIR)
db_test.to_disk(OUTPUT_DIR + "/disease_A-Z_test_on_train.spacy")

print('\nPreparing Test Dataset from Ground Truth Mayoclinic:')
db_test = docanno_to_spacy_ner_db(TEST_MAYO_DIR)
db_test.to_disk(OUTPUT_DIR + "/disease_A-Z_test_mayo.spacy")

Preparing Training Dataset from Structured Data:
- Diseases: 10 
- Processed Documents: 2353 
- Total Entities: 4706 
- Erroneous Entities (Ignored): 0 
- Total Words: 14010

Preparing Validation Dataset from Ground Truth Validation:
- Diseases: 61 
- Processed Documents: 366 
- Total Entities: 3989 
- Erroneous Entities (Ignored): 10 
- Total Words: 41284

Preparing Test Dataset from Ground Truth Training:
- Diseases: 240 
- Processed Documents: 1438 
- Total Entities: 18539 
- Erroneous Entities (Ignored): 34 
- Total Words: 178882

Preparing Test Dataset from Ground Truth Mayoclinic:
- Diseases: 1 
- Processed Documents: 90 
- Total Entities: 2222 
- Erroneous Entities (Ignored): 6 
- Total Words: 20588


# Spacy Model Training Part

In [None]:
# Creates the training configuration file from the given base configuration. You can configure it yourself on:
# https://spacy.io/usage/training#quickstart
# We are using a GPU based training setting for Accuracy (RoBERTa model)

#!python -m spacy init fill-config config/base_config.cfg config/config.cfg

[38;5;2m[+] Auto-filled config with all values[0m
[38;5;2m[+] Saved config[0m
config\config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
'''
WARNING: Training Will Take Time -
Trains a SpaCy NER model on our training data - Please REMOVE
--gpu-id 0 if want to run this in CPU
'''

train_start_time = time.time()

# !python -m spacy train config/config.cfg --gpu-id 0 --output model --paths.train dataset/disease_A-Z_train.spacy --paths.dev dataset/disease_A-Z_valid.spacy

train_end_time = time.time()
print(f'Total Training Time = {train_end_time - train_start_time} (sec)')

[i] Saving to output directory: model
[i] Using GPU: 0
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['transformer', 'ner']
[i] Initial learn rate: 0.0
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0         167.39    301.06    2.38    1.32   12.66    0.02
 10     200       26518.41  45397.28   14.32   12.39   16.95    0.14
 21     400         154.33    133.78   10.85   11.91    9.95    0.11
 31     600          54.45     48.30    8.28    9.42    7.40    0.08
 42     800          11.82     11.48   10.06   10.17    9.95    0.10
 52    1000          37.00     28.12    5.80    6.97    4.96    0.06
 63    1200          52.99     45.81    9.84   10.82    9.02    0.10
 73    1400          21.71     17.07    9.42   11.51    7.97    0.09
 84    1600           0.00      0.00   10.54   12.21    9.28    0.11
 94    1800          27.72     18.84    5.60    5.80    5.41    0.06
105    2000      

[2023-08-18 17:04:39,177] [INFO] Set up nlp object from config
[2023-08-18 17:04:39,984] [INFO] Pipeline: ['transformer', 'ner']
[2023-08-18 17:04:39,987] [INFO] Created vocabulary
[2023-08-18 17:04:39,988] [INFO] Finished initializing nlp object
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2023-08-18 17:04:51,166] [INFO] Initialized pipel

In [None]:
# loading the best model from the directory (saved during the training)
# Please download it from here: https://drive.google.com/file/d/1JlrIfJycQwQ3k9rLlOAIhYEWo9EIQExz/view?usp=sharing

nlp_ner = spacy.load("model/model-best")

In [None]:
# Inferencing and visualizing some sample text using the trained model
text_inf = "Tuberculosis generally damages the lungs, but it can also impair other parts of the body such as brain and spine. Typical signs of active Tuberculosis include chronic cough with blood-containing mucus, fever, night sweats, and weight loss. Tuberculosis damages the lungs whereas Malaria could detriment both kidneys by impairing the liver."

doc_inf = nlp_ner(text_inf)

colors = {'Disease_E': 'yellow', 'Anatomy_E': 'silver', 'Cause_E': '#0D9CB4',
          'Code_E': '#5813C7', 'Diagnosis_E': '#0D350E', 'Precaution_E': '#1AA436',
          'Riskfactor_E': '#1AE0F9', 'Symptom_E': 'orange', 'Medicine_E': '#BADCA1',
          'Composition_E': '#78A2E5', 'Complication_E': '#D845FB', 'Surgery_E': '#54B69E'}
options = {"colors": colors}

spacy.displacy.render(doc_inf, style="ent", options=options, jupyter=True)

In [None]:
# Inferencing and visualizing some sample text using the trained model
text_inf = "Chagas (CHAH-gus) disease is an inflammatory, infectious disease caused by the parasite Trypanosoma cruzi. This parasite is found in the feces of the triatomine (reduviid) bug. This bug is also known as the 'kissing bug'. Chagas disease is common in South America, Central America and Mexico, the primary home of the triatomine bug. Rare cases of Chagas disease have also been found in the southern United States."

doc_inf = nlp_ner(text_inf)

colors = {'Disease_E': 'yellow', 'Anatomy_E': 'silver', 'Cause_E': '#0D9CB4',
          'Code_E': '#5813C7', 'Diagnosis_E': '#0D350E', 'Precaution_E': '#1AA436',
          'Riskfactor_E': '#1AE0F9', 'Symptom_E': 'orange', 'Medicine_E': '#BADCA1',
          'Composition_E': '#78A2E5', 'Complication_E': '#D845FB', 'Surgery_E': '#54B69E'}
options = {"colors": colors}

spacy.displacy.render(doc_inf, style="ent", options=options, jupyter=True)

In [None]:
'''Evaluating the model separately - Using Validation Data'''
start_time = time.time()

!python -m spacy evaluate --gpu-id 0 model/model-best/ dataset/disease_A-Z_valid.spacy

end_time = time.time()
print(f'\nTotal Validation Time = {end_time - start_time} (sec)')

[i] Using GPU: 0
[1m

TOK     100.00
NER P   12.39 
NER R   16.95 
NER F   14.32 
SPEED   3814  

[1m

                     P       R       F
Disease_E        18.89   42.86   26.22
Complication_E    4.35    0.19    0.36
Anatomy_E         3.89   10.78    5.72
Cause_E           5.85   14.60    8.35
Riskfactor_E      1.43    0.60    0.84
Precaution_E     15.13   10.06   12.08
Diagnosis_E       0.00    0.00    0.00
Medicine_E       14.06   20.63   16.72
Symptom_E         0.00    0.00    0.00
Surgery_E         0.00    0.00    0.00
Composition_E     0.00    0.00    0.00


Total Validation Time = 21.23520541191101 (sec)


In [None]:
'''Evaluating the model separately - Using Ground Truth Training Data'''
start_time = time.time()

!python -m spacy evaluate --gpu-id 0 model/model-best/ dataset/disease_A-Z_test_on_train.spacy

end_time = time.time()
print(f'\nTotal Test Time = {end_time - start_time} (sec)')

[i] Using GPU: 0
[1m

TOK     100.00
NER P   13.24 
NER R   17.09 
NER F   14.92 
SPEED   6370  

[1m

                     P       R       F
Disease_E        19.65   42.30   26.84
Cause_E           7.08   13.89    9.38
Symptom_E         4.20    0.26    0.50
Complication_E    6.15    0.16    0.31
Anatomy_E         4.80   11.75    6.81
Medicine_E       13.96   20.22   16.52
Precaution_E     18.23   17.54   17.87
Surgery_E        26.32    1.08    2.07
Composition_E     1.17   27.87    2.25
Riskfactor_E      4.07    1.77    2.47
Diagnosis_E       0.00    0.00    0.00


Total Test Time = 38.44014859199524 (sec)


In [None]:
'''Evaluating the model separately - Using Mayoclinic Test Data'''
start_time = time.time()

!python -m spacy evaluate --gpu-id 0 model/model-best/ dataset/disease_A-Z_test_mayo.spacy

end_time = time.time()
print(f'\nTotal Test Time = {end_time - start_time} (sec)')

[i] Using GPU: 0
[1m

TOK     100.00
NER P   11.24 
NER R   12.24 
NER F   11.72 
SPEED   2665  

[1m

                     P       R       F
Disease_E        15.65   40.73   22.61
Anatomy_E         6.32    8.67    7.31
Cause_E           3.72   23.40    6.41
Complication_E   33.33    0.26    0.52
Riskfactor_E      2.56    0.74    1.14
Symptom_E         0.00    0.00    0.00
Composition_E     1.50    4.62    2.26
Medicine_E       29.10   10.37   15.29
Precaution_E     10.43   23.61   14.47
Diagnosis_E       0.00    0.00    0.00
Surgery_E        20.00    1.18    2.22


Total Test Time = 15.904410362243652 (sec)


# Evaluation using the Test/Validation Data [without cmd]
## Need to Implement the Confusion Matrix (Sklearn)

### Check the following Tutorial:
https://github.com/wjbmattingly/spacy_tutorials_3x/blob/main/02_02_formal_test.ipynb

In [None]:
def load_data(file_path: str, nlp):
  '''This function loads data from SpaCy docbin formatted files into spacy compitable JSON format'''
  doc_bin = DocBin().from_disk(file_path)
  samples, entities_count = [], 0
  for doc in doc_bin.get_docs(nlp.vocab):
    sample = {
      "text": doc.text,
      "entities": []
    }
    if len(doc.ents) > 0:
      entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
      sample["entities"] = entities
      entities_count += len(entities)
    else:
      warnings.warn("Sample without entities!")
    samples.append(sample)
  return samples, entities_count

In [None]:
samples_val, entities_count_val = load_data(OUTPUT_DIR + "/disease_A-Z_valid.spacy", nlp_ner)
samples_test, entities_count_test = load_data(OUTPUT_DIR + "/disease_A-Z_test_on_train.spacy", nlp_ner)
samples_test_mayo, entities_count_test_mayo = load_data(OUTPUT_DIR + "/disease_A-Z_test_mayo.spacy", nlp_ner)

In [None]:
# Looking into one particular samples from the ground truth of the validation set
ground = samples_test[0]
ground

{'text': '#Diagnosis - Abdominal Pain - MD81.4\nGenerally abdominal pain goes with time, but if the pain does not subside, then one should seek medical help: Abdominal discomfort that lasts 1 week or longer, Abdominal pain that does not improve in 24 - 48 hours, or becomes more severe and frequent and occurs with nausea and vomiting, Bloating that persists for more than 2 days, Burning sensation or increase in frequency on urination, Diarrhoea for more than 5 daysFever (over 100°F for adults or 100.4°F for children) with pain, Prolonged poor appetite, Prolonged vaginal bleeding, Unexplained weight loss.',
 'entities': [(13, 27, 'Disease_E'),
  (47, 61, 'Disease_E'),
  (89, 93, 'Complication_E'),
  (147, 195, 'Symptom_E'),
  (197, 323, 'Symptom_E'),
  (325, 368, 'Symptom_E'),
  (370, 425, 'Symptom_E'),
  (427, 520, 'Symptom_E'),
  (522, 545, 'Symptom_E'),
  (547, 573, 'Symptom_E'),
  (575, 598, 'Symptom_E')]}

In [None]:
# predicting the text of the above single sample with the model
pred = nlp_ner(ground['text'])

print(ground['text'])

print('\nPhrase --> Predicted Entity\n')
for ent in pred.ents:
  print(ent.text, '-->', ent.label_)

#Diagnosis - Abdominal Pain - MD81.4
Generally abdominal pain goes with time, but if the pain does not subside, then one should seek medical help: Abdominal discomfort that lasts 1 week or longer, Abdominal pain that does not improve in 24 - 48 hours, or becomes more severe and frequent and occurs with nausea and vomiting, Bloating that persists for more than 2 days, Burning sensation or increase in frequency on urination, Diarrhoea for more than 5 daysFever (over 100°F for adults or 100.4°F for children) with pain, Prolonged poor appetite, Prolonged vaginal bleeding, Unexplained weight loss.

Phrase --> Predicted Entity

#Diagnosis - Abdominal Pain --> Disease_E
Generally abdominal pain --> Disease_E
Abdominal discomfort --> Disease_E
Abdominal pain --> Disease_E
24 - 48 hours --> Cause_E
Bloating --> Disease_E
Burning sensation or increase in frequency on urination --> Cause_E
Diarrhoea for --> Disease_E
daysFever ( --> Disease_E
over 100°F for adults or 100.4°F for children --> Caus

In [None]:
def evaluate(ner_model, samples):
  '''Given a trained spacy ner model along with json formatted data, this function will evaluate the model on the data'''
  scorer = Scorer(ner_model)
  example = []
  for sample in samples:
    pred = ner_model(sample['text'])
    #print(type(pred))
    temp_ex = Example.from_dict(pred, {'entities': sample['entities']})
    example.append(temp_ex)
  scores = scorer.score(example)
  return scores

In [None]:
results = evaluate(nlp_ner, samples_val, )

In [None]:
from print_dict import pd as pdic
pdic(results)

{
    'token_acc': 1.0,
    'token_p': 1.0,
    'token_r': 1.0,
    'token_f': 1.0,
    'ents_p': 0.12394572790612395,
    'ents_r': 0.16946603158686388,
    'ents_f': 0.1431748385047125,
    'ents_per_type': {
        'Disease_E': {
            'p': 0.18885096700796358,
            'r': 0.42857142857142855,
            'f': 0.26217425638325875
        },
        'Complication_E': {
            'p': 0.043478260869565216,
            'r': 0.001876172607879925,
            'f': 0.003597122302158273
        },
        'Anatomy_E': {
            'p': 0.03891509433962264,
            'r': 0.10784313725490197,
            'f': 0.05719237435008665
        },
        'Cause_E': {
            'p': 0.05849889624724062,
            'r': 0.14600550964187328,
            'f': 0.0835303388494878
        },
        'Riskfactor_E': {
            'p': 0.014285714285714285,
            'r': 0.005988023952095809,
            'f': 0.008438818565400845
        },
        'Precaution_E': {
            'p': 

# SemEval Evaluation Scripts

In [None]:
def list_to_spacy_ner_doc(ner_pred):
  '''
  This function takes a list of directory of NER predictions of the form
  {'text': '...', 'entities':[(start, end, tag)]} and converts them into SpaCy Doc Object
  '''
  # Creates a blank Tokenizer with just the English vocab
  nlp = spacy.blank("en")

  Doc.set_extension("rel", default={},force=True)
  vocab = Vocab()

  # try:
  # parsing the docanno JSON data (per-line)
  text = ner_pred["text"]
  spans = ner_pred["entities"]

  """ Parsing tokens from Text """
  tokens = nlp(text)

  entities = []

  spaces = [True if tok.whitespace_ else False for tok in tokens]
  words = [t.text for t in tokens]
  doc = Doc(nlp.vocab, words=words, spaces=spaces)

  for start, end, label in spans:
    """ The modes should be: strict, contract, and expand """
      # print(eg['text'][int(span["start_offset"]):int(span["end_offset"])])
    entity = doc.char_span(start, end, label=label, alignment_mode='contract')

    # Not considering the spans which are Erroneous
    if entity is None:
      # disease_name = text.split('\n')[0]
      # print(f'No Entity Found in File: {disease_name};\n Span = {start}-{end}; Phrase = {doc.text[start:end]}; Label = {label}\n')
      continue
    else:
      entities.append(entity)

  # print(entities[0].label_)
  try:
    doc.ents = entities
  except:
    print("=>> Error")
    print(text)

  # except:
  #   print('Error While Loading Predicted List...')

  return doc

In [None]:
def render_sample_pred(ner_doc):
  spacy.displacy.render(ner_doc, style="ent", options=options, jupyter=True)

In [None]:
# Looking into one particular samples from the ground truth of the validation set
ground = samples_test[0]
print(ground)
print()

pred = nlp_ner(ground['text'])
print(pred)

print('\nPhrase --> Predicted Entity\n')
for ent in pred.ents:
  print(ent.text, '-->', ent.label_)

{'text': '#Diagnosis - Abdominal Pain - MD81.4\nGenerally abdominal pain goes with time, but if the pain does not subside, then one should seek medical help: Abdominal discomfort that lasts 1 week or longer, Abdominal pain that does not improve in 24 - 48 hours, or becomes more severe and frequent and occurs with nausea and vomiting, Bloating that persists for more than 2 days, Burning sensation or increase in frequency on urination, Diarrhoea for more than 5 daysFever (over 100°F for adults or 100.4°F for children) with pain, Prolonged poor appetite, Prolonged vaginal bleeding, Unexplained weight loss.', 'entities': [(13, 27, 'Disease_E'), (47, 61, 'Disease_E'), (89, 93, 'Complication_E'), (147, 195, 'Symptom_E'), (197, 323, 'Symptom_E'), (325, 368, 'Symptom_E'), (370, 425, 'Symptom_E'), (427, 520, 'Symptom_E'), (522, 545, 'Symptom_E'), (547, 573, 'Symptom_E'), (575, 598, 'Symptom_E')]}

#Diagnosis - Abdominal Pain - MD81.4
Generally abdominal pain goes with time, but if the pain does

In [None]:
# Visualizing the NER Predictions against the Ground Truth 'samples'
print('\n########### Prediction ###########\n')
render_sample_pred(pred)
print('\n########### Ground Truth ###########\n')
render_sample_pred(list_to_spacy_ner_doc(ground))


########### Prediction ###########




########### Ground Truth ###########



In [None]:
def spacy_evaluate(ner_predictions, samples, show_res=False):
  '''Spacy Evaluation Function - Not going to use it
  Given a trained spacy ner model along with json formatted data, this function will evaluate the model on the data'''
  scorer = Scorer()
  example = []
  for pred, sample in zip(ner_predictions, samples):
    # print(pred)
    pred_doc = list_to_spacy_ner_doc(pred)

    if show_res:
      print('\n########### Prediction ###########\n')
      render_sample_pred(pred_doc)
      print('\n########### Ground Truth ###########\n')
      render_sample_pred(list_to_spacy_ner_doc(sample))

    temp_ex = Example.from_dict(pred_doc, {'entities': sample['entities']})
    example.append(temp_ex)

  scores = scorer.score(example)
  return scores

In [None]:
def save_predictions(ner_predictions, filename, semeval_format=True, docanno_format=True):
  # Saving the predictions as JSON - each dictionary on a line
  semeval_ent = []
  with open(OUTPUT_DIR+'/'+filename, 'w', encoding='UTF-8') as json_file:
    for pred in ner_predictions:
      tmp_ent = []
      if semeval_format:
        if docanno_format:
            # Prodigy/Docanno formatted Ground Truth to work with nereval library - for SemEval 2013 - 9.1 task.
            for ent in pred['entities']:
              # saved in this format: [{"label": "PER", "start": 2, "end": 4}, ... ]
              tmp_ent.append({"label": ent[2], "start": ent[0], "end": ent[1]})

        else:
            # Spacy Doc object to work with nereval library - for SemEval 2013 - 9.1 task.
            for ent in pred.ents:
              # saved in this format: [{"label": "PER", "start": 2, "end": 4}, ... ]
              tmp_ent.append({"label": ent.label_, "start": ent.start_char, "end": ent.end_char})

        semeval_ent.append(tmp_ent)

      else:
        # this is regullar spacy format, can be used for spacy's default evaluation later also for input of Ex-2
        # saved in this format: {"text": "", "entities": [[36, 40, "Complication_E"], [44, 51, "Anatomy_E"], ...]}
        for ent in pred.ents:
            tmp_ent.append([ent.start_char, ent.end_char, ent.label_])

        pred_json = {"text": pred.text, "entities": tmp_ent}
        json_file.write(json.dumps(pred_json, ensure_ascii=False))
        json_file.write('\n')

    if semeval_format:
      # dumping it into a JSON file
      json_file.write(json.dumps(semeval_ent, ensure_ascii=False))

  return semeval_ent
  # # This is single line JSON Dump of the entile list of dictionary - parser cannot parse it directly
  # with open(OUTPUT_DIR+'/predition.jsonl', 'w') as fout:
  #     json.dump(ner_predictions, fout)

In [None]:
def preprocess_results(results_by_tag):
    results_by_entity = []
    for entity in ENTITY_LABELS:
        if entity != 'Code_E':
            df = pd.DataFrame(results_by_tag[entity])
            df = df.round(decimals = 2)
            df.insert(0,'Entity','')
            df['Entity'] = entity
            results_by_entity.append(df)
    return results_by_entity

In [None]:
def semeval_evaluation(true, pred):
    evaluator = Evaluator(true, pred, tags=ENTITY_LABELS)
    results, results_by_tag = evaluator.evaluate()

    results = pd.DataFrame(results)
    results.to_excel(OUTPUT_DIR+'/'+'overall_benchmark.xlsx')

    results_by_entity = pd.concat(preprocess_results(results_by_tag))
    results_by_entity.to_excel(OUTPUT_DIR+'/'+'entity_benchmark.xlsx')

    return results, results_by_entity

### Validation Evaluation

In [None]:
start_time = time.time()
# Saving the predictions in a list for Validation Set
ner_predictions_val = []

for sample in samples_val:
    ner_predictions_val.append(nlp_ner(sample['text']))

In [None]:
print(ner_predictions_val[0])

#Causes - Quinsy - CA0K.1
Peri Tonsillar Abscess is usually a complication of an untreated or partially treated acute tonsillitis. The infection, in these cases, spreads to the peritonsillar area (peritonsillitis). This region comprises loose connective tissue and is hence susceptible to formation of abscess.


In [None]:
# saving the ground and predictions into a JSONL file for later evaluation.
semeval_ground_val = save_predictions(samples_val, filename= 'ground_val_semEval.jsonl')
semeval_pred_val = save_predictions(ner_predictions_val, filename='predition_val_semEval.jsonl', semeval_format=True, docanno_format=False)

In [None]:
# Validation evaluation following SemEval 2013 metrics
results, results_by_entity = semeval_evaluation(true=semeval_ground_val, pred=semeval_pred_val)
end_time = time.time()
print(f'\nTotal Validation Time = {end_time - start_time} (sec)')


Total Validation Time = 43.31132888793945 (sec)


### Ground Truth Training Evaluation (For Ex-2)

In [None]:
start_time = time.time()
# Saving the predictions in a list for Test Set
ner_predictions_test = []

for sample in samples_test:
    ner_predictions_test.append(nlp_ner(sample['text']))

In [None]:
# saving the grond and predictions into a JSONL file for later evaluation.
semeval_ground_test = save_predictions(samples_test, filename= 'ground_test.jsonl')
semeval_pred_test = save_predictions(ner_predictions_test, filename='predition_test.jsonl', semeval_format=True, docanno_format=False)

# Saving this for Experiment 2... Spacy Format
_ = save_predictions(ner_predictions_test, filename='predition_LM_spacy.jsonl', semeval_format=False, docanno_format=False)

In [None]:
# Test evaluation following SemEval 2013 metrics
results, results_by_entity = semeval_evaluation(true=semeval_ground_test, pred=semeval_pred_test)
end_time = time.time()
print(f'\nTotal Time for Prediction on Training Data = {end_time - start_time} (sec)')


Total Time for Prediction on Training Data = 198.69852328300476 (sec)


### Test Evaluation


In [None]:
start_time = time.time()
# Saving the predictions in a list for Test Set
ner_predictions_test_mayo = []

for sample in samples_test_mayo:
    ner_predictions_test_mayo.append(nlp_ner(sample['text']))

In [None]:
# saving the grond and predictions into a JSONL file for later evaluation.
semeval_ground_test_mayo = save_predictions(samples_test_mayo, filename= 'ground_test.jsonl')
semeval_pred_test_mayo = save_predictions(ner_predictions_test_mayo, filename='predition_test.jsonl', semeval_format=True, docanno_format=False)


In [None]:
# Test evaluation following SemEval 2013 metrics
results, results_by_entity = semeval_evaluation(true=semeval_ground_test_mayo, pred=semeval_pred_test_mayo)
end_time = time.time()
print(f'\nTotal Test Time = {end_time - start_time} (sec)')


Total Test Time = 19.321892023086548 (sec)
