In [1]:
import json
import spacy
from spacy import displacy

In [2]:
def find_entity_positions_GPT(text, entities, entity_types):
    
    entity_positions = []
    start_index = 0
    end_index = 0
    
    for i, entity in enumerate(entities):            
        # only start searching from the last found entities inside the text
        #print(entity)
        found_index = text[end_index:].find(entity)
        if found_index == -1:  # if entity not found in text
            del entity_types[i]
        else:
            start_index = end_index + found_index
            end_index = start_index + len(entity)
            entity_positions.append([start_index, end_index])
        
    
    return entity_positions, entity_types

In [4]:
def find_entity_positions_UniNER(text, entities, entity_types):
    
    # Initializing a dictionary to store the first occurrence of each entity type for each entity
    entity_dict = {}
    for entity, entity_type in zip(entities, entity_types):
        if entity not in entity_dict:
            entity_dict[entity] = entity_type

    # List to store the entity indices along with their types
    entity_indices = []
    new_entity_types = []

    for entity, entity_type in entity_dict.items():
        start_index = 0
        while True:
            start_index = text.find(entity, start_index)
            if start_index == -1:
                break
            end_index = start_index + len(entity)
            entity_indices.append([start_index, end_index])
            new_entity_types.append(entity_type)
            start_index = end_index  # Move to the next position for the next search

    # Displaying the result
    # print(len(entity_indices))
    # print(entity_indices)
    return entity_indices, new_entity_types

In [5]:
def construct_json(entity_types, entity_positions):
    
    ENTITY_CLASS = ['Disease_E', 'Anatomy_E', 'Cause_E', 'Diagnosis_E', 'Precaution_E', 'Riskfactor_E', 'Symptom_E', 'Medicine_E', 'Composition_E', 'Complication_E', 'Surgery_E']

    # Mapping entities to their types
    entities_with_types = [[*entity_positions[i], entity_types[i]] for i in range(len(entity_positions))]
    
    for start, end, entity_type in entities_with_types:
        # checking for the halucination problem of GPT [-_-]
        if entity_type not in ENTITY_CLASS:
            print(f'Fault in Predicted Entity Types = {entity_type}')
            entities_with_types = []
            break
        
    # Constructing the JSON structure
    output = {
        "text": text,
        "entities": entities_with_types
    }

    json_output = json.dumps(output, ensure_ascii=False)
    print(json_output)
    
    return output

In [6]:
def visualize_entities(data):
    # Create a SpaCy Doc object from the text and entities
    nlp = spacy.blank('en')
    doc = nlp(data['text'])
    entities = [(start, end, label) for start, end, label in data['entities']]
    
    # Create spans and check for None
    spans = []
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is None:
            print(f"None span: start={start}, end={end}, label={label}, text={doc.text[start:end]}")
        else:
            spans.append(span)
    doc.set_ents(spans)

    # Define a color scheme for the entities
    colors = {"Disease_E": "aqua", "Anatomy_E": "coral", "Cause_E": "lightgreen", "Diagnosis_E": "yellow",
              "Precaution_E": "pink", "Riskfactor_E": "orange", "Symptom_E": "lightblue", "Medicine_E": "red",
              "Composition_E": "green", "Complication_E": "blue", "Surgery_E": "purple"}

    options = {"ents": ["Disease_E", "Anatomy_E", "Cause_E", "Diagnosis_E", "Precaution_E", "Riskfactor_E",
                        "Symptom_E", "Medicine_E", "Composition_E", "Complication_E", "Surgery_E"], "colors": colors}

    # Visualize the entities
    displacy.render(doc, style='ent', options=options, jupyter=True)

In [7]:
text = "??????? - Abdominal aortic aneurysm - BD50.4Z\nAneurysms can develop anywhere along the aorta, but most aortic aneurysms occur in the part of the aorta that's in the belly area (abdomen). Several things can play a role in the development of an abdominal aortic aneurysm, including: Hardening of the arteries (atherosclerosis). Atherosclerosis occurs when fat and other substances build up on the lining of a blood vessel. High blood pressure. High blood pressure can damage and weaken the aorta's walls. Blood vessel diseases. These are diseases that cause blood vessels to become inflamed. Infection in the aorta. Rarely, a bacterial or fungal infection might cause an abdominal aortic aneurysms. Trauma. For example, being injured in a car accident can cause an abdominal aortic aneurysms. "

entities = ['atherosclerosis', 'fat and other substances build up on the lining of a blood vessel', 'High blood pressure', 'bacterial or fungal infection', 'Trauma', 'aorta', 'belly area', 'blood vessel', 'Abdominal aortic aneurysm', 'Abdominal aortic aneurysm', 'Aneurysms', 'atherosclerosis', 'High blood pressure', 'Blood vessel diseases', 'Infection in the aorta', 'Trauma']
entity_types = ['Cause_E', 'Cause_E', 'Cause_E', 'Cause_E', 'Cause_E', 'Anatomy_E', 'Anatomy_E', 'Anatomy_E', 'Diagnosis_E', 'Disease_E', 'Disease_E', 'Disease_E', 'Disease_E', 'Disease_E', 'Disease_E', 'Disease_E']

print(len(entities))
print(len(entity_types))
print()

# entity_positions, entity_types = find_entity_positionsGPT(text, entities, entity_types)
entity_positions, entity_types = find_entity_positions_UniNER(text, entities, entity_types)

print(entity_positions)
print()

# Constructing the JSON - this will be saved and used to compare the predictions later
data = construct_json(entity_types, entity_positions)
print()

# Visualizng entities in order to check if the output is correct of not.
visualize_entities(data)

16
16

[[308, 323], [354, 419], [421, 440], [442, 461], [624, 653], [697, 703], [87, 92], [145, 150], [488, 493], [607, 612], [165, 175], [407, 419], [556, 568], [10, 35], [46, 55], [503, 524], [590, 612]]

{"text": "??????? - Abdominal aortic aneurysm - BD50.4Z\nAneurysms can develop anywhere along the aorta, but most aortic aneurysms occur in the part of the aorta that's in the belly area (abdomen). Several things can play a role in the development of an abdominal aortic aneurysm, including: Hardening of the arteries (atherosclerosis). Atherosclerosis occurs when fat and other substances build up on the lining of a blood vessel. High blood pressure. High blood pressure can damage and weaken the aorta's walls. Blood vessel diseases. These are diseases that cause blood vessels to become inflamed. Infection in the aorta. Rarely, a bacterial or fungal infection might cause an abdominal aortic aneurysms. Trauma. For example, being injured in a car accident can cause an abdominal aortic an

ValueError: [E1010] Unable to set entity information for token 82 which is included in more than one span in entities, blocked, missing or outside.