In [1]:
import spacy
import scispacy
from scispacy.linking import EntityLinker
import json
import numpy as np
from drug_utils import DataHandler
from tqdm import tqdm
import re

In [2]:
nlp = spacy.load("en_core_sci_sm")
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

# Obtain the linker from the pipeline
linker = nlp.get_pipe("scispacy_linker")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [3]:
def process_drug(drug, nlp, linker):
    drug_name = drug['name']
    conditions_and_diseases = []
    
    # Process each paragraph in the 'side_effects' list for this drug
    for uses_text in drug['uses']:
        # Process the uses text with the NLP model
        doc = nlp(uses_text)

        # Iterate over the entities and link them to UMLS
        for ent in doc.ents:
            kb_ents = ent._.kb_ents
            if kb_ents:
                cui = kb_ents[0][0]  # Get the top CUI
                umls_entity = linker.kb.cui_to_entity[cui]

                # Check if the UMLS entity has a relevant semantic type
                relevant_types = {"T047", "T048", "T184", "T037", "T046", "T061", "T060"}
                if any(st in relevant_types for st in umls_entity.types):
                    conditions_and_diseases.append(umls_entity.canonical_name)

    # Remove duplicates
    return list(set(conditions_and_diseases))

In [4]:
def process_uses(drugs_data, nlp, linker):
    # Initialize dictionary for drug uses
    
    # Iterate over each drug in the data
    for drug in tqdm(drugs_data, desc="Processing Drugs"):
        conditions_and_diseases = process_drug(drug, nlp, linker)
        drug["uses"] = conditions_and_diseases

    return drugs_data

In [5]:
# Load the JSON data
with open("../testing.json", 'r', encoding='utf-8') as file:
    drugs_data = json.load(file)

In [6]:
# Process the drug uses
drug_semantic_types = {"T109", "T195", "T121"}
drugs_data = process_uses(drugs_data, nlp, linker)
filtered_drugs_data = [drug for drug in drugs_data if drug['status'] != 'Unknown']
with open('data.json', 'w') as json_file:
    json.dump(filtered_drugs_data, json_file)

Processing Drugs: 100%|██████████| 7878/7878 [27:02<00:00,  4.86it/s]  


In [7]:
dh = DataHandler("processed_data2.json")
dh.save_data(filtered_drugs_data)

In [8]:
def remove_text_in_parentheses(drug_data):
    # Regular expression pattern to find text within parentheses
    pattern = r'\s*\([^)]*\)'
    for entry in drug_data:
        # Remove text within parentheses and any leading/trailing whitespace
        entry['name'] = re.sub(pattern, '', entry['name']).strip()
    return drug_data

In [9]:
cleaned_data = remove_text_in_parentheses(drugs_data)
dh.save_data(cleaned_data)