In [1]:
import spacy
import scispacy
from scispacy.linking import EntityLinker
import json
import numpy as np
from drug_utils import DataHandler
from tqdm import tqdm
import re

In [2]:
nlp = spacy.load("en_core_sci_sm")
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

# Obtain the linker from the pipeline
linker = nlp.get_pipe("scispacy_linker")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [4]:
# Load the JSON data
with open("./jsons/processed_data3.json", 'r', encoding='utf-8') as file:
    drugs_data = json.load(file)

In [None]:
def process_drug(drug, nlp, linker, semantic_type_to_category):
    categorized_ingredients = {}
    if "spl_product_data_elements" not in drug.keys():
        return []

    doc = nlp(uses_text)
    # Process each paragraph in the 'side_effects' list for this drug
    for ent in doc.ents:
            kb_ents = ent._.kb_ents
            if kb_ents:
                cui = kb_ents[0][0]
                umls_entity = linker.kb.cui_to_entity[cui]

                for st in umls_entity.types:
                    if st in semantic_type_to_category:
                        category = semantic_type_to_category[st]
                        if category not in categorized_ingredients:
                            categorized_ingredients[category] = []
                        categorized_ingredients[category].append(umls_entity.canonical_name)

    # Deduplicate ingredients in each category
    for category in categorized_ingredients:
        categorized_ingredients[category] = list(set(categorized_ingredients[category]))

    return categorized_ingredients

In [30]:
def process_drug(drug, nlp, linker, semantic_type_to_category):
    # Dictionary to hold ingredients and their categories
    ingredient_categories = {}

    if "spl_product_data_elements" not in drug.keys():
        return {}

    for uses_text in drug["spl_product_data_elements"]:
        doc = nlp(uses_text)

        for ent in doc.ents:
            kb_ents = ent._.kb_ents
            if kb_ents:
                cui = kb_ents[0][0]
                umls_entity = linker.kb.cui_to_entity[cui]

                # Loop through the semantic types of the entity
                for st in umls_entity.types:
                    # Check if the semantic type is in your mapping
                    if st in semantic_type_to_category:
                        category = semantic_type_to_category[st]

                        # If the ingredient is not already in the dictionary, add it
                        if umls_entity.canonical_name not in ingredient_categories:
                            ingredient_categories[umls_entity.canonical_name] = []

                        # Add the category to the ingredient
                        if category not in ingredient_categories[umls_entity.canonical_name]:
                            ingredient_categories[umls_entity.canonical_name].append(category)

    return ingredient_categories


In [28]:
def process_ingredients(drugs_data, nlp, linker, semantic_type_to_category):
    # Initialize dictionary for drug uses
    
    # Iterate over each drug in the data
    for drug in tqdm(drugs_data, desc="Processing Drugs"):
        ingredients = process_drug(drug, nlp, linker, semantic_type_to_category)
        drug["ingredients"] = ingredients

    return drugs_data

In [29]:
semantic_type_to_category = {
        "T197": "Immunologic Factors",
        "T121": "Pharmacologic Substances",
        "T109": "Organic Chemicals",
        "T123": "Biologically Active Substances",
        "T195": "Antibiotics",
        "T168": "Food",
        "T125": "Hormone",
        "T110": "Eicosanoids",
        "T127": "Vitamins",
        "T114": "Nucleic Acids",
        "T103": "Chemical",
        "T115": "Organophosphorus Compounds",
        "T116": "Amino Acid, Peptide, or Protein",
        "T088": "Carbohydrate Sequence",
        "T002" : "Plant"
}

In [31]:
drugs_data = process_ingredients(drugs_data, nlp, linker, semantic_type_to_category)
filtered_drugs_data = [drug for drug in drugs_data if drug['status'] != 'Unknown']

Processing Drugs:   0%|          | 0/7878 [00:00<?, ?it/s]

Processing Drugs: 100%|██████████| 7878/7878 [12:54<00:00, 10.17it/s]  


In [12]:
drugs_data

[{'name': 'Azulfidine',
  'alternative_names': ['azulfidine', 'sulfasalazine'],
  'drug_classes': ['5-aminosalicylates', 'Antirheumatics'],
  'uses': ['Rheumatoid Arthritis', 'Arthritis', 'Ulcerative Colitis'],
  'status': 'Prescription only',
  'generic_name': 'sulfasalazine',
  'spl_product_data_elements': ['Azulfidine EN-tabs Sulfasalazine SULFASALAZINE SULFASALAZINE WHITE WAX CARNAUBA WAX CELLACEFATE MAGNESIUM STEARATE POLYETHYLENE GLYCOL 20000 POVIDONE, UNSPECIFIED PROPYLENE GLYCOL GLYCERYL STEARATE SE SILICON DIOXIDE STARCH, CORN TALC Gold elliptical convex 102;KPh'],
  'ingredients': ['Azulfidine',
   'starch',
   'gold',
   'Stearates',
   'povidone',
   'sulfasalazine']},
 {'name': 'Abstral',
  'alternative_names': ['fentanyl', 'abstral'],
  'drug_classes': ['Opioids (narcotic analgesics)'],
  'uses': ['Cancer Pain'],
  'status': 'Discontinued',
  'generic_name': 'fentanyl',
  'spl_product_data_elements': ['Fentanyl Citrate Fentanyl Citrate Fentanyl Citrate Fentanyl Hydrochlor

In [32]:
with open('./jsons/processed_data4.json', 'w') as json_file:
    json.dump(drugs_data, json_file, indent=4)