In [41]:
import json
with open('./jsons/sideeffects.json', 'r', encoding='utf-8') as f:
    side_effects_data = json.load(f)
with open('./jsons/processed_data4.json', 'r', encoding='utf-8') as f:
    drugs_data = json.load(f)


In [42]:
consolidated_data = {}

for item in side_effects_data:
    generic_name = item["generic_name"]
    side_effects = item["side_effect"]

    if generic_name not in consolidated_data:
        consolidated_data[generic_name] = set(side_effects)
    else:
        consolidated_data[generic_name].update(side_effects)

# Convert sets back to lists
for generic_name in consolidated_data:
    consolidated_data[generic_name] = list(consolidated_data[generic_name])
side_effects_data = consolidated_data

In [43]:

# Adding side effects to drugs data
for drug in drugs_data:
    generic_name = drug["generic_name"]
    if generic_name in side_effects_data:
        drug["side_effects"] = side_effects_data[generic_name]


# Initialize a counter
count_with_side_effects = 0

# Iterate through the drugs data
for drug in drugs_data:
    if "side_effects" in drug:
        count_with_side_effects += 1

print(f"Number of drugs with side effects: {count_with_side_effects}")

Number of drugs with side effects: 4775


In [44]:
with open('./jsons/processed_data5.json', 'w', encoding='utf-8') as f:
    json.dump(drugs_data, f, indent=4)

In [45]:
with open('./jsons/sideeffects2.json', 'r', encoding='utf-8') as f:
    side_effects2_data = json.load(f)

In [46]:
# Convert side_effects2_data to a dictionary for easier access
side_effects2_dict = {item['generic_name']: item for item in side_effects2_data}

# Updating drugs_data with side effects and serious effects
for drug in drugs_data:
    generic_name = drug["generic_name"]
    if generic_name in side_effects2_dict:
        side_effect_info = side_effects2_dict[generic_name]

        # Update general side effects
        if 'side_effects' in drug.keys():
            drug['side_effects'].extend(side_effect_info.get('side_effects', []))
        else:
            drug['side_effects'] = side_effect_info.get('side_effects', [])

        # Update serious side effects
        if 'serious_effects_list' in drug:
            drug['serious_effects_list'].extend(side_effect_info.get('serious_effects_list', []))
        else:
            drug['serious_effects_list'] = side_effect_info.get('serious_effects_list', [])


In [47]:
with open('./jsons/processed_data5.json', 'w', encoding='utf-8') as f:
    json.dump(drugs_data, f, indent=4)

In [48]:
count_with_side_effects = 0

# Iterate through the drugs data
for drug in drugs_data:
    if "side_effects" in drug:
        count_with_side_effects += 1

print(f"Number of drugs with side effects: {count_with_side_effects}/{len(drugs_data)}")

Number of drugs with side effects: 7524/7878


In [49]:
drugs_data

[{'name': 'Azulfidine',
  'alternative_names': ['azulfidine', 'sulfasalazine'],
  'drug_classes': ['5-aminosalicylates', 'Antirheumatics'],
  'uses': ['Rheumatoid Arthritis', 'Arthritis', 'Ulcerative Colitis'],
  'status': 'Prescription only',
  'generic_name': 'sulfasalazine',
  'spl_product_data_elements': ['Azulfidine EN-tabs Sulfasalazine SULFASALAZINE SULFASALAZINE WHITE WAX CARNAUBA WAX CELLACEFATE MAGNESIUM STEARATE POLYETHYLENE GLYCOL 20000 POVIDONE, UNSPECIFIED PROPYLENE GLYCOL GLYCERYL STEARATE SE SILICON DIOXIDE STARCH, CORN TALC Gold elliptical convex 102;KPh'],
  'ingredients': {'Azulfidine': ['Organic Chemicals',
    'Pharmacologic Substances'],
   'sulfasalazine': ['Organic Chemicals', 'Pharmacologic Substances'],
   'Stearates': ['Organic Chemicals'],
   'povidone': ['Organic Chemicals', 'Pharmacologic Substances'],
   'starch': ['Organic Chemicals',
    'Pharmacologic Substances',
    'Biologically Active Substances'],
   'Zea mays': ['Plant'],
   'gold': ['Pharmacolog

In [50]:
with open('./jsons/processed_data5.json', 'w', encoding='utf-8') as f:
    json.dump(drugs_data, f, indent=4)

In [51]:
import spacy
import scispacy
from scispacy.linking import EntityLinker
import json
import numpy as np
from drug_utils import DataHandler
from tqdm import tqdm
import re
nlp = spacy.load("en_core_sci_sm")
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

# Obtain the linker from the pipeline
linker = nlp.get_pipe("scispacy_linker")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [52]:
from spacy.tokens import Span

# Set the custom extension if not already set
if not Span.has_extension("umls_ents"):
    Span.set_extension("umls_ents", default=[])


In [53]:
def process_drug(drug, nlp, linker, key):
    conditions_and_diseases = []
    
    # Process each paragraph in the 'side_effects' list for this drug
    for uses_text in drug[key]:
        # Process the uses text with the NLP model
        doc = nlp(uses_text)

        # Iterate over the entities and link them to UMLS
        for ent in doc.ents:
            kb_ents = ent._.kb_ents
            if kb_ents:
                cui = kb_ents[0][0]  # Get the top CUI
                umls_entity = linker.kb.cui_to_entity[cui]

                # Check if the UMLS entity has a relevant semantic type
                relevant_types = {"T184", "T033", "T046", "T047", "T037", "T048", "T053"}
                if any(st in relevant_types for st in umls_entity.types):
                    conditions_and_diseases.append(umls_entity.canonical_name)

    # Remove duplicates
    return list(set(conditions_and_diseases))

In [54]:
def process_uses(drugs_data, nlp, linker, key):
    # Initialize dictionary for drug uses
    
    # Iterate over each drug in the data
    for drug in tqdm(drugs_data, desc="Processing Drugs"):
        if key in drug:
            conditions_and_diseases = process_drug(drug, nlp, linker, key)
            drug[key] = conditions_and_diseases

    return drugs_data

In [56]:
drugs_data = process_uses(drugs_data, nlp, linker, 'side_effects')


Processing Drugs: 100%|██████████| 7878/7878 [2:58:15<00:00,  1.36s/it]  


In [57]:
with open('./jsons/processed_data6.json', 'w', encoding='utf-8') as f:
    json.dump(drugs_data, f, indent=4)

In [58]:
drugs_data = process_uses(drugs_data, nlp, linker, 'serious_effects_list')

Processing Drugs:   0%|          | 0/7878 [00:00<?, ?it/s]

Processing Drugs: 100%|██████████| 7878/7878 [26:41<00:00,  4.92it/s]  


In [60]:
drugs_data

[{'name': 'Azulfidine',
  'alternative_names': ['azulfidine', 'sulfasalazine'],
  'drug_classes': ['5-aminosalicylates', 'Antirheumatics'],
  'uses': ['Rheumatoid Arthritis', 'Arthritis', 'Ulcerative Colitis'],
  'status': 'Prescription only',
  'generic_name': 'sulfasalazine',
  'spl_product_data_elements': ['Azulfidine EN-tabs Sulfasalazine SULFASALAZINE SULFASALAZINE WHITE WAX CARNAUBA WAX CELLACEFATE MAGNESIUM STEARATE POLYETHYLENE GLYCOL 20000 POVIDONE, UNSPECIFIED PROPYLENE GLYCOL GLYCERYL STEARATE SE SILICON DIOXIDE STARCH, CORN TALC Gold elliptical convex 102;KPh'],
  'ingredients': {'Azulfidine': ['Organic Chemicals',
    'Pharmacologic Substances'],
   'sulfasalazine': ['Organic Chemicals', 'Pharmacologic Substances'],
   'Stearates': ['Organic Chemicals'],
   'povidone': ['Organic Chemicals', 'Pharmacologic Substances'],
   'starch': ['Organic Chemicals',
    'Pharmacologic Substances',
    'Biologically Active Substances'],
   'Zea mays': ['Plant'],
   'gold': ['Pharmacolog

In [62]:
for drug in drugs_data:
    if "side_effects" in drug:
        drug['side_effects'].sort()
    if "serious_effects_list" in drug:
        drug['serious_effects_list'].sort()

In [63]:
with open('./jsons/processed_data6.json', 'w', encoding='utf-8') as f:
    json.dump(drugs_data, f, indent=4)

In [77]:
serious_lst = []
for drug in drugs_data:
    if "serious_effects_list" in drug.keys():
        #print(drug['serious_effects_list'])
        serious_lst.extend(drug['serious_effects_list'])
print(len(serious_lst))
serious_lst = list(set(serious_lst))
serious_lst.sort()
print(len(serious_lst))

100991
485


In [86]:
serious_set = set(serious_lst)
for drug in drugs_data:
    if "side_effects" in drug:
        for effect in drug['side_effects']:
            if "serious_effects_list" in drug.keys():
                if effect in serious_set and effect not in drug['serious_effects_list']:
                    if isinstance(drug['serious_effects_list'], str):
                        drug['serious_effects_list'] = [drug['serious_effects_list']]
                    drug['serious_effects_list'].append(effect)
            else:
                if effect in serious_set:
                    drug['serious_effects_list'] = effect

In [88]:
with open('./jsons/processed_data7.json', 'w', encoding='utf-8') as f:
    json.dump(drugs_data, f, indent=4)

In [97]:
prop = []
for drug in drugs_data:
    if "side_effects" in drug:
        for effect in drug['side_effects']:
            if "serious_effects_list" in drug.keys():
                if isinstance(drug['serious_effects_list'], str):
                    drug['serious_effects_list'] = [drug['serious_effects_list']]
                proportion = len(drug['serious_effects_list']) / len(drug['side_effects'])
                
                prop.append((proportion, drug['name']))
prop.sort()

In [98]:
prop.reverse()
prop

[(1.2142857142857142, 'Aquatab D'),
 (1.2142857142857142, 'Aquatab D'),
 (1.2142857142857142, 'Aquatab D'),
 (1.2142857142857142, 'Aquatab D'),
 (1.2142857142857142, 'Aquatab D'),
 (1.2142857142857142, 'Aquatab D'),
 (1.2142857142857142, 'Aquatab D'),
 (1.2142857142857142, 'Aquatab D'),
 (1.2142857142857142, 'Aquatab D'),
 (1.2142857142857142, 'Aquatab D'),
 (1.2142857142857142, 'Aquatab D'),
 (1.2142857142857142, 'Aquatab D'),
 (1.2142857142857142, 'Aquatab D'),
 (1.2142857142857142, 'Aquatab D'),
 (1.1764705882352942, 'Aquatab DM'),
 (1.1764705882352942, 'Aquatab DM'),
 (1.1764705882352942, 'Aquatab DM'),
 (1.1764705882352942, 'Aquatab DM'),
 (1.1764705882352942, 'Aquatab DM'),
 (1.1764705882352942, 'Aquatab DM'),
 (1.1764705882352942, 'Aquatab DM'),
 (1.1764705882352942, 'Aquatab DM'),
 (1.1764705882352942, 'Aquatab DM'),
 (1.1764705882352942, 'Aquatab DM'),
 (1.1764705882352942, 'Aquatab DM'),
 (1.1764705882352942, 'Aquatab DM'),
 (1.1764705882352942, 'Aquatab DM'),
 (1.17647058823

In [100]:

for drug in drugs_data:
    if "side_effects" in drug and "serious_effects_list" in drug:
        drug['side_effects'] = [item for item in drug['side_effects'] if item not in drug['serious_effects_list']]

In [102]:
prop = []
for drug in drugs_data:
    if "side_effects" in drug:
        for effect in drug['side_effects']:
            if "serious_effects_list" in drug.keys():
                if isinstance(drug['serious_effects_list'], str):
                    drug['serious_effects_list'] = [drug['serious_effects_list']]
                proportion = len(drug['serious_effects_list']) / len(drug['side_effects'])
                
                prop.append((proportion, drug['name']))
prop.sort()
prop.reverse()
prop

[(105.0, 'Sodium ferric gluconate complex'),
 (105.0, 'Ferrlecit'),
 (104.0, 'Elavil'),
 (104.0, 'Amitriptyline'),
 (103.0, 'Uloric'),
 (103.0, 'Febuxostat'),
 (101.0, 'Sirolimus'),
 (101.0, 'Rapamune'),
 (95.0, 'Trimipramine'),
 (88.0, 'Zorbtive'),
 (88.0, 'Zomacton'),
 (88.0, 'Somatropin'),
 (88.0, 'Serostim'),
 (88.0, 'Saizen'),
 (88.0, 'Omnitrope'),
 (88.0, 'Nutropin AQ NuSpin 10'),
 (88.0, 'Norditropin FlexPro Pen'),
 (88.0, 'Humatrope'),
 (88.0, 'Genotropin'),
 (88.0, 'Accretropin'),
 (73.0, 'Signifor LAR'),
 (73.0, 'Signifor'),
 (73.0, 'Pasireotide'),
 (71.0, 'Olaparib'),
 (71.0, 'Lynparza'),
 (70.0, 'Veletri'),
 (70.0, 'Sprycel'),
 (70.0, 'Flolan'),
 (70.0, 'Epoprostenol'),
 (70.0, 'Dasatinib'),
 (68.0, 'Prohance'),
 (68.0, 'Pamidronate'),
 (68.0, 'Gadoteridol'),
 (68.0, 'Aredia'),
 (67.0, 'Infugem'),
 (67.0, 'Gemzar'),
 (67.0, 'Gemcitabine'),
 (67.0, 'Diamox Sequels'),
 (67.0, 'Diamox'),
 (67.0, 'Acetazolamide'),
 (63.0, 'Tarceva'),
 (63.0, 'Ropeginterferon alfa-2b'),
 (63.0, 

In [103]:
with open('./jsons/processed_data7.json', 'w', encoding='utf-8') as f:
    json.dump(drugs_data, f, indent=4)

In [108]:
for drug in drugs_data:
    if "side_effects" in drug:
        drug['other_effects'] = drug.pop('side_effects')
    

In [109]:
for drug in drugs_data:
    if "other_effects" not in drug:
        drug['other_effects'] = []
    if "serious_effects_list" not in drug:
        drug['serious_effects_list'] = []
    
    

In [110]:
with open('./jsons/processed_data8.json', 'w', encoding='utf-8') as f:
    json.dump(drugs_data, f, indent=4)