In [1]:
import spacy
import scispacy
from scispacy.linking import EntityLinker
import json
import numpy as np
from drug_utils import DataHandler
from tqdm import tqdm
import re

In [8]:
nlp = spacy.load("en_core_sci_sm")
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

# Obtain the linker from the pipeline
linker = nlp.get_pipe("scispacy_linker")

In [3]:
with open('./jsons/processed_data8.json', 'r', encoding='utf-8') as f:
    drugs_data = json.load(f)

In [4]:
serious_lst = []
for drug in drugs_data:
    if "serious_effects_list" in drug.keys():
        #print(drug['serious_effects_list'])
        serious_lst.extend(drug['serious_effects_list'])
    if "other_effects" in drug.keys():
        #print(drug['serious_effects_list'])
        serious_lst.extend(drug['serious_effects_list'])

serious_lst = list(set(serious_lst))
serious_lst.sort()
serious_lst[10:]

['Abnormally hard consistency',
 'Absent pulse',
 'Accumulation',
 'Ache',
 'Acholic stool',
 'Acidosis',
 'Acidosis, Lactic',
 'Acne Vulgaris',
 'Actual Negative Coping',
 'Agitation',
 'Alcoholic Intoxication',
 'Alopecia',
 'Amnesia',
 'Anorexia',
 'Anxiety',
 'Apathy',
 'Aphasia',
 'Aphthous Stomatitis',
 'Arthralgia',
 'As If Personality',
 'Asthenia',
 'BODY ACHE',
 'Back Pain',
 'Behavior',
 'Black Discoloration',
 'Bladder dysfunction',
 'Bladder pain',
 'Blindness',
 'Blister of skin',
 'Blistering eruption',
 'Blood in stool',
 'Bloodshot eye',
 'Bloody discharge finding',
 'Blue lips',
 'Blurred vision',
 'Body Weight Changes',
 'Bone pain',
 'Bowel problem',
 'Bradycardia',
 'Bradykinesia',
 'Breast size',
 'Breast tenderness',
 'Breath Holding',
 'Breathing abnormally deep',
 'Broad fingertip',
 'Bulla',
 'Burn injury',
 'Burning sensation',
 'Burning sensation of skin',
 'Burns and Wounds',
 'Cachexia',
 'Cardiac Arrhythmia',
 'Cardiomegaly',
 'Cellulitis',
 'Chapping of 

In [5]:
import pandas as pd

file_path = 'CTCAE_v5.0.xlsx'

# Load the Excel file
df = pd.read_excel(file_path)

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,MedDRA Code,MedDRA SOC,CTCAE Term,Grade 1,Grade 2,Grade 3,Grade 4,Grade 5,Definition,Navigational Note,CTCAE v5.0 Change
0,10002272,Blood and lymphatic system disorders,Anemia,Hemoglobin (Hgb) <LLN - 10.0 g/dL; <LLN - 6.2 ...,Hgb <10.0 - 8.0 g/dL; <6.2 - 4.9 mmol/L; <100 ...,Hgb <8.0 g/dL; <4.9 mmol/L; <80 g/L; transfusi...,Life-threatening consequences; urgent interven...,Death,A disorder characterized by a reduction in the...,,Clarification: Definition
1,10005329,Blood and lymphatic system disorders,"Blood and lymphatic system disorders - Other, ...",Asymptomatic or mild symptoms; clinical or dia...,"Moderate; minimal, local or noninvasive interv...",Severe or medically significant but not immedi...,Life-threatening consequences; urgent interven...,Death,-,,Clarification: Grade 3
2,10048580,Blood and lymphatic system disorders,Bone marrow hypocellular,Mildly hypocellular or <=25% reduction from no...,Moderately hypocellular or >25 - <50% reductio...,Severely hypocellular or >50 - <=75% reduction...,Aplastic persistent for longer than 2 weeks,Death,A disorder characterized by the inability of t...,,
3,10013442,Blood and lymphatic system disorders,Disseminated intravascular coagulation,-,Laboratory findings with no bleeding,Laboratory findings and bleeding,Life-threatening consequences; urgent interven...,Death,A disorder characterized by systemic pathologi...,,
4,10014950,Blood and lymphatic system disorders,Eosinophilia,>ULN and >Baseline,-,Steroids initiated,-,-,A disorder characterized by laboratory test re...,,Addition: Term


In [6]:
unique = pd.unique(df['CTCAE Term'])
unique = list(unique)

In [33]:
import string
def preprocess(text_list):
    """
    Preprocess a list of text items: lowercase, remove punctuation, and lemmatize.
    """
    preprocessed_list = []
    for text in text_list:
        # Lowercase
        text = text.lower()
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        preprocessed_list.append(text)
    return preprocessed_list


In [None]:
def get_umls_terms_with_mappings(terms, nlp):
    umls_to_original = {}
    umls_terms_list = []
    for term in terms:
        doc = nlp(term)
        for entity in doc.ents:
            for cui, _ in entity._.umls_ents:
                # Append to the list as tuples
                umls_terms_list.append((cui, term))
                # Add to the dictionary
                umls_to_original.setdefault(cui, []).append(term)

    return umls_terms_list, umls_to_original

# Get UMLS terms and their mappings
umls_terms_list, umls_mappings = get_umls_terms_with_mappings(unique, nlp)

In [None]:
import spacy
import scispacy
from sklearn.metrics.pairwise import cosine_similarity
# ... other necessary imports ...

umls_terms, umls_mappings = get_umls_terms_with_mappings(unique, nlp)
side_effects = serious_lst

# Convert terms to vectors
ctcae_vectors = [nlp(term).vector for term in tqdm(umls_terms)]
side_effect_vectors = [nlp(term).vector for term in tqdm(side_effects)]

# Initialize a dictionary to store mappings
# Initialize a dictionary to store the reverse mappings
side_effect_to_ctcae_mappings = {}

# Calculate similarities and map terms
for i, side_effect_vec in tqdm(enumerate(side_effect_vectors), desc="Processing Side Effects"):
    similarities = cosine_similarity([side_effect_vec], ctcae_vectors)
    
    # Find the index of the highest similarity score
    best_match_index = similarities.argmax()
    best_match_score = similarities[0][best_match_index]

    # Map the side effect to the best matching CTCAE term
    side_effect = side_effects[i]
    best_matching_ctcae_term = umls_terms[best_match_index]
    
    side_effect_to_ctcae_mappings[side_effect] = {
        'ctcae_term': best_matching_ctcae_term,
        'similarity_score': best_match_score
    }

# Evaluate and refine mappings
# Here you can print the mappings or perform further analysis
for side_effect, mapping_info in side_effect_to_ctcae_mappings.items():
    print(f"{side_effect} -> {mapping_info['ctcae_term']} (Score: {mapping_info['similarity_score']})")


100%|██████████| 837/837 [01:36<00:00,  8.64it/s]
100%|██████████| 485/485 [00:52<00:00,  9.24it/s]
Processing Side Effects: 485it [00:01, 394.29it/s]

1/3 meter (distance vision finding) -> Blurred vision (Score: 0.4482669234275818)
Abdomen distended -> Abdominal distension (Score: 0.5908830165863037)
Abdominal Pain -> Abdominal pain (Score: 0.8832767009735107)
Abdominal bloating -> Abdominal pain (Score: 0.8805053234100342)
Abnormal -> Hair texture abnormal (Score: 0.5068145394325256)
Abnormal behavior -> Hair texture abnormal (Score: 0.48100802302360535)
Abnormal breathing -> Movements involuntary (Score: 0.5380785465240479)
Abnormal color -> Hair color changes (Score: 0.7408782839775085)
Abnormal coordination -> Movements involuntary (Score: 0.41915708780288696)
Abnormal sternal ossification -> Nipple deformity (Score: 0.5942028760910034)
Abnormally hard consistency -> Hair texture abnormal (Score: 0.367364764213562)
Absent pulse -> Electrocardiogram T wave abnormal (Score: 0.5007792711257935)
Accumulation -> Concentration impairment (Score: 0.5282284617424011)
Ache -> Thrush (Score: 0.2810570299625397)
Acholic stool -> Fecal inco




In [27]:
with open('./jsons/processed_data8.json', 'r', encoding='utf-8') as f:
    drugs_data = json.load(f)

In [35]:
# Preprocess 'serious_effects_list' and 'other_effects' in drugs_data
for drug in tqdm(drugs_data):
    drug['serious_effects_list'] = [preprocess(effect) for effect in drug['serious_effects_list']]
    drug['other_effects'] = [preprocess(effect) for effect in drug['other_effects']]

# Use the mapping to find corresponding CTCAE terms
for drug in tqdm(drugs_data):
    drug['serious_effects_list'] = [side_effect_to_ctcae_mappings.get(effect, {'ctcae_term': None})['ctcae_term'] 
                                      for effect in drug['serious_effects_list']]
    drug['other_effects'] = [side_effect_to_ctcae_mappings.get(effect, {'ctcae_term': None})['ctcae_term'] 
                                    for effect in drug['other_effects']]

100%|██████████| 7878/7878 [00:04<00:00, 1904.32it/s]
  0%|          | 0/7878 [00:00<?, ?it/s]


TypeError: unhashable type: 'list'