## Data Exploration

In [11]:
import os
import re

In [12]:
if "pyproject.toml" not in os.listdir():
    os.chdir("..")

In [13]:
PATH_TO_HTLM = "data/NYPHKnowldegeBase.html"

In [14]:
with open(PATH_TO_HTLM, "r") as f:
    html_content = f.read()

In [15]:
from bs4 import BeautifulSoup
import pandas as pd

soup = BeautifulSoup(html_content, "html.parser")
table = soup.find("table", {"class": "MsoTableWeb3"})
rows = table.find_all("tr")
data = []

for row in rows[1:]:
    cells = row.find_all("td")
    row_data = [cell.get_text(strip=True) for cell in cells]
    data.append(row_data)

df = pd.DataFrame(data)

In [16]:
df.columns = ("Disease", "Disease Occurrence", "Symptom")

In [17]:
def parse_umls_string(umls_string):
    pattern = r"(UMLS):(C\d{7})_([\w\s]+)"
    matches = re.findall(pattern, umls_string)
    return matches

In [18]:
def get_disease_representation(diseases):
    code = diseases[0][1]
    print(diseases)
    name = " or ".join([" ".join(d[2].replace("\n", " ").split()) for d in diseases])
    return [("UMLS", code, name)]

In [19]:
transformed_rows = []
prev_diseases = None
prev_symptoms = None
for i, row in df.iterrows():
    diseases = parse_umls_string(row.Disease)
    if not diseases:
        diseases = prev_diseases
    prev_diseases = diseases
    symptoms = parse_umls_string(row.Symptom)
    if not symptoms:
        symptoms = prev_symptoms
    prev_symptoms = symptoms
    print(get_disease_representation(diseases))
    for _, d_code, d_name in get_disease_representation(diseases):
        for _, s_code, s_name in symptoms:
            transformed_rows.append((d_code, " ".join(d_name.replace("\n", " ").split()), s_code, " ".join(s_name.replace("\n", " ").split())))

[('UMLS', 'C0020538', 'hypertensive\n  disease')]
[('UMLS', 'C0020538', 'hypertensive disease')]
[('UMLS', 'C0020538', 'hypertensive\n  disease')]
[('UMLS', 'C0020538', 'hypertensive\n  disease')]
[('UMLS', 'C0020538', 'hypertensive disease')]
[('UMLS', 'C0020538', 'hypertensive\n  disease')]
[('UMLS', 'C0020538', 'hypertensive\n  disease')]
[('UMLS', 'C0020538', 'hypertensive disease')]
[('UMLS', 'C0020538', 'hypertensive\n  disease')]
[('UMLS', 'C0020538', 'hypertensive\n  disease')]
[('UMLS', 'C0020538', 'hypertensive disease')]
[('UMLS', 'C0020538', 'hypertensive\n  disease')]
[('UMLS', 'C0020538', 'hypertensive\n  disease')]
[('UMLS', 'C0020538', 'hypertensive disease')]
[('UMLS', 'C0020538', 'hypertensive\n  disease')]
[('UMLS', 'C0020538', 'hypertensive\n  disease')]
[('UMLS', 'C0020538', 'hypertensive disease')]
[('UMLS', 'C0020538', 'hypertensive\n  disease')]
[('UMLS', 'C0020538', 'hypertensive\n  disease')]
[('UMLS', 'C0020538', 'hypertensive disease')]
[('UMLS', 'C0020538',

In [20]:
new_df = pd.DataFrame(transformed_rows, columns=["Disease Code", "Disease", "Symptom Code", "Symptom"])

In [21]:
new_df.head()

Unnamed: 0,Disease Code,Disease,Symptom Code,Symptom
0,C0020538,hypertensive disease,C0008031,pain chest
1,C0020538,hypertensive disease,C0392680,shortness of breath
2,C0020538,hypertensive disease,C0012833,dizziness
3,C0020538,hypertensive disease,C0004093,asthenia
4,C0020538,hypertensive disease,C0085639,fall


In [22]:
from dr_claude import datamodels

In [24]:
import collections

In [28]:
condition_to_symptoms = collections.defaultdict(list)
curr_disease = None
for _, row in new_df.iterrows():
    if row.Disease != curr_disease:
        curr_disease = row.Disease
    condition = datamodels.Condition(name=curr_disease, umls_code=row["Disease Code"])
    condition_to_symptoms[condition].append(datamodels.WeightedSymptom(name=row.Symptom, umls_code=row["Symptom Code"], weight=0.5))

In [None]:
datamodels.DiseaseSymptomKnowledgeBase(pairs=condition_to_symptoms)

In [30]:
datamodels.DiseaseSymptomKnowledgeBase(pairs=condition_to_symptoms)

DiseaseSymptomKnowledgeBase(pairs={Condition(name='hypertensive disease', umls_code='C0020538'): [WeightedSymptom(name='pain chest', umls_code='C0008031', weight=0.5), WeightedSymptom(name='shortness of breath', umls_code='C0392680', weight=0.5), WeightedSymptom(name='dizziness', umls_code='C0012833', weight=0.5), WeightedSymptom(name='asthenia', umls_code='C0004093', weight=0.5), WeightedSymptom(name='fall', umls_code='C0085639', weight=0.5), WeightedSymptom(name='syncope', umls_code='C0039070', weight=0.5), WeightedSymptom(name='vertigo', umls_code='C0042571', weight=0.5), WeightedSymptom(name='sweat', umls_code='C0038990', weight=0.5), WeightedSymptom(name='sweating increased', umls_code='C0700590', weight=0.5), WeightedSymptom(name='palpitation', umls_code='C0030252', weight=0.5), WeightedSymptom(name='nausea', umls_code='C0027497', weight=0.5), WeightedSymptom(name='angina pectoris', umls_code='C0002962', weight=0.5), WeightedSymptom(name='pressure chest', umls_code='C0438716', we