In [None]:
import utils
import pandas as pd


In [None]:
HPO_PATH = 'data/hp.obo'
LABEVENTS_HPO_PATH = 'data/OUT_LABEVENTS_HPO.csv'
DIAGNOSES_HPO_PATH = 'data/DIAGNOSE_ICD_hpo.csv'


In [None]:
hpo = utils.read_hpo_from_obo(HPO_PATH)
labevents_df = pd.read_csv(LABEVENTS_HPO_PATH).fillna('')
diagnoses_df = pd.read_csv(DIAGNOSES_HPO_PATH).fillna('')


In [None]:
class Subject:
    def __init__(self, id):
        self.id = id
        self.labevents: set[str] = set()
        self.diagnoses: set[str] = set()

    def labevent_vector(self, feature_list: list[str]):
        return [int(feature in self.labevents) for feature in feature_list]

    def diagnoses_vector(self, feature_list: list[str]):
        return [int(feature in self.diagnoses) for feature in feature_list]

    def __repr__(self) -> str:
        return f'<Subject {self.id}>'


In [None]:
subjects: dict[int, Subject] = {}
all_present_hpo_features: set[str] = set()


In [None]:
for _, line in labevents_df.iterrows():
    subject_id = line.subject_id
    if line.selected_hpo_features != '':
        hpo_features = line.selected_hpo_features.split(';')
        all_present_hpo_features.update(hpo_features)
        subjects.setdefault(subject_id, Subject(subject_id)).labevents.update(hpo_features)


In [None]:
for _, line in diagnoses_df.iterrows():
    subject_id = line.subject_id
    if line.hpo_features != '':
        hpo_features = line.hpo_features.split(';')
        all_present_hpo_features.update(hpo_features)
        subjects.setdefault(subject_id, Subject(subject_id)).diagnoses.update(hpo_features)


In [None]:
feature_list = [e for e in all_present_hpo_features]
hpo_to_id = {feature: i for i, feature in enumerate(feature_list)}


In [None]:
subjects[10006].labevent_vector(feature_list)
subjects[10006].diagnoses_vector(feature_list)