In [2]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import pyarrow.parquet as pq
import numpy as np


import torch
# from torch_geometric.data import HeteroData
import torch.nn as nn
import torch.nn.functional as F

from collections import defaultdict

import gc
import os
from tqdm import tqdm


from sklearn.preprocessing import StandardScaler
from transformers import AutoTokenizer, AutoModel

dir = '/content/drive/MyDrive/Thesis/Codes/Data/Graph'

# Fuctions

In [4]:
def load_data(dir, file):
  return pd.read_parquet(f'{dir}/{file}')

In [5]:
def get_id(df, value, df_id = 'stay_id'):
  df_map_infunction = build_id_map(df[df_id].unique())
  for key in df_map_infunction:
    if df_map_infunction[key] == value:
      return key
  return None

In [6]:
def make_hours_since_icu(df, charttime = 'charttime', intime = 'intime'):
    df[charttime] = pd.to_datetime(df[charttime])
    df[intime] = pd.to_datetime(df[intime])

    df['hours_since_icu'] = (df[charttime] - df[intime]).dt.total_seconds() / 3600
    df = df[df['hours_since_icu'] >= 0]
    df = df[df['hours_since_icu'] <= 24]
    del df[charttime], df[intime]
    return df

In [7]:
def check_columns(cols_to_check, df, ind):
  node_candidates = []
  edge_candidates = []

  for col in cols_to_check:
      uniq_counts = df.groupby(ind)[col].nunique()

      if (uniq_counts <= 1).all():
          node_candidates.append(col)
      else:
          edge_candidates.append(col)

  print(f"Likely NODE features (constant for each {ind}):", node_candidates)
  print("Likely EDGE features (vary across stays):", edge_candidates)

  return node_candidates, edge_candidates

In [8]:
def build_id_map(unique_vals):
    return {val: i for i, val in enumerate(unique_vals)}

In [9]:
def bool_to_int(df):
  for col in df.columns:
      if df[col].dtype == bool:
        df[col] = df[col].astype(int)

  return df

In [10]:
def find_bool(df, id = None):
  bool_cols = []
  not_bool = []
  if id:
      cols = df.drop(columns = [id]).columns
  else:
    cols = df.columns
  for col in cols:
    count = df[col].nunique()
    if count == 2:
      bool_cols.append(col)
    else:
      print(col, count)
      not_bool.append(col)
  return bool_cols, not_bool

In [11]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from transformers import AutoTokenizer, AutoModel


class FeatureBuilder(nn.Module):
    def __init__(self,
                 cat_cols=None,
                 one_hot_cols=None,
                 bool_cols=None,
                 cont_cols=None,
                 raw_cont_cols=None,
                 emb_dims=None,
                 pretrained_cols=None,
                 pretrained_model="emilyalsentzer/Bio_ClinicalBERT",
                 ontology_embeddings=None):
        """
        Flexible feature builder for heterogeneous graphs.
        Supports numeric, categorical, boolean, textual (via LLM),
        and ontology-based features.
        """

        super().__init__()

        self.cat_cols = cat_cols or []
        self.one_hot_cols = one_hot_cols or []
        self.bool_cols = bool_cols or []
        self.cont_cols = cont_cols or []
        self.raw_cont_cols = raw_cont_cols or []
        self.emb_dims = emb_dims or {}
        self.pretrained_cols = pretrained_cols or []
        self.ontology_embeddings = ontology_embeddings or {}

        if self.pretrained_cols:
            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
            self.bert = AutoModel.from_pretrained(pretrained_model)
            self.bert.eval()
        else:
            self.tokenizer, self.bert = None, None

        self.embeddings = nn.ModuleDict()
        self.cat_maps = {}
        self.scaler = StandardScaler() if self.cont_cols else None


    def _embed_categorical(self, df, col):
        """Embed categorical columns"""
        unique_vals = df[col].dropna().unique()
        self.cat_maps[col] = {val: i + 1 for i, val in enumerate(unique_vals)}
        n_unique = len(unique_vals)
        if col not in self.emb_dims:
            self.emb_dims[col] = min(50, int(6 * (n_unique ** 0.25)))
        num_embeddings = n_unique + 2
        self.embeddings[col] = nn.Embedding(num_embeddings, self.emb_dims[col])
        indices = df[col].map(self.cat_maps[col]).fillna(0).astype(int).values
        out = self.embeddings[col](torch.tensor(indices, dtype=torch.long))
        names = [f"{col}_emb{i}" for i in range(out.shape[1])]
        return out, names

    def _embed_pretrained(self, df, col):
        """Use pretrained LLM embeddings (like ClinicalBERT) for text columns"""
        texts = df[col].astype(str).fillna("[UNK]").tolist()
        with torch.no_grad():
            tokens = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
            outputs = self.bert(**tokens)
            embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        names = [f"{col}_bert{i}" for i in range(embeddings.shape[1])]
        return embeddings, names

    def _embed_ontology(self, df, col):
        """Use predefined ontology embeddings for structured codes like ICD"""
        mapping = self.ontology_embeddings.get(col, {})
        dim = len(next(iter(mapping.values()))) if mapping else 50
        out = []
        for val in df[col]:
            vec = mapping.get(val, np.zeros(dim))
            out.append(vec)
        out = torch.tensor(out, dtype=torch.float32)
        names = [f"{col}_ont{i}" for i in range(dim)]
        return out, names


    def build(self, df, id_col=None, target_col=None, log_target=False):
        df = df.copy()
        ids, target = None, None
        feature_names = []

        if id_col:
            ids = df[id_col]
            df = df.drop(columns=[id_col])

        if target_col:
            target = df[target_col].astype(float)
            if log_target:
                target = torch.tensor(np.log1p(target.values), dtype=torch.float32)
            else:
                target = torch.tensor(target.values, dtype=torch.float32)
            df = df.drop(columns=[target_col])

        tensors = []

        for col in self.cat_cols:
            if col in df.columns:
                out, names = self._embed_categorical(df, col)
                # print(f"Embedding {col} with {len(names)} features and {out[0]} ")
                tensors.append(out)
                feature_names.extend(names)

        for col in self.pretrained_cols:
            if col in df.columns:
                out, names = self._embed_pretrained(df, col)
                tensors.append(out)
                feature_names.extend(names)
        for col in self.ontology_embeddings:
            if col in df.columns:
                out, names = self._embed_ontology(df, col)
                # print(f"Embedding {col} with {len(names)} features and {out[0]} ")
                tensors.append(out)
                feature_names.extend(names)

        for col in self.one_hot_cols:
            if col in df.columns:
                dummies = pd.get_dummies(df[col], dummy_na=True)
                tensors.append(torch.tensor(dummies.values, dtype=torch.float32))
                feature_names.extend(list(dummies.columns))

        if self.cont_cols:
            cont_cols_exist = [c for c in self.cont_cols if c in df.columns]
            if cont_cols_exist:
                cont_vals = self.scaler.fit_transform(df[cont_cols_exist].values)
                tensors.append(torch.tensor(cont_vals, dtype=torch.float32))
                feature_names.extend(cont_cols_exist)

        raw_exist = [c for c in self.raw_cont_cols if c in df.columns]
        if raw_exist:
            raw_vals = torch.tensor(df[raw_exist].values, dtype=torch.float32)
            tensors.append(raw_vals)
            feature_names.extend(raw_exist)


        bool_cols_exist = [c for c in self.bool_cols if c in df.columns]
        if bool_cols_exist:
            tensors.append(torch.tensor(df[bool_cols_exist].astype(float).values,
                                        dtype=torch.float32))
            feature_names.extend(bool_cols_exist)

        if not tensors:
            print("No features to embed.")
            tensors.append(torch.ones((len(df), 1), dtype=torch.float32))
            feature_names = ["constant"]

        features = torch.cat(tensors, dim=1)

        return features, feature_names, self.embeddings, self.cat_maps, ids, target


In [12]:
def build_pretrained_embeddings(
    df: pd.DataFrame,
    text_col: str = "Building",
    node_id_col: str = "diag_node_id",
    model_name: str = "emilyalsentzer/Bio_ClinicalBERT",
    batch_size: int = 32,
    max_length: int = 128,
    device: str = None,
    projection_dim: int = 64):

    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    projection_layer = nn.Linear(model.config.hidden_size, projection_dim).to(device)


    unique_df = df[[node_id_col, text_col]].drop_duplicates().reset_index(drop=True)

    embeddings = []

    with torch.no_grad():
        for i in tqdm(range(0, len(unique_df), batch_size), desc=f"Embedding {text_col}"):
            batch_texts = unique_df[text_col].iloc[i:i+batch_size].astype(str).tolist()

            inputs = tokenizer(
                batch_texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=max_length
            ).to(device)

            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            projected_embeddings = projection_layer(cls_embeddings).cpu().numpy()
            embeddings.append(projected_embeddings)

    embeddings = np.vstack(embeddings)

    final_embeddings = dict(zip(unique_df[node_id_col], embeddings))

    return final_embeddings

In [13]:
def build_text_embeddings(
    text: str = "Building",
    node_id_col: str = "diag_node_id",
    model_name: str = "emilyalsentzer/Bio_ClinicalBERT",
    batch_size: int = 32,
    max_length: int = 128,
    device: str = None,
    projection_dim: int = 64):

    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    projection_layer = nn.Linear(model.config.hidden_size, projection_dim).to(device)



    embeddings = []

    with torch.no_grad():
      inputs = tokenizer(
          text,
          return_tensors="pt",
          padding=True,
          truncation=True,
          max_length=max_length
      ).to(device)

      outputs = model(**inputs)
      cls_embeddings = outputs.last_hidden_state[:, 0, :]
      projected_embeddings = projection_layer(cls_embeddings).cpu().numpy()

    return projected_embeddings

In [14]:
import re
from collections import defaultdict

def make_feature_name(feature_names):
  group_indices = defaultdict(list)

  for idx, feat in enumerate(feature_names):
      base = re.split(r"_emb\d+|_ont\d+|_\d+$", feat)[0]
      group_indices[base].append(idx)

  feature_ranges = {k: [v[0], v[-1]] for k, v in group_indices.items()}

  # print(feature_ranges)
  return feature_ranges

# Make Nodes and Edges

In [15]:
from torch_geometric.data import HeteroData
data = HeteroData()
# dir_save = '/content/drive/MyDrive/Thesis/Codes/'
# dir = '/content/drive/MyDrive/Thesis/Codes/Data/Graph'
# data = torch.load(f'{dir}/graph_data_embedded_v1.pt', weights_only=False)

## ICU stay

In [16]:
icu_stay_details = pd.read_parquet('/content/drive/MyDrive/Thesis/Codes/Data/Graph/icu_stay_details.parquet')

icu_stay_details = icu_stay_details.rename(columns={'last_careunit': 'first_careunit'})
# icu_stay_details['weight_diff'] = icu_stay_details['weight_max'] -      icu_stay_details['weight_min']
# icu_stay_details['weight_loss'] = icu_stay_details['weight_admit'] - icu_stay_details['weight_min']


icu_stay_details = icu_stay_details.loc[:, ['stay_id','los' , 'first_icu_stay','first_careunit', 'admission_type', 'admission_location','admission_age']]


icu_stay_details = bool_to_int(icu_stay_details)

In [17]:
icu_stay_details = icu_stay_details[icu_stay_details['los'] <= 30]

In [18]:
stay_id_map = build_id_map(icu_stay_details['stay_id'].unique())

In [19]:
builder = FeatureBuilder(
    cat_cols=["first_careunit", "admission_type", "admission_location"],
    bool_cols=["first_icu_stay"],
    cont_cols=['admission_age','weight_diff', 'weight_loss']
)

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(icu_stay_details,
                                                                           id_col='stay_id',
                                                                           target_col='los')

data["stay"].x = features
data["stay"].feature_names = make_feature_name(feature_names)
data["stay"].y = torch.log1p(target)
data["stay"].ids = torch.tensor(ids.values, dtype=torch.long)

In [20]:
import json

stay_id_map_serializable = {str(int(k)): v for k, v in stay_id_map.items()}

with open(f'{dir}/stay_id_map_filter_30.json', 'w') as f:
    json.dump(stay_id_map_serializable, f)

## Patient

In [21]:
patient = load_data(dir, 'patient.parquet')
charlson = load_data(f'{dir}/Score', 'charlson.parquet')

charlson_patient = pd.merge(patient, charlson.drop(columns=['hadm_id']), on='subject_id', how='left')

charlson_patient = charlson_patient.groupby('subject_id').max()
charlson_patient = bool_to_int(charlson_patient.drop(columns = ['anchor_age'])).reset_index()

patient_id_map = build_id_map(patient['subject_id'].unique())

In [22]:
builder = FeatureBuilder(
    cat_cols=["gender"],
    bool_cols=[
        "myocardial_infarct", "congestive_heart_failure",
        "peripheral_vascular_disease", "cerebrovascular_disease",
        "dementia", "chronic_pulmonary_disease", "rheumatic_disease",
        "peptic_ulcer_disease", "mild_liver_disease", "diabetes_without_cc",
        "diabetes_with_cc", "paraplegia", "renal_disease",
        "malignant_cancer", "severe_liver_disease",
        "metastatic_solid_tumor", "aids"
    ],
    cont_cols=["age_score", "charlson_comorbidity_index"]
)

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    charlson_patient,
    id_col="subject_id",
    target_col=None
)

data["patient"].x = features
data["patient"].feature_names = make_feature_name(feature_names)

### Edges

In [23]:
link_df = pd.read_csv(f'{dir}/Link.csv')

link_df = link_df[link_df['subject_id'].isin(patient['subject_id'].unique())]
link_df = link_df[link_df['stay_id'].isin(icu_stay_details['stay_id'].unique())]

src =  torch.tensor([patient_id_map[pid] for pid in link_df['subject_id']], dtype=torch.long)
dst =  torch.tensor([stay_id_map[sid] for sid in link_df['stay_id']], dtype=torch.long)

edge_index = torch.stack([src, dst])

data['patient', 'HAS_STAY', 'stay'].edge_index = edge_index
data['stay', 'REV_STAY', 'patient'].edge_index = edge_index.flip(0)

print(data['patient', 'HAS_STAY', 'stay'].edge_index.shape)

torch.Size([2, 93492])


## Procedure

In [24]:
procedure = load_data(dir, 'procedure.parquet')

procedure = procedure[procedure['stay_id'].isin(stay_id_map.keys())].copy()
procedure = make_hours_since_icu(procedure,'storetime', 'icu_intime')

cols_to_check = ['procedure_label', 'procedure_category',
                 'ordercategoryname', 'location', 'locationcategory','statusdescription']

procedure = procedure.sort_values(by=['stay_id', 'procedure_label', 'hours_since_icu']).reset_index(drop=True)

node_candidates, edge_candidates = check_columns(cols_to_check, procedure, 'itemid')
procedure[node_candidates] = procedure[node_candidates].fillna("UNK")

procedure = procedure.drop_duplicates(subset = node_candidates + ['itemid', 'stay_id', 'hours_since_icu'],keep = 'first')

Likely NODE features (constant for each itemid): ['procedure_label', 'procedure_category', 'ordercategoryname']
Likely EDGE features (vary across stays): ['location', 'locationcategory', 'statusdescription']


In [25]:
procedure['procedure_category'] = procedure['procedure_category'].str.replace(r'^\d+-', '', regex=True).str.strip()
print(procedure['procedure_category'].unique())

['Imaging' 'Procedures' 'Intubation/Extubation' 'Ventilation'
 'Significant Events' 'Cultures' 'Access Lines - Peripheral'
 'Communication' 'Access Lines - Invasive' 'GI/GU' 'Dialysis'
 'Medications']


In [26]:
builder = FeatureBuilder(
    cat_cols=['procedure_label', 'procedure_category', 'ordercategoryname'],
)

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    procedure,
    id_col="itemid",
)


unique_features, inverse_indices, counts = torch.unique(features, dim=0, return_inverse=True, return_counts=True)

data["procedure"].x = unique_features
data["procedure"].feature_names = make_feature_name(feature_names)

In [27]:
builder = FeatureBuilder(
    cat_cols=['location', 'locationcategory', 'statusdescription'],
    cont_cols=['procedure_duration'],
    raw_cont_cols= ['hours_since_icu']
)

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    procedure,
    id_col="stay_id",
    target_col=None
)

proc_id_map = build_id_map(procedure['itemid'].unique())


src = torch.tensor(procedure['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor(procedure['itemid'].map(proc_id_map).values, dtype=torch.long)
edge_index = torch.stack([src, dst])

data['stay', 'HAS_PROC', 'procedure'].edge_index = edge_index
data['stay', 'HAS_PROC', 'procedure'].edge_attr = features.float()
data['stay', 'HAS_PROC', 'procedure'].edge_attr_name = make_feature_name(feature_names)

## Diagnoses

In [28]:
diagnoses = load_data(dir, 'diagnoses.parquet')
diagnoses = diagnoses.dropna(subset=['icd_code','icd_version'])

diagnoses = diagnoses[diagnoses['stay_id'].isin(stay_id_map.keys())]

diagnoses['diag_node_id'] = diagnoses.apply(lambda x: f"{x['icd_code']}_{int(x['icd_version'])}", axis=1)

In [None]:
diag_map_id = build_id_map(diagnoses['diag_node_id'].unique())
diagnoses['diag_node_id'] = diagnoses['diag_node_id'].map(diag_map_id)

In [None]:
def categorize_icd_code(icd_code, icd_version):

    categories = {}
    categories['diag_id'] = (icd_code, icd_version)


    if icd_code is None or pd.isna(icd_code):
        icd_code = "UNK"

    if icd_version is None or pd.isna(icd_version):
        icd_version = 9
    icd_code = str(icd_code).strip()

    if icd_version == 9:
        if len(icd_code) >= 3:
            major_cat = icd_code[:3]
        else:
            major_cat = icd_code

        if major_cat.startswith(('001', '002', '003', '004', '005', '006', '007', '008', '009')):
            categories['major_category'] = 'infectious_diseases'
        elif major_cat.startswith(('140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '170', '171', '172', '173', '174', '175', '176', '177', '178', '179', '180', '181', '182', '183', '184', '185', '186', '187', '188', '189', '190', '191', '192', '193', '194', '195', '196', '197', '198', '199')):
            categories['major_category'] = 'neoplasms'
        elif major_cat.startswith(('240', '241', '242', '243', '244', '245', '246', '247', '248', '249', '250', '251', '252', '253', '254', '255', '256', '257', '258', '259')):
            categories['major_category'] = 'endocrine_nutritional_metabolic'
        elif major_cat.startswith(('280', '281', '282', '283', '284', '285', '286', '287', '288', '289')):
            categories['major_category'] = 'blood_diseases'
        elif major_cat.startswith(('290', '291', '292', '293', '294', '295', '296', '297', '298', '299', '300', '301', '302', '303', '304', '305', '306', '307', '308', '309', '310', '311', '312', '313', '314', '315', '316', '317', '318', '319')):
            categories['major_category'] = 'mental_disorders'
        elif major_cat.startswith(('320', '321', '322', '323', '324', '325', '326', '327', '328', '329', '330', '331', '332', '333', '334', '335', '336', '337', '338', '339', '340', '341', '342', '343', '344', '345', '346', '347', '348', '349', '350', '351', '352', '353', '354', '355', '356', '357', '358', '359')):
            categories['major_category'] = 'nervous_system'
        elif major_cat.startswith(('360', '361', '362', '363', '364', '365', '366', '367', '368', '369', '370', '371', '372', '373', '374', '375', '376', '377', '378', '379')):
            categories['major_category'] = 'eye_diseases'
        elif major_cat.startswith(('380', '381', '382', '383', '384', '385', '386', '387', '388', '389', '390', '391', '392', '393', '394', '395', '396', '397', '398', '399')):
            categories['major_category'] = 'ear_diseases'
        elif major_cat.startswith(('401', '402', '403', '404', '405', '406', '407', '408', '409', '410', '411', '412', '413', '414', '415', '416', '417', '418', '419', '420', '421', '422', '423', '424', '425', '426', '427', '428', '429', '430', '431', '432', '433', '434', '435', '436', '437', '438', '439', '440', '441', '442', '443', '444', '445', '446', '447', '448', '449', '450', '451', '452', '453', '454', '455', '456', '457', '458', '459')):
            categories['major_category'] = 'circulatory_system'
        elif major_cat.startswith(('460', '461', '462', '463', '464', '465', '466', '467', '468', '469', '470', '471', '472', '473', '474', '475', '476', '477', '478', '479', '480', '481', '482', '483', '484', '485', '486', '487', '488', '489', '490', '491', '492', '493', '494', '495', '496', '497', '498', '499', '500', '501', '502', '503', '504', '505', '506', '507', '508', '509', '510', '511', '512', '513', '514', '515', '516', '517', '518', '519')):
            categories['major_category'] = 'respiratory_system'
        elif major_cat.startswith(('520', '521', '522', '523', '524', '525', '526', '527', '528', '529', '530', '531', '532', '533', '534', '535', '536', '537', '538', '539', '540', '541', '542', '543', '544', '545', '546', '547', '548', '549', '550', '551', '552', '553', '554', '555', '556', '557', '558', '559', '560', '561', '562', '563', '564', '565', '566', '567', '568', '569', '570', '571', '572', '573', '574', '575', '576', '577', '578', '579')):
            categories['major_category'] = 'digestive_system'
        elif major_cat.startswith(('580', '581', '582', '583', '584', '585', '586', '587', '588', '589', '590', '591', '592', '593', '594', '595', '596', '597', '598', '599')):
            categories['major_category'] = 'genitourinary_system'
        elif major_cat.startswith(('600', '601', '602', '603', '604', '605', '606', '607', '608', '609', '610', '611', '612', '613', '614', '615', '616', '617', '618', '619', '620', '621', '622', '623', '624', '625', '626', '627', '628', '629')):
            categories['major_category'] = 'pregnancy_childbirth_puerperium'
        elif major_cat.startswith(('630', '631', '632', '633', '634', '635', '636', '637', '638', '639', '640', '641', '642', '643', '644', '645', '646', '647', '648', '649', '650', '651', '652', '653', '654', '655', '656', '657', '658', '659', '660', '661', '662', '663', '664', '665', '666', '667', '668', '669', '670', '671', '672', '673', '674', '675', '676', '677', '678', '679')):
            categories['major_category'] = 'perinatal_conditions'
        elif major_cat.startswith(('680', '681', '682', '683', '684', '685', '686', '687', '688', '689', '690', '691', '692', '693', '694', '695', '696', '697', '698', '699', '700', '701', '702', '703', '704', '705', '706', '707', '708', '709', '710', '711', '712', '713', '714', '715', '716', '717', '718', '719', '720', '721', '722', '723', '724', '725', '726', '727', '728', '729', '730', '731', '732', '733', '734', '735', '736', '737', '738', '739')):
            categories['major_category'] = 'skin_subcutaneous_tissue'
        elif major_cat.startswith(('740', '741', '742', '743', '744', '745', '746', '747', '748', '749', '750', '751', '752', '753', '754', '755', '756', '757', '758', '759')):
            categories['major_category'] = 'congenital_anomalies'
        elif major_cat.startswith(('760', '761', '762', '763', '764', '765', '766', '767', '768', '769', '770', '771', '772', '773', '774', '775', '776', '777', '778', '779', '780', '781', '782', '783', '784', '785', '786', '787', '788', '789', '790', '791', '792', '793', '794', '795', '796', '797', '798', '799')):
            categories['major_category'] = 'symptoms_signs_ill_defined'
        elif major_cat.startswith(('800', '801', '802', '803', '804', '805', '806', '807', '808', '809', '810', '811', '812', '813', '814', '815', '816', '817', '818', '819', '820', '821', '822', '823', '824', '825', '826', '827', '828', '829', '830', '831', '832', '833', '834', '835', '836', '837', '838', '839', '840', '841', '842', '843', '844', '845', '846', '847', '848', '849', '850', '851', '852', '853', '854', '855', '856', '857', '858', '859', '860', '861', '862', '863', '864', '865', '866', '867', '868', '869', '870', '871', '872', '873', '874', '875', '876', '877', '878', '879', '880', '881', '882', '883', '884', '885', '886', '887', '888', '889', '890', '891', '892', '893', '894', '895', '896', '897', '898', '899', '900', '901', '902', '903', '904', '905', '906', '907', '908', '909', '910', '911', '912', '913', '914', '915', '916', '917', '918', '919', '920', '921', '922', '923', '924', '925', '926', '927', '928', '929', '930', '931', '932', '933', '934', '935', '936', '937', '938', '939', '940', '941', '942', '943', '944', '945', '946', '947', '948', '949', '950', '951', '952', '953', '954', '955', '956', '957', '958', '959', '960', '961', '962', '963', '964', '965', '966', '967', '968', '969', '970', '971', '972', '973', '974', '975', '976', '977', '978', '979', '980', '981', '982', '983', '984', '985', '986', '987', '988', '989', '990', '991', '992', '993', '994', '995', '996', '997', '998', '999')):
            categories['major_category'] = 'injury_poisoning'
        elif icd_code.startswith(('V', 'E')):
            categories['major_category'] = 'supplementary_classification'
        else:
            categories['major_category'] = 'other'

        if len(icd_code) >= 4:
            sub_cat = icd_code[:4]
        else:
            sub_cat = icd_code
        categories['subcategory'] = sub_cat

    elif icd_version == 10:
        if len(icd_code) >= 1:
            major_cat = icd_code[0]
        else:
            major_cat = icd_code

        if major_cat in ['A', 'B']:
            categories['major_category'] = 'infectious_diseases'
        elif major_cat in ['C', 'D']:
            categories['major_category'] = 'neoplasms'
        elif major_cat in ['E']:
            categories['major_category'] = 'endocrine_nutritional_metabolic'
        elif major_cat in ['F']:
            categories['major_category'] = 'mental_disorders'
        elif major_cat in ['G']:
            categories['major_category'] = 'nervous_system'
        elif major_cat in ['H']:
            categories['major_category'] = 'eye_ear_diseases'
        elif major_cat in ['I']:
            categories['major_category'] = 'circulatory_system'
        elif major_cat in ['J']:
            categories['major_category'] = 'respiratory_system'
        elif major_cat in ['K']:
            categories['major_category'] = 'digestive_system'
        elif major_cat in ['L']:
            categories['major_category'] = 'skin_subcutaneous_tissue'
        elif major_cat in ['M']:
            categories['major_category'] = 'musculoskeletal_connective_tissue'
        elif major_cat in ['N']:
            categories['major_category'] = 'genitourinary_system'
        elif major_cat in ['O']:
            categories['major_category'] = 'pregnancy_childbirth_puerperium'
        elif major_cat in ['P']:
            categories['major_category'] = 'perinatal_conditions'
        elif major_cat in ['Q']:
            categories['major_category'] = 'congenital_anomalies'
        elif major_cat in ['R']:
            categories['major_category'] = 'symptoms_signs_ill_defined'
        elif major_cat in ['S', 'T']:
            categories['major_category'] = 'injury_poisoning'
        elif major_cat in ['Z']:
            categories['major_category'] = 'health_services_contact'
        elif major_cat in ['V', 'W', 'X', 'Y']:
            categories['major_category'] = 'external_causes'
        else:
            categories['major_category'] = 'other'

        if len(icd_code) >= 3:
            sub_cat = icd_code[:3]
        else:
            sub_cat = icd_code
        categories['subcategory'] = sub_cat

    categories['icd_version'] = icd_version
    categories['code_length'] = len(icd_code)

    return categories

In [None]:
diagnoses['major_category'] = diagnoses.apply(lambda x: categorize_icd_code(x['icd_code'], x['icd_version'])['major_category'], axis=1)
diagnoses['subcategory'] = diagnoses.apply(lambda x: categorize_icd_code(x['icd_code'], x['icd_version'])['subcategory'], axis=1)

In [None]:
description_id_map = build_id_map(diagnoses['diagnosis_description'].unique())
diagnoses['description_id'] = diagnoses['diagnosis_description'].map(description_id_map)

icd_map = build_id_map(diagnoses['icd_code'].unique())
diagnoses['icd']= diagnoses['icd_code'].map(icd_map)


In [33]:
embedding_path = f'{dir}/Embeddings New'

diag_embeddings = build_pretrained_embeddings(
    diagnoses,
    text_col="diagnosis_description",
    node_id_col="description_id",
    batch_size=128,
    projection_dim=8,
    max_length = 128
)
torch.save(diag_embeddings, f'{embedding_path}/diag_embeddings.pt')

embedding_path = f'{dir}/Embeddings New'
diag_embeddings = torch.load(f'{embedding_path}/diag_embeddings.pt', weights_only=False)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.

Embedding diagnosis_description:   0%|          | 0/110 [00:00<?, ?it/s][A
Embedding diagnosis_description:   1%|          | 1/110 [00:16<30:16, 16.66s/it][A
Embedding diagnosis_description:   2%|

In [34]:
builder = FeatureBuilder(
    cont_cols = ['icd',	'icd_version'],
    cat_cols=['major_category', 'subcategory'],
    emb_dims = {'major_category' : 4, 'subcategory': 2},
    ontology_embeddings={'description_id': diag_embeddings},
)

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    diagnoses,
    id_col="diag_node_id"
    )

features_unique, inverse_indices, counts = torch.unique(features, dim=0, return_inverse=True, return_counts=True)

diagnoses['node_id'] = diagnoses['diag_node_id']

data['diagnosis'].x = features_unique
data['diagnosis'].feature_names = make_feature_name(feature_names)

builder = FeatureBuilder(
    cont_cols=['seq_num']
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    diagnoses,
    id_col="stay_id",
)

diag_id_map = build_id_map(diagnoses['diag_node_id'].unique())


src = torch.tensor(diagnoses['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor(diagnoses['diag_node_id'].values, dtype=torch.long)
edge_index = torch.stack([src, dst])

data['stay', 'HAS_DIAG', 'diagnosis'].edge_index = edge_index
data['stay', 'HAS_DIAG', 'diagnosis'].edge_attr = features.float()
data['stay', 'HAS_DIAG', 'diagnosis'].edge_attr_name = make_feature_name(feature_names)

  out = torch.tensor(out, dtype=torch.float32)


## Measurment

### icp

In [35]:
icp = load_data(dir, 'Measurment/icp.parquet')
icp = icp[icp['stay_id'].isin(stay_id_map.keys())]

icp = make_hours_since_icu(icp)
icp = icp.dropna(subset=['icp', 'stay_id'])

In [36]:
data['icp'].x = torch.tensor([[1.0]], dtype=torch.float)



builder = FeatureBuilder(
    cont_cols=['icp'],
    raw_cont_cols = ['hours_since_icu']
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    icp,
    id_col="stay_id",
    target_col=None
)

src =  torch.tensor(icp['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor([0] * len(icp), dtype=torch.long)
edge_index = torch.stack([src, dst])


data['stay', 'HAS_ICP', 'icp'].edge_index = edge_index
data['stay', 'HAS_ICP', 'icp'].edge_attr = features.float()
data['stay', 'HAS_ICP', 'icp'].edge_attr_name = make_feature_name(feature_names)

### code status

In [37]:
code_status = load_data(dir, 'Measurment/code_status.parquet')
code_status = code_status[code_status['stay_id'].isin(stay_id_map.keys())]

code_status = make_hours_since_icu(code_status)

In [38]:
code_status['id'] = 0

data['code_status'].x = torch.tensor([[1.0]], dtype=torch.float)


builder = FeatureBuilder(
    bool_cols =  ['fullcode', 'cmo', 'dni','dnr'],
    raw_cont_cols=['hours_since_icu']
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    code_status,
    id_col="stay_id",
    target_col=None
)

src =  torch.tensor(code_status['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor([0] * len(code_status), dtype=torch.long)
edge_index = torch.stack([src, dst])


data['stay', 'HAS_CODE_STATUS', 'code_status'].edge_index = edge_index
data['stay', 'HAS_CODE_STATUS', 'code_status'].edge_attr = features.float()
data['stay', 'HAS_CODE_STATUS', 'code_status'].edge_attr_name =  make_feature_name(feature_names)


### Inflammation

In [39]:
inflammation = load_data(dir, 'Measurment/inflammation.parquet')
inflammation = inflammation[inflammation['stay_id'].isin(stay_id_map.keys())]
inflammation = make_hours_since_icu(inflammation)

inflammation = inflammation.rename(columns={'specimen_id': 'inflammation_specimen_id'})

In [40]:
data['inflammation'].x = torch.tensor([[1.0]], dtype=torch.float)


builder = FeatureBuilder(
    cont_cols =  ['inflammation_specimen_id', 'crp'],
    raw_cont_cols = ['hours_since_icu']
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    inflammation,
    id_col="stay_id"
)


src =  torch.tensor(inflammation['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor([0] * len(inflammation), dtype=torch.long)
edge_index = torch.stack([src, dst])


data['stay', 'HAS_INFLAMMATION', 'inflammation'].edge_index = edge_index
data['stay', 'HAS_INFLAMMATION', 'inflammation'].edge_attr = features.float()
data['stay', 'HAS_INFLAMMATION', 'inflammation'].edge_attr_name =  make_feature_name(feature_names)

### Oxygen Delivery

In [41]:
dir = '/content/drive/MyDrive/Thesis/Codes/Data/Graph'
oxygen_delivery = load_data(dir, 'Measurment/oxygen_delivery_v2.parquet')
oxygen_delivery = oxygen_delivery[oxygen_delivery['stay_id'].isin(stay_id_map.keys())]
oxygen_delivery = make_hours_since_icu(oxygen_delivery)

od_id_map = build_id_map(oxygen_delivery['o2_delivery_device_1'].unique())

oxygen_delivery['oid'] = oxygen_delivery['o2_delivery_device_1'].map(od_id_map)

In [42]:
oxygen_delivery['o2_flow'] = oxygen_delivery['o2_flow'].fillna(oxygen_delivery['o2_flow_additional'])

In [43]:
oxygen_delivery = oxygen_delivery.drop(columns=['o2_flow_additional'])

In [44]:
oxygen_delivery['o2_delivery_device_1'] = oxygen_delivery['o2_delivery_device_1'].fillna("UNK").astype(str).str.strip()

In [45]:
builder = FeatureBuilder(
    cat_cols=['o2_delivery_device_1'],
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    oxygen_delivery,
    id_col="oid"
)

print(len(features))
unique_features, inverse_indices, counts = torch.unique(features, dim=0, return_inverse=True, return_counts=True)
print(len(unique_features))

data["oxygen_event"].x = unique_features
data["oxygen_event"].feature_names =  make_feature_name(feature_names)

builder = FeatureBuilder(
    cont_cols=['o2_flow', 'o2_flow_additional'],
    raw_cont_cols = ['hours_since_icu']
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    oxygen_delivery,
    id_col="stay_id"
)

src = torch.tensor(oxygen_delivery['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor(oxygen_delivery['oid'].values, dtype=torch.long)
edge_index = torch.stack([src, dst])


data['stay', 'HAS_OXYGEN', 'oxygen_event'].edge_index = edge_index
data['stay', 'HAS_OXYGEN', 'oxygen_event'].edge_attr = features.float()
data['stay', 'HAS_OXYGEN', 'oxygen_event'].edge_attr_name =  make_feature_name(feature_names)

253291
19


## Drugs

### Type 1 (Prescriptions)

In [46]:
dir = '/content/drive/MyDrive/Thesis/Codes/Data/Graph'


prescriptions = pd.read_parquet(f'{dir}/prescriptions_v2.parquet')
# prescriptions = prescriptions[prescriptions['stay_id'].isin(stay_id_map.keys())]

In [47]:
prescriptions['route'] = prescriptions['route'].fillna("UNK").astype(str).str.strip()

In [48]:
drug_id_map = build_id_map(prescriptions['drug'].unique())
prescriptions['drug_id'] = prescriptions['drug'].map(drug_id_map)


node_id_map = build_id_map(prescriptions['prec_id'].unique())
prescriptions['node_id'] = prescriptions['prec_id'].map(node_id_map)

In [49]:
# embedding_path = f'{dir}/Embeddings'

drug_embeddings = build_pretrained_embeddings(
    prescriptions,
    text_col="drug",
    node_id_col="drug_id",
    batch_size=128,
    projection_dim=4,
    max_length = 128
)
torch.save(drug_embeddings, f'{embedding_path}/drug_embeddings.pt')

drug_category_embeddings = build_pretrained_embeddings(
    prescriptions,
    text_col="drug_category",
    node_id_col="prec_id",
    batch_size=128,
    projection_dim=2
)
torch.save(drug_category_embeddings, f'{embedding_path}/drug_category_embeddings.pt')



drug_embeddings = torch.load(f'{embedding_path}/drug_embeddings.pt', weights_only=False)
drug_category_embeddings = torch.load(f'{embedding_path}/drug_category_embeddings.pt', weights_only=False)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Embedding drug: 100%|██████████| 21/21 [03:30<00:00, 10.01s/it]


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Embedding drug_category: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]


In [50]:
builder = FeatureBuilder(
    ontology_embeddings={'prec_id': drug_category_embeddings},
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    prescriptions,
    id_col="node_id"
)

print(len(features))
uniqe_features, inverse_indices, counts = torch.unique(features, dim=0, return_inverse=True, return_counts=True)
print(len(uniqe_features))

data["prescriptions"].x = uniqe_features
data["prescriptions"].feature_names = make_feature_name(feature_names)

2162785
33


In [51]:
builder = FeatureBuilder(
    cat_cols=['route'],
    raw_cont_cols=['hours_since_icu'],
    ontology_embeddings = {'drug_id': drug_embeddings}
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    prescriptions,
    id_col="stay_id"
)

src = torch.tensor(prescriptions['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor(prescriptions['node_id'].values, dtype=torch.long)
edge_index = torch.stack([src, dst])

data['stay', 'HAS_PRES', 'prescriptions'].edge_index = edge_index
data['stay', 'HAS_PRES', 'prescriptions'].edge_attr = features.float()
data['stay', 'HAS_PRES', 'prescriptions'].edge_attr_name =  make_feature_name(feature_names)

## Input Event

In [52]:
import re

In [53]:
inputevent = pd.read_parquet(f'{dir}/inputevent.parquet')
inputevent = inputevent[inputevent['stay_id'].isin(stay_id_map.keys())]
inputevent = inputevent.drop(columns= ['secondaryordercategoryname'])

for col in ['ordercategoryname', 'ordercomponenttypedescription']:
    inputevent[col] = inputevent[col].apply(
        lambda x: re.sub(r'^\d+-', '', str(x)).strip() if x is not None else x
    )

inputevent = make_hours_since_icu(inputevent)
inputevent = inputevent.drop(columns=['abbreviation'])

inputevent.loc[inputevent['rate'].isna(), 'rate'] = inputevent['amount']
inputevent.loc[inputevent['rateuom'].isna(), 'rateuom'] = inputevent['amountuom']

inputevent['node_id'] = inputevent.apply(lambda x: f"{x['label']}_{x['ordercategoryname']}", axis=1)

inputevent_id_map = build_id_map(inputevent['node_id'].unique())

In [54]:
inputevent['id'] = inputevent['node_id'].map(inputevent_id_map)

inputevent = inputevent.drop(columns =['node_id', 'amountuom', 'rateuom'])
inputevent['node_id'] = inputevent['id']

In [55]:
input_embeddings = build_pretrained_embeddings(
    inputevent,
    text_col="label",
    node_id_col="id",
    batch_size=32,
    projection_dim=2
)
torch.save(input_embeddings, f'{embedding_path}/input_embeddings.pt')

input_embeddings = torch.load(f'{embedding_path}/input_embeddings.pt', weights_only=False)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Embedding label: 100%|██████████| 11/11 [00:16<00:00,  1.50s/it]


In [56]:
builder = FeatureBuilder(
    cat_cols = ['ordercategoryname', 'ordercomponenttypedescription'],
    ontology_embeddings= {'id': input_embeddings}
    )

print(len(features))

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    inputevent,
    id_col="node_id"
)
print(len(uniqe_features))

unique_features, inverse_indices, counts = torch.unique(features, dim=0, return_inverse=True, return_counts=True)

data["input_event"].x = unique_features
data["input_event"].feature_names =  make_feature_name(feature_names)

2162785
33


In [57]:
builder = FeatureBuilder(
    cat_cols=['rateuom'],
    cont_cols=['rate'],
    raw_cont_cols =['hours_since_icu'],

    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    inputevent,
    id_col="stay_id"
)


src = torch.tensor(inputevent['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor(inputevent['id'].values, dtype=torch.long)
edge_index = torch.stack([src, dst])


data['stay', 'HAS_INPUT', 'input_event'].edge_index = edge_index
data['stay', 'HAS_INPUT', 'input_event'].edge_attr = features.float()
data['stay', 'HAS_INPUT', 'input_event'].edge_attr_name =  make_feature_name(feature_names)

## Output Events

In [58]:
outputevent = pd.read_parquet(f'{dir}/outputevent.parquet')
outputevent = outputevent[outputevent['stay_id'].isin(stay_id_map.keys())]
outputevent = make_hours_since_icu(outputevent)
outputevent = outputevent.drop(columns=['param_type'])
outputevent = outputevent.drop(columns=['valueuom'])
outputevent_id_map = build_id_map(outputevent['label'].unique())
outputevent['id'] = outputevent['label'].map(outputevent_id_map)
outputevent['node_id'] = outputevent['id']

In [59]:
output_embeddings = build_pretrained_embeddings(
    outputevent,
    text_col="label",
    node_id_col="id",
    batch_size=128,
    projection_dim=2
)

torch.save(output_embeddings, f'{embedding_path}/output_embeddings.pt')

# output_embeddings = torch.load(f'{embedding_path}/output_embeddings.pt', weights_only=False)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Embedding label: 100%|██████████| 1/1 [00:02<00:00,  2.43s/it]


In [60]:
builder = FeatureBuilder(
    ontology_embeddings= {'id': output_embeddings})

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    outputevent,
    id_col="node_id"
)

print(len(features))
features, inverse_indices, counts = torch.unique(features, dim=0, return_inverse=True, return_counts=True)
print(len(features))

data["output_event"].x = features
data["output_event"].feature_names =  make_feature_name(feature_names)


builder = FeatureBuilder(
    cat_cols=['valueuom'],
    cont_cols=['value'],
    raw_cont_cols = ['hours_since_icu']
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    outputevent,
    id_col="stay_id"
)

src = torch.tensor(outputevent['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor(outputevent['label'].map(outputevent_id_map).values, dtype=torch.long)
edge_index = torch.stack([src, dst], dim=0)


data['stay', 'HAS_OUTPUT', 'output_event'].edge_index = edge_index
data['stay', 'HAS_OUTPUT', 'output_event'].edge_attr = features.float()
data['stay', 'HAS_OUTPUT', 'output_event'].edge_attr_name =  make_feature_name(feature_names)

662106
70


## Chart Events

In [61]:
dir = '/content/drive/MyDrive/Thesis/Codes/Data/Graph'
chartevents = pd.read_parquet(f'{dir}/chartevent.parquet')
chartevents = chartevents[chartevents['stay_id'].isin(stay_id_map.keys())]
chartevents = make_hours_since_icu(chartevents)

In [62]:
warnings = chartevents[chartevents['param_type'] == 'Checkbox']
chartevents = chartevents[chartevents['param_type'] != 'Checkbox']
text_events = chartevents[chartevents['param_type'] == 'Text']
chartevents = chartevents[chartevents['param_type'] != 'Text']
chartevents = chartevents.drop(columns =['warning', 'valueuom'])

### Warning

In [63]:
warnings = warnings.drop(columns = ['param_type', 'value',	'valueuom',	'warning'])
warnings_id_map = build_id_map(warnings['label'].unique())
warnings['id'] = warnings['label'].map(warnings_id_map)
warnings['node_id'] = warnings['id']

In [64]:
warnings_embeddings = build_pretrained_embeddings(
    warnings,
    text_col="label",
    node_id_col="id",
    batch_size=128,
    projection_dim=4
)
torch.save(warnings_embeddings, f'{embedding_path}/warnings_embeddings.pt')

# warnings_embeddings = torch.load(f'{embedding_path}/warnings_embeddings.pt', weights_only=False)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Embedding label: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it]


In [65]:
builder = FeatureBuilder(
    ontology_embeddings= {'id': warnings_embeddings}
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    warnings,
    id_col="node_id"
)
print(len(features))
features, inverse_indices, counts = torch.unique(features, dim=0, return_inverse=True, return_counts=True)
print(len(features))

data["warning"].x = features
data["warning"].feature_names = make_feature_name(feature_names)

551
29


In [66]:
builder = FeatureBuilder(
    raw_cont_cols=['hours_since_icu']
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    warnings,
    id_col="stay_id"
)

src = torch.tensor(warnings['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor(warnings['id'].values, dtype=torch.long)
edge_index = torch.stack([src, dst])


data['stay', 'HAS_WARNING', 'warning'].edge_index = edge_index
data['stay', 'HAS_WARNING', 'warning'].edge_attr = features.float()
data['stay', 'HAS_WARNING', 'warning'].edge_attr_name = make_feature_name(feature_names)

### Text Events

In [67]:
text_events_id_map = build_id_map(text_events['label'].unique())
text_events['id'] = text_events['label'].map(text_events_id_map)
text_events['node_id'] = text_events['id']


label_embeddings = build_pretrained_embeddings(
    text_events,
    text_col="label",
    node_id_col="id",
    batch_size=32,
    projection_dim=4
)

torch.save(label_embeddings, f'{embedding_path}/text_label_embeddings.pt')


value_id_map = build_id_map(text_events['value'].unique())
text_events['value_id'] = text_events['value'].map(value_id_map)

value_embeddings = build_pretrained_embeddings(
    text_events,
    text_col="value",
    node_id_col="value_id",
    batch_size=32,
    projection_dim=2
)

torch.save(value_embeddings, f'{embedding_path}/text_value_embeddings.pt')

# label_embeddings = torch.load(f'{embedding_path}/text_label_embeddings.pt', weights_only=False)
# value_embeddings = torch.load(f'{embedding_path}/text_value_embeddings.pt', weights_only=False)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Embedding label: 100%|██████████| 17/17 [00:23<00:00,  1.38s/it]


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Embedding value: 100%|██████████| 43/43 [01:06<00:00,  1.54s/it]


In [68]:
builder = FeatureBuilder(
    ontology_embeddings= {'id': label_embeddings}
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    text_events,
    id_col="node_id"
)

print(len(features))
features, inverse_indices, counts = torch.unique(features, dim=0, return_inverse=True, return_counts=True)
print(len(features))

data['text_event'].x = features
data['text_event'].feature_names = make_feature_name(feature_names)

107070
543


In [69]:
builder = FeatureBuilder(
    raw_cont_cols=['hours_since_icu'],
    ontology_embeddings = {'id': value_embeddings}
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    text_events,
    id_col="stay_id"
    )

src = torch.tensor(text_events['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor(text_events['node_id'].values, dtype=torch.long)
edge_index = torch.stack([src, dst], dim=0)


data['stay', 'HAS_TEXTED', 'text_event'].edge_index = edge_index
data['stay', 'HAS_TEXTED', 'text_event'].edge_attr = features.float()
data['stay', 'HAS_TEXTED', 'text_event'].edge_attr_name = make_feature_name(feature_names)

In [70]:
data['stay', 'HAS_TEXTED', 'text_event'].edge_attr_name

{'id': [0, 1], 'hours_since_icu': [2, 2]}

In [71]:
def sanitize_hetero_(data):
    """
    Remove invalid edges and keep edge attributes aligned.
    Works in-place on a PyG HeteroData object.
    """

    for key, store in data.edge_items():
        src, rel, dst = key
        edge_index = store.edge_index

        src_num = data[src].num_nodes
        dst_num = data[dst].num_nodes

        src_idx = edge_index[0]
        dst_idx = edge_index[1]

        mask = (src_idx < src_num) & (dst_idx < dst_num)

        n_drop = int((~mask).sum())
        if n_drop == 0:
            continue

        print(f"[sanitize] dropping {n_drop} edges from {key}")

        # ---- apply mask ----
        store.edge_index = edge_index[:, mask]

        # ---- keep ALL edge-level tensors aligned ----
        for attr in [
            "edge_attr",
            # "edge_weight",
            # "edge_time",
            # "edge_label",
            # "edge_type",
        ]:
            if hasattr(store, attr):
                tensor = getattr(store, attr)
                if tensor is not None:
                    setattr(store, attr, tensor[mask])



sanitize_hetero_(data)


[sanitize] dropping 9978 edges from ('stay', 'HAS_TEXTED', 'text_event')


### Other

In [72]:
value_id_map = build_id_map(chartevents['label'].unique())
chartevents['id'] = chartevents['label'].map(value_id_map)
chartevents['node_id'] = chartevents['id']

label_embeddings = build_pretrained_embeddings(
    chartevents,
    text_col="label",
    node_id_col="id",
    batch_size=128,
    projection_dim=4
)
torch.save(label_embeddings, f'{embedding_path}/chart_label_embeddings.pt')

# label_embeddings = torch.load(f'{embedding_path}/chart_label_embeddings.pt', weights_only = False)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Embedding label: 100%|██████████| 3/3 [00:18<00:00,  6.10s/it]


In [73]:
builder = FeatureBuilder(
    ontology_embeddings= {'id': label_embeddings}
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    chartevents,
    id_col="node_id"
)

print(len(features))
features, inverse_indices, counts = torch.unique(features, dim=0, return_inverse=True, return_counts=True)
print(len(features))

data["chart_event"].x = features
data["chart_event"].feature_names = make_feature_name(feature_names)

1546293
370


In [74]:
builder = FeatureBuilder(
    cont_cols=['value'],
    raw_cont_cols = ['hours_since_icu']
    )

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    chartevents,
    id_col="stay_id"
    )

src = torch.tensor(chartevents['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor(chartevents['id'].values, dtype=torch.long)
edge_index = torch.stack([src, dst], dim=0)


data['stay', 'HAS_CHARTED', 'chart_event'].edge_index = edge_index
data['stay', 'HAS_CHARTED', 'chart_event'].edge_attr = features.float()
data['stay', 'HAS_CHARTED', 'chart_event'].edge_attr_name = make_feature_name(feature_names)

## Scores

### SIRS

In [75]:
sirs = load_data(f'{dir}/Score', 'sirs.parquet')
sirs = sirs[sirs['stay_id'].isin(stay_id_map.keys())]

sirs['id'] = 0

describe = """Range 0 to 4. Higher score reflects stronger inflammation, sepsis risk, and physiologic stress affecting ICU length of stay."""


label_embeddings = build_text_embeddings(
    text = describe,
    batch_size = 32,
    device = None,
    projection_dim = 8,
    )

data['sirs'].x = torch.tensor(label_embeddings, dtype=torch.float)
data['sirs'].feature_names = {'sirs_discribe': [0, 7]}

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [76]:
builder = FeatureBuilder(
    cont_cols = ['sirs','temp_score','heart_rate_score','resp_score',   'wbc_score'],
)

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    sirs,
    id_col='stay_id',
    target_col=None
)

features = torch.nan_to_num(features, nan=0.0)

src = torch.tensor(sirs['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor([0] * len(sirs), dtype=torch.long)
edge_index = torch.stack([src, dst])

data['stay', 'HAS_SIRS', 'sirs'].edge_index = edge_index
data['stay', 'HAS_SIRS', 'sirs'].edge_attr = features.float()
data['stay', 'HAS_SIRS', 'sirs'].edge_attr_name = make_feature_name(feature_names)

### SOFA

In [77]:
sofa = load_data(f'{dir}/Score', 'sofa.parquet')
sofa = sofa[sofa['stay_id'].isin(stay_id_map.keys())]

In [78]:
sofa = load_data(f'{dir}/Score', 'sofa.parquet')
sofa = sofa[sofa['stay_id'].isin(stay_id_map.keys())]

sofa['id'] = 0

describe = """SOFA score evaluates organ dysfunction in ICU patients. Based on respiratory, cardiovascular, hepatic, coagulation, renal, and neurological function. Higher score indicates greater organ failure and mortality risk."""


label_embeddings = build_text_embeddings(
    text = describe,
    model_name = "emilyalsentzer/Bio_ClinicalBERT",
    batch_size = 128,
    max_length = 64,
    projection_dim = 8,
    # desc = "Embedding diagnoses"
    )

data['sofa'].x = torch.tensor(label_embeddings, dtype=torch.float)
data['sofa'].feature_names = {'sofa_discribe': [0, 7]}

builder = FeatureBuilder(
    cont_cols = ['sofa'],
)

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    sofa.drop(columns=['stay_id']),
    id_col=None,
    target_col=None
)

src = torch.tensor(sofa['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor([0] * len(sofa), dtype=torch.long)
edge_index = torch.stack([src, dst])

data['stay', 'HAS_SOFA', 'sofa'].edge_index = edge_index
data['stay', 'HAS_SOFA', 'sofa'].edge_attr = features.float()
data['stay', 'HAS_SOFA', 'sofa'].edge_attr_name = make_feature_name(feature_names)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


### LODS

In [79]:
lods = load_data(f'{dir}/Score', 'lods.parquet')
lods = lods[lods['stay_id'].isin(stay_id_map.keys())]

lods['id'] = 0

describe = """LODS score quantifies severity of organ dysfunction using physiologic and laboratory variables. Includes cardiovascular, neurologic, renal, hematologic, hepatic, and respiratory parameters. Higher score shows increased organ failure and death risk."""


label_embeddings = build_text_embeddings(
    text = describe,
    model_name = "emilyalsentzer/Bio_ClinicalBERT",
    batch_size = 128,
    max_length = 64,
    device = None,
    projection_dim = 8,
    # desc = "Embedding diagnoses"
    )

data['lods'].x = torch.tensor(label_embeddings, dtype=torch.float)
data['lods'].feature_names = {'lods_discribe': [0, 15]}


builder = FeatureBuilder(
    cont_cols = ['lods','neurologic','cardiovascular','renal','pulmonary','hematologic','hepatic'],
)

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    lods,
    id_col='stay_id',
    target_col=None
)

# Handle NaN values by replacing them with 0 before assigning to edge_attr
features = torch.nan_to_num(features, nan=0.0)

src =  torch.tensor(lods['stay_id'].map(stay_id_map), dtype=torch.long)
dst =  torch.tensor([0] * len(lods), dtype=torch.long)
edge_index = torch.stack([src, dst])

data['stay', 'HAS_LODS', 'lods'].edge_index = edge_index
data['stay', 'HAS_LODS', 'lods'].edge_attr = features.float()
data['stay', 'HAS_LODS', 'lods'].edge_attr_name = make_feature_name(feature_names)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


### APSIII

In [80]:
apsiii = load_data(f'{dir}/Score', 'apsiii.parquet')

apsiii = apsiii[apsiii['stay_id'].isin(stay_id_map.keys())]

apsiii['id'] = 0

describe = """ Higher score represents more severe illness and higher mortality probability."""


label_embeddings = build_text_embeddings(
    text = describe,
    model_name = "emilyalsentzer/Bio_ClinicalBERT",
    batch_size = 32,
    max_length = 64,
    device = None,
    projection_dim = 8,
    # desc = "Embedding diagnoses"
    )

data['apsiii'].x = torch.tensor(label_embeddings, dtype=torch.float)
data['apsiii'].feature_names = {'apsiii_discribe': [0, 7]}


builder = FeatureBuilder(
    cont_cols = ['apsiii',	'apsiii_prob'],
)

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    apsiii.drop(columns=['stay_id']),
    id_col=None,
    target_col=None
)

features = torch.nan_to_num(features, nan=0.0)

src = torch.tensor(apsiii['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor([0] * len(apsiii), dtype=torch.long)
edge_index = torch.stack([src, dst])

data['stay', 'HAS_APSIII', 'apsiii'].edge_index = edge_index
data['stay', 'HAS_APSIII', 'apsiii'].edge_attr = features.float()
data['stay', 'HAS_APSIII', 'apsiii'].edge_attr_name = make_feature_name(feature_names)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


### OASIS

In [81]:
oasis = load_data(f'{dir}/Score', 'oasis.parquet')
oasis = oasis[oasis['stay_id'].isin(stay_id_map.keys())]

oasis['id'] = 0

describe = """Higher score reflects increased illness severity and mortality risk."""


label_embeddings = build_text_embeddings(
    text = describe,
    model_name = "emilyalsentzer/Bio_ClinicalBERT",
    batch_size = 32,
    max_length = 64,
    device = None,
    projection_dim = 2,
    # desc = "Embedding diagnoses"
    )

data['oasis'].x = torch.tensor(label_embeddings, dtype=torch.float)
data['oasis'].feature_names = {'oasis_discribe': [0, 1]}


builder = FeatureBuilder(
    cont_cols = ['oasis', 'oasis_prob', 'age', 'preiculos', 'preiculos_score',
       'gcs', 'gcs_score', 'heartrate', 'heart_rate_score', 'meanbp',
       'mbp_score', 'resprate', 'resp_rate_score', 'temp', 'temp_score',
       'urineoutput', 'urineoutput_score', 'mechvent', 'mechvent_score',
       'electivesurgery', 'electivesurgery_score']
)

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    oasis.drop(columns=['stay_id']),
    id_col=None,
    target_col=None
)
features = torch.nan_to_num(features, nan=0.0)

src = torch.tensor(oasis['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor([0] * len(oasis), dtype=torch.long)
edge_index = torch.stack([src, dst])

data['stay', 'HAS_OASIS', 'oasis'].edge_index = edge_index
data['stay', 'HAS_OASIS', 'oasis'].edge_attr = features.float()
data['stay', 'HAS_OASIS', 'oasis'].edge_attr_name = make_feature_name(feature_names)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


### MELD

In [82]:
meld = load_data(f'{dir}/Score', 'meld.parquet')
meld = meld[meld['stay_id'].isin(stay_id_map.keys())]

meld['id'] = 0

describe = """MELD score assesses liver disease severity using bilirubin, INR, and creatinine. Predicts survival and guides transplant priority. Higher score indicates worse hepatic dysfunction and prognosis."""


label_embeddings = build_text_embeddings(
    text = describe,
    model_name = "emilyalsentzer/Bio_ClinicalBERT",
    batch_size = 32,
    max_length = 64,
    device = None,
    projection_dim = 4,
    # desc = "Embedding diagnoses"
    )

data['meld'].x = torch.tensor(label_embeddings, dtype=torch.float)
data['meld'].feature_names = {'meld_discribe': [0, 3]}

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.decoder.weight             | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [83]:
builder = FeatureBuilder(
    cont_cols = ['meld_initial', 'meld', 'rrt', 'creatinine_max',
       'bilirubin_total_max', 'inr_max', 'sodium_min']
)

features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    meld,
    id_col='stay_id',
    target_col=None
)


src = torch.tensor(meld['stay_id'].map(stay_id_map).values, dtype=torch.long)
dst = torch.tensor([0] * len(meld), dtype=torch.long)
edge_index = torch.stack([src, dst], dim = 0)

data['stay', 'HAS_MELD', 'meld'].edge_index = edge_index
data['stay', 'HAS_MELD', 'meld'].edge_attr = features.float()
data['stay', 'HAS_MELD', 'meld'].edge_attr_name = make_feature_name(feature_names)

## Save First

In [84]:
dir_save = '/content/drive/MyDrive/Thesis/Codes'

data_dir = f'{dir_save}/graph_data_embedded_v1_lighter.pt'
torch.save(data, f'{data_dir}')


print(f"Data saved successfully to {data_dir}")

Data saved successfully to /content/drive/MyDrive/Thesis/Codes/graph_data_embedded_v1_lighter.pt


# Sofa Lods Merged

In [None]:
import pickle
with open(f'{dir}/Score/sofa_lods_dict.pkl', 'rb') as f:
    data_dict = pickle.load(f)

In [None]:
for node in ['pulmonary', 'neurologic', 'cardiovascular', 'renal', 'hematologic', 'hepatic']:

  data[node].x = torch.tensor([[1.0]], dtype=torch.float)

  temp = data_dict[node].dropna(subset=[f'{node}_lods'])

  builder = FeatureBuilder(
      cont_cols = [f'{node}_lods']
  )

  features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
      temp,
      id_col='stay_id',

  )

  src = temp['stay_id'].map(stay_id_map).values
  dst = [0] * len(temp)

  edge_index = torch.tensor([src, dst], dtype=torch.long)

  data['stay', 'HAS_LODS', node].edge_index = edge_index
  data['stay', 'HAS_LODS', node].edge_attr = features.float()
  data['stay', 'HAS_LODS', node].edge_attr_name = make_feature_name(feature_names)


  temp = data_dict[node].dropna(subset=[f'{node}_sofa'])

  builder = FeatureBuilder(
      cont_cols = [f'{node}_sofa']
  )

  features, feature_names, emb_layers, cat_maps, ids, target = builder.build(
    temp,
    id_col='stay_id',
  )


  src = temp['stay_id'].map(stay_id_map).values
  dst = [0] * len(temp)

  edge_index = torch.tensor([src, dst], dtype=torch.long)

  data['stay', 'HAS_SOFA', node].edge_index = edge_index
  data['stay', 'HAS_SOFA', node].edge_attr = features.float()
  data['stay', 'HAS_SOFA', node].edge_attr_name = make_feature_name(feature_names)

In [None]:
del data['lods']
del data['stay', 'HAS_LODS', 'lods']
del data['sofa']
del data['stay', 'HAS_SOFA', 'sofa']

## Save and Load

In [None]:
data_dir = f'{dir}/graph_data_embedded_v2.pt'
torch.save(data, f'{data_dir}')

print(f"Data saved successfully to {data_dir}")

Data saved successfully to /content/drive/MyDrive/Thesis/Codes/Data/Graph/graph_data_embedded_v2.pt


# Make Temporal Edges

In [None]:
from torch_geometric.data import HeteroData
data = HeteroData()
dir = '/content/drive/MyDrive/Thesis/Codes/S2G'
data = torch.load(f'{dir}/graph_data_embedded_v1.pt', weights_only=False)

In [None]:
def get_edge_col(edge_attr_name, key):
    """
    edge_attr_name = dict like:
      {'o2_flow':[0,0], 'hours_since_icu':[2,2]}
    returns column index
    """
    return edge_attr_name[key][0]

def get_time_and_value_indices(edge_attr_name):
    time_key = None
    value_keys = []

    for k in edge_attr_name.keys():
        if 'hours_since_icu' in k:
            time_key = k
        else:
            value_keys.append(k)

    if time_key is None:
        print("No time key found!")


    return time_key, value_keys


def build_sequences_from_relation(
    data,
    edge_type,
    num_stays,
    T=24,
    device="cpu"
):
    store = data[edge_type]

    edge_index = store.edge_index.to(device)
    edge_attr = store.edge_attr.to(device)
    names = store.edge_attr_name

    time_key, value_keys = get_time_and_value_indices(names)
    # print(time_key, value_keys)

    if time_key is None:
        return None, None, None

    time_idx = names[time_key][0]

    stay_idx = edge_index[0]
    time = edge_attr[:, time_idx]
    hour_bin = time.long().clamp(0, T-1)

    seq_list = []
    mask_list = []

    for v_key in value_keys:
        v_idx = names[v_key][0]
        values = edge_attr[:, v_idx]

        seq = torch.zeros((num_stays, T), device=device)
        cnt = torch.zeros_like(seq)

        flat = stay_idx * T + hour_bin

        seq.view(-1).index_add_(0, flat, values)
        cnt.view(-1).index_add_(0, flat, torch.ones_like(values))

        mask = cnt > 0
        seq[mask] /= cnt[mask]

        seq_list.append(seq)
        mask_list.append(mask)

    seq_cat = torch.stack(seq_list, dim=-1)   # (N,T,D_rel)
    mask_cat = torch.stack(mask_list, dim=-1)

    return seq_cat, mask_cat, value_keys



In [None]:
def build_full_timeseries(data, device="cpu"):
    num_stays = data['stay'].num_nodes
    T = 24

    X_list = []
    M_list = []
    feature_names = []

    for edge_type in data.edge_types:
      if edge_type[2] != 'warning':
        if edge_type[0] == 'stay' and edge_type[1].startswith('HAS_'):
            store = data[edge_type]
            print(edge_type)
            if getattr(store, 'edge_attr', None) is None:
                continue

            seq, mask, keys = build_sequences_from_relation(
                data,
                edge_type,
                num_stays=num_stays,
                T=T,
                device=device
            )

            if seq is None:
                continue

            X_list.append(seq)
            M_list.append(mask)

            for k in keys:
                feature_names.append((edge_type, k))

    X = torch.cat(X_list, dim=-1)   # (N,T,D_total)
    M = torch.cat(M_list, dim=-1)

    return X, M, feature_names

In [None]:
X, M, fnames = build_full_timeseries(data, device="cpu")

('stay', 'HAS_PROC', 'procedure')
('stay', 'HAS_DIAG', 'diagnosis')
No time key found!
('stay', 'HAS_ICP', 'icp')
('stay', 'HAS_CODE_STATUS', 'code_status')
('stay', 'HAS_INFLAMMATION', 'inflammation')
('stay', 'HAS_OXYGEN', 'oxygen_event')
('stay', 'HAS_PRES', 'prescriptions')
('stay', 'HAS_INPUT', 'input_event')
('stay', 'HAS_OUTPUT', 'output_event')
('stay', 'HAS_TEXTED', 'text_event')
('stay', 'HAS_CHARTED', 'chart_event')
('stay', 'HAS_SIRS', 'sirs')
No time key found!
('stay', 'HAS_SOFA', 'sofa')
No time key found!
('stay', 'HAS_LODS', 'lods')
No time key found!
('stay', 'HAS_APSIII', 'apsiii')
No time key found!
('stay', 'HAS_OASIS', 'oasis')
No time key found!
('stay', 'HAS_MELD', 'meld')
No time key found!


In [None]:
def forward_fill(X, M):
    X = X.clone()
    N,T,D = X.shape

    for d in range(D):
        last = torch.zeros(N, device=X.device)
        seen = torch.zeros(N, dtype=torch.bool, device=X.device)

        for t in range(T):
            mask = M[:,t,d]
            last[mask] = X[mask,t,d]
            seen |= mask
            X[~seen,t,d] = 0
            X[seen & ~mask,t,d] = last[seen & ~mask]

    return X

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TemporalEncoder(nn.Module):
    def __init__(self, d_in, d_hidden=128, d_out=128):
        super().__init__()
        self.proj = nn.Linear(d_in, d_hidden)

        self.gru = nn.GRU(
            input_size=d_hidden,
            hidden_size=d_hidden,
            num_layers=2,
            batch_first=True,
            bidirectional=True
        )

        self.out = nn.Linear(d_hidden * 2, d_out)

    def forward(self, X):
        """
        X shape: (N , T , D)
        """

        h = F.gelu(self.proj(X))
        out, _ = self.gru(h)
        h_last = out[:, -1, :]

        z = self.out(h_last)

        return z


In [None]:
encoder = TemporalEncoder(
    d_in=X.shape[-1],
    d_hidden=128,
    d_out=128
).to("cpu")

z_ts = encoder(forward_fill(X,M))

In [None]:
data['stay'].x = torch.cat(
    [data['stay'].x.to("cpu"), z_ts],
    dim=-1
)
data['stay'].feature_names['temporal_embedding'] = [34, len(data['stay'].x[0]) - 1]

In [None]:
dir_save = '/content/drive/MyDrive/Thesis/Codes/S2G'

data_dir = f'{dir_save}/graph_data_embedded_v1_temporal_coded.pt'
torch.save(data, f'{data_dir}')


print(f"Data saved successfully to {data_dir}")

Data saved successfully to /content/drive/MyDrive/Thesis/Codes/S2G/graph_data_embedded_v1_temporal_coded.pt
