# Embed Features using Pretrained Models

In [None]:
import numpy as np
import pandas as pd

import torch
from transformers import pipeline, AutoModel, AutoConfig, AutoTokenizer

import os

Load the data and the variable descriptions

In [None]:
df = pd.read_csv("yale_new_haven.csv")

In [None]:
df_vars = pd.read_csv("variable_descriptions.csv")

Get the column of interest

In [None]:
disposition_var = {'disposition'}
demographic_vars = {'age', 'gender', 'ethnicity', 'race', 'lang',
       'religion', 'maritalstatus', 'employstatus', 'insurance_status'}
triage_evaluation_vars = {'dep_name', 'esi', 'arrivalmode', 'arrivalmonth', 'arrivalday', 'arrivalhour_bin'}.union({col for col in df.columns if 'triage_vital' in col})
chief_complaint_vars = {col for col in df.columns if "cc_" in col}
medication_vars = {col for col in df.columns if 'meds_' in col}
hospital_usage_stats_vars = {'previousdispo', 'n_edvisits', 'n_admissions', 'n_surgeries'}
imaging_ekg_vars = {'cxr_count','echo_count','ekg_count','otherxr_count', 'otherus_count', 'headct_count', 'otherct_count', 'mri_count','otherimg_count'}
historical_vital_vars = {'dbp_last',
 'dbp_max',
 'dbp_median',
 'dbp_min',
 'o2_device_last',
 'o2_device_max',
 'o2_device_median',
 'o2_device_min',
 'pulse_last',
 'pulse_max',
 'pulse_median',
 'pulse_min',
 'resp_last',
 'resp_max',
 'resp_median',
 'resp_min',
 'sbp_last',
 'sbp_max',
 'sbp_median',
 'sbp_min',
 'spo2_last',
 'spo2_max',
 'spo2_median',
 'spo2_min',
 'temp_last',
 'temp_max',
 'temp_median',
 'temp_min'}

curr = disposition_var.union(demographic_vars.union(triage_evaluation_vars.union(chief_complaint_vars.union(medication_vars.union(hospital_usage_stats_vars.union(imaging_ekg_vars.union(historical_vital_vars)))))))

# past medical history
pmh_vars = list({col for col in df.columns if col not in curr and "_" not in col and col not in ['ID', 'previousdispo']})
pmh_vars.sort()

In [None]:
embed_feats = "pmh"

In [None]:
if embed_feats == 'cc':
    embed_cols = list(chief_complaint_vars)
elif embed_feats == 'pmh':
    embed_cols = list(pmh_vars)

In [None]:
df = df[embed_cols]

#### Get the collection of values for each patient

Get variable name to condition name mapping

In [None]:
df_vars = df_vars[['Variable Name', 'Description']][df_vars['Variable Name'].isin(embed_cols)].set_index('Variable Name')
col_cond_dict = df_vars.to_dict()['Description']

Get conditions

In [None]:
# conditions = df.T.apply(lambda x: [col_cond_dict[i] for i in x[x ==1].index])

In [None]:
conditions = pd.read_csv(f"~/scratch/datasets/yale_new_haven/supplementary_info/{embed_feats}_list.csv")
conditions = conditions['0'].apply(lambda x: x.split("!") if isinstance(x, str) else [])

In [None]:
conditions_text = list(col_cond_dict.values())

## Embed the conditions

Huggingface Model

In [None]:
model = 'dmis-lab/biosyn-sapbert-bc5cdr-disease'
# model = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
# model = 'michiyasunaga/BioLinkBERT-large'

In [None]:
feature_extractor = pipeline('feature-extraction', model=model, framework='pt')

In [None]:
# get the mean embedding of the string given by x
def get_embedding(x):
    return np.squeeze(feature_extractor(x)).mean(axis=0)

Get the average embedding of the conditions for each of the patients

In [None]:
embedding_dict = {}
for c in conditions_text:
    embedding_dict[c] = get_embedding(c)

In [None]:
E = np.zeros((conditions.shape[0], get_embedding("hello").shape[0]))
for i in range(len(conditions)):
    if i != 0 and i%10000 == 0:
        print(i)
    if len(conditions[i]) > 0:
        E[i, :] = np.array([embedding_dict[c] for c in conditions[i]]).mean(axis=0)

In [None]:
df_E = pd.DataFrame(E)

In [None]:
df_E = df_E.rename(columns={col:f"{embed_feats}_{col}" for col in df_E.columns})

In [None]:
output_filepath = f'past_medical_hist_embeddings.csv'

In [None]:
df_E.to_csv(output_filepath, index=False)