In [1]:
from pathlib import Path
import colorcet as cc
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset, Dataset, load_from_disk

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


# set some defaults
warnings.filterwarnings("ignore")
np.set_printoptions(precision=4, 
                    floatmode='fixed')
pd.set_option('display.precision', 3)

# set default font size for matplotlib
plt.rcParams.update({'font.size': 12,})
# set default font
plt.rcParams['font.family'] = 'Roboto'

heatmap_parms = {
                # 'linewidths' : 0.5,
                'linecolor' : 'white',
                'cmap' : cc.cm.bkr,
                }


In [2]:
# read in the LCOPT-data-dict

data_dict = pd.read_excel('../lc-project-data/2024-04-09-DATA-DICTIONARY.xlsx')

In [3]:
data_dict

Unnamed: 0,Variable / Field Name,Form Name,Section Header,Field Type,Field Label,"Choices, Calculations, OR Slider Labels",Field Note,Text Validation Type OR Show Slider Number,Text Validation Min,Text Validation Max,...,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50
0,record_id,consent_form,,text,Record ID,,,,,,...,,,,,,,,,,
1,sex,demographics,,radio,Sex,"1, Male | 2, Female",,,,,...,,,,,,,,,,
2,gender,demographics,,text,Gender,,,,,,...,,,,,,,,,,
3,age,demographics,,text,Age,,,number,0,100,...,,,,,,,,,,
4,home_location,demographics,,radio,Living where,"1, Home | 2, Residence with services (RPA) | 3...",,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,label,biospecimen,,descriptive,[dag] - [labelnumber] PLM-A\n[dag] - [labelnum...,,,,,,...,,,,,,,,,,
308,datetime_collection,biospecimen,,text,Date and Time of Collection,,,datetime_ymd,2020-01-01 00:00:00,today,...,,,,,,,,,,
309,datetime_freezing,biospecimen,,text,Date and Time of Freezing,,,datetime_ymd,,,...,,,,,,,,,,
310,aliquots,biospecimen,,dropdown,# of aliquots,"A, 1 | B, 2 | C, 3 | D, 4 | E, 5 | F, 6 | G, 7...",,,,,...,,,,,,,,,,


In [4]:
# read the Medications sheet from the data dict file
medications = pd.read_excel('../lc-project-data/2024-04-09-DATA-DICTIONARY.xlsx', sheet_name='Medications', header = None)

# add a header row

# make a dict: index: column 0

# convert the dataframe to a dict
medications_dict = medications.set_index(0).T.to_dict('records')[0]

# invert the k,v
medications_dict = {v: k for k, v in medications_dict.items()}
medications_dict

{1: 'Analgesics',
 2: 'Anesthetics',
 3: 'Anti-addiction agents',
 4: 'Antibacterials',
 5: 'Anticonvulsants',
 6: 'Antidementia agents',
 7: 'Antidepressants\xa0',
 8: 'Antiemetics',
 9: 'Antifungals',
 10: 'Antigout agents',
 11: 'Antimigraine agents',
 12: 'Antimyasthenic agents',
 13: 'Antimycobacterials',
 14: 'Antineoplastics',
 15: 'Antiparasitics',
 16: 'Antiparkinson agents',
 17: 'Antipsychotics',
 18: 'Antispasticity agents',
 19: 'Antivirals',
 20: 'Anxiolytics',
 21: 'Bipolar agents',
 22: 'Blood glucose regulators',
 23: 'Blood products',
 24: 'Cardiovascular agents',
 25: 'Central nervous system agents',
 26: 'Dental and oral agents',
 27: 'Dermatological agents',
 28: 'Electrolytes, minerals, metals, vitamins',
 29: 'Gastrointestinal agents',
 30: 'Genetic/enzyme/protein disorder agents',
 31: 'Genitourinary agents',
 32: 'Hormonal agents (adrenal)',
 33: 'Hormonal agents (pituitary)',
 34: 'Hormonal agents (prostaglandins)',
 35: 'Hormonal agents (sex hormones)',
 36: 

In [5]:
# save the dict as JSON
import json

with open('./OUTPUT/dict-medications.json', 'w') as f:
    json.dump(medications_dict, f)

    

## history diags


In [6]:
# make a dict to map names for past_hhx column

s = "1, Arterial Hypertension | 2, Pulmonary hypertension | 3, Cardiovascular disease | 4, Diabetes | 5, Cancer | 6, Immunosuppression | 7, Chronic lung disease | 8, Chronic kidney disease | 9, Dialysis | 10, Chronic liver disease | 11, Obesity | 12, Pregnant | 13, Transient ischemic attack (TIA) | 14, Chronic neurological disorder (other than stroke/TIA) | 15, Stroke | 17, Dementia | 18, HIV or AIDS | 19, Atrial fibrillation or flutter | 21, Prior myocardial infarction | 22, Heart failure | 23, Coronary artery disease | 24, Chronic hematologic disease | 25, Malnutrition | 26, COPD (emphysema, chronic bronchitis) | 27, Psychiatric disease | 28, Asthma | 29, Rheumatologic disease | 30, Past tuberculosis infection | 31, Other"

# split s on , and |
s = s.split("| ")

# remove leading/trailing whitespace
s = [x.strip() for x in s]


# convert to a dict
s = {int(x.split(", ")[0]): x.split(", ")[1] for x in s}

past_hhx_dict = s

In [7]:
# save the dict as JSON
import json

with open('./OUTPUT/dict-past-hhx.json', 'w') as f:
    json.dump(past_hhx_dict, f)


## symptoms

In [8]:
symptoms = \
"""joint_pain_arthralgia
confusion_altered_mental_s
red_eye_conjunctivitis
seizure
diarrhea
abdominal_pain
chest_pain
shortness_of_breath_dyspne
dizziness
extremity_weakness_or_numb
fatigue
fever_38_0oc
hemoptysis_bloody_sputum
loss_of_appetite
ear_pain
sore_throat
headache
muscle_aches_myalgia
nausea_vomiting
leg_swelling_edema
loss_of_taste_lost_of_smel
skin_rash
runny_nose_rhinorrhea
wheezing_or_stridor
cough
trouble_speaking_aphasia_d
night_sweats
weight_loss"""

# make this into a list
symptoms = symptoms.split("\n")

# save the symtpoms list as JSON
import json

with open('./OUTPUT/list-symptoms.json', 'w') as f:
    json.dump(symptoms, f)

## other health hx

In [9]:
# read the Medications sheet from the data dict file
other_hx = pd.read_excel('../lc-project-data/2024-04-09-DATA-DICTIONARY.xlsx', sheet_name='PHx', header = None)


# make a dict: index: column 0

# convert the dataframe to a dict
otherhx_dict = other_hx.set_index(0).T.to_dict('records')[0]

# invert the k,v
# otherhx_dict = {v: k for k, v in otherhx_dict.items()}
otherhx_dict

{1: 'autism',
 2: 'concussion',
 3: 'hypo',
 4: 'hyper',
 5: 'MSK',
 6: 'GI',
 7: 'GERD',
 8: 'DVT',
 9: 'IBS',
 10: 'Sleep disorder',
 11: "Crohn's disease",
 12: 'BPH',
 13: 'OB',
 14: 'AAA',
 15: 'chagas',
 16: 'resp',
 17: 'UNSPECIFIED',
 18: 'Celiac',
 19: 'sickle cell anemia',
 20: 'Anemia',
 21: 'diverticulitis',
 22: 'Derm',
 23: 'ulcerative colitis',
 24: 'genetic disorder',
 25: 'fatigue',
 26: 'surgery'}

In [10]:
# save the symtpoms list as JSON
import json

with open('./OUTPUT/dict-other-hx.json', 'w') as f:
    json.dump(otherhx_dict, f)

## clinical details

In [6]:
cd = """pregnant
delivery_date_expected
postpartum
outcome
delivery_date_actual
baby_covid
oxygen
spo2
fio2
gcs
avpu
urine
pao2
sao2
ph
paco2
hco3
aexcess
alactate
ventilatory_support_type
adjunctive
mode_of_diag
covid_test_date_1
covid_test_result_1
covid_test_type_1
covid_test_date_2
covid_test_result_2
covid_test_type_2
covid_test_date_3
covid_test_result_3
covid_test_type_3
covid_test_date_4
covid_test_result_4
covid_test_type_4
covid_test_date_5
covid_test_result_5
covid_test_type_5
severity_level
hospitalized
date_of_hospitalization
date_of_discharge
required_oxygen
type_of_care
medication_received
specify_other_med
temperature
height_m
height_cm
weight_kgs
bmi
heart_rate
bp_systolic
bp_diastolic
respiratory_rate
o2_saturation"""

# make this into a list
cd = cd.split("\n")
cd

['pregnant',
 'delivery_date_expected',
 'postpartum',
 'outcome',
 'delivery_date_actual',
 'baby_covid',
 'oxygen',
 'spo2',
 'fio2',
 'gcs',
 'avpu',
 'urine',
 'pao2',
 'sao2',
 'ph',
 'paco2',
 'hco3',
 'aexcess',
 'alactate',
 'ventilatory_support_type',
 'adjunctive',
 'mode_of_diag',
 'covid_test_date_1',
 'covid_test_result_1',
 'covid_test_type_1',
 'covid_test_date_2',
 'covid_test_result_2',
 'covid_test_type_2',
 'covid_test_date_3',
 'covid_test_result_3',
 'covid_test_type_3',
 'covid_test_date_4',
 'covid_test_result_4',
 'covid_test_type_4',
 'covid_test_date_5',
 'covid_test_result_5',
 'covid_test_type_5',
 'severity_level',
 'hospitalized',
 'date_of_hospitalization',
 'date_of_discharge',
 'required_oxygen',
 'type_of_care',
 'medication_received',
 'specify_other_med',
 'temperature',
 'height_m',
 'height_cm',
 'weight_kgs',
 'bmi',
 'heart_rate',
 'bp_systolic',
 'bp_diastolic',
 'respiratory_rate',
 'o2_saturation']

In [7]:
# save the symtpoms list as JSON
import json

with open('./OUTPUT/list-clin-details.json', 'w') as f:
    json.dump(cd, f)

# COMP

In [4]:
comp = """cardiovascular_complicatio
cardiac_arrest
other_cardiac_arrhythmia
endocarditis
myocarditis
pericarditis
new_atrial_fibrillation_or
ventricular_tachycardia_or
left_ventricular_function
decompensated_heart_failur
non_st_elevation_myocardia
st_elevation
deep_vein_thrombosis_dvt
disseminated_intravascular
myocardial_infarction_stem
respiratory_complications
asthma
bronchiolitis
pulmonary_embolism_pe
pleural_effusion
interstitial_lung_disease
copd
bacterial_viral_or_cryptog
pneumothorax
acute_respiratory_distress
renal_complications
acute_kidney_injury
neurologic_complications
tia
stroke
meningitis
encephalitis
insomnia
difficulty_with_concentrat
memory_problems
brain_fog
psychiatric_complications
depression
anxiety
mood_change
gastrointestinal_complicat
liver_dysfunction
gastrointestinal_haemorrha
pancreatitis
complications_of_the_endoc
hyperglycemia
hypoglycemia
anemia
rhabdomyolysis
myositis
bacteriemia"""

# make this into a list
comp = comp.split("\n")
comp

['cardiovascular_complicatio',
 'cardiac_arrest',
 'other_cardiac_arrhythmia',
 'endocarditis',
 'myocarditis',
 'pericarditis',
 'new_atrial_fibrillation_or',
 'ventricular_tachycardia_or',
 'left_ventricular_function',
 'decompensated_heart_failur',
 'non_st_elevation_myocardia',
 'st_elevation',
 'deep_vein_thrombosis_dvt',
 'disseminated_intravascular',
 'myocardial_infarction_stem',
 'respiratory_complications',
 'asthma',
 'bronchiolitis',
 'pulmonary_embolism_pe',
 'pleural_effusion',
 'interstitial_lung_disease',
 'copd',
 'bacterial_viral_or_cryptog',
 'pneumothorax',
 'acute_respiratory_distress',
 'renal_complications',
 'acute_kidney_injury',
 'neurologic_complications',
 'tia',
 'stroke',
 'meningitis',
 'encephalitis',
 'insomnia',
 'difficulty_with_concentrat',
 'memory_problems',
 'brain_fog',
 'psychiatric_complications',
 'depression',
 'anxiety',
 'mood_change',
 'gastrointestinal_complicat',
 'liver_dysfunction',
 'gastrointestinal_haemorrha',
 'pancreatitis',
 'com

In [5]:
# save the symtpoms list as JSON
import json

with open("./OUTPUT/list-complications-medcon.json", "w") as f:
    json.dump(comp, f)