In [None]:
import pandas as pd
import numpy as np
import pickle

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    try:
        with open(name + '.pkl', 'rb') as f:
            return pickle.load(f)    
    except FileNotFoundError as e:
        return False;

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load data
## Note: skip "Observations" due to memory issues.

encounters = pd.read_csv('/content/drive/MyDrive/SIAP/data/encounters.csv')
print("encounters")
# display(encounters.head(10))
conditions = pd.read_csv('/content/drive/MyDrive/SIAP/data/conditions.csv')
print("conditions")
# display(conditions.head(10))
careplans = pd.read_csv('/content/drive/MyDrive/SIAP/data/careplans.csv')
print("careplans")
# display(careplans.head(10))
devices = pd.read_csv('/content/drive/MyDrive/SIAP/data/devices.csv')
print("devices")
# display(devices.head(10))
procedures = pd.read_csv('/content/drive/MyDrive/SIAP/data/procedures.csv')
print("procedures")
# display(procedures.head(10))
medications = pd.read_csv('/content/drive/MyDrive/SIAP/data/medications.csv')
print("medications")
# display(medications.head(10))

patients = pd.read_csv('/content/drive/MyDrive/SIAP/data/patients.csv')
print("patients")
# display(patients.head(10))

encounters
conditions
careplans
devices
procedures
medications
patients


In [None]:
# Cell for various tests in between. 
# display(procedures.head(10))

In [None]:
# Filter out encounters which reason is one of: 'Bullet wound', 'Drug overdose', 'Suicide - firearms', 'Suicide - suffocation'
## Why? No strong/noticable correlation between medical history and final death reason.

encounters = encounters[~encounters['REASONDESCRIPTION'].isin(['Bullet wound', 'Drug overdose', 'Suicide - firearms', 'Suicide - suffocation'])]

In [None]:
# Prepare data for filtering & processing.

# Memorize the date of death of every patient.
patients_death = patients[['Id', 'DEATHDATE']]

# Add death date to every row of data -> to be used for filtering (calculating distance of data from death).
encounters = encounters.join(patients_death.set_index('Id'), on='PATIENT' , how='left', lsuffix='L')
conditions = conditions.join(patients_death.set_index('Id'), on='PATIENT' , how='left', lsuffix='L')
careplans = careplans.join(patients_death.set_index('Id'), on='PATIENT' , how='left', lsuffix='L')
devices = devices.join(patients_death.set_index('Id'), on='PATIENT' , how='left', lsuffix='L')
procedures = procedures.join(patients_death.set_index('Id'), on='PATIENT' , how='left', lsuffix='L')
medications = medications.join(patients_death.set_index('Id'), on='PATIENT' , how='left', lsuffix='L')

In [None]:
# Proccess dates so we can calculate distance of data from the death date.
encounters['DEATHDATE'] = pd.to_datetime(encounters['DEATHDATE']).dt.date
encounters['START'] = pd.to_datetime(encounters['START']).dt.date

conditions['DEATHDATE'] = pd.to_datetime(conditions['DEATHDATE']).dt.date
conditions['START'] = pd.to_datetime(conditions['START']).dt.date

careplans['DEATHDATE'] = pd.to_datetime(careplans['DEATHDATE']).dt.date
careplans['START'] = pd.to_datetime(careplans['START']).dt.date

devices['DEATHDATE'] = pd.to_datetime(devices['DEATHDATE']).dt.date
devices['START'] = pd.to_datetime(devices['START']).dt.date

procedures['DEATHDATE'] = pd.to_datetime(procedures['DEATHDATE']).dt.date
procedures['DATE'] = pd.to_datetime(procedures['DATE']).dt.date

medications['DEATHDATE'] = pd.to_datetime(medications['DEATHDATE']).dt.date
medications['START'] = pd.to_datetime(medications['START']).dt.date

In [None]:
# Calculate distance of data from death in years.
encounters['yfd'] = (encounters['DEATHDATE'] - encounters['START']).dt.days / 365
conditions['yfd'] = (conditions['DEATHDATE'] - conditions['START']).dt.days / 365
careplans['yfd'] = (careplans['DEATHDATE'] - careplans['START']).dt.days / 365
devices['yfd'] = (devices['DEATHDATE'] - devices['START']).dt.days / 365
procedures['yfd'] = (procedures['DEATHDATE'] - procedures['DATE']).dt.days / 365
medications['yfd'] = (medications['DEATHDATE'] - medications['START']).dt.days / 365


In [None]:
# Filter out youngest data (< 1.8 years before death date)
## Why?
## 1) Save RAM.
## 2) Try to make problem harder -> lots of data that is close to death directly correlates to the reason of death.
## The point is to predict reason of death for moderately healthy pearson, not already dying one.

encounters = encounters[encounters['yfd'] > 0.3]
conditions = conditions[conditions['yfd'] > 0.3]
careplans = careplans[careplans['yfd'] > 0.3]
devices = devices[devices['yfd'] > 0.3]
procedures = procedures[procedures['yfd'] > 0.3]
medications = medications[medications['yfd'] > 0.3]

In [None]:
# Workaround. To be used during pivoting as a value assigned to a cell of a patient for new column.

encounters['FOR_COUNTING'] = 1
conditions['FOR_COUNTING'] = 1
careplans['FOR_COUNTING'] = 1
devices['FOR_COUNTING'] = 1
procedures['FOR_COUNTING'] = 1
medications['FOR_COUNTING'] = 1

In [None]:
# Pivot dataframes. 
## Example:
## "Anemia (disorder)" as a value for "DESCRIPTION" column in conditions dataframe, will become a column in the pivoted DF. All patients which have been diagnosed with 
## this condition will have a value of 1 (one) in the new column, for the specific encounter where that condition has been found. 
### Note: default behavior of pivot_table is to assign values based on values from column that has not been set as neither index nor column. 
### In this case that is the "FOR_COUNTING" column.

encounters = pd.pivot_table(encounters[['Id', 'PATIENT', 'REASONDESCRIPTION', 'FOR_COUNTING']], index=['Id', 'PATIENT'], columns=['REASONDESCRIPTION'], fill_value=0)
conditions = pd.pivot_table(conditions[['ENCOUNTER', 'PATIENT', 'DESCRIPTION', 'FOR_COUNTING']], index=['ENCOUNTER', 'PATIENT'], columns=['DESCRIPTION'], fill_value=0)
careplans = pd.pivot_table(careplans[['ENCOUNTER', 'PATIENT', 'DESCRIPTION', 'FOR_COUNTING']], index=['ENCOUNTER', 'PATIENT'], columns=['DESCRIPTION'], fill_value=0)
devices = pd.pivot_table(devices[['ENCOUNTER', 'PATIENT', 'DESCRIPTION', 'FOR_COUNTING']], index=['ENCOUNTER', 'PATIENT'], columns=['DESCRIPTION'], fill_value=0)
procedures = pd.pivot_table(procedures[['ENCOUNTER', 'PATIENT', 'DESCRIPTION', 'FOR_COUNTING']], index=['ENCOUNTER', 'PATIENT'], columns=['DESCRIPTION'], fill_value=0)
medications = pd.pivot_table(medications[['ENCOUNTER', 'PATIENT', 'DESCRIPTION','FOR_COUNTING']], index=['ENCOUNTER', 'PATIENT'], columns=['DESCRIPTION'], fill_value=0)


## To do: figure out why this was needed. Possibly a leftover while doing trial/error. 
# conditions_pivoted.fillna(0)
# careplans_pivoted.fillna(0)
# devices_pivoted.fillna(0)
# procedures_pivoted.fillna(0)
# medications_pivoted.fillna(0)

In [None]:
encounters_pivoted = encounters.reset_index()
conditions_pivoted = conditions.reset_index()
careplans_pivoted = careplans.reset_index()
devices_pivoted = devices.reset_index()
procedures_pivoted = procedures.reset_index()
medications_pivoted = medications.reset_index()

In [None]:
save_obj(encounters_pivoted, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_encounters_pivoted")
save_obj(conditions_pivoted, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_conditions_pivoted")
save_obj(careplans_pivoted, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_careplans_pivoted")
save_obj(devices_pivoted, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_devices_pivoted")
save_obj(procedures_pivoted, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_procedures_pivoted")
save_obj(medications_pivoted, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_medications_pivoted")

In [None]:
encounters_pivoted = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_encounters_pivoted")
conditions_pivoted= load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_conditions_pivoted")
careplans_pivoted = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_careplans_pivoted")
devices_pivoted = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_devices_pivoted")
procedures_pivoted = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_procedures_pivoted")
medications_pivoted = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_medications_pivoted")

In [None]:
# Combine all data into one DF by encounter and patient. 
join = conditions_pivoted.join(encounters_pivoted.set_index(['Id', 'PATIENT']), on=['ENCOUNTER', 'PATIENT'], how='right', lsuffix='_CO')
join = careplans_pivoted.join(join.set_index(['ENCOUNTER', 'PATIENT']), on=['ENCOUNTER', 'PATIENT'], how='right', lsuffix='_CP')
join = devices_pivoted.join(join.set_index(['ENCOUNTER', 'PATIENT']), on=['ENCOUNTER', 'PATIENT'], how='right', lsuffix='_D')
join = procedures_pivoted.join(join.set_index(['ENCOUNTER', 'PATIENT']), on=['ENCOUNTER', 'PATIENT'], how='right', lsuffix='_P')
join = medications_pivoted.join(join.set_index(['ENCOUNTER', 'PATIENT']), on=['ENCOUNTER', 'PATIENT'], how='right', lsuffix='_M')

In [None]:
# join[join['PATIENT'] == '364e0a91-7cf8-d278-ae57-034f11674e84']
join = join.fillna(0)
join.head(5)

Unnamed: 0_level_0,ENCOUNTER,PATIENT,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING,FOR_COUNTING
DESCRIPTION,Unnamed: 1_level_1,Unnamed: 2_level_1,0.25 ML Leuprolide Acetate 30 MG/ML Prefilled Syringe,0.4 ML Enoxaparin sodium 100 MG/ML Prefilled Syringe,0.67 ML anakinra 149 MG/ML Prefilled Syringe,1 ML DOCEtaxel 20 MG/ML Injection,1 ML Enoxaparin sodium 150 MG/ML Prefilled Syringe,1 ML Epinephrine 1 MG/ML Injection,1 ML Epoetin Alfa 4000 UNT/ML Injection [Epogen],1 ML Morphine Sulfate 5 MG/ML Injection,1 ML denosumab 60 MG/ML Prefilled Syringe,1 ML heparin sodium porcine 5000 UNT/ML Injection,1 ML medroxyPROGESTERone acetate 150 MG/ML Injection,1 ML medroxyprogesterone acetate 150 MG/ML Injection,10 ML Alfentanil 0.5 MG/ML Injection,10 ML Doxorubicin Hydrochloride 2 MG/ML Injection,10 ML Fentanyl 0.05 MG/ML Injection,10 ML Fluorouracil 50 MG/ML Injection,10 ML Furosemide 10 MG/ML Injection,10 ML Pamidronate Disodium 3 MG/ML Injection,10 ML oxaliplatin 5 MG/ML Injection,100 ML Epirubicin Hydrochloride 2 MG/ML Injection,100 ML Propofol 10 MG/ML Injection,100 ML zoledronic acid 0.04 MG/ML Injection,12 HR Cefaclor 500 MG Extended Release Oral Tablet,12 HR Hydrocodone Bitartrate 10 MG Extended Release Oral Capsule,120 ACTUAT Fluticasone propionate 0.044 MG/ACTUAT Metered Dose Inhaler,150 ML vancomycin 5 MG/ML Injection,168 HR Ethinyl Estradiol 0.00146 MG/HR / norelgestromin 0.00625 MG/HR Transdermal System,2 ML Ondansetron 2 MG/ML Injection,20 ML Ciprofloxacin 10 MG/ML Injection,20 ML tocilizumab 20 MG/ML Injection,24 HR Donepezil hydrochloride 10 MG / Memantine hydrochloride 28 MG Extended Release Oral Capsule,24 HR Metformin hydrochloride 500 MG Extended Release Oral Tablet,24hr nicotine transdermal patch,3 ML Amiodarone hydrocholoride 50 MG/ML Prefilled Syringe,3 ML liraglutide 6 MG/ML Pen Injector,4 ML norepinephrine 1 MG/ML Injection,5 ML SUFentanil 0.05 MG/ML Injection,5 ML fulvestrant 50 MG/ML Prefilled Syringe,...,Non-small cell carcinoma of lung TNM stage 1 (disorder),Non-small cell carcinoma of lung TNM stage 2 (disorder),Non-small cell lung cancer (disorder),Normal pregnancy,Open Removal of Gall Bladder,Opioid abuse (disorder),Osteoarthritis of hip,Osteoarthritis of knee,Otitis media,Overlapping malignant neoplasm of colon,Oxygen Therapy,Perennial allergic rhinitis,Perennial allergic rhinitis with seasonal variation,Polyp of colon,Posttraumatic stress disorder,Primary fibromyalgia syndrome,Primary malignant neoplasm of colon,Primary small cell malignant neoplasm of lung TNM stage 1 (disorder),Pulmonary emphysema (disorder),Pyelonephritis,Rheumatoid arthritis,Rupture of patellar tendon,Seasonal allergic rhinitis,Second degree burn,Secondary malignant neoplasm of colon,Seizure disorder,Sepsis (disorder),Sepsis caused by Pseudomonas (disorder),Sepsis caused by Staphylococcus aureus,Septic shock (disorder),Shock (disorder),Sinusitis (disorder),Small cell carcinoma of lung (disorder),Spina bifida occulta (disorder),Streptococcal sore throat (disorder),Suicidal deliberate poisoning,Suspected lung cancer (situation),Third degree burn,Tubal pregnancy,Viral sinusitis (disorder)
341942,000030b3-eb6b-0aec-b6ab-d5a5198e8be1,680c093a-7f94-1612-c6be-89650d4e9b9e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
341942,00005026-4a19-3d36-2068-a58cf5c8ccb8,8f36f44b-fca2-86cf-1c1b-d9eb6ae127fb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
341942,00008289-83ee-2769-3de1-6620e256a601,14dc8ba3-20b0-e131-46b8-9ebcb92f95f8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
341942,0000a876-a46b-59f4-5fdb-a6c1b2b60057,f661112a-306b-24c3-8c49-cd9eb9932a69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
341942,0000e03b-2c8b-14d1-aa01-c1e56416f6df,65ab6d59-20e2-04fb-f6ea-26374ed3b0fc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Next few cells are used to generate large vectors for training autoencoder. 
# When a lot of data is used RAM gets filled up so processing was done in 2 steps -> first / second half. 
# At the end it was combined and saved. 
save_obj(join, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_joined")

In [None]:
# Save initial combined data as intermediate result.
save_obj(join, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3")

In [None]:

small_testing = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3")
# test = test.groupby(test['PATIENT'])
# small_testing = join
small_testing = small_testing.reset_index()

In [None]:
small_testing.shape[0]

328655

In [None]:
# Used while testing methods for reshaping data. Keep for reference. 
# t = test.columns.to_flat_index().values
# t = [list(xi) for xi in t]
# # print(t)
# t = np.delete(t, 0, axis=1)
# t = t[4:]
# t1 = [list(x) for x in t if x != '']
# print(t1)
# test3 = pd.pivot_table(test, index=[['PATIENT']], columns=[t1], aggfunc="list", fill_value=0)
# test3.head()

[['0.25 ML Leuprolide Acetate 30 MG/ML Prefilled Syringe'], ['0.3 ML Enoxaparin sodium 100 MG/ML Prefilled Syringe'], ['0.4 ML Enoxaparin sodium 100 MG/ML Prefilled Syringe'], ['0.67 ML anakinra 149 MG/ML Prefilled Syringe'], ['1 ML DOCEtaxel 20 MG/ML Injection'], ['1 ML Enoxaparin sodium 150 MG/ML Prefilled Syringe'], ['1 ML Epinephrine 1 MG/ML Injection'], ['1 ML Epoetin Alfa 4000 UNT/ML Injection [Epogen]'], ['1 ML Morphine Sulfate 5 MG/ML Injection'], ['1 ML Vasopressin (USP) 20 UNT/ML Injection'], ['1 ML denosumab 60 MG/ML Prefilled Syringe'], ['1 ML heparin sodium  porcine 5000 UNT/ML Injection'], ['1 ML medroxyPROGESTERone acetate 150 MG/ML Injection'], ['1 ML medroxyprogesterone acetate 150 MG/ML Injection'], ['10 ML Alfentanil 0.5 MG/ML Injection'], ['10 ML Doxorubicin Hydrochloride 2 MG/ML Injection'], ['10 ML Fentanyl 0.05 MG/ML Injection'], ['10 ML Fluorouracil 50 MG/ML Injection'], ['10 ML Furosemide 10 MG/ML Injection'], ['10 ML Pamidronate Disodium 3 MG/ML Injection'], [

In [None]:
small_testing = small_testing.reset_index()

In [None]:
temp_only_values_first_half = small_testing.tail(178654)

In [None]:
# Combine all columns to a list -> a vector and place it in separate column. Do not change name of column since it is used in other colabs. 
temp_only_values = temp_only_values_first_half.values[:, 7:]


In [None]:
temp_only_values_first_half['test_col'] = temp_only_values.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
temp_only_values_first_half.head()

In [None]:
temp_only_values_first_half = temp_only_values_first_half[['PATIENT', 'test_col']]

In [None]:
save_obj(temp_only_values_first_half, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_SECOND_HALF_BEFORE_GROUPING")


In [None]:
temp_only_values_first_half_0 = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_FIRST_HALF_BEFORE_GROUPING")
temp_only_values_second_half_1 = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_SECOND_HALF_BEFORE_GROUPING")

In [None]:
temp_only_values_first_half_0 = pd.concat([temp_only_values_first_half_0, temp_only_values_second_half_1])
temp_only_values_first_half_0.head(2)

Unnamed: 0_level_0,PATIENT,test_col
DESCRIPTION,Unnamed: 1_level_1,Unnamed: 2_level_1
0,680c093a-7f94-1612-c6be-89650d4e9b9e,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,8f36f44b-fca2-86cf-1c1b-d9eb6ae127fb,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
save_obj(temp_only_values_first_half_0, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_FULL_BEFORE_GROUPING")

In [None]:
temp_only_values_first_half_0 = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_0_3_FULL_BEFORE_GROUPING")

In [None]:
temp_only_values_first_half_0 = temp_only_values_first_half_0.groupby(temp_only_values_first_half_0['PATIENT'])

In [None]:
temp_only_values_first_half_0 = temp_only_values_first_half_0.agg(list)

In [None]:
temp_only_values_first_half_0 = temp_only_values_first_half_0.reset_index()
temp_only_values_first_half_0.columns

In [None]:
temp_only_values_first_half_0.columns = temp_only_values_first_half_0.columns.get_level_values(0)
temp_only_values_first_half_0.columns

Index(['PATIENT', 'test_col'], dtype='object')

In [None]:
temp_only_values_first_half_0 = temp_only_values_first_half_0.rename(columns={'PATIENT': 'Index'})

In [None]:
# For later classification we need the reason of death for every patient. 
## This cell loads that data.

patients = pd.read_csv('/content/drive/MyDrive/SIAP/data/patients.csv')
encounters = pd.read_csv('/content/drive/MyDrive/SIAP/data/encounters.csv')
encounters = encounters[~encounters['REASONDESCRIPTION'].isin(['Bullet wound', 'Drug overdose', 'Suicide - firearms', 'Suicide - suffocation'])]

death_encounters = encounters[encounters['DESCRIPTION'] == 'Death Certification']
patients_death = patients[['Id', 'DEATHDATE']]
# patients_death.head()

patients_death_reason = death_encounters.join(patients_death.set_index('Id'), on='PATIENT', how='right', lsuffix="_L")
patients_death_reason = patients_death_reason[['PATIENT', 'REASONDESCRIPTION']]

In [None]:
temp_only_values_first_half_0 = temp_only_values_first_half_0.join(patients_death_reason.set_index('PATIENT'), on='PATIENT', how='left' , lsuffix="_L", rsuffix='_R')

In [None]:
temp_only_values_first_half_0 = temp_only_values_first_half_0[['test_col', 'REASONDESCRIPTION']]
temp_only_values_first_half_0.head(2)

Unnamed: 0_level_0,test_col,REASONDESCRIPTION
PATIENT,Unnamed: 1_level_1,Unnamed: 2_level_1
00032a55-fb87-c742-ad10-0773a82bb52b,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death
00058442-c52b-8e4e-6297-a4063fe79a14,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Natural death with unknown cause


In [None]:
temp_only_values_first_half_0 = temp_only_values_first_half_0.reset_index()

In [None]:
temp_only_values_first_half_0 = temp_only_values_first_half_0[['test_col', 'REASONDESCRIPTION']]
temp_only_values_first_half_0.head(2)

Unnamed: 0,test_col,REASONDESCRIPTION
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death
1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Natural death with unknown cause


In [None]:
save_obj(temp_only_values_first_half_0, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_complete_0_3_FINAL")

In [None]:
# Originallz used to traz and pad data, but not enough RAM
# So used to generate labels separately.
data_to_pad = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_complete_0_3_FINAL")

In [None]:
# Extract only labels and save separately
labels = data_to_pad['REASONDESCRIPTION'][0:18300]

18300


'Sudden Cardiac Death'

In [None]:
save_obj(data_to_pad, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_complete_0_3_LABELS_18300")

In [None]:
data_to_pad = data_to_pad['test_col'][9000:18300]

In [None]:
len(data_to_pad)

9300

In [None]:
data_to_pad = data_to_pad.array
data_to_pad = data_to_pad.to_numpy()

In [None]:
import tensorflow as tf
input_seq_len = 30
num_of_features = 789
data_to_pad = np.array(data_to_pad)
data_to_pad = tf.keras.preprocessing.sequence.pad_sequences(
    data_to_pad, maxlen=input_seq_len, dtype='int32', padding='pre',
    truncating='pre', value=0.0
)

In [None]:
save_obj(data_to_pad, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_complete_0_3_FINAL_PADDED_SECOND_HALF")

In [None]:
padded_data_0 = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_complete_0_3_FINAL_PADDED_FIRST_HALF")
padded_data_1 = load_obj("/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_complete_0_3_FINAL_PADDED_SECOND_HALF")

In [None]:
padded_data_0 = np.concatenate((padded_data_0, padded_data_1))
len(padded_data_0)

18300

In [None]:
save_obj(padded_data_final, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_complete_0_3_FINAL_PADDED_ARRAY")

In [None]:
# NOTE: From here to end can be used to generate raw vectors from smaller amount of data.
# Extract only needed data -> patient ID and his vectors for each encounter. 
small_testing = small_testing[['PATIENT', 'test_col']]


In [None]:
small_testing = small_testing.reset_index()

In [None]:
# Group data by patient.
small_testing = small_testing.groupby(small_testing['PATIENT'])

In [None]:
# Combine all patient's data into one row. "test_col" will now hold all vectors for one patient - for all his encounters. 
small_testing = small_testing.agg(list)
small_testing.head(2)

Unnamed: 0_level_0,index,PATIENT,test_col
PATIENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00032a55-fb87-c742-ad10-0773a82bb52b,[2457],[00032a55-fb87-c742-ad10-0773a82bb52b],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
00058442-c52b-8e4e-6297-a4063fe79a14,"[4517, 13967, 16501, 29196, 49453, 49858, 5206...","[00058442-c52b-8e4e-6297-a4063fe79a14, 0005844...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [None]:
# small_testing = small_testing['test_col']


In [None]:
# For later classification we need the reason of deatch for every patient. 
## This cell loads that data.

patients = pd.read_csv('/content/drive/MyDrive/SIAP/data/patients.csv')
encounters = pd.read_csv('/content/drive/MyDrive/SIAP/data/encounters.csv')
encounters = encounters[~encounters['REASONDESCRIPTION'].isin(['Bullet wound', 'Drug overdose', 'Suicide - firearms', 'Suicide - suffocation'])]

death_encounters = encounters[encounters['DESCRIPTION'] == 'Death Certification']
patients_death = patients[['Id', 'DEATHDATE']]
# patients_death.head()

patients_death_reason = death_encounters.join(patients_death.set_index('Id'), on='PATIENT', how='right', lsuffix="_L")
patients_death_reason = patients_death_reason[['PATIENT', 'REASONDESCRIPTION']]

In [None]:
# patients_death_reason.head(2)
small_testing = small_testing.reset_index()

In [None]:
df2 = small_testing
df2.columns

MultiIndex([(   'Index', ''),
            (   'index', ''),
            ( 'PATIENT', ''),
            ('test_col', '')],
           )

In [None]:
df2.columns = df2.columns.get_level_values(0)
df2.columns

Index(['Index', 'index', 'PATIENT', 'test_col'], dtype='object')

In [None]:
# Add a column with reasons of death to the data. 
# small_testing = small_testing.rename_axis('Index') # - some debugging leftover. Keep for reference.

to_save = df2.join(patients_death_reason.set_index('PATIENT'), on='Index', how='left' , lsuffix="_L", rsuffix='_R')

In [None]:
to_save.head()

Unnamed: 0,Index,index,PATIENT,test_col,REASONDESCRIPTION
0,00032a55-fb87-c742-ad10-0773a82bb52b,[2457],[00032a55-fb87-c742-ad10-0773a82bb52b],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death
1,00058442-c52b-8e4e-6297-a4063fe79a14,"[4517, 13967, 16501, 29196, 49453, 49858, 5206...","[00058442-c52b-8e4e-6297-a4063fe79a14, 0005844...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Natural death with unknown cause
2,000cca33-5892-7015-edb0-e714ac012990,"[80605, 96188, 132677]","[000cca33-5892-7015-edb0-e714ac012990, 000cca3...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death
3,000d0b7f-6196-f285-a9cb-4ead2b5e04ea,"[3996, 5460, 24027, 30893, 39071, 114360, 1453...","[000d0b7f-6196-f285-a9cb-4ead2b5e04ea, 000d0b7...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Myocardial Infarction
4,000ee730-2474-459a-72ea-f31892298013,"[44717, 56408, 221852]","[000ee730-2474-459a-72ea-f31892298013, 000ee73...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death


In [None]:
# Some trial/error. Keep for reference. 
# to_save_test = to_save
# to_save.reset_index()

Unnamed: 0,Index,"(index, )","(PATIENT, )","(test_col, )",REASONDESCRIPTION
0,00032a55-fb87-c742-ad10-0773a82bb52b,[2457],[00032a55-fb87-c742-ad10-0773a82bb52b],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death
1,00058442-c52b-8e4e-6297-a4063fe79a14,"[4517, 13967, 16501, 29196, 49453, 49858, 5206...","[00058442-c52b-8e4e-6297-a4063fe79a14, 0005844...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Natural death with unknown cause
2,000cca33-5892-7015-edb0-e714ac012990,"[80605, 96188, 132677]","[000cca33-5892-7015-edb0-e714ac012990, 000cca3...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death
3,000d0b7f-6196-f285-a9cb-4ead2b5e04ea,"[3996, 5460, 24027, 30893, 39071, 114360, 1453...","[000d0b7f-6196-f285-a9cb-4ead2b5e04ea, 000d0b7...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Myocardial Infarction
4,000ee730-2474-459a-72ea-f31892298013,"[44717, 56408, 221852]","[000ee730-2474-459a-72ea-f31892298013, 000ee73...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death
...,...,...,...,...,...
16836,fff977c2-7cc8-dd7d-b6f4-738953009683,"[384, 2949, 4358, 4617, 6067, 7012, 8577, 1046...","[fff977c2-7cc8-dd7d-b6f4-738953009683, fff977c...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Chronic congestive heart failure (disorder)
16837,fffb6ad6-12a7-f732-58f1-9b05ac017b03,"[97704, 129127, 152690, 166220, 167055, 239767]","[fffb6ad6-12a7-f732-58f1-9b05ac017b03, fffb6ad...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Myocardial Infarction
16838,fffe1250-d45a-9c5e-a20b-ea1ecf5524c8,"[497, 4512, 32376, 77894, 77911, 126507, 12884...","[fffe1250-d45a-9c5e-a20b-ea1ecf5524c8, fffe125...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",
16839,fffe7860-0fd0-918e-5779-5c7b8628d9af,"[12578, 37534, 118218, 168715]","[fffe7860-0fd0-918e-5779-5c7b8628d9af, fffe786...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Natural death with unknown cause


In [None]:
# Extract final needed data -> vectors for every patient for every encounter & the reason of patients death. 
to_save = to_save[['test_col', 'REASONDESCRIPTION']]
to_save.head()

Unnamed: 0,test_col,REASONDESCRIPTION
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death
1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Natural death with unknown cause
2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death
3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Myocardial Infarction
4,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",Sudden Cardiac Death


In [None]:
# Check data before saving.
np.asarray(to_save.values)

array([[list([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [None]:
save_obj(to_save, "/content/drive/MyDrive/SIAP/helpers/vectors_lstm_no_observations_older_correct_complete")