In [3]:
%pip install gensim
#%pip install pyhealth

Collecting gensim
  Using cached gensim-4.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
Collecting smart-open>=1.8.1
  Using cached smart_open-6.3.0-py3-none-any.whl (56 kB)
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.3.1 smart-open-6.3.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [22]:
import pandas as pd
import numpy as np
#import pyhealth
import os
import warnings
import sagemaker
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Input
from tensorflow.keras.models import Model
#from sagemaker.tensorflow import TensorFlow


We will first import the data needed for this experiment

In [23]:
#1. The PubMed word2vec vectors
w2vec_url = 'http://evexdb.org/pmresources/vec-space-models/'
w2vec_filename = 'PubMed-and-PMC-w2v.bin'
if not os.path.isfile('./data/' + w2vec_filename):
    !cd data
    !wget {w2vec_url}{w2vec_filename}

In [24]:
#2. The admissions records from MIMIC III

admission_df = pd.read_csv(os.getcwd() + '/data/ADMISSIONS.csv.gz')
admission_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1


In [25]:
#3. The ICD Diagnoses records from MIMIC III
diagnoses_df = pd.read_csv(os.getcwd() + '/data/DIAGNOSES_ICD.csv.gz')
diagnoses_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,40301
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,58281
3,1300,109,172335,4.0,5855
4,1301,109,172335,5.0,4254


In [26]:
%%time
#4. The notes records from MIMIC III

notes_df = pd.read_csv(os.getcwd() + '/data/NOTEEVENTS.csv.gz')
notes_df.head()



CPU times: user 52.9 s, sys: 2.31 s, total: 55.2 s
Wall time: 55.2 s


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


In [27]:
# ICD-9 codes for heart failure. The paper had the origional ICD numbers with decimel in place. The MIMIC dataset removed all decimels.
# The codes below match the MIMIC III ICD codes (i.e. without decimals)
hf_icd9_codes = [
    '39891', '40201', '40211', '40291', '40401', '40403', '40411',
    '40413', '40491', '40493', '4280', '4281', '42820', '42821',
    '42822', '42823', '42830', '42831', '42832', '42833', '42840',
    '42841', '42842', '42843', '4289'
]

In [28]:
#Filter out all diagnoses that does not pertain to heart failure diagnoses
hf_diagnoses_df = diagnoses_df[diagnoses_df['ICD9_CODE'].isin(hf_icd9_codes)]
hf_diagnoses_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
51,1511,115,114585,10.0,4280
67,1527,117,140784,5.0,4280
150,1610,124,138376,12.0,42833
153,1613,124,138376,15.0,4280
211,1671,130,198214,2.0,4280


In [29]:
#filter out all the notes that are note of the category "Discharge summary". Also cleaned converted type of HADM_ID in notes_df becuase they
#were imported as floats while other other MIMIC III data imported this column as ints

discharge_notes_df = notes_df[notes_df["CATEGORY"] == "Discharge summary"]
discharge_notes_df = discharge_notes_df.round().astype({'HADM_ID':'int64'})
discharge_notes_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


In [30]:
#Create one dataframe that has all the data we want
intermediate_df = admission_df.merge(discharge_notes_df, how="left", on="HADM_ID")
admission_diagnoses_notes_df = intermediate_df.merge(hf_diagnoses_df, how="left", on="HADM_ID")
admission_diagnoses_notes_df.head()

Unnamed: 0,ROW_ID_x,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,ROW_ID,SUBJECT_ID,SEQ_NUM,ICD9_CODE
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,...,,Discharge summary,Report,,,Admission Date: [**2196-4-9**] Discharg...,,,,
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,...,,Discharge summary,Report,,,Admission Date: [**2153-9-3**] D...,,,,
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,...,,Discharge summary,Report,,,Admission Date: [**2157-10-18**] ...,,,,
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,...,,Discharge summary,Report,,,Admission Date: [**2139-6-6**] D...,,,,
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,...,,Discharge summary,Report,,,Admission Date: [**2160-11-2**] ...,,,,


In [31]:
#Drop all rows where the patient was NOT diagnosed with a heart failure related ICD9 code
admission_diagnoses_notes_df = admission_diagnoses_notes_df.dropna(subset=["ICD9_CODE"]).reset_index()
admission_diagnoses_notes_df.head()

Unnamed: 0,index,ROW_ID_x,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,...,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,ROW_ID,SUBJECT_ID,SEQ_NUM,ICD9_CODE
0,5,26,26,197661,2126-05-06 15:16:00,2126-05-13 15:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,...,,Discharge summary,Report,,,Admission Date: [**2126-5-6**] Discharge ...,186.0,26.0,3.0,4280
1,8,29,30,104557,2172-10-14 14:17:00,2172-10-19 14:37:00,,URGENT,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,...,,Discharge summary,Report,,,Admission Date: [**2172-10-14**] Discha...,211.0,30.0,4.0,4281
2,12,33,34,115799,2186-07-18 16:46:00,2186-07-20 16:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,...,,,,,,,237.0,34.0,2.0,4280
3,13,34,34,144319,2191-02-23 05:23:00,2191-02-25 20:20:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME HEALTH CARE,...,,Discharge summary,Report,,,Admission Date: [**2191-2-23**] ...,245.0,34.0,2.0,42822
4,14,34,34,144319,2191-02-23 05:23:00,2191-02-25 20:20:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME HEALTH CARE,...,,Discharge summary,Report,,,Admission Date: [**2191-2-23**] ...,249.0,34.0,6.0,4280


In [32]:
#Drop all rows where a discharge summary does not exist
admission_diagnoses_notes_df= admission_diagnoses_notes_df.dropna(subset=["CATEGORY"]).reset_index()
admission_diagnoses_notes_df.head(2)

Unnamed: 0,level_0,index,ROW_ID_x,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,...,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,ROW_ID,SUBJECT_ID,SEQ_NUM,ICD9_CODE
0,0,5,26,26,197661,2126-05-06 15:16:00,2126-05-13 15:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,...,,Discharge summary,Report,,,Admission Date: [**2126-5-6**] Discharge ...,186.0,26.0,3.0,4280
1,1,8,29,30,104557,2172-10-14 14:17:00,2172-10-19 14:37:00,,URGENT,TRANSFER FROM HOSP/EXTRAM,...,,Discharge summary,Report,,,Admission Date: [**2172-10-14**] Discha...,211.0,30.0,4.0,4281


In [33]:
admission_diagnoses_notes_df = admission_diagnoses_notes_df.drop(["level_0", "index"], axis=1)
admission_diagnoses_notes_df.head()

Unnamed: 0,ROW_ID_x,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,ROW_ID,SUBJECT_ID,SEQ_NUM,ICD9_CODE
0,26,26,197661,2126-05-06 15:16:00,2126-05-13 15:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Medicare,...,,Discharge summary,Report,,,Admission Date: [**2126-5-6**] Discharge ...,186.0,26.0,3.0,4280
1,29,30,104557,2172-10-14 14:17:00,2172-10-19 14:37:00,,URGENT,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,...,,Discharge summary,Report,,,Admission Date: [**2172-10-14**] Discha...,211.0,30.0,4.0,4281
2,34,34,144319,2191-02-23 05:23:00,2191-02-25 20:20:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME HEALTH CARE,Medicare,...,,Discharge summary,Report,,,Admission Date: [**2191-2-23**] ...,245.0,34.0,2.0,42822
3,34,34,144319,2191-02-23 05:23:00,2191-02-25 20:20:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME HEALTH CARE,Medicare,...,,Discharge summary,Report,,,Admission Date: [**2191-2-23**] ...,249.0,34.0,6.0,4280
4,39,37,188670,2183-08-21 16:48:00,2183-08-26 18:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,...,,Discharge summary,Report,,,Admission Date: [**2183-8-21**] Discharge...,297.0,37.0,2.0,4280


In [34]:
admission_diagnoses_notes_df["ADMITTIME"] = pd.to_datetime(admission_diagnoses_notes_df["ADMITTIME"])
admission_diagnoses_notes_df["DISCHTIME"] = pd.to_datetime(admission_diagnoses_notes_df["DISCHTIME"])
admission_diagnoses_notes_df.head()

Unnamed: 0,ROW_ID_x,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,ROW_ID,SUBJECT_ID,SEQ_NUM,ICD9_CODE
0,26,26,197661,2126-05-06 15:16:00,2126-05-13 15:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Medicare,...,,Discharge summary,Report,,,Admission Date: [**2126-5-6**] Discharge ...,186.0,26.0,3.0,4280
1,29,30,104557,2172-10-14 14:17:00,2172-10-19 14:37:00,,URGENT,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,...,,Discharge summary,Report,,,Admission Date: [**2172-10-14**] Discha...,211.0,30.0,4.0,4281
2,34,34,144319,2191-02-23 05:23:00,2191-02-25 20:20:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME HEALTH CARE,Medicare,...,,Discharge summary,Report,,,Admission Date: [**2191-2-23**] ...,245.0,34.0,2.0,42822
3,34,34,144319,2191-02-23 05:23:00,2191-02-25 20:20:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME HEALTH CARE,Medicare,...,,Discharge summary,Report,,,Admission Date: [**2191-2-23**] ...,249.0,34.0,6.0,4280
4,39,37,188670,2183-08-21 16:48:00,2183-08-26 18:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,...,,Discharge summary,Report,,,Admission Date: [**2183-8-21**] Discharge...,297.0,37.0,2.0,4280


In [35]:
admission_diagnoses_notes_df["SAME_SUBJECT_ID"] = admission_diagnoses_notes_df["SUBJECT_ID_x"] == admission_diagnoses_notes_df["SUBJECT_ID_x"].shift(1)
admission_diagnoses_notes_df["SAME_HADM_ID"] = admission_diagnoses_notes_df["HADM_ID"] == admission_diagnoses_notes_df["HADM_ID"].shift(1)
admission_diagnoses_notes_df.head(2)

Unnamed: 0,ROW_ID_x,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,DESCRIPTION,CGID,ISERROR,TEXT,ROW_ID,SUBJECT_ID,SEQ_NUM,ICD9_CODE,SAME_SUBJECT_ID,SAME_HADM_ID
0,26,26,197661,2126-05-06 15:16:00,2126-05-13 15:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Medicare,...,Report,,,Admission Date: [**2126-5-6**] Discharge ...,186.0,26.0,3.0,4280,False,False
1,29,30,104557,2172-10-14 14:17:00,2172-10-19 14:37:00,,URGENT,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,...,Report,,,Admission Date: [**2172-10-14**] Discha...,211.0,30.0,4.0,4281,False,False


In [39]:
readmit_times = []

subject_id = 0
hadm_id = 0
dischtime = pd.Timedelta(0)
last_time = pd.Timedelta(0)
for i in range(len(admission_diagnoses_notes_df)):
    
    if admission_diagnoses_notes_df["SUBJECT_ID_x"][i] != subject_id:
        subject_id = admission_diagnoses_notes_df["SUBJECT_ID_x"][i]
        hadm_id = admission_diagnoses_notes_df["HADM_ID"][i]
        dischtime = admission_diagnoses_notes_df["DISCHTIME"][i]
        
        readmit_times.append(pd.Timedelta(0))
        
    elif (admission_diagnoses_notes_df["SUBJECT_ID_x"][i] == subject_id) and (admission_diagnoses_notes_df["HADM_ID"][i] == hadm_id) and (readmit_times[-1] == pd.Timedelta(0)):
        readmit_times.append(pd.Timedelta(0))
        
    elif (admission_diagnoses_notes_df["SUBJECT_ID_x"][i] == subject_id) and (admission_diagnoses_notes_df["HADM_ID"][i] != hadm_id):
        readmit_times.append(admission_diagnoses_notes_df["ADMITTIME"][i] - dischtime)
        
        last_time = admission_diagnoses_notes_df["ADMITTIME"][i] - dischtime
        hadm_id = admission_diagnoses_notes_df["HADM_ID"][i]
        dischtime = admission_diagnoses_notes_df["DISCHTIME"][i]
        
    elif (admission_diagnoses_notes_df["SUBJECT_ID_x"][i] == subject_id) and (admission_diagnoses_notes_df["HADM_ID"][i] == hadm_id):
        readmit_times.append(last_time)
        
    else:
        readmit_times.append(pd.Timedelta(0))

In [40]:
admission_diagnoses_notes_df["READMISSION_TIMES"] = readmit_times

In [41]:
admission_diagnoses_notes_df[admission_diagnoses_notes_df["SUBJECT_ID_x"] == 357]

Unnamed: 0,ROW_ID_x,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,CGID,ISERROR,TEXT,ROW_ID,SUBJECT_ID,SEQ_NUM,ICD9_CODE,SAME_SUBJECT_ID,SAME_HADM_ID,READMISSION_TIMES
6,456,357,174486,2197-12-06 07:13:00,2198-01-03 14:00:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Private,...,,,Admission Date: [**2197-12-6**] ...,4334.0,357.0,8.0,4280,False,False,0 days 00:00:00
7,458,357,122609,2198-11-01 22:36:00,2198-11-14 14:20:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Private,...,,,Admission Date: [**2198-11-1**] ...,4297.0,357.0,14.0,4280,True,False,302 days 08:36:00
8,458,357,122609,2198-11-01 22:36:00,2198-11-14 14:20:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Private,...,,,Admission Date: [**2198-11-1**] ...,4298.0,357.0,15.0,42823,True,True,302 days 08:36:00
9,459,357,101651,2199-10-20 12:05:00,2199-10-23 17:30:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,...,,,Admission Date: [**2199-10-20**] ...,4246.0,357.0,2.0,42822,True,False,339 days 21:45:00
10,459,357,101651,2199-10-20 12:05:00,2199-10-23 17:30:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,...,,,Admission Date: [**2199-10-20**] ...,4253.0,357.0,9.0,4280,True,True,339 days 21:45:00
11,460,357,117876,2199-12-21 22:37:00,2200-01-18 11:45:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,...,,,Admission Date: [**2199-12-21**] ...,4271.0,357.0,10.0,42822,True,False,59 days 05:07:00
12,460,357,117876,2199-12-21 22:37:00,2200-01-18 11:45:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,...,,,Admission Date: [**2199-12-21**] ...,4273.0,357.0,12.0,4280,True,True,59 days 05:07:00


In [43]:
general_readmission = []

subject_id = 0
hadm_id = 0
for i in range(len(admission_diagnoses_notes_df) - 1, -1, -1):
        
    if (admission_diagnoses_notes_df["SUBJECT_ID_x"][i] != subject_id):
        subject_id = admission_diagnoses_notes_df["SUBJECT_ID_x"][i]
        hadm_id = admission_diagnoses_notes_df["HADM_ID"][i]
        
        general_readmission.append(0)
        
    elif (admission_diagnoses_notes_df["SUBJECT_ID_x"][i] == subject_id) and (admission_diagnoses_notes_df["HADM_ID"][i] != hadm_id):
        general_readmission.append(1)
        
        
    else:
        general_readmission.append(0)

In [44]:
admission_diagnoses_notes_df["GENERAL_READMISSION"] = list(reversed(general_readmission))

In [45]:
admission_diagnoses_notes_df[admission_diagnoses_notes_df["SUBJECT_ID_x"] == 357]

Unnamed: 0,ROW_ID_x,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,ISERROR,TEXT,ROW_ID,SUBJECT_ID,SEQ_NUM,ICD9_CODE,SAME_SUBJECT_ID,SAME_HADM_ID,READMISSION_TIMES,GENERAL_READMISSION
6,456,357,174486,2197-12-06 07:13:00,2198-01-03 14:00:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Private,...,,Admission Date: [**2197-12-6**] ...,4334.0,357.0,8.0,4280,False,False,0 days 00:00:00,1
7,458,357,122609,2198-11-01 22:36:00,2198-11-14 14:20:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Private,...,,Admission Date: [**2198-11-1**] ...,4297.0,357.0,14.0,4280,True,False,302 days 08:36:00,1
8,458,357,122609,2198-11-01 22:36:00,2198-11-14 14:20:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Private,...,,Admission Date: [**2198-11-1**] ...,4298.0,357.0,15.0,42823,True,True,302 days 08:36:00,1
9,459,357,101651,2199-10-20 12:05:00,2199-10-23 17:30:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,...,,Admission Date: [**2199-10-20**] ...,4246.0,357.0,2.0,42822,True,False,339 days 21:45:00,1
10,459,357,101651,2199-10-20 12:05:00,2199-10-23 17:30:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,...,,Admission Date: [**2199-10-20**] ...,4253.0,357.0,9.0,4280,True,True,339 days 21:45:00,1
11,460,357,117876,2199-12-21 22:37:00,2200-01-18 11:45:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,...,,Admission Date: [**2199-12-21**] ...,4271.0,357.0,10.0,42822,True,False,59 days 05:07:00,0
12,460,357,117876,2199-12-21 22:37:00,2200-01-18 11:45:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,...,,Admission Date: [**2199-12-21**] ...,4273.0,357.0,12.0,4280,True,True,59 days 05:07:00,0


In [46]:
thirtyday_readmission = []

subject_id = 0
hadm_id = 0
for i in range(len(admission_diagnoses_notes_df) - 1, -1, -1):
        
    if (admission_diagnoses_notes_df["SUBJECT_ID_x"][i] != subject_id):
        subject_id = admission_diagnoses_notes_df["SUBJECT_ID_x"][i]
        hadm_id = admission_diagnoses_notes_df["HADM_ID"][i]
        
        thirtyday_readmission.append(0)
        
    elif (admission_diagnoses_notes_df["SUBJECT_ID_x"][i] == subject_id) and (admission_diagnoses_notes_df["HADM_ID"][i] != hadm_id) and (admission_diagnoses_notes_df["READMISSION_TIMES"][i] < pd.Timedelta(days=30)) and (admission_diagnoses_notes_df["READMISSION_TIMES"][i] > pd.Timedelta(days=0)):
        thirtyday_readmission.append(1)
        
    else:
        thirtyday_readmission.append(0)

In [47]:
admission_diagnoses_notes_df["THIRTYDAY_READMISSION"] = list(reversed(thirtyday_readmission))

In [48]:
admission_diagnoses_notes_df.shape[0]

23830

In [49]:
sum(admission_diagnoses_notes_df['GENERAL_READMISSION'])

5969

In [50]:
sum(admission_diagnoses_notes_df['THIRTYDAY_READMISSION'])

668

In [51]:
true_gen_readmit = admission_diagnoses_notes_df[admission_diagnoses_notes_df["GENERAL_READMISSION"] == True]

In [52]:
false_gen_readmit = admission_diagnoses_notes_df[admission_diagnoses_notes_df["GENERAL_READMISSION"] == False].sample(n=sum(admission_diagnoses_notes_df["GENERAL_READMISSION"]), random_state=1)

In [53]:
gen_readmission_dataset = pd.concat([true_gen_readmit, false_gen_readmit])
gen_readmission_dataset.head()

Unnamed: 0,ROW_ID_x,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,TEXT,ROW_ID,SUBJECT_ID,SEQ_NUM,ICD9_CODE,SAME_SUBJECT_ID,SAME_HADM_ID,READMISSION_TIMES,GENERAL_READMISSION,THIRTYDAY_READMISSION
6,456,357,174486,2197-12-06 07:13:00,2198-01-03 14:00:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Private,...,Admission Date: [**2197-12-6**] ...,4334.0,357.0,8.0,4280,False,False,0 days 00:00:00,1,0
7,458,357,122609,2198-11-01 22:36:00,2198-11-14 14:20:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Private,...,Admission Date: [**2198-11-1**] ...,4297.0,357.0,14.0,4280,True,False,302 days 08:36:00,1,0
8,458,357,122609,2198-11-01 22:36:00,2198-11-14 14:20:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Private,...,Admission Date: [**2198-11-1**] ...,4298.0,357.0,15.0,42823,True,True,302 days 08:36:00,1,0
9,459,357,101651,2199-10-20 12:05:00,2199-10-23 17:30:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,...,Admission Date: [**2199-10-20**] ...,4246.0,357.0,2.0,42822,True,False,339 days 21:45:00,1,0
10,459,357,101651,2199-10-20 12:05:00,2199-10-23 17:30:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,...,Admission Date: [**2199-10-20**] ...,4253.0,357.0,9.0,4280,True,True,339 days 21:45:00,1,0


In [54]:
gen_readmission_dataset.shape[0]

11938

In [55]:
true_thirty_readmit = admission_diagnoses_notes_df[admission_diagnoses_notes_df["THIRTYDAY_READMISSION"] == True]

In [56]:
false_thirty_readmit = admission_diagnoses_notes_df[admission_diagnoses_notes_df["THIRTYDAY_READMISSION"] == False].sample(n=sum(admission_diagnoses_notes_df["THIRTYDAY_READMISSION"]), random_state=1)

In [57]:
thirty_readmission_dataset = pd.concat([true_thirty_readmit, false_thirty_readmit])
thirty_readmission_dataset.head()

Unnamed: 0,ROW_ID_x,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,TEXT,ROW_ID,SUBJECT_ID,SEQ_NUM,ICD9_CODE,SAME_SUBJECT_ID,SAME_HADM_ID,READMISSION_TIMES,GENERAL_READMISSION,THIRTYDAY_READMISSION
235,650,518,153168,2109-07-07 07:49:00,2109-07-09 12:16:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,...,Admission Date: [**2109-7-7**] D...,6159.0,518.0,4.0,4280,True,False,7 days 14:49:00,1,1
597,1908,1569,103678,2177-12-16 21:31:00,2178-01-08 13:52:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Medicare,...,Admission Date: [**2177-12-16**] ...,17589.0,1569.0,8.0,42830,True,False,4 days 01:01:00,1,1
598,1908,1569,103678,2177-12-16 21:31:00,2178-01-08 13:52:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Medicare,...,Admission Date: [**2177-12-16**] ...,17590.0,1569.0,9.0,4280,True,True,4 days 01:01:00,1,1
686,1650,1339,109444,2138-09-22 08:36:00,2138-09-30 15:25:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,...,Admission Date: [**2138-9-22**] ...,15234.0,1339.0,5.0,42833,True,False,1 days 20:06:00,1,1
687,1651,1339,158998,2138-10-13 11:24:00,2138-10-21 14:40:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,...,Admission Date: [**2138-10-11**] ...,15303.0,1339.0,2.0,42823,True,False,12 days 19:59:00,1,1


In [58]:
thirty_readmission_dataset.shape[0]

1336

In [59]:
gen_readmission_dataset = gen_readmission_dataset.sample(frac=1)
ratio = 0.9
 
total_rows = gen_readmission_dataset.shape[0]
train_size = int(total_rows*ratio)
 
# Split data into test and train
train_gen = gen_readmission_dataset[0:train_size]
test_gen = gen_readmission_dataset[train_size:]

In [60]:
thirty_readmission_dataset = thirty_readmission_dataset.sample(frac=1)
ratio = 0.9
 
total_rows = thirty_readmission_dataset.shape[0]
train_size = int(total_rows*ratio)
 
# Split data into test and train
train_thirty = thirty_readmission_dataset[0:train_size]
test_thirty = thirty_readmission_dataset[train_size:]

In [61]:
train_gen.head()

Unnamed: 0,ROW_ID_x,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,TEXT,ROW_ID,SUBJECT_ID,SEQ_NUM,ICD9_CODE,SAME_SUBJECT_ID,SAME_HADM_ID,READMISSION_TIMES,GENERAL_READMISSION,THIRTYDAY_READMISSION
3972,8893,7275,125765,2135-11-19 17:22:00,2135-11-20 21:00:00,,EMERGENCY,EMERGENCY ROOM ADMIT,LEFT AGAINST MEDICAL ADVI,Medicare,...,Admission Date: [**2135-11-19**] ...,81257.0,7275.0,1.0,4280,True,False,381 days 03:22:00,1,0
1393,3910,3242,175206,2168-08-26 00:58:00,2168-09-02 10:45:00,2168-09-02 10:45:00,EMERGENCY,CLINIC REFERRAL/PREMATURE,DEAD/EXPIRED,Medicare,...,Admission Date: [**2168-8-26**] ...,36173.0,3242.0,6.0,4280,True,True,0 days 00:00:00,0,0
3720,9701,7911,102407,2115-09-17 02:17:00,2115-10-09 17:50:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Medicare,...,Admission Date: [**2115-9-17**] ...,88679.0,7911.0,2.0,4280,False,False,0 days 00:00:00,0,0
9632,30771,25184,191458,2190-12-19 15:17:00,2191-01-03 10:37:00,2191-01-03 10:37:00,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Medicare,...,Admission Date: [**2190-12-19**] Discha...,281578.0,25184.0,6.0,4280,True,False,8 days 00:29:00,0,0
19996,49315,69513,139550,2135-10-30 14:00:00,2135-11-05 13:30:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,...,Admission Date: [**2135-10-30**] ...,515456.0,69513.0,2.0,42832,False,False,0 days 00:00:00,0,0


In [62]:
test_gen.head()

Unnamed: 0,ROW_ID_x,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,TEXT,ROW_ID,SUBJECT_ID,SEQ_NUM,ICD9_CODE,SAME_SUBJECT_ID,SAME_HADM_ID,READMISSION_TIMES,GENERAL_READMISSION,THIRTYDAY_READMISSION
22519,55091,87962,174124,2172-06-25 19:54:00,2172-07-02 15:56:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,SNF,Medicare,...,Admission Date: [**2172-6-25**] ...,596367.0,87962.0,3.0,42833,False,False,0 days 00:00:00,1,0
94,597,472,173064,2172-09-28 19:21:00,2172-10-06 18:15:00,2172-10-06 18:15:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,DEAD/EXPIRED,Medicare,...,Admission Date: [**2172-9-28**] ...,5641.0,472.0,5.0,4280,False,False,0 days 00:00:00,0,0
22971,57222,94597,133529,2169-11-17 16:59:00,2169-11-25 13:30:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,...,Admission Date: [**2169-11-17**] ...,625883.0,94597.0,1.0,42823,True,False,1 days 03:59:00,1,1
2512,9968,8141,123494,2122-03-15 21:17:00,2122-03-20 13:30:00,2122-03-20 13:30:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,DEAD/EXPIRED,Medicare,...,Admission Date: [**2122-3-15**] ...,91111.0,8141.0,7.0,4280,False,False,0 days 00:00:00,0,0
10117,28043,22984,104849,2123-02-22 01:24:00,2123-03-16 15:45:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Private,...,Admission Date: [**2123-2-22**] ...,256071.0,22984.0,11.0,4280,True,True,0 days 00:00:00,0,0


In [63]:
train_thirty.head()

Unnamed: 0,ROW_ID_x,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,TEXT,ROW_ID,SUBJECT_ID,SEQ_NUM,ICD9_CODE,SAME_SUBJECT_ID,SAME_HADM_ID,READMISSION_TIMES,GENERAL_READMISSION,THIRTYDAY_READMISSION
13687,33753,27603,179237,2181-08-23 21:22:00,2181-08-30 14:47:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,...,"Name: [**Known lastname 12733**],[**Known fir...",311540.0,27603.0,8.0,4280,True,True,30 days 04:50:00,1,0
11638,28917,23688,175731,2147-12-04 13:00:00,2147-12-17 10:06:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME,Private,...,Admission Date: [**2147-12-4**] ...,264314.0,23688.0,3.0,4280,True,True,0 days 00:00:00,0,0
20425,52184,78565,110298,2142-06-14 01:49:00,2142-06-19 17:35:00,,EMERGENCY,EMERGENCY ROOM ADMIT,LONG TERM CARE HOSPITAL,Medicare,...,Admission Date: [**2142-6-14**] ...,555498.0,78565.0,4.0,42833,True,False,9 days 11:49:00,1,1
15888,42263,47861,124686,2192-09-25 21:15:00,2192-10-01 17:24:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME HEALTH CARE,Medicare,...,Admission Date: [**2192-9-25**] ...,414079.0,47861.0,7.0,4280,True,True,0 days 00:00:00,0,0
16771,44233,53856,155343,2146-08-05 17:38:00,2146-08-06 18:10:00,2146-08-06 18:10:00,EMERGENCY,CLINIC REFERRAL/PREMATURE,DEAD/EXPIRED,Medicare,...,Admission Date: [**2146-8-5**] D...,442683.0,53856.0,12.0,4280,False,False,0 days 00:00:00,0,0


In [64]:
test_thirty.head()

Unnamed: 0,ROW_ID_x,SUBJECT_ID_x,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,TEXT,ROW_ID,SUBJECT_ID,SEQ_NUM,ICD9_CODE,SAME_SUBJECT_ID,SAME_HADM_ID,READMISSION_TIMES,GENERAL_READMISSION,THIRTYDAY_READMISSION
13113,38030,31260,107777,2109-04-01 10:57:00,2109-04-13 12:09:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicaid,...,Admission Date: [**2109-4-1**] D...,359760.0,31260.0,8.0,4280,True,True,27 days 15:06:00,1,1
20105,48258,66264,173568,2102-12-25 17:01:00,2103-01-15 15:45:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Private,...,Admission Date: [**2102-12-25**] ...,500025.0,66264.0,23.0,4280,True,True,123 days 04:31:00,1,0
22976,57224,94597,179353,2170-11-14 05:30:00,2170-11-27 14:18:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Medicare,...,Admission Date: [**2170-11-14**] ...,626011.0,94597.0,6.0,4280,True,True,6 days 12:15:00,1,1
19720,48954,68457,118480,2129-08-23 04:43:00,2129-09-01 16:20:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,LONG TERM CARE HOSPITAL,Medicare,...,Admission Date: [**2129-8-23**] ...,510028.0,68457.0,7.0,40411,True,True,10 days 13:03:00,1,1
19353,49403,69776,133201,2129-11-23 23:01:00,2129-11-24 17:00:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,LONG TERM CARE HOSPITAL,Medicaid,...,Admission Date: [**2129-11-23**] ...,516696.0,69776.0,8.0,4280,True,True,1 days 07:46:00,1,1


Now to start building out the CNN 

In [65]:
train_texts = train_gen['TEXT'].values
test_texts = test_gen['TEXT'].values

In [66]:
train_labels = train_gen['GENERAL_READMISSION'].values
test_labels = test_gen['GENERAL_READMISSION'].values

In [67]:
train_texts.shape[0]

10744

In [69]:
test_texts.shape[0]

1194

In [70]:
train_labels.shape[0]

10744

In [71]:
test_labels.shape[0]

1194

In [72]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)

In [73]:
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

In [74]:
max_sequence_length = max([len(seq) for seq in train_sequences])
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

In [75]:
word2vec_path = os.getcwd() + '/data/PubMed-and-PMC-w2v.bin'
word2vec_model = KeyedVectors.load_word2vec_format(datapath(word2vec_path), binary=True)

embedding_dim = word2vec_model.vector_size
vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

In [76]:
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=True)(input_layer)
conv1 = Conv1D(filters=128, kernel_size=1, activation='relu')(embedding_layer)
conv2 = Conv1D(filters=128, kernel_size=2, activation='relu')(embedding_layer)
conv3 = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding_layer)

pool1 = GlobalMaxPooling1D()(conv1)
pool2 = GlobalMaxPooling1D()(conv2)
pool3 = GlobalMaxPooling1D()(conv3)

concat = tf.concat([pool1, pool2, pool3], axis=-1)
output_layer = Dense(1, activation='sigmoid')(concat)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

2023-05-06 21:17:01.158752: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/lib/python3.9/site-packages/smdistributed/dataparallel/lib:/usr/local/lib:/opt/amazon/openmpi/lib/:/opt/amazon/efa/lib/:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/lib
2023-05-06 21:17:01.158779: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2023-05-06 21:17:01.158800: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (tensorflow-2-10-1-gp-ml-m5-2xlarge-e1e5905768965f7ca9d3d68b6ae2): /proc/driver/nvidia/version does not exist
2023-05-06 21:17:01.159026: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU 

Extension horovod.torch has not been built: /usr/local/lib/python3.9/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-39-x86_64-linux-gnu.so not found
If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.
[2023-05-06 21:17:01.494 tensorflow-2-10-1-gp-ml-m5-2xlarge-e1e5905768965f7ca9d3d68b6ae2:21 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2023-05-06 21:17:01.709 tensorflow-2-10-1-gp-ml-m5-2xlarge-e1e5905768965f7ca9d3d68b6ae2:21 INFO profiler_config_parser.py:111] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.




In [77]:
%%time
model.fit(train_padded, train_labels, epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10


2023-05-06 21:18:18.878768: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 329248788 exceeds 10% of free system memory.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 6h 30min 28s, sys: 1h 12min 2s, total: 7h 42min 31s
Wall time: 1h 3min 49s


<keras.callbacks.History at 0x7fc5400bb070>

In [78]:
loss, accuracy = model.evaluate(test_padded, test_labels)
print(f'Test set accuracy: {accuracy}')

Test set accuracy: 0.8257956504821777


In [79]:
from tensorflow.keras.models import load_model

# Make predictions on the test set
test_predictions = model.predict(test_padded)

# Set a threshold to classify predictions as positive or negative
threshold = 0.5
predicted_labels = [1 if pred >= threshold else 0 for pred in test_predictions]

# Evaluate the model's performance using various metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels)
recall = recall_score(test_labels, predicted_labels)
f1 = f1_score(test_labels, predicted_labels)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 score: {f1:.2f}")

Accuracy: 0.83
Precision: 0.80
Recall: 0.86
F1 score: 0.83


In [80]:
thirty_day_train_texts = train_thirty['TEXT'].values
thirty_day_test_texts = test_thirty['TEXT'].values

In [86]:
thirty_day_train_labels = train_thirty['GENERAL_READMISSION'].values
thirty_day_test_labels = test_thirty['GENERAL_READMISSION'].values

In [87]:
thirty_day_train_texts.shape[0]

1202

In [88]:
thirty_day_test_texts.shape[0]

134

In [89]:
thirty_day_train_labels.shape[0]

1202

In [90]:
thirty_day_test_labels.shape[0]

134

In [91]:
thirty_day_tokenizer = Tokenizer()
thirty_day_tokenizer.fit_on_texts(thirty_day_train_texts)

In [92]:
thirty_day_train_sequences = thirty_day_tokenizer.texts_to_sequences(thirty_day_train_texts)
thirty_day_test_sequences = thirty_day_tokenizer.texts_to_sequences(thirty_day_test_texts)

In [93]:
thirty_day_max_sequence_length = max([len(seq) for seq in thirty_day_train_sequences])
thirty_day_train_padded = pad_sequences(thirty_day_train_sequences, maxlen=max_sequence_length, padding='post')
thirty_day_test_padded = pad_sequences(thirty_day_test_sequences, maxlen=max_sequence_length, padding='post')

In [94]:
#word2vec_path = os.getcwd() + '/data/PubMed-and-PMC-w2v.bin'
#word2vec_model = KeyedVectors.load_word2vec_format(datapath(word2vec_path), binary=True)

thirty_day_embedding_dim = word2vec_model.vector_size
thirty_day_vocab_size = len(thirty_day_tokenizer.word_index) + 1

thirty_day_embedding_matrix = np.zeros((thirty_day_vocab_size, thirty_day_embedding_dim))

for word, i in thirty_day_tokenizer.word_index.items():
    if word in word2vec_model:
        thirty_day_embedding_matrix[i] = word2vec_model[word]

In [95]:
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(thirty_day_vocab_size,thirty_day_embedding_dim, weights=[thirty_day_embedding_matrix], trainable=True)(input_layer)
conv1 = Conv1D(filters=128, kernel_size=1, activation='relu')(embedding_layer)
conv2 = Conv1D(filters=128, kernel_size=2, activation='relu')(embedding_layer)
conv3 = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding_layer)

pool1 = GlobalMaxPooling1D()(conv1)
pool2 = GlobalMaxPooling1D()(conv2)
pool3 = GlobalMaxPooling1D()(conv3)

concat = tf.concat([pool1, pool2, pool3], axis=-1)
output_layer = Dense(1, activation='sigmoid')(concat)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [96]:
%%time
model.fit(thirty_day_train_padded, thirty_day_train_labels, epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 42min 28s, sys: 6min 35s, total: 49min 3s
Wall time: 6min 30s


<keras.callbacks.History at 0x7fc5267a50d0>

In [97]:
loss, accuracy = model.evaluate(thirty_day_test_padded, thirty_day_test_labels)
print(f'Test set accuracy: {accuracy}')

Test set accuracy: 0.8208954930305481


In [98]:

# Make predictions on the test set
test_predictions = model.predict(thirty_day_test_padded)

# Set a threshold to classify predictions as positive or negative
threshold = 0.5
predicted_labels = [1 if pred >= threshold else 0 for pred in test_predictions]

# Evaluate the model's performance using various metrics
#from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(thirty_day_test_labels, predicted_labels)
precision = precision_score(thirty_day_test_labels, predicted_labels)
recall = recall_score(thirty_day_test_labels, predicted_labels)
f1 = f1_score(thirty_day_test_labels, predicted_labels)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 score: {f1:.2f}")

Accuracy: 0.82
Precision: 0.80
Recall: 0.95
F1 score: 0.87
