In [None]:
%pip install gensim
#%pip install pyhealth

In [None]:
import pandas as pd
import numpy as np
#import pyhealth
import os
import warnings
import sagemaker
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Input
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from tensorflow.keras.models import Model
#from sagemaker.tensorflow import TensorFlow


We will first import the data needed for this experiment

In [None]:
#1. The PubMed word2vec vectors
w2vec_url = 'http://evexdb.org/pmresources/vec-space-models/'
w2vec_filename = 'PubMed-and-PMC-w2v.bin'
if not os.path.isfile('./data/' + w2vec_filename):
    !cd data
    !wget {w2vec_url}{w2vec_filename}

In [None]:
#2. The admissions records from MIMIC III

admission_df = pd.read_csv(os.getcwd() + '/data/ADMISSIONS.csv.gz')
admission_df.head()

In [None]:
#3. The ICD Diagnoses records from MIMIC III
diagnoses_df = pd.read_csv(os.getcwd() + '/data/DIAGNOSES_ICD.csv.gz')
diagnoses_df.head()

In [None]:
%%time
#4. The notes records from MIMIC III

notes_df = pd.read_csv(os.getcwd() + '/data/NOTEEVENTS.csv.gz')
notes_df.head()

In [None]:
# ICD-9 codes for heart failure. The paper had the origional ICD numbers with decimel in place. The MIMIC dataset removed all decimels.
# The codes below match the MIMIC III ICD codes (i.e. without decimals)
hf_icd9_codes = [
    '39891', '40201', '40211', '40291', '40401', '40403', '40411',
    '40413', '40491', '40493', '4280', '4281', '42820', '42821',
    '42822', '42823', '42830', '42831', '42832', '42833', '42840',
    '42841', '42842', '42843', '4289'
]

In [None]:
#Filter out all diagnoses that does not pertain to heart failure diagnoses
hf_diagnoses_df = diagnoses_df[diagnoses_df['ICD9_CODE'].isin(hf_icd9_codes)]
hf_diagnoses_df.head()

In [None]:
#filter out all the notes that are note of the category "Discharge summary". Also cleaned converted type of HADM_ID in notes_df becuase they
#were imported as floats while other other MIMIC III data imported this column as ints

discharge_notes_df = notes_df[notes_df["CATEGORY"] == "Discharge summary"]
discharge_notes_df = discharge_notes_df.round().astype({'HADM_ID':'int64'})
discharge_notes_df.head()

In [None]:
#Create one dataframe that has all the data we want
intermediate_df = admission_df.merge(discharge_notes_df, how="left", on="HADM_ID")
admission_diagnoses_notes_df = intermediate_df.merge(hf_diagnoses_df, how="left", on="HADM_ID")
admission_diagnoses_notes_df.head()

In [None]:
#Drop all rows where the patient was NOT diagnosed with a heart failure related ICD9 code
admission_diagnoses_notes_df = admission_diagnoses_notes_df.dropna(subset=["ICD9_CODE"]).reset_index()
admission_diagnoses_notes_df.head()

In [None]:
#Drop all rows where a discharge summary does not exist
admission_diagnoses_notes_df= admission_diagnoses_notes_df.dropna(subset=["CATEGORY"]).reset_index()
admission_diagnoses_notes_df.head(2)

In [None]:
admission_diagnoses_notes_df = admission_diagnoses_notes_df.drop(["level_0", "index"], axis=1)
admission_diagnoses_notes_df.head()

In [None]:
admission_diagnoses_notes_df["ADMITTIME"] = pd.to_datetime(admission_diagnoses_notes_df["ADMITTIME"])
admission_diagnoses_notes_df["DISCHTIME"] = pd.to_datetime(admission_diagnoses_notes_df["DISCHTIME"])
admission_diagnoses_notes_df.head()

In [None]:
admission_diagnoses_notes_df["SAME_SUBJECT_ID"] = admission_diagnoses_notes_df["SUBJECT_ID_x"] == admission_diagnoses_notes_df["SUBJECT_ID_x"].shift(1)
admission_diagnoses_notes_df["SAME_HADM_ID"] = admission_diagnoses_notes_df["HADM_ID"] == admission_diagnoses_notes_df["HADM_ID"].shift(1)
admission_diagnoses_notes_df.head(2)

In [None]:
readmit_times = []

subject_id = 0
hadm_id = 0
dischtime = pd.Timedelta(0)
last_time = pd.Timedelta(0)
for i in range(len(admission_diagnoses_notes_df)):
    
    if admission_diagnoses_notes_df["SUBJECT_ID_x"][i] != subject_id:
        subject_id = admission_diagnoses_notes_df["SUBJECT_ID_x"][i]
        hadm_id = admission_diagnoses_notes_df["HADM_ID"][i]
        dischtime = admission_diagnoses_notes_df["DISCHTIME"][i]
        
        readmit_times.append(pd.Timedelta(0))
        
    elif (admission_diagnoses_notes_df["SUBJECT_ID_x"][i] == subject_id) and (admission_diagnoses_notes_df["HADM_ID"][i] == hadm_id) and (readmit_times[-1] == pd.Timedelta(0)):
        readmit_times.append(pd.Timedelta(0))
        
    elif (admission_diagnoses_notes_df["SUBJECT_ID_x"][i] == subject_id) and (admission_diagnoses_notes_df["HADM_ID"][i] != hadm_id):
        readmit_times.append(admission_diagnoses_notes_df["ADMITTIME"][i] - dischtime)
        
        last_time = admission_diagnoses_notes_df["ADMITTIME"][i] - dischtime
        hadm_id = admission_diagnoses_notes_df["HADM_ID"][i]
        dischtime = admission_diagnoses_notes_df["DISCHTIME"][i]
        
    elif (admission_diagnoses_notes_df["SUBJECT_ID_x"][i] == subject_id) and (admission_diagnoses_notes_df["HADM_ID"][i] == hadm_id):
        readmit_times.append(last_time)
        
    else:
        readmit_times.append(pd.Timedelta(0))

In [None]:
admission_diagnoses_notes_df["READMISSION_TIMES"] = readmit_times

In [None]:
admission_diagnoses_notes_df[admission_diagnoses_notes_df["SUBJECT_ID_x"] == 357]

In [None]:
general_readmission = []

subject_id = 0
hadm_id = 0
for i in range(len(admission_diagnoses_notes_df) - 1, -1, -1):
        
    if (admission_diagnoses_notes_df["SUBJECT_ID_x"][i] != subject_id):
        subject_id = admission_diagnoses_notes_df["SUBJECT_ID_x"][i]
        hadm_id = admission_diagnoses_notes_df["HADM_ID"][i]
        
        general_readmission.append(0)
        
    elif (admission_diagnoses_notes_df["SUBJECT_ID_x"][i] == subject_id) and (admission_diagnoses_notes_df["HADM_ID"][i] != hadm_id):
        general_readmission.append(1)
        
        
    else:
        general_readmission.append(0)

In [None]:
admission_diagnoses_notes_df["GENERAL_READMISSION"] = list(reversed(general_readmission))

In [None]:
admission_diagnoses_notes_df[admission_diagnoses_notes_df["SUBJECT_ID_x"] == 357]

In [None]:
thirtyday_readmission = []

subject_id = 0
hadm_id = 0
for i in range(len(admission_diagnoses_notes_df) - 1, -1, -1):
        
    if (admission_diagnoses_notes_df["SUBJECT_ID_x"][i] != subject_id):
        subject_id = admission_diagnoses_notes_df["SUBJECT_ID_x"][i]
        hadm_id = admission_diagnoses_notes_df["HADM_ID"][i]
        
        thirtyday_readmission.append(0)
        
    elif (admission_diagnoses_notes_df["SUBJECT_ID_x"][i] == subject_id) and (admission_diagnoses_notes_df["HADM_ID"][i] != hadm_id) and (admission_diagnoses_notes_df["READMISSION_TIMES"][i] < pd.Timedelta(days=30)) and (admission_diagnoses_notes_df["READMISSION_TIMES"][i] > pd.Timedelta(days=0)):
        thirtyday_readmission.append(1)
        
    else:
        thirtyday_readmission.append(0)

In [None]:
admission_diagnoses_notes_df["THIRTYDAY_READMISSION"] = list(reversed(thirtyday_readmission))

In [None]:
admission_diagnoses_notes_df.shape[0]

In [None]:
sum(admission_diagnoses_notes_df['GENERAL_READMISSION'])

In [None]:
sum(admission_diagnoses_notes_df['THIRTYDAY_READMISSION'])

In [None]:
true_gen_readmit = admission_diagnoses_notes_df[admission_diagnoses_notes_df["GENERAL_READMISSION"] == True]

In [None]:
false_gen_readmit = admission_diagnoses_notes_df[admission_diagnoses_notes_df["GENERAL_READMISSION"] == False].sample(n=sum(admission_diagnoses_notes_df["GENERAL_READMISSION"]), random_state=1)

In [None]:
gen_readmission_dataset = pd.concat([true_gen_readmit, false_gen_readmit])
gen_readmission_dataset.head()

In [None]:
gen_readmission_dataset.shape[0]

In [None]:
true_thirty_readmit = admission_diagnoses_notes_df[admission_diagnoses_notes_df["THIRTYDAY_READMISSION"] == True]

In [None]:
false_thirty_readmit = admission_diagnoses_notes_df[admission_diagnoses_notes_df["THIRTYDAY_READMISSION"] == False].sample(n=sum(admission_diagnoses_notes_df["THIRTYDAY_READMISSION"]), random_state=1)

In [None]:
thirty_readmission_dataset = pd.concat([true_thirty_readmit, false_thirty_readmit])
thirty_readmission_dataset.head()

In [None]:
thirty_readmission_dataset.shape[0]

In [None]:
gen_readmission_dataset = gen_readmission_dataset.sample(frac=1)
ratio = 0.9
 
total_rows = gen_readmission_dataset.shape[0]
train_size = int(total_rows*ratio)
 
# Split data into test and train
train_gen = gen_readmission_dataset[0:train_size]
test_gen = gen_readmission_dataset[train_size:]

In [None]:
thirty_readmission_dataset = thirty_readmission_dataset.sample(frac=1)
ratio = 0.9
 
total_rows = thirty_readmission_dataset.shape[0]
train_size = int(total_rows*ratio)
 
# Split data into test and train
train_thirty = thirty_readmission_dataset[0:train_size]
test_thirty = thirty_readmission_dataset[train_size:]

In [None]:
train_gen.head()

In [None]:
test_gen.head()

In [None]:
train_thirty.head()

In [None]:
test_thirty.head()

In [None]:
train_gen.shape[0]

In [None]:
train_thirty.shape[0]

Now to start building out the CNN. Defining a few functions that can be reused for ML models

In [None]:
def values_labels(training_set, test_set):
    train_texts = training_set['TEXT'].values
    test_texts = test_set['TEXT'].values
    train_labels = training_set['GENERAL_READMISSION'].values
    test_labels = test_set['GENERAL_READMISSION'].values
    return train_texts, test_texts, train_labels, test_labels


In [None]:
word2vec_path = os.getcwd() + '/data/PubMed-and-PMC-w2v.bin'
word2vec_model = KeyedVectors.load_word2vec_format(datapath(word2vec_path), binary=True)
embedding_dim = word2vec_model.vector_size

In [None]:
def build_embedding_info(training_texts, testing_texts):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(training_texts)
    train_sequences = tokenizer.texts_to_sequences(training_texts)
    test_sequences = tokenizer.texts_to_sequences(testing_texts)
    
    max_sequence_length = max([len(seq) for seq in train_sequences])
    train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
    test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')
    
    vocab_size = len(tokenizer.word_index) + 1

    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, i in tokenizer.word_index.items():
        if word in word2vec_model:
            embedding_matrix[i] = word2vec_model[word]
    return max_sequence_length, train_padded, test_padded, vocab_size, embedding_matrix

In [None]:
def random_forest(training_texts, testing_texts):
    vectorizer = TfidfVectorizer(max_features=25000)
    X_train_tfidf = vectorizer.fit_transform(training_texts)
    X_test_tfidf = vectorizer.transform(testing_texts)
    rf_clf = RandomForestClassifier(n_estimators=100)
    return rf_clf,  X_train_tfidf, X_test_tfidf

In [None]:
def build_CNN(max_sequence_length,embedding_dim, vocab_size, embedding_matrix):
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=True)(input_layer)
    conv1 = Conv1D(filters=256, kernel_size=1, activation='relu')(embedding_layer)
    conv2 = Conv1D(filters=256, kernel_size=2, activation='relu')(embedding_layer)
    conv3 = Conv1D(filters=256, kernel_size=3, activation='relu')(embedding_layer)

    pool1 = GlobalMaxPooling1D()(conv1)
    pool2 = GlobalMaxPooling1D()(conv2)
    pool3 = GlobalMaxPooling1D()(conv3)

    concat = tf.concat([pool1, pool2, pool3], axis=-1)
    output_layer = Dense(1, activation='sigmoid')(concat)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
def chi_square_scores(test_texts, test_labels):
    vectorizer = CountVectorizer(binary=True)
    X_counts_correct = vectorizer.fit_transform(test_texts)
    chi2_scores, p_values = chi2(X_counts_correct, test_labels)

    # Sort the words based on their chi-square scores
    sorted_indices = np.argsort(chi2_scores)[::-1]
    sorted_words = np.array(vectorizer.get_feature_names_out())[sorted_indices]
    sorted_scores = chi2_scores[sorted_indices]
    return sorted_words, sorted_scores

In [None]:
gen_train_texts, gen_test_texts, gen_train_labels, gen_test_labels = values_labels(train_gen, test_gen)
print(f'training text size={gen_train_texts.shape[0]}')
print(f'test text size={gen_test_texts.shape[0]}')
print(f'training labels size={gen_train_labels.shape[0]}')
print(f'test labels size={gen_test_labels.shape[0]}')

In [None]:
thirty_day_train_texts, thirty_day_test_texts, thirty_day_train_labels, thirty_day_test_labels = values_labels(train_thirty, test_thirty)
print(f'training text size={thirty_day_train_texts.shape[0]}')
print(f'test text size={thirty_day_test_texts.shape[0]}')
print(f'training labels size={thirty_day_train_labels.shape[0]}')
print(f'test labels size={thirty_day_test_labels.shape[0]}')

In [None]:
gen_rand_forest_classifier, gen_train_tfidf, gen_test_tfidf = random_forest(gen_train_texts, gen_test_texts)

In [None]:
%%time
gen_rand_forest_classifier.fit(gen_train_tfidf, gen_train_labels)

In [None]:
# Make predictions on the test set
gen_y_pred = gen_rand_forest_classifier.predict(gen_test_tfidf)
accuracy = accuracy_score(gen_test_labels, gen_y_pred)
precision = precision_score(gen_test_labels, gen_y_pred)
recall = recall_score(gen_test_labels, gen_y_pred)
f1 = f1_score(gen_test_labels, gen_y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 score: {f1:.2f}")

In [None]:
thirty_day_rand_forest_classifier, thirty_day_train_tfidf, thirty_day_test_tfidf = random_forest(thirty_day_train_texts, thirty_day_test_texts)

In [None]:
%%time
thirty_day_rand_forest_classifier.fit(thirty_day_train_tfidf, thirty_day_train_labels)

In [None]:
thirty_day_y_pred = thirty_day_rand_forest_classifier.predict(thirty_day_test_tfidf)
accuracy = accuracy_score(thirty_day_test_labels, thirty_day_y_pred)
recision = precision_score(thirty_day_test_labels, thirty_day_y_pred)
recall = recall_score(thirty_day_test_labels, thirty_day_y_pred)
f1 = f1_score(thirty_day_test_labels, thirty_day_y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 score: {f1:.2f}")

In [None]:
max_sequence_length, gen_train_padded, gen_test_padded, gen_vocab_size, gen_embedding_matrix = build_embedding_info(gen_train_texts, gen_test_texts)

In [None]:
gen_model = build_CNN(max_sequence_length, embedding_dim, gen_vocab_size, gen_embedding_matrix)

In [None]:
%%time
gen_model.fit(gen_train_padded, gen_train_labels, epochs=10, batch_size=10, validation_split=0.1)

In [None]:
loss, accuracy = gen_model.evaluate(gen_test_padded, gen_test_labels)
print(f'Test set accuracy: {accuracy}')

In [None]:
test_predictions = gen_model.predict(gen_test_padded)

# Set a threshold to classify predictions as positive or negative
threshold = 0.7
predicted_labels = [1 if pred >= threshold else 0 for pred in test_predictions]

accuracy = accuracy_score(gen_test_labels, predicted_labels)
precision = precision_score(gen_test_labels, predicted_labels)
recall = recall_score(gen_test_labels, predicted_labels)
f1 = f1_score(gen_test_labels, predicted_labels)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 score: {f1:.2f}")

In [None]:
gen_correct_indices = np.where(predicted_labels == gen_test_labels)[0]
gen_correct_texts = gen_test_texts[gen_correct_indices]
gen_correct_labels = gen_test_labels[gen_correct_indices]

In [None]:
max_sequence_length, thirty_day_train_padded, thirty_day_test_padded, thirty_day_vocab_size, thirty_day_embedding_matrix = build_embedding_info(thirty_day_train_texts, thirty_day_test_texts)

In [None]:
thirty_day_model = build_CNN(max_sequence_length, embedding_dim, thirty_day_vocab_size, thirty_day_embedding_matrix)

In [None]:
%%time
thirty_day_model.fit(thirty_day_train_padded, thirty_day_train_labels, epochs=10, batch_size=10, validation_split=0.1)

In [None]:
loss, accuracy = thirty_day_model.evaluate(thirty_day_test_padded, thirty_day_test_labels)
print(f'Test set accuracy: {accuracy}')

In [None]:

# Make predictions on the test set
test_predictions = thirty_day_model.predict(thirty_day_test_padded)

# Set a threshold to classify predictions as positive or negative
threshold = 0.7
predicted_labels = [1 if pred >= threshold else 0 for pred in test_predictions]

accuracy = accuracy_score(thirty_day_test_labels, predicted_labels)
precision = precision_score(thirty_day_test_labels, predicted_labels)
recall = recall_score(thirty_day_test_labels, predicted_labels)
f1 = f1_score(thirty_day_test_labels, predicted_labels)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 score: {f1:.2f}")

In [None]:
thirty_day_correct_indices = np.where(predicted_labels == thirty_day_test_labels)[0]
thirty_day_correct_texts = thirty_day_test_texts[thirty_day_correct_indices]
thirty_day_correct_labels = thirty_day_test_labels[thirty_day_correct_indices]

In [None]:
#chi-squre analysis on gen admissions
gen_words, gen_scores = chi_square_scores(gen_correct_texts, gen_correct_labels)
num_features = 20
for i in range(num_features):
    print(f"{gen_words[i]}: {gen_scores[i]:.2f}")

In [None]:
#chi-squre analysis on thirty day admissions
thirty_day_words, thirty_day_scores = chi_square_scores(thirty_day_correct_texts, thirty_day_correct_labels)
num_features = 20
for i in range(num_features):
    print(f"{thirty_day_words[i]}: {thirty_day_scores[i]:.2f}")