In [2]:
# Importing necessary libraries
import pandas as pd
import regex as re
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping

2023-12-03 13:50:53.940252: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-03 13:50:54.259233: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-03 13:50:54.259327: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-03 13:50:54.273388: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-03 13:50:54.332193: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-03 13:50:54.335814: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [3]:
# Loading data
train_file = 'MIMIC_III_train.csv'
test_file = 'MIMIC_III_test.csv'
validation_file = 'MIMIC_III_validation.csv'

train = pd.read_csv(train_file, index_col=0)
test = pd.read_csv(test_file, index_col=0)
validation = pd.read_csv(validation_file, index_col=0)

In [4]:
# Selecting relevant columns
train = train[['TEXT', 'DIAGNOSIS']]
test = test[['TEXT', 'DIAGNOSIS']]
validation = validation[['TEXT', 'DIAGNOSIS']]

In [5]:
# Text preprocessing function
def preprocess(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text, re.I|re.A)  # Remove non-alphanumeric characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove single characters
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)  # Remove single characters from the start
    text = re.sub(r'\s+', ' ', text, flags=re.I)  # Substitute multiple spaces with single space
    text = re.sub(r'^b\s+', '', text)  # Remove prefixed 'b'
    text = text.lower()  # Convert to lowercase
    return text

In [6]:
# Applying text preprocessing to the data
train['TEXT'] = train['TEXT'].apply(preprocess)
test['TEXT'] = test['TEXT'].apply(preprocess)
validation['TEXT'] = validation['TEXT'].apply(preprocess)

In [7]:
# Tokenizing and padding sequences
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 250

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train['TEXT'].values)
word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

Found 60530 unique tokens.


In [8]:
# Pad sequences for train, test, and validation
X_train = pad_sequences(tokenizer.texts_to_sequences(train['TEXT'].values), maxlen=MAX_SEQUENCE_LENGTH)
X_test = pad_sequences(tokenizer.texts_to_sequences(test['TEXT'].values), maxlen=MAX_SEQUENCE_LENGTH)
X_validation = pad_sequences(tokenizer.texts_to_sequences(validation['TEXT'].values), maxlen=MAX_SEQUENCE_LENGTH)

In [9]:
# For any diagnosis in test and validate not in train, replace with 'Other'
test['DIAGNOSIS'] = test['DIAGNOSIS'].apply(lambda x: 'Other' if x not in train['DIAGNOSIS'].unique() else x)
validation['DIAGNOSIS'] = validation['DIAGNOSIS'].apply(lambda x: 'Other' if x not in train['DIAGNOSIS'].unique() else x)

# One-hot encoding the target variable
Y_train = pd.get_dummies(train['DIAGNOSIS']).values

# Have Y_test and Y_validation use the same one-hot encoding as Y_train
Y_test = pd.get_dummies(test['DIAGNOSIS']).reindex(columns = pd.get_dummies(train['DIAGNOSIS']).columns, fill_value=False).values
Y_validation = pd.get_dummies(validation['DIAGNOSIS']).reindex(columns = pd.get_dummies(train['DIAGNOSIS']).columns, fill_value=False).values

# Print diagnoses and their corresponding column number
diagnoses = pd.get_dummies(train['DIAGNOSIS']).columns
for i in range(len(diagnoses)):
    print("Diagnosis: " + diagnoses[i] + " Column: " + str(i))

Diagnosis: ABDOMINAL PAIN Column: 0
Diagnosis: ACUTE CORONARY SYNDROME Column: 1
Diagnosis: ALTERED MENTAL STATUS Column: 2
Diagnosis: CARDIAC ARREST Column: 3
Diagnosis: CHEST PAIN Column: 4
Diagnosis: CONGESTIVE HEART FAILURE Column: 5
Diagnosis: CORONARY ARTERY DISEASE Column: 6
Diagnosis: CORONARY ARTERY DISEASE\CATH Column: 7
Diagnosis: CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS GRAFT /SDA Column: 8
Diagnosis: CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS GRAFT/SDA Column: 9
Diagnosis: DIABETIC KETOACIDOSIS Column: 10
Diagnosis: DYSPNEA Column: 11
Diagnosis: FEVER Column: 12
Diagnosis: GASTROINTESTINAL BLEED Column: 13
Diagnosis: HEAD BLEED Column: 14
Diagnosis: HYPOTENSION Column: 15
Diagnosis: HYPOXIA Column: 16
Diagnosis: INTRACRANIAL HEMORRHAGE Column: 17
Diagnosis: LIVER FAILURE Column: 18
Diagnosis: LOWER GI BLEED Column: 19
Diagnosis: MYOCARDIAL INFARCTION Column: 20
Diagnosis: NEWBORN Column: 21
Diagnosis: Other Column: 22
Diagnosis: PANCREATITIS Column: 23
Diagnosis:

In [10]:
# Print the shape of the data
print('Shape of train data tensor:', X_train.shape)
print('Shape of train label tensor:', Y_train.shape)
print('Shape of test data tensor:', X_test.shape)  
print('Shape of test label tensor:', Y_test.shape)
print('Shape of validation data tensor:', X_validation.shape)
print('Shape of validation label tensor:', Y_validation.shape)

# Validate that there is exactly one True value per row in Y_train, Y_test, and Y_validation
for i in range(len(Y_train)):
    if sum(Y_train[i]) != 1:
        print("Error: Y_train row " + str(i) + " has " + str(sum(Y_train[i])) + " True values")
for i in range(len(Y_test)):
    if sum(Y_test[i]) != 1:
        print("Error: Y_test row " + str(i) + " has " + str(sum(Y_test[i])) + " True values")
for i in range(len(Y_validation)):
    if sum(Y_validation[i]) != 1:
        print("Error: Y_validation row " + str(i) + " has " + str(sum(Y_validation[i])) + " True values")

Shape of train data tensor: (30000, 250)
Shape of train label tensor: (30000, 37)
Shape of test data tensor: (10000, 250)
Shape of test label tensor: (10000, 37)
Shape of validation data tensor: (15000, 250)
Shape of validation label tensor: (15000, 37)


In [11]:
# Print proportions of each class
unique, counts = np.unique(train['DIAGNOSIS'], return_counts=True)
total = sum(counts)
print("Proportions of each class:")
for i in range(len(unique)):
    print(unique[i], counts[i]/total)

Proportions of each class:
ABDOMINAL PAIN 0.0174
ACUTE CORONARY SYNDROME 0.0029333333333333334
ALTERED MENTAL STATUS 0.0109
CARDIAC ARREST 0.0023
CHEST PAIN 0.0025666666666666667
CONGESTIVE HEART FAILURE 0.013966666666666667
CORONARY ARTERY DISEASE 0.015866666666666668
CORONARY ARTERY DISEASE\CATH 0.0026
CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS GRAFT /SDA 0.0011666666666666668
CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS GRAFT/SDA 0.005666666666666667
DIABETIC KETOACIDOSIS 0.00046666666666666666
DYSPNEA 0.0031333333333333335
FEVER 0.0030666666666666668
GASTROINTESTINAL BLEED 0.017866666666666666
HEAD BLEED 0.0016666666666666668
HYPOTENSION 0.0047
HYPOXIA 0.0026
INTRACRANIAL HEMORRHAGE 0.0012
LIVER FAILURE 0.0014333333333333333
LOWER GI BLEED 0.0008
MYOCARDIAL INFARCTION 0.002533333333333333
NEWBORN 0.3191
Other 0.4522
PANCREATITIS 0.009266666666666666
PNEUMONIA 0.021766666666666667
PREMATURITY 0.015966666666666667
RESPIRATORY DISTRESS 6.666666666666667e-05
RESPIRATORY FAILURE 0

In [3]:
# Load embeddings from 'glove_model_fine_tuned.txt'
embeddings_index = {}
f = open('glove_model_fine_tuned.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        pass
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# Create embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    try:
        embedding_vector = embeddings_index[word]
        embedding_matrix[i] = embedding_vector
    except:
        pass

In [12]:
# Create the model using embedding matrix
model = Sequential()
model.add(Embedding(num_words, EMBEDDING_DIM, input_length=X_train.shape[1], weights=[embedding_matrix], trainable=True))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

2023-12-03 13:55:28.668464: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:02:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-03 13:55:28.714341: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [13]:
print(X_train.dtype)
print(Y_train.dtype)

int32
bool


In [55]:
# Training the model
epochs = 5
batch_size = 64

history = model.fit(
    X_train, Y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_validation, Y_validation),
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


In [56]:
# Evaluate model on test set
accr = model.evaluate(X_test, Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))

Test set
  Loss: 1.342
  Accuracy: 0.743


In [57]:
# Predictions on test set
y_pred = model.predict(X_test)

# Convert predictions to one-hot encoded format
y_pred_one_hot = np.zeros_like(y_pred)
y_pred_one_hot[np.arange(len(y_pred)), y_pred.argmax(axis=1)] = 1




In [58]:
# For each dianosis, calculate the accuracy, precision, recall, and F1 score; catch divide by zero
for i in range(len(diagnoses)):
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for j in range(len(y_pred_one_hot)):
        if y_pred_one_hot[j][i] == 1 and Y_test[j][i] == 1:
            TP += 1
        elif y_pred_one_hot[j][i] == 1 and Y_test[j][i] == 0:
            FP += 1
        elif y_pred_one_hot[j][i] == 0 and Y_test[j][i] == 1:
            FN += 1
        elif y_pred_one_hot[j][i] == 0 and Y_test[j][i] == 0:
            TN += 1
    try:
        accuracy = (TP + TN) / (TP + FP + FN + TN)
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        f1_score = 2 * precision * recall / (precision + recall)
        print(diagnoses[i])
        print("Accuracy: " + str(accuracy))
        print("Precision: " + str(precision))
        print("Recall: " + str(recall))
        print("F1 Score: " + str(f1_score))
        print()
    except:
        print(diagnoses[i])
        print("Accuracy: N/A")
        print("Precision: N/A")
        print("Recall: N/A")
        print("F1 Score: N/A")
        print()

ABDOMINAL PAIN
Accuracy: N/A
Precision: N/A
Recall: N/A
F1 Score: N/A

ACUTE CORONARY SYNDROME
Accuracy: N/A
Precision: N/A
Recall: N/A
F1 Score: N/A

ALTERED MENTAL STATUS
Accuracy: N/A
Precision: N/A
Recall: N/A
F1 Score: N/A

CARDIAC ARREST
Accuracy: N/A
Precision: N/A
Recall: N/A
F1 Score: N/A

CHEST PAIN
Accuracy: N/A
Precision: N/A
Recall: N/A
F1 Score: N/A

CONGESTIVE HEART FAILURE
Accuracy: N/A
Precision: N/A
Recall: N/A
F1 Score: N/A

CORONARY ARTERY DISEASE
Accuracy: N/A
Precision: N/A
Recall: N/A
F1 Score: N/A

CORONARY ARTERY DISEASE\CATH
Accuracy: N/A
Precision: N/A
Recall: N/A
F1 Score: N/A

CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS GRAFT /SDA
Accuracy: N/A
Precision: N/A
Recall: N/A
F1 Score: N/A

CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS GRAFT/SDA
Accuracy: N/A
Precision: N/A
Recall: N/A
F1 Score: N/A

DIABETIC KETOACIDOSIS
Accuracy: N/A
Precision: N/A
Recall: N/A
F1 Score: N/A

DYSPNEA
Accuracy: N/A
Precision: N/A
Recall: N/A
F1 Score: N/A

FEVER
Accuracy: N/

In [59]:
# For each diagnosis, calculate the number of true positives, false positives, false negatives, and true negatives
for i in range(len(diagnoses)):
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for j in range(len(y_pred_one_hot)):
        if y_pred_one_hot[j][i] == 1 and Y_test[j][i] == 1:
            TP += 1
        elif y_pred_one_hot[j][i] == 1 and Y_test[j][i] == 0:
            FP += 1
        elif y_pred_one_hot[j][i] == 0 and Y_test[j][i] == 1:
            FN += 1
        elif y_pred_one_hot[j][i] == 0 and Y_test[j][i] == 0:
            TN += 1
    print(diagnoses[i])
    print("TP: " + str(TP))
    print("FP: " + str(FP))
    print("FN: " + str(FN))
    print("TN: " + str(TN))
    print()
    

ABDOMINAL PAIN
TP: 0
FP: 15
FN: 46
TN: 9939

ACUTE CORONARY SYNDROME
TP: 0
FP: 0
FN: 40
TN: 9960

ALTERED MENTAL STATUS
TP: 0
FP: 0
FN: 110
TN: 9890

CARDIAC ARREST
TP: 0
FP: 0
FN: 0
TN: 10000

CHEST PAIN
TP: 0
FP: 0
FN: 97
TN: 9903

CONGESTIVE HEART FAILURE
TP: 0
FP: 0
FN: 94
TN: 9906

CORONARY ARTERY DISEASE
TP: 0
FP: 7
FN: 427
TN: 9566

CORONARY ARTERY DISEASE\CATH
TP: 0
FP: 0
FN: 37
TN: 9963

CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS GRAFT /SDA
TP: 0
FP: 0
FN: 0
TN: 10000

CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS GRAFT/SDA
TP: 0
FP: 0
FN: 20
TN: 9980

DIABETIC KETOACIDOSIS
TP: 0
FP: 0
FN: 8
TN: 9992

DYSPNEA
TP: 0
FP: 0
FN: 48
TN: 9952

FEVER
TP: 0
FP: 0
FN: 7
TN: 9993

GASTROINTESTINAL BLEED
TP: 0
FP: 26
FN: 179
TN: 9795

HEAD BLEED
TP: 0
FP: 0
FN: 0
TN: 10000

HYPOTENSION
TP: 0
FP: 0
FN: 9
TN: 9991

HYPOXIA
TP: 0
FP: 0
FN: 0
TN: 10000

INTRACRANIAL HEMORRHAGE
TP: 0
FP: 0
FN: 280
TN: 9720

LIVER FAILURE
TP: 0
FP: 0
FN: 0
TN: 10000

LOWER GI BLEED
TP: 0
FP: 0
FN: 0
TN: 1

In [65]:
# Export this notebook to HTML
from subprocess import call
call(['python3', '-m', 'nbconvert', 'classification.ipynb' , '--to', 'html'])


[NbConvertApp] Converting notebook classification.ipynb to html
[NbConvertApp] Writing 327269 bytes to classification.html


0