In [1]:
import pandas as pd
import regex as re

train_file = 'MIMIC_III_train.csv'
test_file = 'MIMIC_III_test.csv'
validation_file = 'MIMIC_III_validation.csv'

train = pd.read_csv(train_file, index_col=0)
test = pd.read_csv(test_file, index_col=0)
validation = pd.read_csv(validation_file, index_col=0)

# Remove all columns except for TEXT and DIAGNOSIS
train = train[['TEXT', 'DIAGNOSIS']]
test = test[['TEXT', 'DIAGNOSIS']]
validation = validation[['TEXT', 'DIAGNOSIS']]

def preprocess(text):
    # Remove all non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text, re.I|re.A)
    # Remove all numbers
    text = re.sub(r'\d+', '', text)
    # Remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Remove single characters from the start
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    # Removing prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    # Converting to Lowercase
    text = text.lower()
    return text


train['TEXT'] = train['TEXT'].apply(preprocess)
test['TEXT'] = test['TEXT'].apply(preprocess)
validation['TEXT'] = validation['TEXT'].apply(preprocess)

print(train.head())

                                                       TEXT DIAGNOSIS
ROW_ID_x                                                             
21        admission date discharge date date of birth se...     Other
21        normal sinus rhythm nondiagnostic repolarizati...     Other
21        normal sinus rhythm without diagnostic abnorma...     Other
21         pm chest portable ap clip clip number radiolo...     Other
21         am ct head wo contrast clip clip number radio...     Other


In [5]:
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train['TEXT'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

2023-11-20 12:35:06.293533: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-20 12:35:06.401850: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-20 12:35:06.401904: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-20 12:35:06.404772: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-20 12:35:06.419486: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-20 12:35:06.420709: I tensorflow/core/platform/cpu_feature_guard.cc:1

Found 60530 unique tokens.


In [8]:
# Pad sequences

# Train
X_train = tokenizer.texts_to_sequences(train['TEXT'].values)
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of train data tensor:', X_train.shape)

# Test
X_test = tokenizer.texts_to_sequences(test['TEXT'].values)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test data tensor:', X_test.shape)

# Validation
X_validation = tokenizer.texts_to_sequences(validation['TEXT'].values)
X_validation = pad_sequences(X_validation, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of validation data tensor:', X_validation.shape)



Shape of train data tensor: (30000, 250)
Shape of test data tensor: (10000, 250)
Shape of validation data tensor: (15000, 250)


In [9]:
# Convert categorical labels to numbers
Y_train = pd.get_dummies(train['DIAGNOSIS']).values
print('Shape of train label tensor:', Y_train.shape)

Y_test = pd.get_dummies(test['DIAGNOSIS']).values
print('Shape of test label tensor:', Y_test.shape)

Y_validation = pd.get_dummies(validation['DIAGNOSIS']).values
print('Shape of validation label tensor:', Y_validation.shape)

Shape of train label tensor: (30000, 11)
Shape of test label tensor: (10000, 11)
Shape of validation label tensor: (15000, 11)


In [12]:
# Print proportions of each class
import numpy as np
unique, counts = np.unique(train['DIAGNOSIS'], return_counts=True)
total = sum(counts)
print(dict(zip(unique, counts/total)))

{'ABDOMINAL PAIN': 522, 'ALTERED MENTAL STATUS': 327, 'CHEST PAIN': 77, 'CONGESTIVE HEART FAILURE': 419, 'CORONARY ARTERY DISEASE': 476, 'FEVER': 92, 'INTRACRANIAL HEMORRHAGE': 36, 'NEWBORN': 9573, 'Other': 17129, 'PNEUMONIA': 653, 'SEPSIS': 696}


In [10]:
# Create the model
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_validation, Y_validation), callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

2023-11-20 12:41:29.606316: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:02:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-20 12:41:29.609044: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
# Test accuracy
accr = model.evaluate(X_test, Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))

Test set
  Loss: 0.809
  Accuracy: 0.844
