# Group DARK Project 

In [1]:
#general imports
import pandas as pd 
from bs4 import BeautifulSoup
import bs4
import tensorflow as tf
import glob
import re
import xml.etree.ElementTree as etree
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB


In [25]:
# First, the data needs to be parsed from the XML files
soup = BeautifulSoup() #create beautiful soup object 
path_list = glob.glob('data/*.xml') #get the path to each xml file in the data folder

data_dicts = [] #start a list where the dictionaries will be appended to create the data frame
for f in path_list: # loop through each xml and store the data into the beautiful soup object
    
    soup = BeautifulSoup(open(f), 'html.parser') # I had to use html in order to extract the CData from the XML

    #find the text of the note, which is stored in the CData 
    text = soup.find(text=lambda text: isinstance(text, bs4.CData)).string.strip()

    
    #use regular expression to clean the notes 
    text = re.sub(r'\n+', ' ', text) #remove new paragraph 
    text = re.sub(r'\s\s+\s*', '', text) #remove multiple spaces
    text = re.sub(r'\_+', '', text) #remove underline
    
    #gather smoker status
    try:
        smoker = soup.find('smoker').get('status') #find the smoker status
    except AttributeError: #if the smoker status is unlisted, it will be recorded as 'unknown'
        smoker = 'unknown'
   
    #find the status of coronary artery disease, 'unknown' will be stored if not known
    try:
        CAD = soup.find('cad').get('indicator')
        
    except AttributeError: 
        
        CAD = 'unknown'
    
    f = re.search(r'\d+\-\d+\.\w+', f).group()
    data_dicts.append({'Note': text, 'Smoker': smoker, 'CAD': CAD, 'File name': f})
    
    
    

df = pd.DataFrame(data_dicts) #create a pandas data frame from the dictionaries collected 
df = df.drop_duplicates() #in case some files get passed mulitple times, we'll delete any duplicates
        


In [26]:
df.head()

Unnamed: 0,Note,Smoker,CAD,File name
0,Record date: 2093-04-28BMC EMERGENCY DEPT VISI...,past,unknown,379-03.xml
1,Record date: 2093-01-13 Team X Intern Admissio...,unknown,test,279-03.xml
2,"Record date: 2088-05-21 Patient Name: CURTIS, ...",unknown,unknown,119-01.xml
3,Record date: 2062-03-27 Hematology Clinic Prog...,never,unknown,304-03.xml
4,Record date: 2135-12-15CARDIOLOGYPERDUE MEDICA...,past,test,204-03.xml


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1304 entries, 0 to 1303
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Note       1304 non-null   object
 1   Smoker     1304 non-null   object
 2   CAD        1304 non-null   object
 3   File name  1304 non-null   object
dtypes: object(4)
memory usage: 40.9+ KB


In [124]:
df.loc[20]['Note'] #display an example note from the data

"Record date: 2148-07-28 Internal Medicine Intern Admit Note Pt: Samuel Tuttle MR: 2793372 Cc: EKG changesHPI: 66 y.o. male with pmhx of DM, incr.cholestoral, and autonomic insufficeny admitted for EKG changes. The patient has significant risk factors for CAD of DM, incr. Cholesterol, and h/o htn.He had a adenosine stress test done in 11/47 which was negative for ischemia (lasting 5min and PDP of 13746). His last echo was done in 4/48 showed an EF of 59% presented to the health clinic with a 3 day hx of worsening L shoulder pain and episodes of diaphoresis. The shoulder pain is chronic and thought to be due to oa, and dm chor.He came to the clinic today for a BP check ( he is often hypotensive and orthostatic) and was noted to have TWI inferiorly and laterally on routine EKG.He denied any chest pain, nausea, vomiting when he went to the e.r.In the e.r., his first set of cardiac enzymes was negative, he was also started on heparin IV, and given IV lopressor, and an aspirin. Of note, the

In [27]:
df['Smoker'].value_counts() #view unique values in the 'smoker' column

unknown    635
never      304
past       262
current     91
ever        12
Name: Smoker, dtype: int64

In [28]:
# assuming, 'ever' was a typo of 'never'
df['Smoker'] = df['Smoker'].replace({'ever': 'never'})

In [29]:
df['Smoker'].value_counts() #verify the change took place

unknown    635
never      316
past       262
current     91
Name: Smoker, dtype: int64

In [30]:
df['CAD'].value_counts() # view unique CAD values

unknown    734
mention    337
test        91
event       83
symptom     59
Name: CAD, dtype: int64

In [31]:
df.to_csv('data.csv', index=False)

In [32]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,Note,Smoker,CAD,File name
0,Record date: 2093-04-28BMC EMERGENCY DEPT VISI...,past,unknown,379-03.xml
1,Record date: 2093-01-13 Team X Intern Admissio...,unknown,test,279-03.xml
2,"Record date: 2088-05-21 Patient Name: CURTIS, ...",unknown,unknown,119-01.xml
3,Record date: 2062-03-27 Hematology Clinic Prog...,never,unknown,304-03.xml
4,Record date: 2135-12-15CARDIOLOGYPERDUE MEDICA...,past,test,204-03.xml


In [34]:
df['Smoker'] = df['Smoker'].replace({'unknown': 'unknown', 'never': 'no', 'past': 'yes', 'current': 'yes'})


df.head()

Unnamed: 0,Note,Smoker,CAD,File name
0,Record date: 2093-04-28BMC EMERGENCY DEPT VISI...,yes,unknown,379-03.xml
1,Record date: 2093-01-13 Team X Intern Admissio...,unknown,test,279-03.xml
2,"Record date: 2088-05-21 Patient Name: CURTIS, ...",unknown,unknown,119-01.xml
3,Record date: 2062-03-27 Hematology Clinic Prog...,no,unknown,304-03.xml
4,Record date: 2135-12-15CARDIOLOGYPERDUE MEDICA...,yes,test,204-03.xml


In [35]:
# we will make a binary function to classify each value to either "yes" or "unknown" for coronary artery disease 
def binary(row):
    if row == 'unknown':
        val = 'unknown'
    else:
        val = 'yes'
    return val


df['CAD'] = df['CAD'].apply(binary)

df.head()

Unnamed: 0,Note,Smoker,CAD,File name
0,Record date: 2093-04-28BMC EMERGENCY DEPT VISI...,yes,unknown,379-03.xml
1,Record date: 2093-01-13 Team X Intern Admissio...,unknown,yes,279-03.xml
2,"Record date: 2088-05-21 Patient Name: CURTIS, ...",unknown,unknown,119-01.xml
3,Record date: 2062-03-27 Hematology Clinic Prog...,no,unknown,304-03.xml
4,Record date: 2135-12-15CARDIOLOGYPERDUE MEDICA...,yes,yes,204-03.xml


In [36]:
# export df to csv to make re-running easier
df.to_csv('data.csv', index=False)

In [37]:
#read databack in
df = pd.read_csv('data.csv')

df.head()

Unnamed: 0,Note,Smoker,CAD,File name
0,Record date: 2093-04-28BMC EMERGENCY DEPT VISI...,yes,unknown,379-03.xml
1,Record date: 2093-01-13 Team X Intern Admissio...,unknown,yes,279-03.xml
2,"Record date: 2088-05-21 Patient Name: CURTIS, ...",unknown,unknown,119-01.xml
3,Record date: 2062-03-27 Hematology Clinic Prog...,no,unknown,304-03.xml
4,Record date: 2135-12-15CARDIOLOGYPERDUE MEDICA...,yes,yes,204-03.xml


In [39]:
#create a function to format the data frame and generate dummy variables 
def dummies():
    df = pd.read_csv('data.csv', usecols=['Smoker', 'CAD'])
    new_df = pd.get_dummies(df)
    df = pd.read_csv('data.csv', usecols=['Note'])
    new_df = pd.concat([df, new_df], axis=1)
    return new_df 

In [40]:
new_df = dummies()
new_df.head(10)

Unnamed: 0,Note,Smoker_no,Smoker_unknown,Smoker_yes,CAD_unknown,CAD_yes
0,Record date: 2093-04-28BMC EMERGENCY DEPT VISI...,0,0,1,1,0
1,Record date: 2093-01-13 Team X Intern Admissio...,0,1,0,0,1
2,"Record date: 2088-05-21 Patient Name: CURTIS, ...",0,1,0,1,0
3,Record date: 2062-03-27 Hematology Clinic Prog...,1,0,0,1,0
4,Record date: 2135-12-15CARDIOLOGYPERDUE MEDICA...,0,0,1,0,1
5,Record date: 2084-01-07Fellow/NP Xiang Reason ...,1,0,0,0,1
6,Record date: 2081-09-06 09/06/81 Follow up aft...,1,0,0,1,0
7,Record date: 2082-10-30 COLIN RAMEY CHH Unit N...,1,0,0,1,0
8,Record date: 2079-11-09 Cocke County Baptist H...,1,0,0,1,0
9,Record date: 2062-07-21 Halcyon House Intern A...,0,1,0,1,0


In [41]:
new_df.to_csv('formatted data.csv', index=False)

In [42]:
new_df = pd.read_csv('formatted data.csv')
new_df.head()

Unnamed: 0,Note,Smoker_no,Smoker_unknown,Smoker_yes,CAD_unknown,CAD_yes
0,Record date: 2093-04-28BMC EMERGENCY DEPT VISI...,0,0,1,1,0
1,Record date: 2093-01-13 Team X Intern Admissio...,0,1,0,0,1
2,"Record date: 2088-05-21 Patient Name: CURTIS, ...",0,1,0,1,0
3,Record date: 2062-03-27 Hematology Clinic Prog...,1,0,0,1,0
4,Record date: 2135-12-15CARDIOLOGYPERDUE MEDICA...,0,0,1,0,1


In [43]:
from keras.preprocessing.text import text_to_word_sequence
import pandas as pd


def prepareTextFeatures():
    df = pd.read_csv("formatted data.csv",usecols =['Note'])
    texts = df.values
    processeddocs = []
    for idx, sentence in enumerate(texts):
        processedSentence = text_to_word_sequence(sentence[0])
        corpus = ''
        for word in processedSentence:
            corpus = corpus + ' ' + word
        processeddocs.append(corpus)
        
    print(processeddocs[0])
    return processeddocs
corpus = prepareTextFeatures()

Using TensorFlow backend.


 record date 2093 04 28bmc emergency dept visitnancy jasmine nikkolas693 31 18 1visit date 04 28 93 the patient was seen interviewed and examined by myself as well asdr judd whose note i have reviewed and whose findings i haveconfirmed history of presenting complaint this patient is a 68 year oldmale who presents with slurred speech the patient has a historyof a prior cerebrovascular accident with residual right sideddeficit one day prior to arrival the patient was found to haveincreasing slurring of speech and he was found to increasingweakness over his baseline on the right side the patient arriveswithout acute complaints except for difficulty speaking he deniesany headache he also does complain of some throat pain review of systems review of systems is as per the written note the patient denies fevers chills chest pain shortness of breathor dysphagia past medical history the patient's past medical history issignificant for hypertension diabetes mellitus cerebrovascularaccident and o

## RNN

Try an recurrent neural network to classify all the labels

In [44]:
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Bidirectional, Flatten, Input, Embedding
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import roc_curve, auc
from keras_tqdm import TQDMNotebookCallback
LABELS = list(new_df.columns)
LABELS = LABELS[1:]


def create_model(voc):
    sequence_input = Input(shape=(100,), dtype='int32')
    embedded_sequences = Embedding(voc, 64, input_length=100)(sequence_input)
    rnn_layer = Bidirectional(LSTM(64, return_sequences=True, dropout=0.3,recurrent_dropout=0),
                              merge_mode='concat')(embedded_sequences)
    l_flat = Flatten()(rnn_layer)
    l_dense = Dense(150, activation='relu')(l_flat)
    l_dense = Dense(120, activation='relu')(l_dense)
    l_dense = Dense(100, activation='relu')(l_dense)
    l_dense = Dense(50, activation='relu')(l_dense)
    l_dense = Dense(50, activation='relu')(l_dense)
    preds = Dense(len(LABELS), activation='sigmoid')(l_dense)
    model = Model(inputs=sequence_input, outputs=preds)
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model
df = pd.read_csv("formatted data.csv")
train, test = train_test_split(df, random_state=42, test_size=0.3, shuffle=True)


X_train = train["Note"].values
X_test = test["Note"].values
Y_train = train[LABELS].values
Y_test = test[LABELS].values




tokenizer = Tokenizer(num_words=10)
tokenizer.fit_on_texts(df["Note"].values)
voc_size = len(tokenizer.word_index)+1


X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=100, padding='post')
voc_size = len(tokenizer.word_index)+1


X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=100, padding='post')

model = create_model(voc_size)

model.fit(x=X_train, y=Y_train, batch_size=128,validation_split=0.3, epochs=10,
          verbose=0,callbacks=[TQDMNotebookCallback(leave_inner=True)])


total=0
prediction = model.predict(X_test)
for idx,category in enumerate(LABELS):
    fpr, tpr, _ = roc_curve(Y_test[:, idx], prediction[:, idx])
    roc_auc = auc(fpr, tpr)
    total+=roc_auc
    print('{}: \n Test auc is {}'.format(category,roc_auc))

print("Macro Average AUC:" + str(total/len(LABELS)))

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 64)           2887104   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          66048     
_________________________________________________________________
flatten_1 (Flatten)          (None, 12800)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 150)               1920150   
_________________________________________________________________
dense_2 (Dense)              (None, 120)               18120     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               1210

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


HBox(children=(FloatProgress(value=0.0, description='Training', max=10.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Epoch 0', max=638.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=638.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=638.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=638.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=638.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=638.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 6', max=638.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 7', max=638.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 8', max=638.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 9', max=638.0, style=ProgressStyle(description_widt…



Smoker_no: 
 Test auc is 0.6792174632286835
Smoker_unknown: 
 Test auc is 0.8128600099043448
Smoker_yes: 
 Test auc is 0.719700168918919
CAD_unknown: 
 Test auc is 0.5736991535542759
CAD_yes: 
 Test auc is 0.580545015522594
Macro Average AUC:0.6732043622257635


## CNN

Try a convolutional neural network to accomplish the task

In [45]:
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Input, Embedding

def create_model(voc):
    sequence_input = Input(shape=(100,), dtype='int32')
    embedded_sequences = Embedding(voc, 64, input_length=100)(sequence_input)
    l_cov1= Conv1D(128, 3, activation='relu')(embedded_sequences)
    l_pool1 = MaxPooling1D(5)(l_cov1)
    l_cov2 = Conv1D(128, 3, activation='relu')(l_pool1)
    l_pool2 = MaxPooling1D(17)(l_cov2)  # global max pooling
    l_flat = Flatten()(l_pool2)
    l_dense = Dense(128, activation='relu')(l_flat)
    preds = Dense(5, activation='sigmoid')(l_dense)
    model = Model(inputs=sequence_input, outputs=preds)
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model


model = create_model(voc_size)

model.fit(x=X_train, y=Y_train,batch_size=158,validation_split=0.2, epochs=10,verbose=0,callbacks=[TQDMNotebookCallback(leave_inner=True)])


total=0
prediction = model.predict(X_test)
for idx,category in enumerate(LABELS):
    fpr, tpr, _ = roc_curve(Y_test[:, idx], prediction[:, idx])
    roc_auc = auc(fpr, tpr)
    total+=roc_auc
    print('{} \n Test auc is {}'.format(category,roc_auc))

print("Macro Average AUC:" + str(total/len(LABELS)))

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 64)           2887104   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 98, 128)           24704     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 19, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 17, 128)           49280     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 1, 128)            0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 128)               0   

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


HBox(children=(FloatProgress(value=0.0, description='Training', max=10.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Epoch 0', max=729.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=729.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=729.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=729.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=729.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=729.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 6', max=729.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 7', max=729.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 8', max=729.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Epoch 9', max=729.0, style=ProgressStyle(description_widt…



Smoker_no 
 Test auc is 0.5940230877117272
Smoker_unknown 
 Test auc is 0.7888810696692471
Smoker_yes 
 Test auc is 0.7101280968468469
CAD_unknown 
 Test auc is 0.5849762517579006
CAD_yes 
 Test auc is 0.5676758563961047
Macro Average AUC:0.6491368724763652
