# Group DARK Project 

In [50]:
import pandas as pd 
from bs4 import BeautifulSoup
import bs4
import tensorflow as tf
import glob
import re
import xml.etree.ElementTree as etree
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB


In [24]:
soup = BeautifulSoup()
path_list = glob.glob('data/*.xml')

data_dicts = []
for f in path_list:
    
     
    soup = BeautifulSoup(open(f), 'html.parser') # I had to use html in order to extract the CData from the XML

    #find the text of the note, which is stored in the CData 
    text = soup.find(text=lambda text: isinstance(text, bs4.CData)).string.strip()

    
    #use regular expression to clean the notes 
    text = re.sub(r'\n+', ' ', text) #remove new paragraph 
    text = re.sub(r'\s\s+\s*', '', text) #remove multiple spaces
    text = re.sub(r'\_+', '', text) #remove underline
    
    #gather smoker status
    try:
        smoker = soup.find('smoker').get('status') #find the smoker status
    except AttributeError:
        smoker = 'unknown'
   
    #find the status of coronary artery disease, 'unknown' will be stored if not known
    try:
        CAD = soup.find('cad').get('indicator')
        
    except AttributeError: 
        
        CAD = 'unknown'
    
    data_dicts.append({'Note': text, 'Smoker': smoker, 'CAD': CAD, 'File name': f})
    
    
    

df = pd.DataFrame(data_dicts)
df = df.drop_duplicates() #in case some files get passed mulitple times, we'll delete any duplicates
        


In [25]:
df.head()

Unnamed: 0,Note,Smoker,CAD,File name
0,Record date: 2093-04-28BMC EMERGENCY DEPT VISI...,past,unknown,data/379-03.xml
1,Record date: 2093-01-13 Team X Intern Admissio...,unknown,test,data/279-03.xml
2,"Record date: 2088-05-21 Patient Name: CURTIS, ...",unknown,unknown,data/119-01.xml
3,Record date: 2062-03-27 Hematology Clinic Prog...,never,unknown,data/304-03.xml
4,Record date: 2135-12-15CARDIOLOGYPERDUE MEDICA...,past,test,data/204-03.xml


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1304 entries, 0 to 1303
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Note       1304 non-null   object
 1   Smoker     1304 non-null   object
 2   CAD        1304 non-null   object
 3   File name  1304 non-null   object
dtypes: object(4)
memory usage: 50.9+ KB


In [28]:
df.loc[20]['Note'] #display an example note from the data

"Record date: 2148-07-28 Internal Medicine Intern Admit Note Pt: Samuel Tuttle MR: 2793372 Cc: EKG changesHPI: 66 y.o. male with pmhx of DM, incr.cholestoral, and autonomic insufficeny admitted for EKG changes. The patient has significant risk factors for CAD of DM, incr. Cholesterol, and h/o htn.He had a adenosine stress test done in 11/47 which was negative for ischemia (lasting 5min and PDP of 13746). His last echo was done in 4/48 showed an EF of 59% presented to the health clinic with a 3 day hx of worsening L shoulder pain and episodes of diaphoresis. The shoulder pain is chronic and thought to be due to oa, and dm chor.He came to the clinic today for a BP check ( he is often hypotensive and orthostatic) and was noted to have TWI inferiorly and laterally on routine EKG.He denied any chest pain, nausea, vomiting when he went to the e.r.In the e.r., his first set of cardiac enzymes was negative, he was also started on heparin IV, and given IV lopressor, and an aspirin. Of note, the

In [29]:
df['Smoker'].value_counts()

unknown    635
never      304
past       262
current     91
ever        12
Name: Smoker, dtype: int64

In [30]:
# assuming, 'ever' was a typo of 'never'
df['Smoker'] = df['Smoker'].replace({'ever': 'never'})

In [31]:
df['Smoker'].value_counts() #verify the change took place

unknown    635
never      316
past       262
current     91
Name: Smoker, dtype: int64

In [32]:
df['CAD'].value_counts()

unknown    734
mention    337
test        91
event       83
symptom     59
Name: CAD, dtype: int64

In [35]:
df['Smoker'].value_counts()

unknown    635
never      316
past       262
current     91
Name: Smoker, dtype: int64

In [36]:
#create binary classifications for smoker and CAD 

#since we don't know if 'unknown' means non-smoker, we will drop it from the dataset
smoker_replacements = {'unknown': np.nan, 'never': 0, 'past': 1, 'current': 1}

#for CAD, we will assume 'unknown' to be someone for which there's no reason to assume coronary artery disease
def binary(row):
    if row == 'unknown':
        val = 0
    else:
        val = 1
    return val

df['binary smoker'] = df['Smoker'].replace(smoker_replacements)

df['binary CAD'] = df['CAD'].apply(binary)

df.head()

Unnamed: 0.1,Unnamed: 0,Note,Smoker,CAD,File name,binary smoker,binary CAD
0,0,Record date: 2093-04-28BMC EMERGENCY DEPT VISI...,past,unknown,data/379-03.xml,1.0,0
1,1,Record date: 2093-01-13 Team X Intern Admissio...,unknown,test,data/279-03.xml,,1
2,2,"Record date: 2088-05-21 Patient Name: CURTIS, ...",unknown,unknown,data/119-01.xml,,0
3,3,Record date: 2062-03-27 Hematology Clinic Prog...,never,unknown,data/304-03.xml,0.0,0
4,4,Record date: 2135-12-15CARDIOLOGYPERDUE MEDICA...,past,test,data/204-03.xml,1.0,1


In [69]:
# export df to csv to make re-running easier
df.to_csv('data.csv')

In [70]:
#read data back in
df = pd.read_csv('data.csv')

In [72]:
# since this project will require training two neural networks, and we have to drop some values for the smoker model, 
#we will split them into two seperate datasets. 

df_smoker = df[['Note', 'binary smoker']].dropna()

df_smoker.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 669 entries, 0 to 1303
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Note           669 non-null    object 
 1   binary smoker  669 non-null    float64
dtypes: float64(1), object(1)
memory usage: 15.7+ KB


In [73]:
df_CAD = df[['Note', 'binary CAD']]
df_CAD.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1304 entries, 0 to 1303
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Note        1304 non-null   object
 1   binary CAD  1304 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 20.5+ KB


In [39]:
#import packages for the tensorflow model
# !pip install -q tensorflow-hub
# !pip install -q tfds-nightly
import tensorflow_hub as hub
import tensorflow_datasets as tfds

In [74]:
X_C = df_CAD['Note']
y_C = df_CAD['binary CAD']

X_S = df_smoker['Note']
y_S = df_smoker['binary smoker']

In [75]:
#split both of the datasets into testing and training
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split(X_C, y_C, random_state=42, test_size=.2)

X_train_S, X_test_S, y_train_S, y_test_S = train_test_split(X_S, y_S, random_state=42, test_size=.2)

In [81]:
#model architecture. Largely taken from TensorFlow documentation, further hyperparameter tuning will be needed

embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)

model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dense(1))
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
#first, fit the model to the CAD data

model.fit(X_train_C, y_train_C,
          batch_size=50,
          epochs=30,
          verbose=1,
          validation_data=(X_test_C, y_test_C))

#get classification report to further analyze the model performance
pred = model.predict(X_test_C)
preds = np.where(pred > .5, 1, 0)

print(classification_report(y_test_C, preds, digits=2))
print(confusion_matrix(y_test_C, preds))

Train on 1043 samples, validate on 261 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
              precision    recall  f1-score   support

           0       0.74      0.83      0.78       150
           1       0.73      0.59      0.65       111

    accuracy                           0.73       261
   macro avg       0.73      0.71      0.72       261
weighted avg       0.73      0.73      0.73       261

[[125  25]
 [ 45  66]]


In [82]:
#Now fit the model to the smoker data

embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)

model_S = tf.keras.Sequential()
model_S.add(hub_layer)
model_S.add(tf.keras.layers.Dense(4, activation='relu'))
model_S.add(tf.keras.layers.Dense(1))
model_S.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model_S.fit(X_train_S, y_train_S,
          batch_size=50,
          epochs=70,
          verbose=1,
          validation_data=(X_test_S, y_test_S))

#get classification report to further analyze the model performance
pred = model_S.predict(X_test_S)
preds = np.where(pred > .5, 1, 0)

print(classification_report(y_test_S, preds, digits=2))
print(confusion_matrix(y_test_S, preds))

Train on 535 samples, validate on 134 samples
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70


Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70
              precision    recall  f1-score   support

         0.0       0.60      0.74      0.66        62
         1.0       0.72      0.57      0.64        72

    accuracy                           0.65       134
   macro avg       0.66      0.66      0.65       134
weighted avg       0.66      0.65      0.65       134

[[46 16]
 [31 41]]


In [67]:
# Try Naive Bayes MultinomialNB classifier with TfidfVecorizer 
stop_words = set(stopwords.words('english')) #define the stopwords

#vectorizer = TfidfVectorizer(max_df= 2.0, min_df=2, stop_words=stop_words)

vectorizer = CountVectorizer(analyzer='word', stop_words=stop_words)

X_C = vectorizer.fit_transform(df_CAD['Note'])
#y_C already defined

#re-split the data
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split(X_C, y_C, random_state=42, test_size=.2)


clf = MultinomialNB(alpha=0)
clf.fit(X_train_C, y_train_C)
y_pred = clf.predict(X_test_C)

print(f'Accuracy: {accuracy_score(y_test_C, y_pred)}')
print(f'\nClassification report:\n{classification_report(y_test_C, y_pred)}')
print(f'\nConfusion matrix: \n{confusion_matrix(y_test_C, y_pred)}')


Accuracy: 0.7854406130268199

Classification report:
              precision    recall  f1-score   support

           0       0.83      0.79      0.81       150
           1       0.74      0.77      0.75       111

    accuracy                           0.79       261
   macro avg       0.78      0.78      0.78       261
weighted avg       0.79      0.79      0.79       261


Confusion matrix: 
[[119  31]
 [ 25  86]]


  'setting alpha = %.1e' % _ALPHA_MIN)


In [68]:
# Try the MultinomialNB on the smoker data 

stop_words = set(stopwords.words('english')) #define the stopwords

vectorizer = TfidfVectorizer(max_df= 2.0, min_df=2, stop_words=stop_words)

#vectorizer = CountVectorizer(stop_words=stop_words)

X_S = vectorizer.fit_transform(df_smoker['Note'])
#y_S already defined

#re-split the data
X_train_S, X_test_S, y_train_S, y_test_S = train_test_split(X_S, y_S, random_state=42, test_size=.2)


clf = MultinomialNB(alpha=0)
clf.fit(X_train_S, y_train_S)
y_pred = clf.predict(X_test_S)

print(f'Accuracy: {accuracy_score(y_test_S, y_pred)}')
print(f'\nClassification report:\n{classification_report(y_test_S, y_pred)}')
print(f'\nConfusion matrix: \n{confusion_matrix(y_test_S, y_pred)}')

Accuracy: 0.6716417910447762

Classification report:
              precision    recall  f1-score   support

         0.0       0.65      0.63      0.64        62
         1.0       0.69      0.71      0.70        72

    accuracy                           0.67       134
   macro avg       0.67      0.67      0.67       134
weighted avg       0.67      0.67      0.67       134


Confusion matrix: 
[[39 23]
 [21 51]]


  'setting alpha = %.1e' % _ALPHA_MIN)
