This notebook was created to support the data preparation required to support our CS 598 DLH project.  The paper we have chosen for the reproducibility project is:
***Ensembling Classical Machine Learning and Deep Learning Approaches for Morbidity Identification from Clinical Notes ***



 

The data cannot be shared publicly due to the agreements required to obtain the data so we are storing the data locally and not putting in GitHub.

In [1]:
DATA_PATH = './obesity_data/'

**Classical Machine Learning - TF-IDF - All Features**

![CML TFIDF All](images\cml-tfidf-all.gif)

**Classical Machine Learning - TF-IDF - ExtraTreesClassifier Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-extra.gif)

**Classical Machine Learning - TF-IDF - InfoGain Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-infogain.gif)

**Classical Machine Learning - TF-IDF - SelectKBest Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-selectkbest.gif)

**Classical Machine Learning - Word Embeddings - No Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swno.gif)

**Classical Machine Learning - Word Embeddings - Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swyes.gif)

In [72]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import datetime
from datetime import timedelta
from tqdm import tqdm
import torchtext
from nltk.corpus import stopwords

from sklearn.metrics import roc_auc_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm, naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

# define data path
DATA_PATH = './obesity_data/'
RESULTS_PATH = './results/'
MODELS_PATH = './models/'

cachedStopWords = stopwords.words("english")

all_docs_df = pd.read_pickle(DATA_PATH + '/alldocs_df.pkl')
all_docs_df_ns = pd.read_pickle(DATA_PATH + '/alldocs_df_ns.pkl')
all_annot_df = pd.read_pickle(DATA_PATH + '/alannot_df.pkl')

In [73]:
all_df = pd.merge(all_docs_df,all_annot_df, on='id')
all_df_ns = pd.merge(all_docs_df_ns,all_annot_df, on='id')

disease_list = all_df['disease'].unique().tolist()

print(disease_list)

['Asthma', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'Gout', 'Hypercholesterolemia', 'Hypertriglyceridemia', 'OA', 'OSA', 'Obesity', 'CAD', 'Hypertension', 'PVD', 'Venous Insufficiency', 'GERD']


In [74]:
for index, entry in enumerate(all_df['tok_lem_text']):
    Final_words = []
    for word in entry:
        Final_words.append(word)
    all_df.loc[index, 'text_final'] = str(Final_words)

In [75]:
for index, entry in enumerate(all_df_ns['tok_lem_text']):
    Final_words = []
    for word in entry:
        Final_words.append(word)
    all_df_ns.loc[index, 'text_final'] = str(Final_words)

In [81]:
#all_df = all_df[all_df['disease'] == 'CHF']
print(all_df['tok_lem_text'][0])

['wmc', 'am', 'anemia', 'signed', 'dis', 'admission', 'date', 'report', 'status', 'signed', 'discharge', 'date', 'attending', 'truka', 'deon', 'xavier', 'md', 'service', 'bh', 'principal', 'diagnosis', 'anemia', 'and', 'gi', 'bleed', 'secondary', 'diagnosis', 'diabetes', 'mitral', 'valve', 'replacement', 'atrial', 'fibrillation', 'and', 'chronic', 'kidney', 'disease', 'history', 'of', 'present', 'illness', 'the', 'patient', 'is', 'an', 'yearold', 'woman', 'with', 'a', 'history', 'of', 'diabetes', 'chronic', 'kidney', 'disease', 'congestive', 'heart', 'failure', 'with', 'ejection', 'fraction', 'of', 'to', 'who', 'present', 'from', 'clinic', 'with', 'a', 'chief', 'complaint', 'of', 'fatigue', 'and', 'weakness', 'for', 'one', 'week', 'she', 'had', 'had', 'worsening', 'right', 'groin', 'and', 'hip', 'pain', 'status', 'post', 'a', 'total', 'hip', 'replacement', 'approximately', 'year', 'ago', 'which', 'had', 'been', 'worsening', 'for', 'two', 'week', 'and', 'she', 'ha', 'also', 'recently', 

In [56]:
def get_splits(df):
    X_train, X_test, y_train, y_test = train_test_split(df['text_final'], df['judgment'], test_size=0.20, shuffle=True)
    X_train_ns, X_test_ns, y_train_ns, y_test_ns = train_test_split(df['text_final'], df['judgment'], test_size=0.20, shuffle=True)

In [57]:
Encoder = LabelEncoder()

Train_Y  = Encoder.fit_transform(y_train)
Test_Y  = Encoder.fit_transform(y_test)

Train_Y_NS  = Encoder.fit_transform(y_train_ns)
Test_Y_NS = Encoder.fit_transform(y_test_ns)

In [58]:
Tfidf_vect = TfidfVectorizer(max_features=600)
Tfidf_vect_NS = TfidfVectorizer(max_features = 600, stop_words = cachedStopWords)

Train_X_Tfidf = Tfidf_vect.fit_transform(X_train)
Test_X_Tfidf = Tfidf_vect.fit_transform(X_test)

Train_X_Tfidf_NS = Tfidf_vect_NS.fit_transform(X_train_ns)
Test_X_Tfidf_NS = Tfidf_vect_NS.fit_transform(X_test_ns)

In [None]:
from keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV, RFE
from sklearn.tree import ExtraTreeClassifier
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import f_classif, mutual_info_classif

def getVocab(X_train, y_train, feature, max_tokens):
 
    ## Step 1: Determine the Initial Vocabulary
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    vocab = list(tokenizer.word_index.keys())

    ## Step 2: Create term  matrix
    vectors = tokenizer.texts_to_matrix(X_train, mode='count')

    ## Do feature selection on term matrix (column headers are words)
    X = vectors
    y = y_train

    ##Choose algorithm
    if feature == 'SelectKBest':
        selector = SelectKBest(score_func=f_classif, k=max_tokens).fit(X,y)
    else: 
        if feature == 'InfoGainAttributeVal':
            #This should be similar to the InfoGain?
            selector = SelectKBest(score_func=mutual_info_classif, k=max_tokens).fit(X,y)
        else:
            #default to ExtraTreeClassifier
            estimator = ExtraTreeClassifier(random_state = seed)
            #selector = SelectFromModel(estimator, max_features = tokens,threshold=-np.inf)
            selector = SelectFromModel(estimator, max_features = max_tokens)
            selector = selector.fit(X, y)

    support_idx = selector.get_support(True)
    
    #print("Vocab:", [vocab[i-1].replace("'","") for i in support_idx])
    tokenizer2 = Tokenizer()
    tokenizer2.fit_on_texts([vocab[i-1].replace("'","") for i in support_idx])
    new_vocab = list(tokenizer2.word_index.keys())

    return new_vocab

In [76]:
Encoder = LabelEncoder()
Tfidf_vect = TfidfVectorizer(max_features=600)
Tfidf_vect_NS = TfidfVectorizer(max_features = 600, stop_words = cachedStopWords)

for _,disease in enumerate(disease_list):
    disease_data_df = all_df[all_df['disease'] == disease]
    disease_data_ns_df = all_df_ns[all_df_ns['disease'] == disease]
    
    X_train, X_test, y_train, y_test = train_test_split(disease_data_df['text_final'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
    X_train_ns, X_test_ns, y_train_ns, y_test_ns = train_test_split(disease_data_ns_df['text_final'], disease_data_ns_df['judgment'], test_size=0.20, shuffle=True)
    
    Train_Y  = Encoder.fit_transform(y_train)
    Test_Y  = Encoder.fit_transform(y_test)

    Train_Y_NS  = Encoder.fit_transform(y_train_ns)
    Test_Y_NS = Encoder.fit_transform(y_test_ns)
    
    Train_X_Tfidf = Tfidf_vect.fit_transform(X_train)
    Test_X_Tfidf = Tfidf_vect.fit_transform(X_test)

    Train_X_Tfidf_NS = Tfidf_vect_NS.fit_transform(X_train_ns)
    Test_X_Tfidf_NS = Tfidf_vect_NS.fit_transform(X_test_ns)
    
    
    #SVM
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(Train_X_Tfidf, y_train)
    
    predictions_SVM = SVM.predict(Test_X_Tfidf)

    #f1 = f1_score(y_test, predictions_SVM)
    f1_macro = f1_score(y_test, predictions_SVM,average='macro')
    f1_micro = f1_score(y_test, predictions_SVM,average='micro')

    #print("SVM - ", disease, ": f1-score", f1)
    print("SVM - ", disease, ": f1-macro", f1_macro)
    print("SVM - ", disease, ": f1-micro", f1_micro)
    
    
    
    #KNN
    # fit the training dataset on the KNN classifier
    knn = KNeighborsClassifier(n_neighbors=7)
    clf = knn.fit(Train_X_Tfidf, y_train)

    # predict the labels on validation dataset
    predictions_KNN = clf.predict(Test_X_Tfidf)

    #auroc = roc_auc_score(truth, pred[:,1])
    #f1 = f1_score(y_test, predictions_KNN)
    f1_macro = f1_score(y_test, predictions_KNN,average='macro')
    f1_micro = f1_score(y_test, predictions_KNN,average='micro')

    #print("KNN - ", disease, ": f1-score", f1)
    print("KNN - ", disease, ": f1-macro", f1_macro)
    print("KNN - ", disease, ": f1-micro", f1_micro)
    
    
    
    
    #Naive Bayes
    # fit the training dataset on the NB classifier
    Naive = naive_bayes.MultinomialNB()
    Naive.fit(Train_X_Tfidf,y_train)

    # predict the labels on validation dataset
    predictions_NB = Naive.predict(Test_X_Tfidf)

    #f1 = f1_score(y_test, predictions_NB)
    f1_macro = f1_score(y_test, predictions_NB,average='macro')
    f1_micro = f1_score(y_test, predictions_NB,average='micro')

    #print("NB - ", disease, ": f1-score", f1)
    print("NB - ", disease, ": f1-macro", f1_macro)
    print("NB - ", disease, ": f1-micro", f1_micro)
    
    
    
    
    #RF
    # fit the training dataset on the RF classifier
    classifier=RandomForestClassifier(n_estimators =400,criterion="entropy",random_state =0)
    classifier.fit(Train_X_Tfidf,y_train)

    # predict the labels on validation dataset
    predictions_RF = classifier.predict(Test_X_Tfidf)

    #f1 = f1_score(y_test, predictions_RF)
    f1_macro = f1_score(y_test, predictions_RF,average='macro')
    f1_micro = f1_score(y_test, predictions_RF,average='micro')

    #print("RF - ", disease, ": f1-score", f1)
    print("RF - ", disease, ": f1-macro", f1_macro)
    print("RF - ", disease, ": f1-micro", f1_micro)

SVM -  Asthma : f1-score 0.0
SVM -  Asthma : f1-macro 0.4752475247524752
SVM -  Asthma : f1-micro 0.9056603773584906
KNN -  Asthma : f1-score 0.0
KNN -  Asthma : f1-macro 0.47000000000000003
KNN -  Asthma : f1-micro 0.8867924528301887
NB -  Asthma : f1-score 0.0
NB -  Asthma : f1-macro 0.4752475247524752
NB -  Asthma : f1-micro 0.9056603773584906
Random Forest Accuracy Score ->  90.56603773584906
RF -  Asthma : f1-score 0.0
RF -  Asthma : f1-macro 0.4752475247524752
RF -  Asthma : f1-micro 0.9056603773584906
SVM -  CHF : f1-score 0.6298342541436464
SVM -  CHF : f1-macro 0.5075776775305387
SVM -  CHF : f1-micro 0.5379310344827586
KNN -  CHF : f1-score 0.7522935779816514
KNN -  CHF : f1-macro 0.5011467889908257
KNN -  CHF : f1-micro 0.6275862068965518
NB -  CHF : f1-score 0.771186440677966
NB -  CHF : f1-macro 0.385593220338983
NB -  CHF : f1-micro 0.6275862068965518
Random Forest Accuracy Score ->  64.13793103448275
RF -  CHF : f1-score 0.7739130434782608
RF -  CHF : f1-macro 0.45362318

Random Forest Accuracy Score ->  92.14659685863874
RF -  Venous Insufficiency : f1-score 0.0
RF -  Venous Insufficiency : f1-macro 0.47956403269754766
RF -  Venous Insufficiency : f1-micro 0.9214659685863874
SVM -  GERD : f1-score 0.0
SVM -  GERD : f1-macro 0.42901234567901236
SVM -  GERD : f1-micro 0.7513513513513513
KNN -  GERD : f1-score 0.0
KNN -  GERD : f1-macro 0.4254658385093168
KNN -  GERD : f1-micro 0.7405405405405405
NB -  GERD : f1-score 0.0
NB -  GERD : f1-macro 0.42901234567901236
NB -  GERD : f1-micro 0.7513513513513513
Random Forest Accuracy Score ->  75.13513513513513
RF -  GERD : f1-score 0.0
RF -  GERD : f1-macro 0.42901234567901236
RF -  GERD : f1-micro 0.7513513513513513


# Support Vector Machine (SVM)

https://link.springer.com/article/10.1007/BF00994018

In [82]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import ExtraTreesClassifier

#estimator = ExtraTreesClassifier()
#selector = RFECV(estimator, step=1, cv=5)
#selector = selector.fit(Train_X_Tfidf, y_train)

#print(selector)

# fit the training dataset on the SVM classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, y_train)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy: ",accuracy_score(predictions_SVM, y_test)*100)

f1 = f1_score(y_test, predictions_SVM)
f1_macro = f1_score(y_test, predictions_SVM,average='macro')
f1_micro = f1_score(y_test, predictions_SVM,average='micro')

print(f1)
print(f1_macro)
print(f1_micro)

KeyboardInterrupt: 

In [None]:
# fit the training dataset on the SVM classifier
SVM_NS = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM_NS.fit(Train_X_Tfidf_NS, y_train_ns)

# predict the labels on validation dataset
predictions_SVM_NS = SVM.predict(Test_X_Tfidf_NS)

# Use accuracy_score function to get the accuracy
print("SVM NS Accuracy: ",accuracy_score(predictions_SVM_NS, y_test_ns)*100)

# k-Nearest Neighbours (kNN)

https://link.springer.com/article/10.1007/BF00153759

In [61]:
# fit the training dataset on the KNN classifier
knn = KNeighborsClassifier(n_neighbors=7)
clf = knn.fit(Train_X_Tfidf, y_train)

# predict the labels on validation dataset
predictions_KNN = clf.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("kNN Accuracy: ",accuracy_score(predictions_KNN, y_test)*100)

#print(predictions_KNN)

#auroc = roc_auc_score(truth, pred[:,1])
f1 = f1_score(y_test, predictions_KNN)
f1_macro = f1_score(y_test, predictions_KNN,average='macro')
f1_micro = f1_score(y_test, predictions_KNN,average='micro')

print(f1)
print(f1_macro)
print(f1_micro)

kNN Accuracy:  64.82758620689654
0.7733333333333333
0.49435897435897436
0.6482758620689655


In [None]:
# fit the training dataset on the KNN classifier
knn_ns = KNeighborsClassifier(n_neighbors=7)
clf_ns = knn_ns.fit(Train_X_Tfidf_NS, y_train_ns)

# predict the labels on validation dataset
predictions_KNN_NS = clf_ns.predict(Test_X_Tfidf_NS)

# Use accuracy_score function to get the accuracy
print("kNN NS Accuracy: ",accuracy_score(predictions_KNN_NS, y_test_ns)*100)

# Naive Bayes

https://arxiv.org/abs/1302.4964

In [62]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,y_train)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, y_test)*100)

f1 = f1_score(y_test, predictions_NB)
f1_macro = f1_score(y_test, predictions_NB,average='macro')
f1_micro = f1_score(y_test, predictions_NB,average='micro')

print(f1)
print(f1_macro)
print(f1_micro)

Naive Bayes Accuracy Score ->  68.96551724137932
0.8034934497816593
0.5328942658744362
0.6896551724137931


In [None]:
# fit the training dataset on the NB classifier
Naive_NS = naive_bayes.MultinomialNB()
Naive_NS.fit(Train_X_Tfidf_NS, y_train_ns)

# predict the labels on validation dataset
predictions_NB_NS = Naive.predict(Test_X_Tfidf_NS)

# Use accuracy_score function to get the accuracy
print("Naive Bayes NS Accuracy Score -> ",accuracy_score(predictions_NB_NS, y_test_ns)*100)

# Random Forest

https://link.springer.com/article/10.1023/A:1010933404324

In [63]:
# fit the training dataset on the RF classifier
classifier=RandomForestClassifier(n_estimators =400,criterion="entropy",random_state =0)
classifier.fit(Train_X_Tfidf,y_train)

# predict the labels on validation dataset
predictions_RF = classifier.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Random Forest Accuracy Score -> ",accuracy_score(predictions_RF, y_test)*100)

f1 = f1_score(y_test, predictions_RF)
f1_macro = f1_score(y_test, predictions_RF,average='macro')
f1_micro = f1_score(y_test, predictions_RF,average='micro')

print(f1)
print(f1_macro)
print(f1_micro)

Random Forest Accuracy Score ->  73.79310344827587
0.8362068965517241
0.5905172413793103
0.7379310344827587


In [None]:
# fit the training dataset on the RF classifier
classifier_ns = RandomForestClassifier(n_estimators = 400, criterion = "entropy", random_state = 0)
classifier_ns.fit(Train_X_Tfidf_NS, y_train_ns)

# predict the labels on validation dataset
predictions_RF_NS = classifier_ns.predict(Test_X_Tfidf_NS)

# Use accuracy_score function to get the accuracy
print("Random Forest NS Accuracy Score -> ",accuracy_score(predictions_RF_NS, y_test_ns)*100)

# Random Tree

https://onlinelibrary.wiley.com/doi/10.1002/rsa.3240050207