This notebook was created to support the data preparation required to support our CS 598 DLH project.  The paper we have chosen for the reproducibility project is:
***Ensembling Classical Machine Learning and Deep Learning Approaches for Morbidity Identification from Clinical Notes ***



 

The data cannot be shared publicly due to the agreements required to obtain the data so we are storing the data locally and not putting in GitHub.

In [1]:
DATA_PATH = './obesity_data/'

**Classical Machine Learning - TF-IDF - All Features**

![CML TFIDF All](images\cml-tfidf-all.gif)

**Classical Machine Learning - TF-IDF - ExtraTreesClassifier Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-extra.gif)

**Classical Machine Learning - TF-IDF - InfoGain Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-infogain.gif)

**Classical Machine Learning - TF-IDF - SelectKBest Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-selectkbest.gif)

**Classical Machine Learning - Word Embeddings - No Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swno.gif)

**Classical Machine Learning - Word Embeddings - Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swyes.gif)

In [48]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import datetime
from datetime import timedelta
from tqdm import tqdm
import torchtext
from nltk.corpus import stopwords

from sklearn.metrics import roc_auc_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm, naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

# define data path
DATA_PATH = './obesity_data/'
RESULTS_PATH = './results/'
MODELS_PATH = './models/'

cachedStopWords = stopwords.words("english")

all_docs_df = pd.read_pickle(DATA_PATH + '/alldocs_df.pkl')
all_docs_df_ns = pd.read_pickle(DATA_PATH + '/alldocs_df_ns.pkl')
all_annot_df = pd.read_pickle(DATA_PATH + '/alannot_df.pkl')

In [52]:
all_df = pd.merge(all_docs_df,all_annot_df, on='id')
all_df_ns = pd.merge(all_docs_df_ns,all_annot_df, on='id')

disease_list = all_df['disease'].unique().tolist()

print(disease_list)

['Asthma', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'Gout', 'Hypercholesterolemia', 'Hypertriglyceridemia', 'OA', 'OSA', 'Obesity', 'CAD', 'Hypertension', 'PVD', 'Venous Insufficiency', 'GERD']


In [53]:
for index, entry in enumerate(all_df['tok_lem_text']):
    Final_words = []
    for word in entry:
        Final_words.append(word)
    all_df.loc[index, 'text_final'] = str(Final_words)

In [54]:
for index, entry in enumerate(all_df_ns['tok_lem_text']):
    Final_words = []
    for word in entry:
        Final_words.append(word)
    all_df_ns.loc[index, 'text_final'] = str(Final_words)

In [66]:
all_df = all_df[all_df['disease'] == 'CHF']
#print(all_df)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(all_df['text_final'], all_df['judgment'], test_size=0.20, shuffle=True)
X_train_ns, X_test_ns, y_train_ns, y_test_ns = train_test_split(all_df_ns['text_final'], all_df_ns['judgment'], test_size=0.20, shuffle=True)

In [57]:
Encoder = LabelEncoder()

Train_Y  = Encoder.fit_transform(y_train)
Test_Y  = Encoder.fit_transform(y_test)

Train_Y_NS  = Encoder.fit_transform(y_train_ns)
Test_Y_NS = Encoder.fit_transform(y_test_ns)

In [58]:
Tfidf_vect = TfidfVectorizer(max_features=600)
Tfidf_vect_NS = TfidfVectorizer(max_features = 600, stop_words = cachedStopWords)

Train_X_Tfidf = Tfidf_vect.fit_transform(X_train)
Test_X_Tfidf = Tfidf_vect.fit_transform(X_test)

Train_X_Tfidf_NS = Tfidf_vect_NS.fit_transform(X_train_ns)
Test_X_Tfidf_NS = Tfidf_vect_NS.fit_transform(X_test_ns)

# Support Vector Machine (SVM)

https://link.springer.com/article/10.1007/BF00994018

In [60]:
# fit the training dataset on the SVM classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, y_train)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy: ",accuracy_score(predictions_SVM, y_test)*100)

f1 = f1_score(y_test, predictions_SVM)
f1_macro = f1_score(y_test, predictions_SVM,average='macro')
f1_micro = f1_score(y_test, predictions_SVM,average='micro')

print(f1)
print(f1_macro)
print(f1_micro)

SVM Accuracy:  72.41379310344827
0.8058252427184466
0.6648173832639852
0.7241379310344829


In [None]:
# fit the training dataset on the SVM classifier
SVM_NS = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM_NS.fit(Train_X_Tfidf_NS, y_train_ns)

# predict the labels on validation dataset
predictions_SVM_NS = SVM.predict(Test_X_Tfidf_NS)

# Use accuracy_score function to get the accuracy
print("SVM NS Accuracy: ",accuracy_score(predictions_SVM_NS, y_test_ns)*100)

# k-Nearest Neighbours (kNN)

https://link.springer.com/article/10.1007/BF00153759

In [61]:
# fit the training dataset on the KNN classifier
knn = KNeighborsClassifier(n_neighbors=7)
clf = knn.fit(Train_X_Tfidf, y_train)

# predict the labels on validation dataset
predictions_KNN = clf.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("kNN Accuracy: ",accuracy_score(predictions_KNN, y_test)*100)

#print(predictions_KNN)

#auroc = roc_auc_score(truth, pred[:,1])
f1 = f1_score(y_test, predictions_KNN)
f1_macro = f1_score(y_test, predictions_KNN,average='macro')
f1_micro = f1_score(y_test, predictions_KNN,average='micro')

print(f1)
print(f1_macro)
print(f1_micro)

kNN Accuracy:  64.82758620689654
0.7733333333333333
0.49435897435897436
0.6482758620689655


In [None]:
# fit the training dataset on the KNN classifier
knn_ns = KNeighborsClassifier(n_neighbors=7)
clf_ns = knn_ns.fit(Train_X_Tfidf_NS, y_train_ns)

# predict the labels on validation dataset
predictions_KNN_NS = clf_ns.predict(Test_X_Tfidf_NS)

# Use accuracy_score function to get the accuracy
print("kNN NS Accuracy: ",accuracy_score(predictions_KNN_NS, y_test_ns)*100)

# Naive Bayes

https://arxiv.org/abs/1302.4964

In [62]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,y_train)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, y_test)*100)

f1 = f1_score(y_test, predictions_NB)
f1_macro = f1_score(y_test, predictions_NB,average='macro')
f1_micro = f1_score(y_test, predictions_NB,average='micro')

print(f1)
print(f1_macro)
print(f1_micro)

Naive Bayes Accuracy Score ->  68.96551724137932
0.8034934497816593
0.5328942658744362
0.6896551724137931


In [None]:
# fit the training dataset on the NB classifier
Naive_NS = naive_bayes.MultinomialNB()
Naive_NS.fit(Train_X_Tfidf_NS, y_train_ns)

# predict the labels on validation dataset
predictions_NB_NS = Naive.predict(Test_X_Tfidf_NS)

# Use accuracy_score function to get the accuracy
print("Naive Bayes NS Accuracy Score -> ",accuracy_score(predictions_NB_NS, y_test_ns)*100)

# Random Forest

https://link.springer.com/article/10.1023/A:1010933404324

In [63]:
# fit the training dataset on the RF classifier
classifier=RandomForestClassifier(n_estimators =400,criterion="entropy",random_state =0)
classifier.fit(Train_X_Tfidf,y_train)

# predict the labels on validation dataset
predictions_RF = classifier.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Random Forest Accuracy Score -> ",accuracy_score(predictions_RF, y_test)*100)

f1 = f1_score(y_test, predictions_RF)
f1_macro = f1_score(y_test, predictions_RF,average='macro')
f1_micro = f1_score(y_test, predictions_RF,average='micro')

print(f1)
print(f1_macro)
print(f1_micro)

Random Forest Accuracy Score ->  73.79310344827587
0.8362068965517241
0.5905172413793103
0.7379310344827587


In [None]:
# fit the training dataset on the RF classifier
classifier_ns = RandomForestClassifier(n_estimators = 400, criterion = "entropy", random_state = 0)
classifier_ns.fit(Train_X_Tfidf_NS, y_train_ns)

# predict the labels on validation dataset
predictions_RF_NS = classifier_ns.predict(Test_X_Tfidf_NS)

# Use accuracy_score function to get the accuracy
print("Random Forest NS Accuracy Score -> ",accuracy_score(predictions_RF_NS, y_test_ns)*100)

# Random Tree

https://onlinelibrary.wiley.com/doi/10.1002/rsa.3240050207