This notebook was created to support the data preparation required to support our CS 598 DLH project.  The paper we have chosen for the reproducibility project is:
***Ensembling Classical Machine Learning and Deep Learning Approaches for Morbidity Identification from Clinical Notes ***



 

The data cannot be shared publicly due to the agreements required to obtain the data so we are storing the data locally and not putting in GitHub.

In [1]:
DATA_PATH = './obesity_data/'

**Classical Machine Learning - TF-IDF - All Features**

![CML TFIDF All](images\cml-tfidf-all.gif)

**Classical Machine Learning - TF-IDF - ExtraTreesClassifier Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-extra.gif)

**Classical Machine Learning - TF-IDF - InfoGain Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-infogain.gif)

**Classical Machine Learning - TF-IDF - SelectKBest Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-selectkbest.gif)

**Classical Machine Learning - Word Embeddings - No Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swno.gif)

**Classical Machine Learning - Word Embeddings - Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swyes.gif)

In [23]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import datetime
from datetime import timedelta
from tqdm import tqdm
import torchtext

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm, naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
# define data path
DATA_PATH = './obesity_data/'
RESULTS_PATH = './results/'
MODELS_PATH = './models/'

all_docs_df = pd.read_pickle(DATA_PATH + '/alldocs_df.pkl')
all_docs_df_ns = pd.read_pickle(DATA_PATH + '/alldocs_df_ns.pkl')
all_annot_df = pd.read_pickle(DATA_PATH + '/alannot_df.pkl')

#corpus = pd.read_pickle(DATA_PATH + '/corpus.pkl')
#disease_list = test_df['disease'].unique().tolist()

vectorizer = TfidfVectorizer(max_features = 600) #stop_words = cachedStopWords, max_features = 600

In [24]:
all_df = pd.merge(all_docs_df,all_annot_df, on='id')
all_df_ns = pd.merge(all_docs_df_ns,all_annot_df, on='id')

In [26]:
for index, entry in enumerate(all_df['tok_lem_text']):
    Final_words = []
    #print(entry)
    for word in entry:
        #print(word)
        Final_words.append(word)
    all_df.loc[index, 'text_final'] = str(Final_words)

In [27]:
for index, entry in enumerate(all_df_ns['tok_lem_text']):
    Final_words = []
    #print(entry)
    for word in entry:
        #print(word)
        Final_words.append(word)
    all_df_ns.loc[index, 'text_final'] = str(Final_words)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(all_df['text_final'], all_df['judgment'], test_size=0.20, shuffle=True)
X_train_ns, X_test_ns, y_train_ns, y_test_ns = train_test_split(all_df_ns['text_final'], all_df_ns['judgment'], test_size=0.20, shuffle=True)

In [29]:
#y_train = all_df['judgement']

In [30]:
Encoder = LabelEncoder()

Train_Y  = Encoder.fit_transform(y_train)
Test_Y  = Encoder.fit_transform(y_test)

Train_Y_NS  = Encoder.fit_transform(y_train_ns)
Test_Y_NS = Encoder.fit_transform(y_test_ns)

In [31]:
Tfidf_vect = TfidfVectorizer(max_features=600)

Train_X_Tfidf = vectorizer.fit_transform(X_train)
Test_X_Tfidf = vectorizer.fit_transform(X_test)

Train_X_Tfidf_NS = vectorizer.fit_transform(X_train_ns)
Test_X_Tfidf_NS = vectorizer.fit_transform(X_test_ns)

# Support Vector Machine (SVM)

https://link.springer.com/article/10.1007/BF00994018

In [32]:
# fit the training dataset on the SVM classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, y_train)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy: ",accuracy_score(predictions_SVM, y_test)*100)

Accuracy:  68.48392036753445


In [33]:
# fit the training dataset on the SVM classifier
SVM_NS = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM_NS.fit(Train_X_Tfidf_NS, y_train_ns)

# predict the labels on validation dataset
predictions_SVM_NS = SVM.predict(Test_X_Tfidf_NS)

# Use accuracy_score function to get the accuracy
print("SVM NS Accuracy: ",accuracy_score(predictions_SVM_NS, y_test_ns)*100)

Accuracy:  67.41194486983154


# k-Nearest Neighbours (kNN)

https://link.springer.com/article/10.1007/BF00153759

In [34]:
# fit the training dataset on the KNN classifier
knn = KNeighborsClassifier(n_neighbors=7)
clf = knn.fit(Train_X_Tfidf, y_train)

# predict the labels on validation dataset
predictions_KNN = clf.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("kNN Accuracy: ",accuracy_score(predictions_KNN, y_test)*100)

Accuracy:  61.194486983154675


In [37]:
# fit the training dataset on the KNN classifier
knn_ns = KNeighborsClassifier(n_neighbors=7)
clf_ns = knn_ns.fit(Train_X_Tfidf_NS, y_train_ns)

# predict the labels on validation dataset
predictions_KNN_NS = clf_ns.predict(Test_X_Tfidf_NS)

# Use accuracy_score function to get the accuracy
print("kNN NS Accuracy: ",accuracy_score(predictions_KNN_NS, y_test_ns)*100)

Accuracy:  62.17457886676876


# Naive Bayes

https://arxiv.org/abs/1302.4964

In [35]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,y_train)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, y_test)*100)

Naive Bayes Accuracy Score ->  68.48392036753445


In [38]:
# fit the training dataset on the NB classifier
Naive_NS = naive_bayes.MultinomialNB()
Naive_NS.fit(Train_X_Tfidf_NS, y_train_ns)

# predict the labels on validation dataset
predictions_NB_NS = Naive.predict(Test_X_Tfidf_NS)

# Use accuracy_score function to get the accuracy
print("Naive Bayes NS Accuracy Score -> ",accuracy_score(predictions_NB_NS, y_test_ns)*100)

Naive Bayes Accuracy Score ->  67.41194486983154


# Random Forest

https://link.springer.com/article/10.1023/A:1010933404324

In [36]:
# fit the training dataset on the RF classifier
classifier=RandomForestClassifier(n_estimators =400,criterion="entropy",random_state =0)
classifier.fit(Train_X_Tfidf,y_train)

# predict the labels on validation dataset
predictions_RF = classifier.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Random Forest Accuracy Score -> ",accuracy_score(predictions_RF, y_test)*100)

Naive Bayes Accuracy Score ->  68.48392036753445


In [39]:
# fit the training dataset on the RF classifier
classifier_ns = RandomForestClassifier(n_estimators = 400, criterion = "entropy", random_state = 0)
classifier_ns.fit(Train_X_Tfidf_NS, y_train_ns)

# predict the labels on validation dataset
predictions_RF_NS = classifier_ns.predict(Test_X_Tfidf_NS)

# Use accuracy_score function to get the accuracy
print("Random Forest NS Accuracy Score -> ",accuracy_score(predictions_RF_NS, y_test_ns)*100)

Naive Bayes Accuracy Score ->  67.41194486983154


# Random Tree

https://onlinelibrary.wiley.com/doi/10.1002/rsa.3240050207