This notebook was created to support the data preparation required to support our CS 598 DLH project.  The paper we have chosen for the reproducibility project is:
***Ensembling Classical Machine Learning and Deep Learning Approaches for Morbidity Identification from Clinical Notes ***



 

The data cannot be shared publicly due to the agreements required to obtain the data so we are storing the data locally and not putting in GitHub.

In [1]:
DATA_PATH = './obesity_data/'

**Classical Machine Learning - TF-IDF - All Features**

![CML TFIDF All](images\cml-tfidf-all.gif)

**Classical Machine Learning - TF-IDF - ExtraTreesClassifier Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-extra.gif)

**Classical Machine Learning - TF-IDF - InfoGain Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-infogain.gif)

**Classical Machine Learning - TF-IDF - SelectKBest Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-selectkbest.gif)

**Classical Machine Learning - Word Embeddings - No Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swno.gif)

**Classical Machine Learning - Word Embeddings - Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swyes.gif)

In [2]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import datetime
from datetime import timedelta
from tqdm import tqdm
import torchtext

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm, naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
# define data path
DATA_PATH = './obesity_data/'
RESULTS_PATH = './results/'
MODELS_PATH = './models/'

all_docs_df = pd.read_pickle(DATA_PATH + '/alldocs_df.pkl')
all_docs_df_ns = pd.read_pickle(DATA_PATH + '/alldocs_df_ns.pkl')
all_annot_df = pd.read_pickle(DATA_PATH + '/alannot_df.pkl')

#corpus = pd.read_pickle(DATA_PATH + '/corpus.pkl')
#disease_list = test_df['disease'].unique().tolist()

vectorizer = TfidfVectorizer(max_features = 600) #stop_words = cachedStopWords, max_features = 600

In [3]:
all_df = pd.merge(all_docs_df,all_annot_df, on='id')
all_df_ns = pd.merge(all_docs_df_ns,all_annot_df, on='id')

In [4]:
y = all_df['judgment']

In [5]:
for index, entry in enumerate(all_df['tok_lem_text']):
    Final_words = []
    #print(entry)
    for word in entry:
        #print(word)
        Final_words.append(word)
    all_df.loc[index, 'text_final'] = str(Final_words)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(all_df['text_final'], y, test_size=0.20, shuffle=True)
X_train_ns, X_test_ns, y_train_ns, y_test_ns = train_test_split(all_df_ns, y, test_size=0.20, shuffle=True)

In [7]:
#y_train = all_df['judgement']

In [8]:
Encoder = LabelEncoder()
Train_Y  = Encoder.fit_transform(y_train)
Test_Y  = Encoder.fit_transform(y_test)

In [9]:
Tfidf_vect = TfidfVectorizer(max_features=600)
Train_X_Tfidf = vectorizer.fit_transform(X_train)
Test_X_Tfidf = vectorizer.fit_transform(X_test)

In [10]:
print(Train_X_Tfidf)
print(Tfidf_vect.vocabulary)

  (0, 177)	0.017982170312648454
  (0, 178)	0.019379691570302778
  (0, 136)	0.009830044548151642
  (0, 335)	0.02532068929174102
  (0, 203)	0.042016621001826335
  (0, 349)	0.04324854120830858
  (0, 155)	0.01947807196077127
  (0, 493)	0.011946805058742259
  (0, 105)	0.013076761678755974
  (0, 446)	0.02467512738221568
  (0, 92)	0.0257402280171135
  (0, 352)	0.022082116584457964
  (0, 518)	0.01854164200257686
  (0, 324)	0.02165195169639071
  (0, 597)	0.04714518685123831
  (0, 598)	0.09114501658244523
  (0, 103)	0.019131164972346328
  (0, 10)	0.017382679596036857
  (0, 78)	0.013756422238773663
  (0, 459)	0.021086487578244467
  (0, 302)	0.02817910869832115
  (0, 373)	0.020907371884007167
  (0, 27)	0.011687687142757341
  (0, 71)	0.013563822611536436
  (0, 383)	0.01684398576759141
  :	:
  (13059, 595)	0.020663899229518248
  (13059, 505)	0.029846715265981812
  (13059, 520)	0.15614131350986896
  (13059, 558)	0.01931381579980336
  (13059, 56)	0.02296622249350095
  (13059, 127)	0.04029863578059186


# Support Vector Machine (SVM)

https://link.springer.com/article/10.1007/BF00994018

In [11]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, y_train)

# predict labels
predictions_SVM = SVM.predict(Test_X_Tfidf)

# get the accuracy
print("Accuracy: ",accuracy_score(predictions_SVM, y_test)*100)

Accuracy:  68.48392036753445


# k-Nearest Neighbours (kNN)

https://link.springer.com/article/10.1007/BF00153759

# Naive Bayes

https://arxiv.org/abs/1302.4964

In [12]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,y_train)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, y_test)*100)

Naive Bayes Accuracy Score ->  68.48392036753445


# Random Forest

https://link.springer.com/article/10.1023/A:1010933404324

# Random Tree

https://onlinelibrary.wiley.com/doi/10.1002/rsa.3240050207