This notebook was created to support the data preparation required to support our CS 598 DLH project.  The paper we have chosen for the reproducibility project is:
***Ensembling Classical Machine Learning and Deep Learning Approaches for Morbidity Identification from Clinical Notes ***



 

The data cannot be shared publicly due to the agreements required to obtain the data so we are storing the data locally and not putting in GitHub.

In [1]:
DATA_PATH = './obesity_data/'

**Classical Machine Learning - TF-IDF - All Features**

![CML TFIDF All](images\cml-tfidf-all.gif)

**Classical Machine Learning - TF-IDF - ExtraTreesClassifier Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-extra.gif)

**Classical Machine Learning - TF-IDF - InfoGain Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-infogain.gif)

**Classical Machine Learning - TF-IDF - SelectKBest Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-selectkbest.gif)

**Classical Machine Learning - Word Embeddings - No Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swno.gif)

**Classical Machine Learning - Word Embeddings - Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swyes.gif)

In [32]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import datetime
from datetime import timedelta
from tqdm import tqdm
import torchtext

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm, naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
# define data path
DATA_PATH = './obesity_data/'
RESULTS_PATH = './results/'
MODELS_PATH = './models/'

test_df = pd.read_pickle(DATA_PATH + '/test.pkl') 
train_df = pd.read_pickle(DATA_PATH + '/train.pkl') 
#corpus = pd.read_pickle(DATA_PATH + '/corpus.pkl')
disease_list = test_df['disease'].unique().tolist()

vectorizer = TfidfVectorizer(max_features = 600) #stop_words = cachedStopWords, max_features = 600

In [33]:
train_df

Unnamed: 0,disease,id,judgment,index,text,no_punc_text,no_numerics_text,lower_text,tokenized_text,tok_lem_text,word_count,one_hot,vector_tokenized
0,Asthma,1,False,,490646815 | WMC | 31530471 | | 9629480 | 11/23...,490646815 WMC 31530471 9629480 11232006 1...,WMC AM ANEMIA Signed DIS Admissi...,dr margarito nolting on at am code status the ...,"[wmc, am, anemia, signed, dis, admission, date...","[wmc, am, anemia, signed, dis, admission, date...",44,"[50, 8437, 24327, 6, 17, 73, 159, 36, 1, 13, 2...","[dr, margarito, nolting, on, at, am, code, sta..."
1,Asthma,1,False,,490646815 | WMC | 31530471 | | 9629480 | 11/23...,490646815 WMC 31530471 9629480 11232006 1...,WMC AM ANEMIA Signed DIS Admissi...,wmc am anemia signed dis admission date report...,"[wmc, am, anemia, signed, dis, admission, date...","[wmc, am, anemia, signed, dis, admission, date...",1429,"[5794, 73, 387, 123, 138, 26, 53, 108, 36, 123...","[wmc, am, anemia, signed, dis, admission, date..."
2,CHF,1,True,,490646815 | WMC | 31530471 | | 9629480 | 11/23...,490646815 WMC 31530471 9629480 11232006 1...,WMC AM ANEMIA Signed DIS Admissi...,dr margarito nolting on at am code status the ...,"[wmc, am, anemia, signed, dis, admission, date...","[wmc, am, anemia, signed, dis, admission, date...",44,"[50, 8437, 24327, 6, 17, 73, 159, 36, 1, 13, 2...","[dr, margarito, nolting, on, at, am, code, sta..."
3,CHF,1,True,,490646815 | WMC | 31530471 | | 9629480 | 11/23...,490646815 WMC 31530471 9629480 11232006 1...,WMC AM ANEMIA Signed DIS Admissi...,wmc am anemia signed dis admission date report...,"[wmc, am, anemia, signed, dis, admission, date...","[wmc, am, anemia, signed, dis, admission, date...",1429,"[5794, 73, 387, 123, 138, 26, 53, 108, 36, 123...","[wmc, am, anemia, signed, dis, admission, date..."
4,Depression,1,False,,490646815 | WMC | 31530471 | | 9629480 | 11/23...,490646815 WMC 31530471 9629480 11232006 1...,WMC AM ANEMIA Signed DIS Admissi...,dr margarito nolting on at am code status the ...,"[wmc, am, anemia, signed, dis, admission, date...","[wmc, am, anemia, signed, dis, admission, date...",44,"[50, 8437, 24327, 6, 17, 73, 159, 36, 1, 13, 2...","[dr, margarito, nolting, on, at, am, code, sta..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12636,CAD,175,True,0.0,293423942 | ZH | 49231835 | | 153750 | 11/10/1...,293423942 ZH 49231835 153750 11101990 120...,ZH AM Discharge Summary Unsigned ...,zh am discharge summary unsigned dis admission...,"[zh, am, discharge, summary, unsigned, dis, ad...","[zh, am, discharge, summary, unsigned, dis, ad...",794,"[6985, 73, 18, 126, 793, 138, 26, 53, 108, 36,...","[zh, am, discharge, summary, unsigned, dis, ad..."
12637,Diabetes,175,False,0.0,293423942 | ZH | 49231835 | | 153750 | 11/10/1...,293423942 ZH 49231835 153750 11101990 120...,ZH AM Discharge Summary Unsigned ...,zh am discharge summary unsigned dis admission...,"[zh, am, discharge, summary, unsigned, dis, ad...","[zh, am, discharge, summary, unsigned, dis, ad...",794,"[6985, 73, 18, 126, 793, 138, 26, 53, 108, 36,...","[zh, am, discharge, summary, unsigned, dis, ad..."
12638,Hypertension,175,True,0.0,293423942 | ZH | 49231835 | | 153750 | 11/10/1...,293423942 ZH 49231835 153750 11101990 120...,ZH AM Discharge Summary Unsigned ...,zh am discharge summary unsigned dis admission...,"[zh, am, discharge, summary, unsigned, dis, ad...","[zh, am, discharge, summary, unsigned, dis, ad...",794,"[6985, 73, 18, 126, 793, 138, 26, 53, 108, 36,...","[zh, am, discharge, summary, unsigned, dis, ad..."
12639,OA,175,True,0.0,293423942 | ZH | 49231835 | | 153750 | 11/10/1...,293423942 ZH 49231835 153750 11101990 120...,ZH AM Discharge Summary Unsigned ...,zh am discharge summary unsigned dis admission...,"[zh, am, discharge, summary, unsigned, dis, ad...","[zh, am, discharge, summary, unsigned, dis, ad...",794,"[6985, 73, 18, 126, 793, 138, 26, 53, 108, 36,...","[zh, am, discharge, summary, unsigned, dis, ad..."


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, shuffle=True)

ValueError: Found input variables with inconsistent numbers of samples: [12641, 11190]

In [35]:
Encoder = LabelEncoder()
Train_Y  = Encoder.fit_transform(y_train)
Test_Y  = Encoder.fit_transform(y_test)

NameError: name 'y_train' is not defined

In [36]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Train_X_Tfidf = vectorizer.fit_transform(Train_X)
Test_X_Tfidf = vectorizer.fit_transform(Test_X)

AttributeError: 'numpy.int32' object has no attribute 'lower'

In [37]:
print(Tfidf_vect.vocabulary)

None


# Support Vector Machine (SVM)

https://link.springer.com/article/10.1007/BF00994018

In [None]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, Train_Y)

# predict labels
predictions_SVM = SVM.predict(Test_X_Tfidf)

# get the accuracy
print("Accuracy: ",accuracy_score(predictions_SVM, Test_Y)*100)

# k-Nearest Neighbours (kNN)

https://link.springer.com/article/10.1007/BF00153759

# Naive Bayes

https://arxiv.org/abs/1302.4964

In [None]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

# Random Forest

https://link.springer.com/article/10.1023/A:1010933404324

# Random Tree

https://onlinelibrary.wiley.com/doi/10.1002/rsa.3240050207

# J-48

http://server3.eca.ir/isi/forum/Programs%20for%20Machine%20Learning.pdf

# J-Rip

https://www.sciencedirect.com/science/article/pii/B9781558603776500232?via%3Dihub