This notebook was created to support the data preparation required to support our CS 598 DLH project.  The paper we have chosen for the reproducibility project is:
***Ensembling Classical Machine Learning and Deep Learning Approaches for Morbidity Identification from Clinical Notes ***



 

The data cannot be shared publicly due to the agreements required to obtain the data so we are storing the data locally and not putting in GitHub.

In [1]:
DATA_PATH = './obesity_data/'

**Classical Machine Learning - TF-IDF - All Features**

![CML TFIDF All](images\cml-tfidf-all.gif)

**Classical Machine Learning - TF-IDF - ExtraTreesClassifier Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-extra.gif)

**Classical Machine Learning - TF-IDF - InfoGain Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-infogain.gif)

**Classical Machine Learning - TF-IDF - SelectKBest Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-selectkbest.gif)

**Classical Machine Learning - Word Embeddings - No Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swno.gif)

**Classical Machine Learning - Word Embeddings - Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swyes.gif)

In [2]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import datetime
from datetime import timedelta
from tqdm import tqdm
import torchtext
from nltk.corpus import stopwords
import tensorflow_hub as hub

from sklearn.metrics import roc_auc_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm, naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

# define data path
DATA_PATH = './obesity_data/'
RESULTS_PATH = './results/'
MODELS_PATH = './models/'

cachedStopWords = stopwords.words("english")

#Download info for USE
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

all_docs_df = pd.read_pickle(DATA_PATH + '/alldocs_df.pkl')
all_docs_df_ns = pd.read_pickle(DATA_PATH + '/alldocs_df_ns.pkl')
all_annot_df = pd.read_pickle(DATA_PATH + '/allannot_df.pkl')
all_df_expanded = pd.read_pickle(DATA_PATH + '/all_df_expanded.pkl')



In [3]:
all_df = pd.merge(all_docs_df,all_annot_df, on='id')
all_df_ns = pd.merge(all_docs_df_ns,all_annot_df, on='id')

disease_list = all_df['disease'].unique().tolist()
feature_list = ['All','ExtraTreeClassifier','SelectKBest','InfoGainAttributeVal']
embedding_list = ['GloVe', 'FastText', 'USE']
device = torch.device('cpu')

In [4]:
for index, entry in enumerate(all_df['tok_lem_text']):
    Final_words = []
    for word in entry:
        Final_words.append(word)
    all_df.loc[index, 'text_final'] = str(Final_words)

In [5]:
for index, entry in enumerate(all_df_ns['tok_lem_text']):
    Final_words = []
    for word in entry:
        Final_words.append(word)
    all_df_ns.loc[index, 'text_final'] = str(Final_words)

In [6]:
def performSVM(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test, feature):
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(Train_X_Tfidf, y_train)

    predictions_SVM = SVM.predict(Test_X_Tfidf)

    #f1 = f1_score(y_test, predictions_SVM)
    f1_macro = f1_score(y_test, predictions_SVM,average='macro')
    f1_micro = f1_score(y_test, predictions_SVM,average='micro')

    #print("SVM - ", disease, ": f1-score", f1)
    print("SVM - ", feature, disease, ": f1-macro", f1_macro)
    print("SVM - ", feature, disease, ": f1-micro", f1_micro)
    
    return f1_macro, f1_micro

In [7]:
def performKNN(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test, feature):
    knn1 = KNeighborsClassifier(n_neighbors=1)
    knn5 = KNeighborsClassifier(n_neighbors=5)
    clf1 = knn1.fit(Train_X_Tfidf, y_train)
    clf5 = knn5.fit(Train_X_Tfidf, y_train)

    # predict the labels on validation dataset
    predictions_KNN1 = clf1.predict(Test_X_Tfidf)
    predictions_KNN5 = clf5.predict(Test_X_Tfidf)

    #auroc = roc_auc_score(truth, pred[:,1])
    #f1 = f1_score(y_test, predictions_KNN)
    f1_macro1 = f1_score(y_test, predictions_KNN1,average='macro')
    f1_macro5 = f1_score(y_test, predictions_KNN5,average='macro')
    f1_micro1 = f1_score(y_test, predictions_KNN1,average='micro')
    f1_micro5 = f1_score(y_test, predictions_KNN5,average='micro')

    #print("KNN - ", disease, ": f1-score", f1)
    print("KNN 1 k - ", feature, disease, ": f1-macro", f1_macro1)
    print("KNN 1 k - ", feature, disease, ": f1-micro", f1_micro1)
    print("KNN 5 k - ", feature, disease, ": f1-macro", f1_macro5)
    print("KNN 5 k - ", feature, disease, ": f1-micro", f1_micro5)
    
    return f1_macro1, f1_micro1, f1_macro5, f1_micro5

In [8]:
def performNB(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test, feature):
    scaler = MinMaxScaler()
    scaler.fit(Train_X_Tfidf)
    
    Naive = naive_bayes.MultinomialNB()
    Naive.fit(scaler.transform(Train_X_Tfidf),y_train)

    # predict the labels on validation dataset
    predictions_NB = Naive.predict(Test_X_Tfidf)

    #f1 = f1_score(y_test, predictions_NB)
    f1_macro = f1_score(y_test, predictions_NB,average='macro')
    f1_micro = f1_score(y_test, predictions_NB,average='micro')

    #print("NB - ", disease, ": f1-score", f1)
    print("NB - ", feature, disease, ": f1-macro", f1_macro)
    print("NB - ", feature, disease, ": f1-micro", f1_micro)
    
    return f1_macro, f1_micro

In [9]:
def performRF(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test, feature):
    classifier=RandomForestClassifier(n_estimators =400,criterion="entropy",random_state =0)
    classifier.fit(Train_X_Tfidf,y_train)

    # predict the labels on validation dataset
    predictions_RF = classifier.predict(Test_X_Tfidf)

    #f1 = f1_score(y_test, predictions_RF)
    f1_macro = f1_score(y_test, predictions_RF,average='macro')
    f1_micro = f1_score(y_test, predictions_RF,average='micro')

    #print("RF - ", disease, ": f1-score", f1)
    print("RF - ", feature, disease, ": f1-macro", f1_macro)
    print("RF - ", feature, disease, ": f1-micro", f1_micro)
    
    return f1_macro, f1_micro

In [10]:
def vectorize_batch_GloVe(X_Train):
    embedding_size_used = 300
    vec = torchtext.vocab.GloVe(name='6B', dim=embedding_size_used)
    
    X =  np.zeros((X_Train.shape[0], embedding_size_used * len(X_Train.iloc[0])))
    
    for i in range(len(X_Train)):
        vectors = vec.get_vecs_by_tokens(X_Train.iloc[i]).float().numpy()
        
        X[i,:] = vectors.flatten()
        
    return X

In [11]:
def vectorize_batch_FastText(X_Train):
    embedding_size_used = 300
    vec = torchtext.vocab.FastText()
    
    X =  np.zeros((X_Train.shape[0], embedding_size_used * len(X_Train.iloc[0])))
    
    for i in range(len(X_Train)):
        vectors = vec.get_vecs_by_tokens(X_Train.iloc[i]).float().numpy()
        
        X[i,:] = vectors.flatten()
        
    return X

In [12]:
def vectorize_batch_USE(X_Train):
    embedding_size_used = 512
    
    X =  np.zeros((X_Train.shape[0], embedding_size_used * len(X_Train.iloc[0])))
    
    for i in range(len(X_Train)):
        tensor_flow_vectors = embed(X_Train.iloc[i])
        array_vectors = tensor_flow_vectors.numpy()
        
        X[i,:] = array_vectors.flatten()
        
    return X

In [13]:
def generateXTrainAndTest(X_train, X_test, Tfidf_vect):
    X_train_values_list = Tfidf_vect.fit_transform(X_train).toarray()
    X_training = pd.DataFrame(X_train_values_list, columns=Tfidf_vect.get_feature_names_out())
    X_training = np.asarray(X_training, dtype=float)
    X_training = torch.from_numpy(X_training).to(device)

    X_test_values_list = Tfidf_vect.transform(X_test).toarray()
    X_testing = pd.DataFrame(X_test_values_list, columns=Tfidf_vect.get_feature_names_out())
    X_testing = np.asarray(X_testing, dtype=float)
    X_testing = torch.from_numpy(X_testing).to(device)  
    
    return X_training, X_testing

In [14]:
from keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV, RFE
from sklearn.tree import ExtraTreeClassifier
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import f_classif, mutual_info_classif
from statistics import mean
from sklearn.preprocessing import MinMaxScaler

def getVocab(X_train, y_train, feature, max_tokens):
 
    ## Step 1: Determine the Initial Vocabulary
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    vocab = list(tokenizer.word_index.keys())

    ## Step 2: Create term  matrix
    vectors = tokenizer.texts_to_matrix(X_train, mode='count')

    ## Do feature selection on term matrix (column headers are words)
    X = vectors
    y = y_train

    ##Choose algorithm
    if feature == 'SelectKBest':
        selector = SelectKBest(score_func=f_classif, k=max_tokens).fit(X,y)
    else: 
        if feature == 'InfoGainAttributeVal':
            #This should be similar to the InfoGain?
            selector = SelectKBest(score_func=mutual_info_classif, k=max_tokens).fit(X,y)
        else:
            #default to ExtraTreeClassifier
            estimator = ExtraTreeClassifier(random_state = seed)
            selector = SelectFromModel(estimator, max_features = max_tokens)
            selector = selector.fit(X, y)

    support_idx = selector.get_support(True)
    tokenizer2 = Tokenizer()
    tokenizer2.fit_on_texts([vocab[i-1].replace("'","") for i in support_idx])
    new_vocab = list(tokenizer2.word_index.keys())

    return new_vocab

# Feature Selection

In [15]:
Encoder = LabelEncoder()
Tfidf_vect = TfidfVectorizer(max_features=600)
Tfidf_vect_NS = TfidfVectorizer(max_features = 600, stop_words = cachedStopWords)

svm_f1_micro_scores = []
svm_f1_macro_scores = []
knn1_f1_micro_scores = []
knn1_f1_macro_scores = []
knn5_f1_micro_scores = []
knn5_f1_macro_scores = []
nb_f1_micro_scores = []
nb_f1_macro_scores = []
rf_f1_micro_scores = []
rf_f1_macro_scores = []

max_tokens = 600

for _,feature in enumerate(feature_list):
    for _,disease in enumerate(disease_list):
        disease_data_df = all_df[all_df['disease'] == disease]

        X_train, X_test, y_train, y_test = train_test_split(disease_data_df['text_final'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
        
        if feature != 'All':
            vocab = getVocab(X_train,y_train, feature, max_tokens)
            Tfidf_vect = TfidfVectorizer(max_features=max_tokens,vocabulary = vocab)
        else:
            Tfidf_vect = TfidfVectorizer(max_features=max_tokens)
  
        
        X_train_values_list = Tfidf_vect.fit_transform(X_train).toarray()
        X_training = pd.DataFrame(X_train_values_list, columns=Tfidf_vect.get_feature_names_out())
        X_training = np.asarray(X_training, dtype=float)
        X_training = torch.from_numpy(X_training).to(device)

        X_test_values_list = Tfidf_vect.transform(X_test).toarray()
        X_testing = pd.DataFrame(X_test_values_list, columns=Tfidf_vect.get_feature_names_out())
        X_testing = np.asarray(X_testing, dtype=float)
        X_testing = torch.from_numpy(X_testing).to(device)
        
        #tokens_to_use = X_training.shape[1]

        Train_Y  = Encoder.fit_transform(y_train)
        Test_Y  = Encoder.fit_transform(y_test)

        svm_f1_macro, svm_f1_micro = performSVM(X_training, X_testing, Train_Y, Test_Y, feature)
        svm_f1_macro_scores.append(svm_f1_macro)
        svm_f1_micro_scores.append(svm_f1_micro)

        knn_f1_macro1, knn_f1_micro1, knn_f1_macro5, knn_f1_micro5 = performKNN(X_training, X_testing, Train_Y, Test_Y, feature)
        knn1_f1_macro_scores.append(knn_f1_macro1)
        knn1_f1_micro_scores.append(knn_f1_micro1)
        knn5_f1_macro_scores.append(knn_f1_macro5)
        knn5_f1_micro_scores.append(knn_f1_micro5)

        nb_f1_macro, nb_f1_micro = performNB(X_training, X_testing, Train_Y, Test_Y, feature)
        nb_f1_macro_scores.append(nb_f1_macro)
        nb_f1_micro_scores.append(nb_f1_micro)

        rf_f1_macro, rf_f1_micro = performRF(X_training, X_testing, Train_Y, Test_Y, feature)
        rf_f1_macro_scores.append(rf_f1_macro)
        rf_f1_micro_scores.append(rf_f1_micro)
        
    print("Average SVM - ", feature, ": f1-micro", mean(svm_f1_micro_scores))
    print("Average SVM - ", feature, ": f1-macro", mean(svm_f1_macro_scores))
    
    print("Average KNN 1 - ", feature, ": f1-micro", mean(knn1_f1_micro_scores))
    print("Average KNN 1 - ", feature, ": f1-macro", mean(knn1_f1_macro_scores))
    print("Average KNN 5 - ", feature, ": f1-micro", mean(knn5_f1_micro_scores))
    print("Average KNN 5 - ", feature, ": f1-macro", mean(knn5_f1_macro_scores))
    
    print("Average NB - ", feature, ": f1-micro", mean(nb_f1_micro_scores))
    print("Average NB - ", feature, ": f1-macro", mean(nb_f1_macro_scores))
    
    print("Average RF - ", feature, ": f1-micro", mean(rf_f1_micro_scores))
    print("Average RF - ", feature, ": f1-macro", mean(rf_f1_macro_scores))
        

SVM -  All Asthma : f1-macro 0.609237775127399
SVM -  All Asthma : f1-micro 0.9198113207547169
KNN 1 k -  All Asthma : f1-macro 0.5623043623043623
KNN 1 k -  All Asthma : f1-micro 0.8537735849056604
KNN 5 k -  All Asthma : f1-macro 0.5623043623043623
KNN 5 k -  All Asthma : f1-micro 0.8537735849056604
NB -  All Asthma : f1-macro 0.6466666666666667
NB -  All Asthma : f1-micro 0.9245283018867925
RF -  All Asthma : f1-macro 0.6466666666666667
RF -  All Asthma : f1-micro 0.9245283018867925
SVM -  All CHF : f1-macro 0.7242442127352613
SVM -  All CHF : f1-micro 0.8000000000000002
KNN 1 k -  All CHF : f1-macro 0.6089352720450282
KNN 1 k -  All CHF : f1-micro 0.6827586206896552
KNN 5 k -  All CHF : f1-macro 0.6089352720450282
KNN 5 k -  All CHF : f1-micro 0.6827586206896552
NB -  All CHF : f1-macro 0.6120689655172413
NB -  All CHF : f1-micro 0.7517241379310344
RF -  All CHF : f1-macro 0.8396484488679355
RF -  All CHF : f1-micro 0.8689655172413793
SVM -  All Depression : f1-macro 0.454621105736

SVM -  ExtraTreeClassifier Asthma : f1-macro 0.9238718429557515
SVM -  ExtraTreeClassifier Asthma : f1-micro 0.9575471698113207
KNN 1 k -  ExtraTreeClassifier Asthma : f1-macro 0.6858358430135283
KNN 1 k -  ExtraTreeClassifier Asthma : f1-micro 0.8537735849056604
KNN 5 k -  ExtraTreeClassifier Asthma : f1-macro 0.6858358430135283
KNN 5 k -  ExtraTreeClassifier Asthma : f1-micro 0.8537735849056604
NB -  ExtraTreeClassifier Asthma : f1-macro 0.5470914972837698
NB -  ExtraTreeClassifier Asthma : f1-micro 0.8349056603773584
RF -  ExtraTreeClassifier Asthma : f1-macro 0.9144471347861178
RF -  ExtraTreeClassifier Asthma : f1-micro 0.9528301886792453
SVM -  ExtraTreeClassifier CHF : f1-macro 0.7820017182130584
SVM -  ExtraTreeClassifier CHF : f1-micro 0.8068965517241379
KNN 1 k -  ExtraTreeClassifier CHF : f1-macro 0.6640926640926641
KNN 1 k -  ExtraTreeClassifier CHF : f1-micro 0.6896551724137931
KNN 5 k -  ExtraTreeClassifier CHF : f1-macro 0.6640926640926641
KNN 5 k -  ExtraTreeClassifier 

RF -  ExtraTreeClassifier Hypertension : f1-macro 0.8035882908404155
RF -  ExtraTreeClassifier Hypertension : f1-micro 0.8990384615384617
SVM -  ExtraTreeClassifier PVD : f1-macro 0.5846774193548387
SVM -  ExtraTreeClassifier PVD : f1-micro 0.8543689320388349
KNN 1 k -  ExtraTreeClassifier PVD : f1-macro 0.586677367576244
KNN 1 k -  ExtraTreeClassifier PVD : f1-micro 0.8058252427184465
KNN 5 k -  ExtraTreeClassifier PVD : f1-macro 0.586677367576244
KNN 5 k -  ExtraTreeClassifier PVD : f1-micro 0.8058252427184465
NB -  ExtraTreeClassifier PVD : f1-macro 0.482565011820331
NB -  ExtraTreeClassifier PVD : f1-micro 0.8349514563106796
RF -  ExtraTreeClassifier PVD : f1-macro 0.6494611457742484
RF -  ExtraTreeClassifier PVD : f1-micro 0.8689320388349513
SVM -  ExtraTreeClassifier Venous Insufficiency : f1-macro 0.5176767676767676
SVM -  ExtraTreeClassifier Venous Insufficiency : f1-micro 0.8952879581151832
KNN 1 k -  ExtraTreeClassifier Venous Insufficiency : f1-macro 0.5460242005185825
KNN 1

  f = msb / msw


SVM -  SelectKBest Asthma : f1-macro 0.8343750000000001
SVM -  SelectKBest Asthma : f1-micro 0.9433962264150944
KNN 1 k -  SelectKBest Asthma : f1-macro 0.5892428117492057
KNN 1 k -  SelectKBest Asthma : f1-micro 0.8820754716981133
KNN 5 k -  SelectKBest Asthma : f1-macro 0.5892428117492057
KNN 5 k -  SelectKBest Asthma : f1-micro 0.8820754716981133
NB -  SelectKBest Asthma : f1-macro 0.5411255411255411
NB -  SelectKBest Asthma : f1-micro 0.8867924528301887
RF -  SelectKBest Asthma : f1-macro 0.7976547586583038
RF -  SelectKBest Asthma : f1-micro 0.9339622641509434


  f = msb / msw


SVM -  SelectKBest CHF : f1-macro 0.7214946743495723
SVM -  SelectKBest CHF : f1-micro 0.7724137931034483
KNN 1 k -  SelectKBest CHF : f1-macro 0.6574032403240324
KNN 1 k -  SelectKBest CHF : f1-micro 0.7103448275862069
KNN 5 k -  SelectKBest CHF : f1-macro 0.6574032403240324
KNN 5 k -  SelectKBest CHF : f1-micro 0.7103448275862069
NB -  SelectKBest CHF : f1-macro 0.706547619047619
NB -  SelectKBest CHF : f1-micro 0.7655172413793103
RF -  SelectKBest CHF : f1-macro 0.8356275303643725
RF -  SelectKBest CHF : f1-micro 0.8551724137931035


  f = msb / msw


SVM -  SelectKBest Depression : f1-macro 0.6754988028731046
SVM -  SelectKBest Depression : f1-micro 0.822429906542056
KNN 1 k -  SelectKBest Depression : f1-macro 0.4625816172777499
KNN 1 k -  SelectKBest Depression : f1-micro 0.7196261682242989
KNN 5 k -  SelectKBest Depression : f1-macro 0.4625816172777499
KNN 5 k -  SelectKBest Depression : f1-micro 0.7196261682242989
NB -  SelectKBest Depression : f1-macro 0.5243769801863148
NB -  SelectKBest Depression : f1-micro 0.7803738317757009
RF -  SelectKBest Depression : f1-macro 0.721387146407859
RF -  SelectKBest Depression : f1-micro 0.8457943925233644


  f = msb / msw


SVM -  SelectKBest Diabetes : f1-macro 0.7784679089026916
SVM -  SelectKBest Diabetes : f1-micro 0.850467289719626
KNN 1 k -  SelectKBest Diabetes : f1-macro 0.6177067248009205
KNN 1 k -  SelectKBest Diabetes : f1-micro 0.7242990654205608
KNN 5 k -  SelectKBest Diabetes : f1-macro 0.6177067248009205
KNN 5 k -  SelectKBest Diabetes : f1-micro 0.7242990654205608
NB -  SelectKBest Diabetes : f1-macro 0.6464080459770115
NB -  SelectKBest Diabetes : f1-micro 0.7850467289719625
RF -  SelectKBest Diabetes : f1-macro 0.9314641744548287
RF -  SelectKBest Diabetes : f1-micro 0.9485981308411215


  f = msb / msw


SVM -  SelectKBest Gallstones : f1-macro 0.5856273543900319
SVM -  SelectKBest Gallstones : f1-micro 0.8818181818181818
KNN 1 k -  SelectKBest Gallstones : f1-macro 0.4815396700706991
KNN 1 k -  SelectKBest Gallstones : f1-micro 0.8363636363636363
KNN 5 k -  SelectKBest Gallstones : f1-macro 0.4815396700706991
KNN 5 k -  SelectKBest Gallstones : f1-micro 0.8363636363636363
NB -  SelectKBest Gallstones : f1-macro 0.4991869918699187
NB -  SelectKBest Gallstones : f1-micro 0.8727272727272727
RF -  SelectKBest Gallstones : f1-macro 0.8543367909953652
RF -  SelectKBest Gallstones : f1-micro 0.9454545454545454


  f = msb / msw


SVM -  SelectKBest Gout : f1-macro 0.8350746268656717
SVM -  SelectKBest Gout : f1-micro 0.9457013574660633
KNN 1 k -  SelectKBest Gout : f1-macro 0.5620426829268292
KNN 1 k -  SelectKBest Gout : f1-micro 0.8823529411764706
KNN 5 k -  SelectKBest Gout : f1-macro 0.5620426829268292
KNN 5 k -  SelectKBest Gout : f1-micro 0.8823529411764706
NB -  SelectKBest Gout : f1-macro 0.46875
NB -  SelectKBest Gout : f1-micro 0.8823529411764706
RF -  SelectKBest Gout : f1-macro 0.8521379478133934
RF -  SelectKBest Gout : f1-micro 0.9502262443438914


  f = msb / msw


SVM -  SelectKBest Hypercholesterolemia : f1-macro 0.7952672378620971
SVM -  SelectKBest Hypercholesterolemia : f1-micro 0.7979274611398963
KNN 1 k -  SelectKBest Hypercholesterolemia : f1-macro 0.6786788399570354
KNN 1 k -  SelectKBest Hypercholesterolemia : f1-micro 0.6787564766839378
KNN 5 k -  SelectKBest Hypercholesterolemia : f1-macro 0.6786788399570354
KNN 5 k -  SelectKBest Hypercholesterolemia : f1-micro 0.6787564766839378
NB -  SelectKBest Hypercholesterolemia : f1-macro 0.7712777418659772
NB -  SelectKBest Hypercholesterolemia : f1-micro 0.7720207253886011
RF -  SelectKBest Hypercholesterolemia : f1-macro 0.901172348740062
RF -  SelectKBest Hypercholesterolemia : f1-micro 0.9015544041450776


  f = msb / msw


SVM -  SelectKBest Hypertriglyceridemia : f1-macro 0.8392686297438079
SVM -  SelectKBest Hypertriglyceridemia : f1-micro 0.9675925925925926
KNN 1 k -  SelectKBest Hypertriglyceridemia : f1-macro 0.6972091245061807
KNN 1 k -  SelectKBest Hypertriglyceridemia : f1-micro 0.9490740740740741
KNN 5 k -  SelectKBest Hypertriglyceridemia : f1-macro 0.6972091245061807
KNN 5 k -  SelectKBest Hypertriglyceridemia : f1-micro 0.9490740740740741
NB -  SelectKBest Hypertriglyceridemia : f1-macro 0.48201438848920863
NB -  SelectKBest Hypertriglyceridemia : f1-micro 0.9305555555555556
RF -  SelectKBest Hypertriglyceridemia : f1-macro 0.7378640776699029
RF -  SelectKBest Hypertriglyceridemia : f1-micro 0.9537037037037037


  f = msb / msw


SVM -  SelectKBest OA : f1-macro 0.666618559676721
SVM -  SelectKBest OA : f1-micro 0.7904761904761904
KNN 1 k -  SelectKBest OA : f1-macro 0.5712535728868926
KNN 1 k -  SelectKBest OA : f1-micro 0.7380952380952381
KNN 5 k -  SelectKBest OA : f1-macro 0.5712535728868926
KNN 5 k -  SelectKBest OA : f1-micro 0.7380952380952381
NB -  SelectKBest OA : f1-macro 0.5051353874883286
NB -  SelectKBest OA : f1-micro 0.7476190476190476
RF -  SelectKBest OA : f1-macro 0.7619047619047619
RF -  SelectKBest OA : f1-micro 0.8476190476190476


  f = msb / msw


SVM -  SelectKBest OSA : f1-macro 0.8929618768328447
SVM -  SelectKBest OSA : f1-micro 0.9452054794520548
KNN 1 k -  SelectKBest OSA : f1-macro 0.6161855670103092
KNN 1 k -  SelectKBest OSA : f1-micro 0.8447488584474886
KNN 5 k -  SelectKBest OSA : f1-macro 0.6161855670103092
KNN 5 k -  SelectKBest OSA : f1-micro 0.8447488584474886
NB -  SelectKBest OSA : f1-macro 0.5913486005089058
NB -  SelectKBest OSA : f1-micro 0.8493150684931505
RF -  SelectKBest OSA : f1-macro 0.8571552811236248
RF -  SelectKBest OSA : f1-micro 0.9315068493150684


  f = msb / msw


SVM -  SelectKBest Obesity : f1-macro 0.8335556148306216
SVM -  SelectKBest Obesity : f1-micro 0.8413461538461539
KNN 1 k -  SelectKBest Obesity : f1-macro 0.5871377530766176
KNN 1 k -  SelectKBest Obesity : f1-micro 0.5913461538461539
KNN 5 k -  SelectKBest Obesity : f1-macro 0.5871377530766176
KNN 5 k -  SelectKBest Obesity : f1-micro 0.5913461538461539
NB -  SelectKBest Obesity : f1-macro 0.6793186990676161
NB -  SelectKBest Obesity : f1-micro 0.6971153846153846
RF -  SelectKBest Obesity : f1-macro 0.9118644067796609
RF -  SelectKBest Obesity : f1-micro 0.9134615384615384


  f = msb / msw


SVM -  SelectKBest CAD : f1-macro 0.8076914002595258
SVM -  SelectKBest CAD : f1-micro 0.8133971291866029
KNN 1 k -  SelectKBest CAD : f1-macro 0.655717054263566
KNN 1 k -  SelectKBest CAD : f1-micro 0.6746411483253588
KNN 5 k -  SelectKBest CAD : f1-macro 0.655717054263566
KNN 5 k -  SelectKBest CAD : f1-micro 0.6746411483253588
NB -  SelectKBest CAD : f1-macro 0.8043071161048689
NB -  SelectKBest CAD : f1-micro 0.8086124401913876
RF -  SelectKBest CAD : f1-macro 0.9158794421952317
RF -  SelectKBest CAD : f1-micro 0.9186602870813397


  f = msb / msw


SVM -  SelectKBest Hypertension : f1-macro 0.609071544919183
SVM -  SelectKBest Hypertension : f1-micro 0.7836538461538463
KNN 1 k -  SelectKBest Hypertension : f1-macro 0.5317748917748918
KNN 1 k -  SelectKBest Hypertension : f1-micro 0.75
KNN 5 k -  SelectKBest Hypertension : f1-macro 0.5317748917748918
KNN 5 k -  SelectKBest Hypertension : f1-micro 0.75
NB -  SelectKBest Hypertension : f1-macro 0.5101362271992611
NB -  SelectKBest Hypertension : f1-micro 0.7548076923076922
RF -  SelectKBest Hypertension : f1-macro 0.6491477272727273
RF -  SelectKBest Hypertension : f1-micro 0.8173076923076923


  f = msb / msw


SVM -  SelectKBest PVD : f1-macro 0.7101041373487194
SVM -  SelectKBest PVD : f1-micro 0.9029126213592235
KNN 1 k -  SelectKBest PVD : f1-macro 0.7231182795698925
KNN 1 k -  SelectKBest PVD : f1-micro 0.9029126213592235
KNN 5 k -  SelectKBest PVD : f1-macro 0.7231182795698925
KNN 5 k -  SelectKBest PVD : f1-micro 0.9029126213592235
NB -  SelectKBest PVD : f1-macro 0.4966841186736475
NB -  SelectKBest PVD : f1-micro 0.8640776699029126
RF -  SelectKBest PVD : f1-macro 0.8218873517786562
RF -  SelectKBest PVD : f1-micro 0.9320388349514563


  f = msb / msw


SVM -  SelectKBest Venous Insufficiency : f1-macro 0.6877384196185286
SVM -  SelectKBest Venous Insufficiency : f1-micro 0.9528795811518325
KNN 1 k -  SelectKBest Venous Insufficiency : f1-macro 0.5335775335775337
KNN 1 k -  SelectKBest Venous Insufficiency : f1-micro 0.9162303664921466
KNN 5 k -  SelectKBest Venous Insufficiency : f1-macro 0.5335775335775337
KNN 5 k -  SelectKBest Venous Insufficiency : f1-micro 0.9162303664921466
NB -  SelectKBest Venous Insufficiency : f1-macro 0.4837837837837838
NB -  SelectKBest Venous Insufficiency : f1-micro 0.93717277486911
RF -  SelectKBest Venous Insufficiency : f1-macro 0.6292701863354038
RF -  SelectKBest Venous Insufficiency : f1-micro 0.9476439790575917


  f = msb / msw


SVM -  SelectKBest GERD : f1-macro 0.7657771717958707
SVM -  SelectKBest GERD : f1-micro 0.8594594594594595
KNN 1 k -  SelectKBest GERD : f1-macro 0.5524915336236091
KNN 1 k -  SelectKBest GERD : f1-micro 0.7837837837837838
KNN 5 k -  SelectKBest GERD : f1-macro 0.5524915336236091
KNN 5 k -  SelectKBest GERD : f1-micro 0.7837837837837838
NB -  SelectKBest GERD : f1-macro 0.4605646824550174
NB -  SelectKBest GERD : f1-micro 0.7783783783783784
RF -  SelectKBest GERD : f1-macro 0.7732843137254901
RF -  SelectKBest GERD : f1-micro 0.8702702702702703
Average SVM -  SelectKBest : f1-micro 0.8498288525422886
Average SVM -  SelectKBest : f1-macro 0.6647503728978593
Average KNN 1 -  SelectKBest : f1-micro 0.760994541536007
Average KNN 1 -  SelectKBest : f1-macro 0.5866424153346824
Average KNN 5 -  SelectKBest : f1-micro 0.760994541536007
Average KNN 5 -  SelectKBest : f1-macro 0.5866424153346824
Average NB -  SelectKBest : f1-micro 0.8104904535480079
Average NB -  SelectKBest : f1-macro 0.55542

RF -  InfoGainAttributeVal Obesity : f1-macro 0.9699277108433735
RF -  InfoGainAttributeVal Obesity : f1-micro 0.9711538461538461
SVM -  InfoGainAttributeVal CAD : f1-macro 0.8971095013714043
SVM -  InfoGainAttributeVal CAD : f1-micro 0.8995215311004785
KNN 1 k -  InfoGainAttributeVal CAD : f1-macro 0.7837844340190063
KNN 1 k -  InfoGainAttributeVal CAD : f1-micro 0.7942583732057417
KNN 5 k -  InfoGainAttributeVal CAD : f1-macro 0.7837844340190063
KNN 5 k -  InfoGainAttributeVal CAD : f1-micro 0.7942583732057417
NB -  InfoGainAttributeVal CAD : f1-macro 0.8046536796536796
NB -  InfoGainAttributeVal CAD : f1-micro 0.8181818181818182
RF -  InfoGainAttributeVal CAD : f1-macro 0.9052651668774006
RF -  InfoGainAttributeVal CAD : f1-micro 0.9090909090909091
SVM -  InfoGainAttributeVal Hypertension : f1-macro 0.7020667075915695
SVM -  InfoGainAttributeVal Hypertension : f1-micro 0.8653846153846154
KNN 1 k -  InfoGainAttributeVal Hypertension : f1-macro 0.6049545829892651
KNN 1 k -  InfoGainAt

# Word Embeddings

In [16]:
Encoder = LabelEncoder()
Tfidf_vect = TfidfVectorizer(max_features=600)
Tfidf_vect_NS = TfidfVectorizer(max_features = 600, stop_words = cachedStopWords)

svm_f1_micro_scores = []
svm_f1_macro_scores = []
knn1_f1_micro_scores = []
knn1_f1_macro_scores = []
knn5_f1_micro_scores = []
knn5_f1_macro_scores = []
nb_f1_micro_scores = []
nb_f1_macro_scores = []
rf_f1_micro_scores = []
rf_f1_macro_scores = []

max_tokens = 600

for _, embedding in enumerate(embedding_list):
    for _,disease in enumerate(disease_list):
        disease_data_df = all_df_expanded [all_df_expanded ['disease'] == disease]

        if embedding == 'GloVe':
            X_train, X_test, y_train, y_test = train_test_split(disease_data_df['vector_tokenized'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
            X_train = vectorize_batch_GloVe(X_train)
            X_test = vectorize_batch_GloVe(X_test)
        if embedding == 'FastText':
            X_train, X_test, y_train, y_test = train_test_split(disease_data_df['vector_tokenized'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
            X_train = vectorize_batch_FastText(X_train)
            X_test = vectorize_batch_FastText(X_test)
        if embedding == 'USE':
            X_train, X_test, y_train, y_test = train_test_split(disease_data_df['sentence_tokenized'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
            X_train = vectorize_batch_USE(X_train)
            X_test = vectorize_batch_USE(X_test)

        Train_Y  = Encoder.fit_transform(y_train)
        Test_Y  = Encoder.fit_transform(y_test)

        svm_f1_macro, svm_f1_micro = performSVM(X_train, X_test, Train_Y, Test_Y, embedding)
        svm_f1_macro_scores.append(svm_f1_macro)
        svm_f1_micro_scores.append(svm_f1_micro)

        knn_f1_macro1, knn_f1_micro1, knn_f1_macro5, knn_f1_micro5 = performKNN(X_train, X_test, Train_Y, Test_Y, embedding)
        knn1_f1_macro_scores.append(knn_f1_macro1)
        knn1_f1_micro_scores.append(knn_f1_micro1)
        knn5_f1_macro_scores.append(knn_f1_macro5)
        knn5_f1_micro_scores.append(knn_f1_micro5)

        nb_f1_macro, nb_f1_micro = performNB(X_train, X_test, Train_Y, Test_Y, embedding)
        nb_f1_macro_scores.append(nb_f1_macro)
        nb_f1_micro_scores.append(nb_f1_micro)

        rf_f1_macro, rf_f1_micro = performRF(X_train, X_test, Train_Y, Test_Y, embedding)
        rf_f1_macro_scores.append(rf_f1_macro)
        rf_f1_micro_scores.append(rf_f1_micro)
        
    print("Average SVM - ", embedding, ": f1-micro", mean(svm_f1_micro_scores))
    print("Average SVM - ", embedding, ": f1-macro", mean(svm_f1_macro_scores))

    print("Average KNN 1 - ", embedding, ": f1-micro", mean(knn1_f1_micro_scores))
    print("Average KNN 1 - ", embedding, ": f1-macro", mean(knn1_f1_macro_scores))
    print("Average KNN 5 - ", embedding, ": f1-micro", mean(knn5_f1_micro_scores))
    print("Average KNN 5 - ", embedding, ": f1-macro", mean(knn5_f1_macro_scores))

    print("Average NB - ", embedding, ": f1-micro", mean(nb_f1_micro_scores))
    print("Average NB - ", embedding, ": f1-macro", mean(nb_f1_macro_scores))

    print("Average RF - ", embedding, ": f1-micro", mean(rf_f1_micro_scores))
    print("Average RF - ", embedding, ": f1-macro", mean(rf_f1_macro_scores))

SVM -  GloVe Asthma : f1-macro 0.4557823129251701
SVM -  GloVe Asthma : f1-micro 0.8375
KNN 1 k -  GloVe Asthma : f1-macro 0.4557823129251701
KNN 1 k -  GloVe Asthma : f1-micro 0.8375
KNN 5 k -  GloVe Asthma : f1-macro 0.4557823129251701
KNN 5 k -  GloVe Asthma : f1-micro 0.8375
NB -  GloVe Asthma : f1-macro 0.4799711095060837
NB -  GloVe Asthma : f1-micro 0.8375
RF -  GloVe Asthma : f1-macro 0.4557823129251701
RF -  GloVe Asthma : f1-micro 0.8375
SVM -  GloVe CHF : f1-macro 0.7099485115701012
SVM -  GloVe CHF : f1-micro 0.8284023668639053
KNN 1 k -  GloVe CHF : f1-macro 0.1990521327014218
KNN 1 k -  GloVe CHF : f1-micro 0.2485207100591716
KNN 5 k -  GloVe CHF : f1-macro 0.1990521327014218
KNN 5 k -  GloVe CHF : f1-micro 0.2485207100591716
NB -  GloVe CHF : f1-macro 0.6253058103975535
NB -  GloVe CHF : f1-micro 0.6568047337278107
RF -  GloVe CHF : f1-macro 0.47742733457019176
RF -  GloVe CHF : f1-micro 0.7633136094674556
SVM -  GloVe Depression : f1-macro 0.4957010582010582
SVM -  GloV

NB -  GloVe GERD : f1-macro 0.4590637642072819
NB -  GloVe GERD : f1-micro 0.7403846153846154
RF -  GloVe GERD : f1-macro 0.42857142857142855
RF -  GloVe GERD : f1-micro 0.75
Average SVM -  GloVe : f1-micro 0.7933303976022056
Average SVM -  GloVe : f1-macro 0.5086363586785232
Average KNN 1 -  GloVe : f1-micro 0.6233822170652848
Average KNN 1 -  GloVe : f1-macro 0.3628547491570595
Average KNN 5 -  GloVe : f1-micro 0.6233822170652848
Average KNN 5 -  GloVe : f1-macro 0.3628547491570595
Average NB -  GloVe : f1-micro 0.619267218997998
Average NB -  GloVe : f1-macro 0.4852423700112217
Average RF -  GloVe : f1-micro 0.7855080620246021
Average RF -  GloVe : f1-macro 0.4577054621250818
SVM -  FastText Asthma : f1-macro 0.45080091533180777
SVM -  FastText Asthma : f1-micro 0.8208333333333333
KNN 1 k -  FastText Asthma : f1-macro 0.45080091533180777
KNN 1 k -  FastText Asthma : f1-micro 0.8208333333333333
KNN 5 k -  FastText Asthma : f1-macro 0.45080091533180777
KNN 5 k -  FastText Asthma : f1-

NB -  FastText PVD : f1-macro 0.3332107730489935
NB -  FastText PVD : f1-micro 0.33760683760683763
RF -  FastText PVD : f1-macro 0.4507042253521127
RF -  FastText PVD : f1-micro 0.8205128205128205
SVM -  FastText Venous Insufficiency : f1-macro 0.4782608695652174
SVM -  FastText Venous Insufficiency : f1-micro 0.9166666666666666
KNN 1 k -  FastText Venous Insufficiency : f1-macro 0.4782608695652174
KNN 1 k -  FastText Venous Insufficiency : f1-micro 0.9166666666666666
KNN 5 k -  FastText Venous Insufficiency : f1-macro 0.4782608695652174
KNN 5 k -  FastText Venous Insufficiency : f1-micro 0.9166666666666666
NB -  FastText Venous Insufficiency : f1-macro 0.13334906773660607
NB -  FastText Venous Insufficiency : f1-micro 0.13425925925925927
RF -  FastText Venous Insufficiency : f1-macro 0.4782608695652174
RF -  FastText Venous Insufficiency : f1-micro 0.9166666666666666
SVM -  FastText GERD : f1-macro 0.44285714285714284
SVM -  FastText GERD : f1-micro 0.7403846153846154
KNN 1 k -  FastT

NB -  USE Hypertension : f1-macro 0.18014432376134507
NB -  USE Hypertension : f1-micro 0.21518987341772153
RF -  USE Hypertension : f1-macro 0.5712482645462578
RF -  USE Hypertension : f1-micro 0.8185654008438819
SVM -  USE PVD : f1-macro 0.6648575305291724
SVM -  USE PVD : f1-micro 0.8376068376068376
KNN 1 k -  USE PVD : f1-macro 0.6149135349502253
KNN 1 k -  USE PVD : f1-micro 0.7393162393162394
KNN 5 k -  USE PVD : f1-macro 0.6149135349502253
KNN 5 k -  USE PVD : f1-micro 0.7393162393162394
NB -  USE PVD : f1-macro 0.4608548455353364
NB -  USE PVD : f1-micro 0.7905982905982905
RF -  USE PVD : f1-macro 0.6741955445544554
RF -  USE PVD : f1-micro 0.8461538461538461
SVM -  USE Venous Insufficiency : f1-macro 0.6854368932038835
SVM -  USE Venous Insufficiency : f1-micro 0.9444444444444444
KNN 1 k -  USE Venous Insufficiency : f1-macro 0.6181046676096181
KNN 1 k -  USE Venous Insufficiency : f1-micro 0.9074074074074074
KNN 5 k -  USE Venous Insufficiency : f1-macro 0.6181046676096181
KN