This notebook was created to support the data preparation required to support our CS 598 DLH project.  The paper we have chosen for the reproducibility project is:
***Ensembling Classical Machine Learning and Deep Learning Approaches for Morbidity Identification from Clinical Notes ***



 

The data cannot be shared publicly due to the agreements required to obtain the data so we are storing the data locally and not putting in GitHub.

In [1]:
DATA_PATH = './obesity_data/'

**Classical Machine Learning - TF-IDF - All Features**

![CML TFIDF All](images\cml-tfidf-all.gif)

**Classical Machine Learning - TF-IDF - ExtraTreesClassifier Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-extra.gif)

**Classical Machine Learning - TF-IDF - InfoGain Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-infogain.gif)

**Classical Machine Learning - TF-IDF - SelectKBest Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-selectkbest.gif)

**Classical Machine Learning - Word Embeddings - No Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swno.gif)

**Classical Machine Learning - Word Embeddings - Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swyes.gif)

In [2]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import datetime
from datetime import timedelta
from tqdm import tqdm
import torchtext
from nltk.corpus import stopwords
import tensorflow_hub as hub

from sklearn.metrics import roc_auc_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm, naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

# define data path
DATA_PATH = './obesity_data/'
RESULTS_PATH = './results/'
MODELS_PATH = './models/'

cachedStopWords = stopwords.words("english")

#Download info for USE
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

all_docs_df = pd.read_pickle(DATA_PATH + '/alldocs_df.pkl')
all_docs_df_ns = pd.read_pickle(DATA_PATH + '/alldocs_df_ns.pkl')
all_annot_df = pd.read_pickle(DATA_PATH + '/allannot_df.pkl')
all_df_expanded = pd.read_pickle(DATA_PATH + '/all_df_expanded.pkl')



In [3]:
all_df = pd.merge(all_docs_df,all_annot_df, on='id')
all_df_ns = pd.merge(all_docs_df_ns,all_annot_df, on='id')

disease_list = all_df['disease'].unique().tolist()
feature_list = ['All','ExtraTreeClassifier','SelectKBest','InfoGainAttributeVal']
embedding_list = ['GloVe', 'FastText', 'USE']
device = torch.device('cpu')

In [4]:
for index, entry in enumerate(all_df['tok_lem_text']):
    Final_words = []
    for word in entry:
        Final_words.append(word)
    all_df.loc[index, 'text_final'] = str(Final_words)

In [5]:
for index, entry in enumerate(all_df_ns['tok_lem_text']):
    Final_words = []
    for word in entry:
        Final_words.append(word)
    all_df_ns.loc[index, 'text_final'] = str(Final_words)

In [6]:
def performSVM(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test, feature):
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(Train_X_Tfidf, y_train)

    predictions_SVM = SVM.predict(Test_X_Tfidf)

    #f1 = f1_score(y_test, predictions_SVM)
    f1_macro = f1_score(y_test, predictions_SVM,average='macro')
    f1_micro = f1_score(y_test, predictions_SVM,average='micro')

    #print("SVM - ", disease, ": f1-score", f1)
    print("SVM - ", feature, disease, ": f1-macro", f1_macro)
    print("SVM - ", feature, disease, ": f1-micro", f1_micro)

In [7]:
def performKNN(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test, feature):
    knn = KNeighborsClassifier(n_neighbors=7)
    clf = knn.fit(Train_X_Tfidf, y_train)

    # predict the labels on validation dataset
    predictions_KNN = clf.predict(Test_X_Tfidf)

    #auroc = roc_auc_score(truth, pred[:,1])
    #f1 = f1_score(y_test, predictions_KNN)
    f1_macro = f1_score(y_test, predictions_KNN,average='macro')
    f1_micro = f1_score(y_test, predictions_KNN,average='micro')

    #print("KNN - ", disease, ": f1-score", f1)
    print("KNN - ", feature, disease, ": f1-macro", f1_macro)
    print("KNN - ", feature, disease, ": f1-micro", f1_micro)

In [8]:
def performNB(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test, feature):
    Naive = naive_bayes.MultinomialNB()
    Naive.fit(Train_X_Tfidf,y_train)

    # predict the labels on validation dataset
    predictions_NB = Naive.predict(Test_X_Tfidf)

    #f1 = f1_score(y_test, predictions_NB)
    f1_macro = f1_score(y_test, predictions_NB,average='macro')
    f1_micro = f1_score(y_test, predictions_NB,average='micro')

    #print("NB - ", disease, ": f1-score", f1)
    print("NB - ", feature, disease, ": f1-macro", f1_macro)
    print("NB - ", feature, disease, ": f1-micro", f1_micro)

In [9]:
def performRF(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test, feature):
    classifier=RandomForestClassifier(n_estimators =400,criterion="entropy",random_state =0)
    classifier.fit(Train_X_Tfidf,y_train)

    # predict the labels on validation dataset
    predictions_RF = classifier.predict(Test_X_Tfidf)

    #f1 = f1_score(y_test, predictions_RF)
    f1_macro = f1_score(y_test, predictions_RF,average='macro')
    f1_micro = f1_score(y_test, predictions_RF,average='micro')

    #print("RF - ", disease, ": f1-score", f1)
    print("RF - ", feature, disease, ": f1-macro", f1_macro)
    print("RF - ", feature, disease, ": f1-micro", f1_micro)

In [10]:
def vectorize_batch_GloVe(X_Train):
    embedding_size_used = 300
    vec = torchtext.vocab.GloVe(name='6B', dim=embedding_size_used)    
    
    X = [0] * X_Train.shape[0]
    
    for i in range(len(X_Train)):
        vectors = vec.get_vecs_by_tokens(X_Train.iloc[i]).float().numpy()

        X[i] = vectors.flatten()

    return X

In [11]:
def vectorize_batch_FastText(X_Train):
    embedding_size_used = 300
    vec = torchtext.vocab.FastText()

    X = [0] * X_Train.shape[0]

    for i in range(len(X_Train)):
        vectors = vec.get_vecs_by_tokens(X_Train.iloc[i]).float().numpy()

        X[i] = vectors.flatten()

    return X

In [12]:
def vectorize_batch_USE(X_Train):
    embedding_size_used = 512

    X = [0] * X_Train.shape[0]

    for i in range(len(X_Train)):
        
        tensor_flow_vectors = embed(x)
        array_vectors = tensor_flow_vectors.numpy()

        X[i] = torch.tensor(array_vectors).float()
        Y[i] = torch.tensor(float(y == True))

    return X,Y 

In [13]:
def generateXTrainAndTest(X_train, X_test, Tfidf_vect):
    X_train_values_list = Tfidf_vect.fit_transform(X_train).toarray()
    X_training = pd.DataFrame(X_train_values_list, columns=Tfidf_vect.get_feature_names_out())
    X_training = np.asarray(X_training, dtype=float)
    X_training = torch.from_numpy(X_training).to(device)

    X_test_values_list = Tfidf_vect.transform(X_test).toarray()
    X_testing = pd.DataFrame(X_test_values_list, columns=Tfidf_vect.get_feature_names_out())
    X_testing = np.asarray(X_testing, dtype=float)
    X_testing = torch.from_numpy(X_testing).to(device)  
    
    return X_training, X_testing

In [14]:
from keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV, RFE
from sklearn.tree import ExtraTreeClassifier
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import f_classif, mutual_info_classif

def getVocab(X_train, y_train, feature, max_tokens):
 
    ## Step 1: Determine the Initial Vocabulary
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    vocab = list(tokenizer.word_index.keys())

    ## Step 2: Create term  matrix
    vectors = tokenizer.texts_to_matrix(X_train, mode='count')

    ## Do feature selection on term matrix (column headers are words)
    X = vectors
    y = y_train

    ##Choose algorithm
    if feature == 'SelectKBest':
        selector = SelectKBest(score_func=f_classif, k=max_tokens).fit(X,y)
    else: 
        if feature == 'InfoGainAttributeVal':
            #This should be similar to the InfoGain?
            selector = SelectKBest(score_func=mutual_info_classif, k=max_tokens).fit(X,y)
        else:
            #default to ExtraTreeClassifier
            estimator = ExtraTreeClassifier(random_state = seed)
            selector = SelectFromModel(estimator, max_features = max_tokens)
            selector = selector.fit(X, y)

    support_idx = selector.get_support(True)
    tokenizer2 = Tokenizer()
    tokenizer2.fit_on_texts([vocab[i-1].replace("'","") for i in support_idx])
    new_vocab = list(tokenizer2.word_index.keys())

    return new_vocab

# Feature Selection

In [15]:
Encoder = LabelEncoder()
Tfidf_vect = TfidfVectorizer(max_features=600)
Tfidf_vect_NS = TfidfVectorizer(max_features = 600, stop_words = cachedStopWords)

max_tokens = 600

for _,disease in enumerate(disease_list):
    for _,feature in enumerate(feature_list):
        disease_data_df = all_df[all_df['disease'] == disease]

        X_train, X_test, y_train, y_test = train_test_split(disease_data_df['text_final'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
        
        if feature != 'All':
            vocab = getVocab(X_train,y_train, feature, max_tokens)
            Tfidf_vect = TfidfVectorizer(max_features=max_tokens,vocabulary = vocab)
        else:
            Tfidf_vect = TfidfVectorizer(max_features=max_tokens)
  
        
        X_train_values_list = Tfidf_vect.fit_transform(X_train).toarray()
        X_training = pd.DataFrame(X_train_values_list, columns=Tfidf_vect.get_feature_names_out())
        X_training = np.asarray(X_training, dtype=float)
        X_training = torch.from_numpy(X_training).to(device)

        X_test_values_list = Tfidf_vect.transform(X_test).toarray()
        X_testing = pd.DataFrame(X_test_values_list, columns=Tfidf_vect.get_feature_names_out())
        X_testing = np.asarray(X_testing, dtype=float)
        X_testing = torch.from_numpy(X_testing).to(device)
        
        tokens_to_use = X_training.shape[1]

        Train_Y  = Encoder.fit_transform(y_train)
        Test_Y  = Encoder.fit_transform(y_test)

        performSVM(X_training, X_testing, Train_Y, Test_Y, feature)

        performKNN(X_training, X_testing, Train_Y, Test_Y, feature)

        performNB(X_training, X_testing, Train_Y, Test_Y, feature)

        performRF(X_training, X_testing, Train_Y, Test_Y, feature)
        

SVM -  All Asthma : f1-macro 0.609237775127399
SVM -  All Asthma : f1-micro 0.9198113207547169
KNN -  All Asthma : f1-macro 0.5846627483907081
KNN -  All Asthma : f1-micro 0.9009433962264151
NB -  All Asthma : f1-macro 0.6466666666666667
NB -  All Asthma : f1-micro 0.9245283018867925
RF -  All Asthma : f1-macro 0.6466666666666667
RF -  All Asthma : f1-micro 0.9245283018867925
SVM -  ExtraTreeClassifier Asthma : f1-macro 0.6002514142049027
SVM -  ExtraTreeClassifier Asthma : f1-micro 0.8726415094339622
KNN -  ExtraTreeClassifier Asthma : f1-macro 0.49066108656901497
KNN -  ExtraTreeClassifier Asthma : f1-micro 0.8537735849056604
NB -  ExtraTreeClassifier Asthma : f1-macro 0.4953623902158746
NB -  ExtraTreeClassifier Asthma : f1-micro 0.8632075471698113
RF -  ExtraTreeClassifier Asthma : f1-macro 0.5750286368843069
RF -  ExtraTreeClassifier Asthma : f1-micro 0.8679245283018869


  f = msb / msw


SVM -  SelectKBest Asthma : f1-macro 0.8923311325545962
SVM -  SelectKBest Asthma : f1-micro 0.9433962264150944
KNN -  SelectKBest Asthma : f1-macro 0.6476063829787234
KNN -  SelectKBest Asthma : f1-micro 0.8584905660377359
NB -  SelectKBest Asthma : f1-macro 0.5004776157422148
NB -  SelectKBest Asthma : f1-micro 0.8254716981132075
RF -  SelectKBest Asthma : f1-macro 0.8110853680270896
RF -  SelectKBest Asthma : f1-micro 0.9056603773584906
SVM -  InfoGainAttributeVal Asthma : f1-macro 0.8885223959080262
SVM -  InfoGainAttributeVal Asthma : f1-micro 0.9481132075471698
KNN -  InfoGainAttributeVal Asthma : f1-macro 0.6892406918792143
KNN -  InfoGainAttributeVal Asthma : f1-micro 0.8820754716981133
NB -  InfoGainAttributeVal Asthma : f1-macro 0.5706404078497103
NB -  InfoGainAttributeVal Asthma : f1-micro 0.8632075471698113
RF -  InfoGainAttributeVal Asthma : f1-macro 0.8479850853291266
RF -  InfoGainAttributeVal Asthma : f1-micro 0.9292452830188679
SVM -  All CHF : f1-macro 0.844006568144

  f = msb / msw


SVM -  SelectKBest CHF : f1-macro 0.8228181876648016
SVM -  SelectKBest CHF : f1-micro 0.8689655172413793
KNN -  SelectKBest CHF : f1-macro 0.6610389610389611
KNN -  SelectKBest CHF : f1-micro 0.7517241379310344
NB -  SelectKBest CHF : f1-macro 0.5607626514611547
NB -  SelectKBest CHF : f1-micro 0.7655172413793103
RF -  SelectKBest CHF : f1-macro 0.872997997997998
RF -  SelectKBest CHF : f1-micro 0.903448275862069
SVM -  InfoGainAttributeVal CHF : f1-macro 0.9243846474760118
SVM -  InfoGainAttributeVal CHF : f1-micro 0.9310344827586207
KNN -  InfoGainAttributeVal CHF : f1-macro 0.6754662840746055
KNN -  InfoGainAttributeVal CHF : f1-micro 0.7310344827586208
NB -  InfoGainAttributeVal CHF : f1-macro 0.3958333333333333
NB -  InfoGainAttributeVal CHF : f1-micro 0.6551724137931034
RF -  InfoGainAttributeVal CHF : f1-macro 0.9065721649484535
RF -  InfoGainAttributeVal CHF : f1-micro 0.9172413793103448
SVM -  All Depression : f1-macro 0.42933333333333334
SVM -  All Depression : f1-micro 0.75

  f = msb / msw


SVM -  SelectKBest Depression : f1-macro 0.6845490686559164
SVM -  SelectKBest Depression : f1-micro 0.8457943925233644
KNN -  SelectKBest Depression : f1-macro 0.4427083333333333
KNN -  SelectKBest Depression : f1-micro 0.794392523364486
NB -  SelectKBest Depression : f1-macro 0.4885044388800365
NB -  SelectKBest Depression : f1-micro 0.8037383177570093
RF -  SelectKBest Depression : f1-macro 0.8029465930018416
RF -  SelectKBest Depression : f1-micro 0.897196261682243
SVM -  InfoGainAttributeVal Depression : f1-macro 0.8366162772942434
SVM -  InfoGainAttributeVal Depression : f1-micro 0.9065420560747663
KNN -  InfoGainAttributeVal Depression : f1-macro 0.6462809917355372
KNN -  InfoGainAttributeVal Depression : f1-micro 0.8177570093457944
NB -  InfoGainAttributeVal Depression : f1-macro 0.5403027823240589
NB -  InfoGainAttributeVal Depression : f1-micro 0.8037383177570093
RF -  InfoGainAttributeVal Depression : f1-macro 0.7935558556820372
RF -  InfoGainAttributeVal Depression : f1-mic

  f = msb / msw


SVM -  SelectKBest Diabetes : f1-macro 0.8010934937124111
SVM -  SelectKBest Diabetes : f1-micro 0.8411214953271028
KNN -  SelectKBest Diabetes : f1-macro 0.6656249999999999
KNN -  SelectKBest Diabetes : f1-micro 0.7476635514018691
NB -  SelectKBest Diabetes : f1-macro 0.4174458515800686
NB -  SelectKBest Diabetes : f1-micro 0.677570093457944
RF -  SelectKBest Diabetes : f1-macro 0.9265478547854786
RF -  SelectKBest Diabetes : f1-micro 0.9392523364485983
SVM -  InfoGainAttributeVal Diabetes : f1-macro 0.9317384370015949
SVM -  InfoGainAttributeVal Diabetes : f1-micro 0.9392523364485983
KNN -  InfoGainAttributeVal Diabetes : f1-macro 0.7781822778409808
KNN -  InfoGainAttributeVal Diabetes : f1-micro 0.808411214953271
NB -  InfoGainAttributeVal Diabetes : f1-macro 0.43433147907050146
NB -  InfoGainAttributeVal Diabetes : f1-micro 0.6869158878504673
RF -  InfoGainAttributeVal Diabetes : f1-macro 0.9516893671457596
RF -  InfoGainAttributeVal Diabetes : f1-micro 0.9579439252336449
SVM -  Al

  f = msb / msw


SVM -  SelectKBest Gallstones : f1-macro 0.4735582675281167
SVM -  SelectKBest Gallstones : f1-micro 0.8181818181818182
KNN -  SelectKBest Gallstones : f1-macro 0.45
KNN -  SelectKBest Gallstones : f1-micro 0.8181818181818182
NB -  SelectKBest Gallstones : f1-macro 0.45
NB -  SelectKBest Gallstones : f1-micro 0.8181818181818182
RF -  SelectKBest Gallstones : f1-macro 0.8240074102143068
RF -  SelectKBest Gallstones : f1-micro 0.9136363636363637
SVM -  InfoGainAttributeVal Gallstones : f1-macro 0.8608475648323846
SVM -  InfoGainAttributeVal Gallstones : f1-micro 0.9272727272727272
KNN -  InfoGainAttributeVal Gallstones : f1-macro 0.4901787684837784
KNN -  InfoGainAttributeVal Gallstones : f1-micro 0.8090909090909091
NB -  InfoGainAttributeVal Gallstones : f1-macro 0.4735582675281168
NB -  InfoGainAttributeVal Gallstones : f1-micro 0.8181818181818182
RF -  InfoGainAttributeVal Gallstones : f1-macro 0.9003623188405796
RF -  InfoGainAttributeVal Gallstones : f1-micro 0.9454545454545454
SVM 

  f = msb / msw


SVM -  SelectKBest Gout : f1-macro 0.8474358974358975
SVM -  SelectKBest Gout : f1-micro 0.9366515837104072
KNN -  SelectKBest Gout : f1-macro 0.519059319059319
KNN -  SelectKBest Gout : f1-micro 0.8597285067873304
NB -  SelectKBest Gout : f1-macro 0.4596577017114914
NB -  SelectKBest Gout : f1-micro 0.8506787330316742
RF -  SelectKBest Gout : f1-macro 0.8606489789979144
RF -  SelectKBest Gout : f1-micro 0.9411764705882353
SVM -  InfoGainAttributeVal Gout : f1-macro 0.9564102564102565
SVM -  InfoGainAttributeVal Gout : f1-micro 0.9819004524886877
KNN -  InfoGainAttributeVal Gout : f1-macro 0.6813725490196079
KNN -  InfoGainAttributeVal Gout : f1-micro 0.9095022624434389
NB -  InfoGainAttributeVal Gout : f1-macro 0.5043133195307109
NB -  InfoGainAttributeVal Gout : f1-micro 0.8823529411764706
RF -  InfoGainAttributeVal Gout : f1-macro 0.8886649874055417
RF -  InfoGainAttributeVal Gout : f1-micro 0.9592760180995475
SVM -  All Hypercholesterolemia : f1-macro 0.7525641025641026
SVM -  All 

  f = msb / msw


SVM -  SelectKBest Hypercholesterolemia : f1-macro 0.7651548672566372
SVM -  SelectKBest Hypercholesterolemia : f1-micro 0.7720207253886011
KNN -  SelectKBest Hypercholesterolemia : f1-macro 0.6678325560737286
KNN -  SelectKBest Hypercholesterolemia : f1-micro 0.6787564766839378
NB -  SelectKBest Hypercholesterolemia : f1-macro 0.6961313540260909
NB -  SelectKBest Hypercholesterolemia : f1-micro 0.7098445595854923
RF -  SelectKBest Hypercholesterolemia : f1-macro 0.8303669523181719
RF -  SelectKBest Hypercholesterolemia : f1-micro 0.8341968911917098
SVM -  InfoGainAttributeVal Hypercholesterolemia : f1-macro 0.8746753246753247
SVM -  InfoGainAttributeVal Hypercholesterolemia : f1-micro 0.8756476683937824
KNN -  InfoGainAttributeVal Hypercholesterolemia : f1-macro 0.6630677087529879
KNN -  InfoGainAttributeVal Hypercholesterolemia : f1-micro 0.6632124352331606
NB -  InfoGainAttributeVal Hypercholesterolemia : f1-macro 0.6616409537166901
NB -  InfoGainAttributeVal Hypercholesterolemia : 

  f = msb / msw


SVM -  SelectKBest Hypertriglyceridemia : f1-macro 0.8623677838664456
SVM -  SelectKBest Hypertriglyceridemia : f1-micro 0.9768518518518519
KNN -  SelectKBest Hypertriglyceridemia : f1-macro 0.758273381294964
KNN -  SelectKBest Hypertriglyceridemia : f1-micro 0.9675925925925926
NB -  SelectKBest Hypertriglyceridemia : f1-macro 0.6431062970442445
NB -  SelectKBest Hypertriglyceridemia : f1-micro 0.9583333333333334
RF -  SelectKBest Hypertriglyceridemia : f1-macro 0.7856839121190645
RF -  SelectKBest Hypertriglyceridemia : f1-micro 0.9675925925925926
SVM -  InfoGainAttributeVal Hypertriglyceridemia : f1-macro 0.758273381294964
SVM -  InfoGainAttributeVal Hypertriglyceridemia : f1-micro 0.9675925925925926
KNN -  InfoGainAttributeVal Hypertriglyceridemia : f1-macro 0.7047163362952836
KNN -  InfoGainAttributeVal Hypertriglyceridemia : f1-micro 0.9629629629629629
NB -  InfoGainAttributeVal Hypertriglyceridemia : f1-macro 0.48693586698337293
NB -  InfoGainAttributeVal Hypertriglyceridemia : f

  f = msb / msw


SVM -  SelectKBest OA : f1-macro 0.640472521828454
SVM -  SelectKBest OA : f1-micro 0.8095238095238095
KNN -  SelectKBest OA : f1-macro 0.5357565163744795
KNN -  SelectKBest OA : f1-micro 0.7952380952380952
NB -  SelectKBest OA : f1-macro 0.47783783783783784
NB -  SelectKBest OA : f1-micro 0.780952380952381
RF -  SelectKBest OA : f1-macro 0.765886287625418
RF -  SelectKBest OA : f1-micro 0.8714285714285714
SVM -  InfoGainAttributeVal OA : f1-macro 0.8645575206218812
SVM -  InfoGainAttributeVal OA : f1-micro 0.9095238095238095
KNN -  InfoGainAttributeVal OA : f1-macro 0.6864280237297712
KNN -  InfoGainAttributeVal OA : f1-micro 0.8238095238095238
NB -  InfoGainAttributeVal OA : f1-macro 0.5635783674061186
NB -  InfoGainAttributeVal OA : f1-micro 0.7952380952380952
RF -  InfoGainAttributeVal OA : f1-macro 0.8241013875232164
RF -  InfoGainAttributeVal OA : f1-micro 0.8904761904761904
SVM -  All OSA : f1-macro 0.5229319988224904
SVM -  All OSA : f1-micro 0.8310502283105022
KNN -  All OSA :

  f = msb / msw


SVM -  SelectKBest OSA : f1-macro 0.8624371859296482
SVM -  SelectKBest OSA : f1-micro 0.954337899543379
KNN -  SelectKBest OSA : f1-macro 0.5384031700531152
KNN -  SelectKBest OSA : f1-micro 0.8858447488584474
NB -  SelectKBest OSA : f1-macro 0.5093353248693054
NB -  SelectKBest OSA : f1-micro 0.8904109589041096
RF -  SelectKBest OSA : f1-macro 0.9097438916691198
RF -  SelectKBest OSA : f1-micro 0.9680365296803652
SVM -  InfoGainAttributeVal OSA : f1-macro 0.9473304473304474
SVM -  InfoGainAttributeVal OSA : f1-micro 0.9817351598173516
KNN -  InfoGainAttributeVal OSA : f1-macro 0.9058180254346624
KNN -  InfoGainAttributeVal OSA : f1-micro 0.9680365296803652
NB -  InfoGainAttributeVal OSA : f1-macro 0.7869809203142537
NB -  InfoGainAttributeVal OSA : f1-micro 0.9406392694063926
RF -  InfoGainAttributeVal OSA : f1-macro 0.9473304473304474
RF -  InfoGainAttributeVal OSA : f1-micro 0.9817351598173516
SVM -  All Obesity : f1-macro 0.6856476856476856
SVM -  All Obesity : f1-micro 0.71634615

  f = msb / msw


SVM -  SelectKBest Obesity : f1-macro 0.803030303030303
SVM -  SelectKBest Obesity : f1-micro 0.8076923076923077
KNN -  SelectKBest Obesity : f1-macro 0.6530690049561569
KNN -  SelectKBest Obesity : f1-micro 0.6634615384615384
NB -  SelectKBest Obesity : f1-macro 0.7448957506673402
NB -  SelectKBest Obesity : f1-micro 0.7548076923076922
RF -  SelectKBest Obesity : f1-macro 0.9613095238095237
RF -  SelectKBest Obesity : f1-micro 0.9615384615384616
SVM -  InfoGainAttributeVal Obesity : f1-macro 0.9415292353823088
SVM -  InfoGainAttributeVal Obesity : f1-micro 0.9423076923076923
KNN -  InfoGainAttributeVal Obesity : f1-macro 0.766654732068266
KNN -  InfoGainAttributeVal Obesity : f1-micro 0.7740384615384616
NB -  InfoGainAttributeVal Obesity : f1-macro 0.8428854687942593
NB -  InfoGainAttributeVal Obesity : f1-micro 0.8461538461538461
RF -  InfoGainAttributeVal Obesity : f1-macro 0.9806547619047619
RF -  InfoGainAttributeVal Obesity : f1-micro 0.9807692307692307
SVM -  All CAD : f1-macro 

  f = msb / msw


SVM -  SelectKBest CAD : f1-macro 0.8518150879183211
SVM -  SelectKBest CAD : f1-micro 0.8564593301435407
KNN -  SelectKBest CAD : f1-macro 0.7403726708074534
KNN -  SelectKBest CAD : f1-micro 0.7703349282296652
NB -  SelectKBest CAD : f1-macro 0.787459384378798
NB -  SelectKBest CAD : f1-micro 0.8038277511961722
RF -  SelectKBest CAD : f1-macro 0.9030919784269225
RF -  SelectKBest CAD : f1-micro 0.9090909090909091
SVM -  InfoGainAttributeVal CAD : f1-macro 0.8913310644734354
SVM -  InfoGainAttributeVal CAD : f1-micro 0.8947368421052632
KNN -  InfoGainAttributeVal CAD : f1-macro 0.7833992839645751
KNN -  InfoGainAttributeVal CAD : f1-micro 0.7894736842105263
NB -  InfoGainAttributeVal CAD : f1-macro 0.798712489862125
NB -  InfoGainAttributeVal CAD : f1-micro 0.8181818181818182
RF -  InfoGainAttributeVal CAD : f1-macro 0.8838243468593663
RF -  InfoGainAttributeVal CAD : f1-micro 0.8899521531100477
SVM -  All Hypertension : f1-macro 0.4363143631436314
SVM -  All Hypertension : f1-micro 0

  f = msb / msw


SVM -  SelectKBest Hypertension : f1-macro 0.4765694076038903
SVM -  SelectKBest Hypertension : f1-micro 0.8221153846153845
KNN -  SelectKBest Hypertension : f1-macro 0.4497354497354497
KNN -  SelectKBest Hypertension : f1-micro 0.8173076923076923
NB -  SelectKBest Hypertension : f1-macro 0.4497354497354497
NB -  SelectKBest Hypertension : f1-micro 0.8173076923076923
RF -  SelectKBest Hypertension : f1-macro 0.7362637362637363
RF -  SelectKBest Hypertension : f1-micro 0.8846153846153846
SVM -  InfoGainAttributeVal Hypertension : f1-macro 0.7726775956284153
SVM -  InfoGainAttributeVal Hypertension : f1-micro 0.9038461538461539
KNN -  InfoGainAttributeVal Hypertension : f1-macro 0.5535609756097561
KNN -  InfoGainAttributeVal Hypertension : f1-micro 0.8413461538461539
NB -  InfoGainAttributeVal Hypertension : f1-macro 0.4583333333333333
NB -  InfoGainAttributeVal Hypertension : f1-micro 0.8461538461538461
RF -  InfoGainAttributeVal Hypertension : f1-macro 0.8009569377990431
RF -  InfoGain

  f = msb / msw


SVM -  SelectKBest PVD : f1-macro 0.6494611457742484
SVM -  SelectKBest PVD : f1-micro 0.8689320388349513
KNN -  SelectKBest PVD : f1-macro 0.536166619757951
KNN -  SelectKBest PVD : f1-micro 0.8446601941747571
NB -  SelectKBest PVD : f1-macro 0.4535809018567639
NB -  SelectKBest PVD : f1-micro 0.8300970873786409
RF -  SelectKBest PVD : f1-macro 0.7406135531135531
RF -  SelectKBest PVD : f1-micro 0.8932038834951457
SVM -  InfoGainAttributeVal PVD : f1-macro 0.8531717747683535
SVM -  InfoGainAttributeVal PVD : f1-micro 0.941747572815534
KNN -  InfoGainAttributeVal PVD : f1-macro 0.7508064516129032
KNN -  InfoGainAttributeVal PVD : f1-micro 0.912621359223301
NB -  InfoGainAttributeVal PVD : f1-macro 0.5595394736842105
NB -  InfoGainAttributeVal PVD : f1-micro 0.8737864077669902
RF -  InfoGainAttributeVal PVD : f1-macro 0.8378443838934302
RF -  InfoGainAttributeVal PVD : f1-micro 0.9368932038834952
SVM -  All Venous Insufficiency : f1-macro 0.47237569060773477
SVM -  All Venous Insufficie

  f = msb / msw


SVM -  SelectKBest Venous Insufficiency : f1-macro 0.6086065573770492
SVM -  SelectKBest Venous Insufficiency : f1-micro 0.93717277486911
KNN -  SelectKBest Venous Insufficiency : f1-macro 0.48097826086956524
KNN -  SelectKBest Venous Insufficiency : f1-micro 0.9267015706806283
NB -  SelectKBest Venous Insufficiency : f1-macro 0.48097826086956524
NB -  SelectKBest Venous Insufficiency : f1-micro 0.9267015706806283
RF -  SelectKBest Venous Insufficiency : f1-macro 0.48097826086956524
RF -  SelectKBest Venous Insufficiency : f1-micro 0.9267015706806283
SVM -  InfoGainAttributeVal Venous Insufficiency : f1-macro 0.6877384196185286
SVM -  InfoGainAttributeVal Venous Insufficiency : f1-micro 0.9528795811518325
KNN -  InfoGainAttributeVal Venous Insufficiency : f1-macro 0.6738387978142076
KNN -  InfoGainAttributeVal Venous Insufficiency : f1-micro 0.9476439790575917
NB -  InfoGainAttributeVal Venous Insufficiency : f1-macro 0.4837837837837838
NB -  InfoGainAttributeVal Venous Insufficiency :

  f = msb / msw


SVM -  SelectKBest GERD : f1-macro 0.8155901116427433
SVM -  SelectKBest GERD : f1-micro 0.8918918918918919
KNN -  SelectKBest GERD : f1-macro 0.43768996960486317
KNN -  SelectKBest GERD : f1-micro 0.7783783783783784
NB -  SelectKBest GERD : f1-macro 0.48969221835075494
NB -  SelectKBest GERD : f1-micro 0.7945945945945945
RF -  SelectKBest GERD : f1-macro 0.8299632352941176
RF -  SelectKBest GERD : f1-micro 0.9027027027027028
SVM -  InfoGainAttributeVal GERD : f1-macro 0.7888127853881278
SVM -  InfoGainAttributeVal GERD : f1-micro 0.8594594594594595
KNN -  InfoGainAttributeVal GERD : f1-macro 0.6032475490196079
KNN -  InfoGainAttributeVal GERD : f1-micro 0.772972972972973
NB -  InfoGainAttributeVal GERD : f1-macro 0.460448782810547
NB -  InfoGainAttributeVal GERD : f1-micro 0.7351351351351352
RF -  InfoGainAttributeVal GERD : f1-macro 0.8369086100499559
RF -  InfoGainAttributeVal GERD : f1-micro 0.8864864864864865


# Word Embeddings

In [16]:
Encoder = LabelEncoder()
Tfidf_vect = TfidfVectorizer(max_features=600)
Tfidf_vect_NS = TfidfVectorizer(max_features = 600, stop_words = cachedStopWords)

max_tokens = 600

for _,disease in enumerate(disease_list):
    for _, embedding in enumerate(embedding_list):
        disease_data_df = all_df_expanded [all_df_expanded ['disease'] == disease]

        

        if embedding == 'GloVe':
            X_train, X_test, y_train, y_test = train_test_split(disease_data_df['vector_tokenized'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
            X_train = vectorize_batch_GloVe(X_train)
            X_test = vectorize_batch_GloVe(X_test)
        if embedding == 'FastText':
            X_train, X_test, y_train, y_test = train_test_split(disease_data_df['vector_tokenized'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
            X_train = vectorize_batch_FastText(X_train)
            X_test = vectorize_batch_FastText(X_test)
        if embedding == 'USE':
            X_train, X_test, y_train, y_test = train_test_split(disease_data_df['sentence_tokenized'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
            X_train = vectorize_batch_USE(X_train)
            X_test = vectorize_batch_USE(X_test)
            
            vectorize_batch_USE

        Train_Y  = Encoder.fit_transform(y_train)
        Test_Y  = Encoder.fit_transform(y_test)

        performSVM(X_train, X_test, Train_Y, Test_Y, embedding)

        performKNN(X_train, X_test, Train_Y, Test_Y, embedding)

        #performNB(X_train, X_test, Train_Y, Test_Y, embedding)

        performRF(X_train, X_test, Train_Y, Test_Y, embedding)

SVM -  GloVe Asthma : f1-macro 0.47019867549668876
SVM -  GloVe Asthma : f1-micro 0.8875
KNN -  GloVe Asthma : f1-macro 0.47019867549668876
KNN -  GloVe Asthma : f1-micro 0.8875
RF -  GloVe Asthma : f1-macro 0.47019867549668876
RF -  GloVe Asthma : f1-micro 0.8875
SVM -  FastText Asthma : f1-macro 0.4618834080717488
SVM -  FastText Asthma : f1-micro 0.8583333333333333
KNN -  FastText Asthma : f1-macro 0.4618834080717488
KNN -  FastText Asthma : f1-micro 0.8583333333333333
RF -  FastText Asthma : f1-macro 0.4618834080717488
RF -  FastText Asthma : f1-micro 0.8583333333333333


NameError: name 'x' is not defined