This notebook was created to support the data preparation required to support our CS 598 DLH project.  The paper we have chosen for the reproducibility project is:
***Ensembling Classical Machine Learning and Deep Learning Approaches for Morbidity Identification from Clinical Notes ***



 

The data cannot be shared publicly due to the agreements required to obtain the data so we are storing the data locally and not putting in GitHub.

In [1]:
#pip install tensorflow==2.10.*
#pip install tensorflow-text==2.10.0
#pip install tensorflow_hub

**Classical Machine Learning - TF-IDF - All Features**

![CML TFIDF All](images\cml-tfidf-all.gif)

**Classical Machine Learning - TF-IDF - ExtraTreesClassifier Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-extra.gif)

**Classical Machine Learning - TF-IDF - InfoGain Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-infogain.gif)

**Classical Machine Learning - TF-IDF - SelectKBest Features**

![CML TFIDF ExtraTrees](images\cml-tfidf-selectkbest.gif)

**Classical Machine Learning - Word Embeddings - No Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swno.gif)

**Classical Machine Learning - Word Embeddings - Stopwords**

![CML TFIDF ExtraTrees](images\cml-we-swyes.gif)

In [2]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import datetime
from datetime import timedelta
from tqdm import tqdm
import torchtext
from nltk.corpus import stopwords
import tensorflow_hub as hub
import pickle

from sklearn.metrics import roc_auc_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm, naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

# define data path
DATA_PATH = './obesity_data/'
RESULTS_PATH = './results/'
MODELS_PATH = './models/'

if os.path.exists(RESULTS_PATH) == False:
    os.mkdir(RESULTS_PATH)
if os.path.exists(MODELS_PATH) == False:
    os.mkdir(MODELS_PATH)

cachedStopWords = stopwords.words("english")

#Download info for USE
#embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

all_docs_df = pd.read_pickle(DATA_PATH + '/alldocs_df.pkl')
all_docs_df_ns = pd.read_pickle(DATA_PATH + '/alldocs_df_ns.pkl')
all_annot_df = pd.read_pickle(DATA_PATH + '/allannot_df.pkl')
all_df_expanded = pd.read_pickle(DATA_PATH + '/all_df_expanded.pkl')




In [3]:
all_df = pd.merge(all_docs_df,all_annot_df, on='id')
all_df_ns = pd.merge(all_docs_df_ns,all_annot_df, on='id')

disease_list = all_df['disease'].unique().tolist()
print(disease_list)
#disease_list = ['Asthma']
feature_list = ['All','ExtraTreeClassifier','SelectKBest','InfoGainAttributeVal']
embedding_list = ['GloVe', 'FastText', 'USE']
device = torch.device('cpu')

['Asthma', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'Gout', 'Hypercholesterolemia', 'Hypertriglyceridemia', 'OA', 'OSA', 'Obesity', 'CAD', 'Hypertension', 'PVD', 'Venous Insufficiency', 'GERD']


In [4]:
result_cols = ['Batch','Disease','Classifier','Feature', 'F1_MACRO', 'F1_MICRO', 'Total Run (secs)']

def write_to_file(file, batch_name, disease, clfr, feature,f1_macro,f1_micro,runtime_sec):
    #Pass TFIDF or Embeddings
    
    results_file = f'{RESULTS_PATH}CML_{file}_results.csv'

    if os.path.exists(results_file):
        results = pd.read_csv(results_file)
    else:
        results = pd.DataFrame(columns=result_cols)

    result = pd.DataFrame(columns=result_cols,data=[[batch_name, disease,clfr,feature,f1_macro,f1_micro,runtime_sec]])
    results = pd.concat([results,result])

    #Save results - overwrite so we can see progress
    results.to_csv(results_file, index=False)

In [5]:
for index, entry in enumerate(all_df['tok_lem_text']):
    Final_words = []
    for word in entry:
        Final_words.append(word)
    all_df.loc[index, 'text_final'] = str(Final_words)

In [6]:
for index, entry in enumerate(all_df_ns['tok_lem_text']):
    Final_words = []
    for word in entry:
        Final_words.append(word)
    all_df_ns.loc[index, 'text_final'] = str(Final_words)

In [10]:
def performSVM(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test, feature, disease):
    model_name = 'SVM_' + feature + '_' + disease
    start_time = time.time()
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(Train_X_Tfidf, y_train)

    predictions_SVM = SVM.predict(Test_X_Tfidf)
    end_time = time.time()
    runtime_sec = end_time-start_time

    #f1 = f1_score(y_test, predictions_SVM)
    f1_macro = f1_score(y_test, predictions_SVM,average='macro')
    f1_micro = f1_score(y_test, predictions_SVM,average='micro')

    #print("SVM - ", disease, ": f1-score", f1)
    print("SVM - ", feature, disease, ": f1-macro", f1_macro)
    print("SVM - ", feature, disease, ": f1-micro", f1_micro)
    
    write_to_file(file, batch_name, disease, "SVM", feature,f1_macro, f1_micro, runtime_sec)
    
    #Save model
    torch.save(SVM, f'{MODELS_PATH}{model_name}.pkl')

    return f1_macro, f1_micro

In [11]:
def performKNN(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test, feature, disease):
    first_model_name = 'KNN1_' + feature + '_' + disease
    second_model_name = 'KNN5_' + feature + '_' + disease
    start_time = time.time()
    knn1 = KNeighborsClassifier(n_neighbors=1)
    clf1 = knn1.fit(Train_X_Tfidf, y_train)
    # predict the labels on validation dataset
    predictions_KNN1 = clf1.predict(Test_X_Tfidf)
    end_time = time.time()
    runtime_sec1 = end_time-start_time

    start_time = time.time()
    knn5 = KNeighborsClassifier(n_neighbors=5)
    clf5 = knn5.fit(Train_X_Tfidf, y_train)
    # predict the labels on validation dataset
    predictions_KNN5 = clf5.predict(Test_X_Tfidf)
    end_time = time.time()
    runtime_sec5 = end_time-start_time


    #auroc = roc_auc_score(truth, pred[:,1])
    #f1 = f1_score(y_test, predictions_KNN)
    f1_macro1 = f1_score(y_test, predictions_KNN1,average='macro')
    f1_macro5 = f1_score(y_test, predictions_KNN5,average='macro')
    f1_micro1 = f1_score(y_test, predictions_KNN1,average='micro')
    f1_micro5 = f1_score(y_test, predictions_KNN5,average='micro')

    #print("KNN - ", disease, ": f1-score", f1)
    print("KNN 1 k - ", feature, disease, ": f1-macro", f1_macro1)
    print("KNN 1 k - ", feature, disease, ": f1-micro", f1_micro1)
    print("KNN 5 k - ", feature, disease, ": f1-macro", f1_macro5)
    print("KNN 5 k - ", feature, disease, ": f1-micro", f1_micro5)
    
    write_to_file(file, batch_name, disease, "KNN n=1", feature,f1_macro1,f1_micro1, runtime_sec1)
    write_to_file(file, batch_name, disease, "KNN n=5", feature,f1_macro5,f1_micro5, runtime_sec5)
    
    #Save model
    torch.save(clf1, f'{MODELS_PATH}{first_model_name}.pkl')
    torch.save(clf5, f'{MODELS_PATH}{second_model_name}.pkl')

    return f1_macro1, f1_micro1, f1_macro5, f1_micro5

In [12]:
def performNB(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test, feature, disease):
    model_name = 'NB_' + feature + '_' + disease
    start_time = time.time()
    scaler = MinMaxScaler()
    scaler.fit(Train_X_Tfidf)
    
    Naive = naive_bayes.MultinomialNB()
    Naive.fit(scaler.transform(Train_X_Tfidf),y_train)

    # predict the labels on validation dataset
    predictions_NB = Naive.predict(Test_X_Tfidf)
    end_time = time.time()
    runtime_sec = end_time-start_time

    #f1 = f1_score(y_test, predictions_NB)
    f1_macro = f1_score(y_test, predictions_NB,average='macro')
    f1_micro = f1_score(y_test, predictions_NB,average='micro')

    #print("NB - ", disease, ": f1-score", f1)
    print("NB - ", feature, disease, ": f1-macro", f1_macro)
    print("NB - ", feature, disease, ": f1-micro", f1_micro)

    write_to_file(file, batch_name, disease, "Naive Bayes", feature,f1_macro, f1_micro, runtime_sec)
    
    #Save model
    torch.save(Naive, f'{MODELS_PATH}{model_name}.pkl')
    
    return f1_macro, f1_micro

In [13]:
def performRF(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test, feature, disease):
    model_name = 'RF_' + feature + '_' + disease
    start_time = time.time()

    classifier=RandomForestClassifier(n_estimators =400,criterion="entropy",random_state =0)
    classifier.fit(Train_X_Tfidf,y_train)

    # predict the labels on validation dataset
    predictions_RF = classifier.predict(Test_X_Tfidf)
    end_time = time.time()
    runtime_sec = end_time-start_time

    #f1 = f1_score(y_test, predictions_RF)
    f1_macro = f1_score(y_test, predictions_RF,average='macro')
    f1_micro = f1_score(y_test, predictions_RF,average='micro')

    #print("RF - ", disease, ": f1-score", f1)
    print("RF - ", feature, disease, ": f1-macro", f1_macro)
    print("RF - ", feature, disease, ": f1-micro", f1_micro)
    
    write_to_file(file, batch_name, disease, "Random Forest", feature,f1_macro, f1_micro, runtime_sec)
    
    #Save model
    torch.save(classifier, f'{MODELS_PATH}{model_name}.pkl')

    return f1_macro, f1_micro

In [14]:
def vectorize_batch_GloVe(X_Train):
    embedding_size_used = 300
    vec = torchtext.vocab.GloVe(name='6B', dim=embedding_size_used)
    
    X =  np.zeros((X_Train.shape[0], embedding_size_used * len(X_Train.iloc[0])))
    
    for i in range(len(X_Train)):
        vectors = vec.get_vecs_by_tokens(X_Train.iloc[i]).float().numpy()
        
        X[i,:] = vectors.flatten()
        
    return X

In [15]:
def vectorize_batch_FastText(X_Train):
    embedding_size_used = 300
    vec = torchtext.vocab.FastText()
    
    X =  np.zeros((X_Train.shape[0], embedding_size_used * len(X_Train.iloc[0])))
    
    for i in range(len(X_Train)):
        vectors = vec.get_vecs_by_tokens(X_Train.iloc[i]).float().numpy()
        
        X[i,:] = vectors.flatten()
        
    return X

In [16]:
def vectorize_batch_USE(X_Train):
    embedding_size_used = 512
    
    X =  np.zeros((X_Train.shape[0], embedding_size_used * len(X_Train.iloc[0])))
    
    for i in range(len(X_Train)):
        tensor_flow_vectors = embed(X_Train.iloc[i])
        array_vectors = tensor_flow_vectors.numpy()
        
        X[i,:] = array_vectors.flatten()
        
    return X

In [17]:
def generateXTrainAndTest(X_train, X_test, Tfidf_vect):
    X_train_values_list = Tfidf_vect.fit_transform(X_train).toarray()
    X_training = pd.DataFrame(X_train_values_list, columns=Tfidf_vect.get_feature_names_out())
    X_training = np.asarray(X_training, dtype=float)
    X_training = torch.from_numpy(X_training).to(device)

    X_test_values_list = Tfidf_vect.transform(X_test).toarray()
    X_testing = pd.DataFrame(X_test_values_list, columns=Tfidf_vect.get_feature_names_out())
    X_testing = np.asarray(X_testing, dtype=float)
    X_testing = torch.from_numpy(X_testing).to(device)  
    
    return X_training, X_testing

In [18]:
from keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV, RFE
from sklearn.tree import ExtraTreeClassifier
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import f_classif, mutual_info_classif
from statistics import mean
from sklearn.preprocessing import MinMaxScaler

def getVocab(X_train, y_train, feature, max_tokens):
 
    ## Step 1: Determine the Initial Vocabulary
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    vocab = list(tokenizer.word_index.keys())

    ## Step 2: Create term  matrix
    vectors = tokenizer.texts_to_matrix(X_train, mode='count')

    ## Do feature selection on term matrix (column headers are words)
    X = vectors
    y = y_train

    ##Choose algorithm
    if feature == 'SelectKBest':
        selector = SelectKBest(score_func=f_classif, k=max_tokens).fit(X,y)
    else: 
        if feature == 'InfoGainAttributeVal':
            #This should be similar to the InfoGain?
            selector = SelectKBest(score_func=mutual_info_classif, k=max_tokens).fit(X,y)
        else:
            #default to ExtraTreeClassifier
            estimator = ExtraTreeClassifier(random_state = seed)
            selector = SelectFromModel(estimator, max_features = max_tokens)
            selector = selector.fit(X, y)

    support_idx = selector.get_support(True)
    tokenizer2 = Tokenizer()
    tokenizer2.fit_on_texts([vocab[i-1].replace("'","") for i in support_idx])
    new_vocab = list(tokenizer2.word_index.keys())

    return new_vocab

# Feature Selection

In [9]:
file = 'TFIDF'
result_time = datetime.datetime.now()
result_name = result_time.strftime("%Y-%m-%d-%H-%M-%S")
batch_name = f'CML_tfidf_results_{result_name}'

Encoder = LabelEncoder()
Tfidf_vect = TfidfVectorizer(max_features=600)
Tfidf_vect_NS = TfidfVectorizer(max_features = 600, stop_words = cachedStopWords)

svm_f1_micro_scores = []
svm_f1_macro_scores = []
knn1_f1_micro_scores = []
knn1_f1_macro_scores = []
knn5_f1_micro_scores = []
knn5_f1_macro_scores = []
nb_f1_micro_scores = []
nb_f1_macro_scores = []
rf_f1_micro_scores = []
rf_f1_macro_scores = []

max_tokens = 600

for _,feature in enumerate(feature_list):
    for _,disease in enumerate(disease_list):
        disease_data_df = all_df[all_df['disease'] == disease]

        X_train, X_test, y_train, y_test = train_test_split(disease_data_df['text_final'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
        
        if feature != 'All':
            vocab = getVocab(X_train,y_train, feature, max_tokens)
            Tfidf_vect = TfidfVectorizer(max_features=max_tokens,vocabulary = vocab)
        else:
            Tfidf_vect = TfidfVectorizer(max_features=max_tokens)
  
        
        X_train_values_list = Tfidf_vect.fit_transform(X_train).toarray()
        X_training = pd.DataFrame(X_train_values_list, columns=Tfidf_vect.get_feature_names_out())
        X_training = np.asarray(X_training, dtype=float)
        X_training = torch.from_numpy(X_training).to(device)

        X_test_values_list = Tfidf_vect.transform(X_test).toarray()
        X_testing = pd.DataFrame(X_test_values_list, columns=Tfidf_vect.get_feature_names_out())
        X_testing = np.asarray(X_testing, dtype=float)
        X_testing = torch.from_numpy(X_testing).to(device)
        
        #tokens_to_use = X_training.shape[1]

        Train_Y  = Encoder.fit_transform(y_train)
        Test_Y  = Encoder.fit_transform(y_test)

        svm_f1_macro, svm_f1_micro = performSVM(X_training, X_testing, Train_Y, Test_Y, feature, disease)
        svm_f1_macro_scores.append(svm_f1_macro)
        svm_f1_micro_scores.append(svm_f1_micro)

        knn_f1_macro1, knn_f1_micro1, knn_f1_macro5, knn_f1_micro5 = performKNN(X_training, X_testing, Train_Y, Test_Y, feature, disease)
        knn1_f1_macro_scores.append(knn_f1_macro1)
        knn1_f1_micro_scores.append(knn_f1_micro1)
        knn5_f1_macro_scores.append(knn_f1_macro5)
        knn5_f1_micro_scores.append(knn_f1_micro5)

        nb_f1_macro, nb_f1_micro = performNB(X_training, X_testing, Train_Y, Test_Y, feature, disease)
        nb_f1_macro_scores.append(nb_f1_macro)
        nb_f1_micro_scores.append(nb_f1_micro)

        rf_f1_macro, rf_f1_micro = performRF(X_training, X_testing, Train_Y, Test_Y, feature, disease)
        rf_f1_macro_scores.append(rf_f1_macro)
        rf_f1_micro_scores.append(rf_f1_micro)
        
    print("Average SVM - ", feature, ": f1-micro", mean(svm_f1_micro_scores))
    print("Average SVM - ", feature, ": f1-macro", mean(svm_f1_macro_scores))
    
    print("Average KNN 1 - ", feature, ": f1-micro", mean(knn1_f1_micro_scores))
    print("Average KNN 1 - ", feature, ": f1-macro", mean(knn1_f1_macro_scores))
    print("Average KNN 5 - ", feature, ": f1-micro", mean(knn5_f1_micro_scores))
    print("Average KNN 5 - ", feature, ": f1-macro", mean(knn5_f1_macro_scores))
    
    print("Average NB - ", feature, ": f1-micro", mean(nb_f1_micro_scores))
    print("Average NB - ", feature, ": f1-macro", mean(nb_f1_macro_scores))
    
    print("Average RF - ", feature, ": f1-micro", mean(rf_f1_micro_scores))
    print("Average RF - ", feature, ": f1-macro", mean(rf_f1_macro_scores))
        

KNN 1 k -  All Asthma : f1-macro 0.5623043623043623
KNN 1 k -  All Asthma : f1-micro 0.8537735849056604
KNN 5 k -  All Asthma : f1-macro 0.5846627483907081
KNN 5 k -  All Asthma : f1-micro 0.9009433962264151
KNN 1 k -  All CHF : f1-macro 0.6410891089108911
KNN 1 k -  All CHF : f1-micro 0.696551724137931
KNN 5 k -  All CHF : f1-macro 0.5880681818181819
KNN 5 k -  All CHF : f1-micro 0.7103448275862069
KNN 1 k -  All Depression : f1-macro 0.5882870838360452
KNN 1 k -  All Depression : f1-micro 0.7242990654205608
KNN 5 k -  All Depression : f1-macro 0.5879792399129417
KNN 5 k -  All Depression : f1-micro 0.7850467289719625
KNN 1 k -  All Diabetes : f1-macro 0.6443392425463335
KNN 1 k -  All Diabetes : f1-micro 0.6915887850467289
KNN 5 k -  All Diabetes : f1-macro 0.597922192749779
KNN 5 k -  All Diabetes : f1-micro 0.6822429906542056
KNN 1 k -  All Gallstones : f1-macro 0.53125
KNN 1 k -  All Gallstones : f1-micro 0.7
KNN 5 k -  All Gallstones : f1-macro 0.5009829124451837
KNN 5 k -  All G

StatisticsError: mean requires at least one data point

# Word Embeddings

In [19]:
file = 'Embeddings'
result_time = datetime.datetime.now()
result_name = result_time.strftime("%Y-%m-%d-%H-%M-%S")
batch_name = f'CML_tfidf_results_{result_name}'

Encoder = LabelEncoder()
Tfidf_vect = TfidfVectorizer(max_features=600)
Tfidf_vect_NS = TfidfVectorizer(max_features = 600, stop_words = cachedStopWords)

svm_f1_micro_scores = []
svm_f1_macro_scores = []
knn1_f1_micro_scores = []
knn1_f1_macro_scores = []
knn5_f1_micro_scores = []
knn5_f1_macro_scores = []
nb_f1_micro_scores = []
nb_f1_macro_scores = []
rf_f1_micro_scores = []
rf_f1_macro_scores = []

max_tokens = 600

embedding_list = ['FastText']

for _, embedding in enumerate(embedding_list):
    for _,disease in enumerate(disease_list):
        disease_data_df = all_df_expanded [all_df_expanded ['disease'] == disease]

        if embedding == 'GloVe':
            X_train, X_test, y_train, y_test = train_test_split(disease_data_df['vector_tokenized'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
            X_train = vectorize_batch_GloVe(X_train)
            X_test = vectorize_batch_GloVe(X_test)
        if embedding == 'FastText':
            X_train, X_test, y_train, y_test = train_test_split(disease_data_df['vector_tokenized'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
            X_train = vectorize_batch_FastText(X_train)
            X_test = vectorize_batch_FastText(X_test)
        if embedding == 'USE':
            X_train, X_test, y_train, y_test = train_test_split(disease_data_df['sentence_tokenized'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
            X_train = vectorize_batch_USE(X_train)
            X_test = vectorize_batch_USE(X_test)

        Train_Y  = Encoder.fit_transform(y_train)
        Test_Y  = Encoder.fit_transform(y_test)

        svm_f1_macro, svm_f1_micro = performSVM(X_train, X_test, Train_Y, Test_Y, embedding, disease)
        svm_f1_macro_scores.append(svm_f1_macro)
        svm_f1_micro_scores.append(svm_f1_micro)

        knn_f1_macro1, knn_f1_micro1, knn_f1_macro5, knn_f1_micro5 = performKNN(X_train, X_test, Train_Y, Test_Y, embedding, disease)
        knn1_f1_macro_scores.append(knn_f1_macro1)
        knn1_f1_micro_scores.append(knn_f1_micro1)
        knn5_f1_macro_scores.append(knn_f1_macro5)
        knn5_f1_micro_scores.append(knn_f1_micro5)

        nb_f1_macro, nb_f1_micro = performNB(X_train, X_test, Train_Y, Test_Y, embedding, disease)
        nb_f1_macro_scores.append(nb_f1_macro)
        nb_f1_micro_scores.append(nb_f1_micro)

        rf_f1_macro, rf_f1_micro = performRF(X_train, X_test, Train_Y, Test_Y, embedding, disease)
        rf_f1_macro_scores.append(rf_f1_macro)
        rf_f1_micro_scores.append(rf_f1_micro)
        
    print("Average SVM - ", embedding, ": f1-micro", mean(svm_f1_micro_scores))
    print("Average SVM - ", embedding, ": f1-macro", mean(svm_f1_macro_scores))

    print("Average KNN 1 - ", embedding, ": f1-micro", mean(knn1_f1_micro_scores))
    print("Average KNN 1 - ", embedding, ": f1-macro", mean(knn1_f1_macro_scores))
    print("Average KNN 5 - ", embedding, ": f1-micro", mean(knn5_f1_micro_scores))
    print("Average KNN 5 - ", embedding, ": f1-macro", mean(knn5_f1_macro_scores))

    print("Average NB - ", embedding, ": f1-micro", mean(nb_f1_micro_scores))
    print("Average NB - ", embedding, ": f1-macro", mean(nb_f1_macro_scores))

    print("Average RF - ", embedding, ": f1-micro", mean(rf_f1_micro_scores))
    print("Average RF - ", embedding, ": f1-macro", mean(rf_f1_macro_scores))

KNN 1 k -  FastText Asthma : f1-macro 0.45945945945945943
KNN 1 k -  FastText Asthma : f1-micro 0.85
KNN 5 k -  FastText Asthma : f1-macro 0.45945945945945943
KNN 5 k -  FastText Asthma : f1-micro 0.85
KNN 1 k -  FastText CHF : f1-macro 0.41724137931034483
KNN 1 k -  FastText CHF : f1-micro 0.7159763313609467
KNN 5 k -  FastText CHF : f1-macro 0.41724137931034483
KNN 5 k -  FastText CHF : f1-micro 0.7159763313609467
KNN 1 k -  FastText Depression : f1-macro 0.1947194719471947
KNN 1 k -  FastText Depression : f1-micro 0.24180327868852458
KNN 5 k -  FastText Depression : f1-macro 0.4312354312354313
KNN 5 k -  FastText Depression : f1-micro 0.7581967213114754
KNN 1 k -  FastText Diabetes : f1-macro 0.40243902439024387
KNN 1 k -  FastText Diabetes : f1-micro 0.673469387755102
KNN 5 k -  FastText Diabetes : f1-macro 0.40243902439024387
KNN 5 k -  FastText Diabetes : f1-micro 0.673469387755102
KNN 1 k -  FastText Gallstones : f1-macro 0.12280701754385964
KNN 1 k -  FastText Gallstones : f1-m

StatisticsError: mean requires at least one data point