This notebook was created to support the data preparation required to support our CS 598 DLH project.  The paper we have chosen for the reproducibility project is:
***Ensembling Classical Machine Learning and Deep Learning Approaches for Morbidity Identification from Clinical Notes ***



 

The data cannot be shared publicly due to the agreements required to obtain the data so we are storing the data locally and not putting in GitHub.

In [1]:
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import torchtext

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection, svm, naive_bayes
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
import tensorflow_hub as hub
import datetime
import time
import os



In [2]:
seed = 24
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

MODELS_DIRECTORY = './final_models/'
DATA_PATH = './obesity_data/'
RESULTS_PATH = './results/'

all_df_expanded = pd.read_pickle(DATA_PATH + '/all_df_expanded.pkl')

DISEASES_LIST = ['Asthma', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'Gout', 'Hypercholesterolemia', 'Hypertriglyceridemia', 'OA', 'OSA', 'Obesity', 'CAD', 'Hypertension', 'PVD', 'Venous Insufficiency', 'GERD']
#MODELS_LIST = ['DL_FastText', 'DL_GloVe', 'SVM_ExtraTreeClassifier', 'RF_SelectKBest', 'KNN1_USE']
MODELS_LIST = ['DL_USE', 'DL_GloVe', 'SVM_ExtraTreeClassifier', 'RF_SelectKBest', 'KNN1_FastText']

# SVM_InfoGain, TF_InfoGain, SVM_USE, DL_OPENAI, DL_USE

device = torch.device('cpu')

#This is created in the embeddings file
#max_tokens = 1416
#max_sentences = 380
(max_tokens, max_sentences) = torch.load(DATA_PATH + '/counts.obj')
max_sentences_aoai = 381
oai_col = 'bab_v1_sent'


word_embedding_size = 300
sentence_embedding_size = 512
aoai_embedding_size = 2048
fasttext_embeddings = torchtext.vocab.FastText()
glove_embeddings = torchtext.vocab.GloVe(name='6B', dim=word_embedding_size)    

In [3]:
result_cols = ['Batch','Disease', 'F1_MACRO', 'F1_MICRO', 'Total Run (secs)']

result_time = datetime.datetime.now()
result_name = result_time.strftime("%Y-%m-%d-%H-%M-%S")
batch_name = f'Ensemble_results_{result_name}'


def write_to_file(batch_name, disease,f1_macro,f1_micro,runtime_sec):
    #Pass TFIDF or Embeddings
    
    results_file = f'{RESULTS_PATH}Ensemble_results.csv'

    if os.path.exists(results_file):
        results = pd.read_csv(results_file)
    else:
        results = pd.DataFrame(columns=result_cols)

    result = pd.DataFrame(columns=result_cols,data=[[batch_name, disease,f1_macro,f1_micro,runtime_sec]])
    results = pd.concat([results,result])

    #Save results - overwrite so we can see progress
    results.to_csv(results_file, index=False)

In [4]:
for index, entry in enumerate(all_df_expanded['tok_lem_text']):
    Final_words = []
    for word in entry:
        Final_words.append(word)
    all_df_expanded.loc[index, 'text_final'] = str(Final_words)

In [5]:
from keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV, RFE
from sklearn.tree import ExtraTreeClassifier
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import f_classif, mutual_info_classif
from statistics import mean
from sklearn.preprocessing import MinMaxScaler

def getVocab(X_train, y_train, feature, max_tokens):
 
    ## Step 1: Determine the Initial Vocabulary
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    vocab = list(tokenizer.word_index.keys())

    ## Step 2: Create term  matrix
    vectors = tokenizer.texts_to_matrix(X_train, mode='count')

    ## Do feature selection on term matrix (column headers are words)
    X = vectors
    y = y_train

    ##Choose algorithm
    if feature == 'SelectKBest':
        selector = SelectKBest(score_func=f_classif, k=max_tokens).fit(X,y)
    else: 
        if feature == 'InfoGainAttributeVal':
            #This should be similar to the InfoGain?
            selector = SelectKBest(score_func=mutual_info_classif, k=max_tokens).fit(X,y)
        else:
            #default to ExtraTreeClassifier
            estimator = ExtraTreeClassifier(random_state = seed)
            selector = SelectFromModel(estimator, max_features = max_tokens)
            selector = selector.fit(X, y)

    support_idx = selector.get_support(True)
    tokenizer2 = Tokenizer()
    tokenizer2.fit_on_texts([vocab[i-1].replace("'","") for i in support_idx])
    new_vocab = list(tokenizer2.word_index.keys())

    return new_vocab

In [6]:
def vectorize_batch_FastText(X_Train):
    embedding_size_used = 300
    vec = torchtext.vocab.FastText()
    
    X =  np.zeros((X_Train.shape[0], embedding_size_used * len(X_Train.iloc[0])))
    
    for i in range(len(X_Train)):
        vectors = vec.get_vecs_by_tokens(X_Train.iloc[i]).float().numpy()
        
        X[i,:] = vectors.flatten()
        
    return X

In [7]:
def vectorize_batch_USE(X_Train):
    embedding_size_used = 512
    
    X =  np.zeros((X_Train.shape[0], embedding_size_used * len(X_Train.iloc[0])))
    
    for i in range(len(X_Train)):
        tensor_flow_vectors = embed(X_Train.iloc[i])
        array_vectors = tensor_flow_vectors.numpy()
        
        X[i,:] = array_vectors.flatten()
        
    return X

In [8]:
def performSVM(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test):

    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(Train_X_Tfidf, y_train)

    predictions_SVM = SVM.predict(Test_X_Tfidf)

    return predictions_SVM

In [9]:
def performKNN(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test):

    knn = KNeighborsClassifier(n_neighbors=1)
    clf = knn.fit(Train_X_Tfidf, y_train)
    
    predictions_KNN = clf.predict(Test_X_Tfidf)

    return predictions_KNN

In [10]:
def performRF(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test):
    
    classifier=RandomForestClassifier(n_estimators =400,criterion="entropy",random_state =0)
    classifier.fit(Train_X_Tfidf,y_train)

    # predict the labels on validation dataset
    predictions_RF = classifier.predict(Test_X_Tfidf)

    return predictions_RF

In [11]:
voc = torch.load(DATA_PATH + '/voc.obj')

def vectorize_DL_batch_words(batch):
    embedding_size_used = 300
 
    Xi = batch.iloc[0]
    batch_size = len(batch)

    X = torch.zeros(batch_size, len(Xi), dtype=torch.long)
    
    for i in range(len(batch)):
        x = batch.iloc[i]
        vectors = voc.lookup_indices(x)

        X[i] = torch.tensor(vectors).long()

    return X

def vectorize_DL_batch_USE(batch):
    embedding_size_used = 512

    Xi = batch.iloc[0]
    batch_size = len(batch)

    X = torch.zeros(batch_size, len(Xi), embedding_size_used, dtype=torch.float)

    for i in range(len(batch)):
        x = batch.iloc[i]
        
        tensor_flow_vectors = embed(x)
        array_vectors = tensor_flow_vectors.numpy()

        X[i] = torch.tensor(array_vectors).float()

    return X


In [12]:
matrix_len = len(voc)
glove_weights_matrix = np.zeros((matrix_len, word_embedding_size))
fasttext_weights_matrix = np.zeros((matrix_len, word_embedding_size))

#GloVe
for i in range(0,matrix_len-1):
    word = voc.lookup_token(i)
    try: 
        glove_weights_matrix[i] = glove_embeddings.get_vecs_by_tokens(word)
    except KeyError:
        glove_weights_matrix[i] = np.random.normal(scale=0.6, size=(word_embedding_size, ))
#FastText
for i in range(0,matrix_len-1):
    word = voc.lookup_token(i)
    try: 
        fasttext_weights_matrix[i] = fasttext_embeddings.get_vecs_by_tokens(word)
    except KeyError:
        fasttext_weights_matrix[i] = np.random.normal(scale=0.6, size=(word_embedding_size, ))

In [13]:
class ClincalNoteEmbeddingNet(nn.Module):
    def __init__(self, embedding_type, max_tokens, dropout, hidden_size):
        super(ClincalNoteEmbeddingNet, self).__init__()
        
        self.max_tokens = max_tokens
        self.dropout = dropout
        self.hidden_size = hidden_size

        if(embedding_type == 'USE'):
            self.embedding_dimension = sentence_embedding_size
            self.em = None
        else:
            if embedding_type == 'AOAI':
                self.embedding_dimension = aoai_embedding_size
                self.em = None                                        
            else:
                self.embedding_dimension = word_embedding_size
                if(embedding_type == 'GloVe'):
                    self.em = nn.Embedding.from_pretrained(torch.tensor(glove_weights_matrix).float(), freeze=False)
                else:
                    self.em = nn.Embedding.from_pretrained(torch.tensor(fasttext_weights_matrix).float(), freeze=False)

        self.hidden_dim1 = self.hidden_size
        self.hidden_dim2 = int(self.hidden_size/2)
        self.num_layers = 1

        #Because it is bidirectional, the output from LTSM is coming in twice the size of the hidden states required.
        #input is (batch, #of tokens * embedding_dimension)
        self.bilstm1 = nn.LSTM(input_size = self.embedding_dimension, hidden_size = int(self.hidden_dim1/2), bidirectional = True,  
                               batch_first = True, num_layers = self.num_layers) 
        
        self.bilstm2 = nn.LSTM(input_size = self.hidden_dim1, hidden_size = int(self.hidden_dim2/2), bidirectional = True,  
                               batch_first = True, num_layers=self.num_layers)

        self.do = nn.Dropout(self.dropout)
        self.flatten = nn.Flatten()

        self.fc1 = nn.Linear(self.hidden_dim2 * self.max_tokens, 2)


    def forward(self, x):
        #using an embedding layer instead of just vectors
        if self.em is not None:
            x = self.em(x)  

        x, states = self.bilstm1(x)
        x, states = self.bilstm2(x)

        x = self.flatten(x)
        x = self.do(x)
        x = self.fc1(x)

        return x 


In [14]:
MAX_TOKENS = 600
Encoder = LabelEncoder()

for _, disease in enumerate(DISEASES_LIST):
    disease_data_df = all_df_expanded[all_df_expanded['disease'] == disease]
    
    model_predictions = []
    
    X_train, X_test, y_train, y_test = train_test_split(disease_data_df, disease_data_df['judgment'], test_size=0.20, shuffle=True)
    
    #X_train = X_train['text_final']
    #X_test = X_test['text_final']
    Train_Y  = Encoder.fit_transform(y_train)
    Test_Y  = Encoder.fit_transform(y_test)
    
    for _, model in enumerate(MODELS_LIST):
        start_time = time.time()

        print(f"Evaluating model {model}")        
        if model[0:3] == 'DL_':
            print("Do DL data preprocessing")

            embedding = model.split("_")[1]
            
            model_path = MODELS_DIRECTORY + model + '_' + disease + '.pkl'

            if embedding == 'GloVe' or embedding == 'FastText':
                print('Word', max_tokens)
                X_testing = vectorize_DL_batch_words(X_test['vector_tokenized'].copy())
                model_tokens = max_tokens
            if embedding == 'USE':
                print('Sentence', max_sentences)
                X_testing = vectorize_DL_batch_USE(X_test['sentence_tokenized'].copy())
                model_tokens = max_sentences

            cn_model = ClincalNoteEmbeddingNet(embedding, max_tokens = model_tokens, dropout = .1, hidden_size = 128)
            cn_model = cn_model.to(device)

            cn_model.load_state_dict(torch.load(model_path))

            device = device or torch.device('cpu')
            cn_model.eval()

            pred = cn_model(X_testing)
            pred = np.argmax(pred.detach().to(device),axis=1)
                
            model_predictions.append(pred)

            del cn_model

        else:
            if model[0:3] == 'SVM':
                #feature = 'ExtraTreeClassifier'
                feature = 'InfoGainAttributeVal'
                
                #X_train, X_test, y_train, y_test = train_test_split(disease_data_df['text_final'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
                X_training = X_train['text_final']
                X_testing = X_test['text_final']
                
                vocab = getVocab(X_training,y_train, feature, MAX_TOKENS)
                Tfidf_vect = TfidfVectorizer(max_features=MAX_TOKENS,vocabulary = vocab)

                X_train_values_list = Tfidf_vect.fit_transform(X_training).toarray()
                X_training = pd.DataFrame(X_train_values_list, columns=Tfidf_vect.get_feature_names_out())
                X_training = np.asarray(X_training, dtype=float)
                X_training = torch.from_numpy(X_training).to(device)

                X_test_values_list = Tfidf_vect.transform(X_testing).toarray()
                X_testing = pd.DataFrame(X_test_values_list, columns=Tfidf_vect.get_feature_names_out())
                X_testing = np.asarray(X_testing, dtype=float)
                X_testing = torch.from_numpy(X_testing).to(device)
                
                pred = performSVM(X_training, X_testing, Train_Y, Test_Y)
                
                model_predictions.append(pred)
                
            elif model[0:2] == 'RF':
                #feature = 'SelectKBest'
                feature = 'InfoGainAttributeVal'
                
                #X_train, X_test, y_train, y_test = train_test_split(disease_data_df['text_final'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
                X_training = X_train['text_final']
                X_testing = X_test['text_final']
                
                vocab = getVocab(X_training,y_train, feature, MAX_TOKENS)
                Tfidf_vect = TfidfVectorizer(max_features=MAX_TOKENS,vocabulary = vocab)

                X_train_values_list = Tfidf_vect.fit_transform(X_training).toarray()
                X_training = pd.DataFrame(X_train_values_list, columns=Tfidf_vect.get_feature_names_out())
                X_training = np.asarray(X_training, dtype=float)
                X_training = torch.from_numpy(X_training).to(device)

                X_test_values_list = Tfidf_vect.transform(X_testing).toarray()
                X_testing = pd.DataFrame(X_test_values_list, columns=Tfidf_vect.get_feature_names_out())
                X_testing = np.asarray(X_testing, dtype=float)
                X_testing = torch.from_numpy(X_testing).to(device)
                
                pred = performRF(X_training, X_testing, Train_Y, Test_Y)

                model_predictions.append(pred)
                
            else:
                X_training = X_train['sentence_tokenized']
                X_testing = X_test['sentence_tokenized']
                
                X_training = vectorize_batch_USE(X_training)
                X_testing = vectorize_batch_USE(X_testing)
                
                pred = performKNN(X_training, X_testing, Train_Y, Test_Y)
                
                model_predictions.append(pred)
        
    pred_final = (model_predictions[0] + model_predictions[1] + model_predictions[2]+ model_predictions[3]+ model_predictions[4])/5.0
    
    pred_final = pred_final > 0.5
    
    f1_macro = f1_score(Test_Y, pred_final,average='macro')
    f1_micro = f1_score(Test_Y, pred_final,average='micro')
    
    print("Ensemble - ", disease, ": f1-macro", f1_macro)
    print("Ensemble - ", disease, ": f1-micro", f1_micro)
    #print(mean_squared_error(Test_Y, pred_final))
    end_time = time.time()
    runtime_secs = end_time-start_time

    write_to_file(batch_name, disease, f1_macro,f1_micro, runtime_secs)
    
    #model_1 = torch.load(MODELS_DIRECTORY + 'SVM_ExtraTreeClassifier' + '_' + disease + '.pkl')
    #model_2 = torch.load(MODELS_DIRECTORY + 'RF_SelectKBest' + '_' + disease + '.pkl')
        
    #final_model = VotingClassifier(estimators=[('svm', model_1), ('rf', model_3)], voting='hard')
        
        

Evaluating model DL_USE
Do DL data preprocessing
Sentence 380
Evaluating model DL_GloVe
Do DL data preprocessing
Word 1416
Evaluating model SVM_ExtraTreeClassifier
Evaluating model RF_SelectKBest
Evaluating model KNN1_FastText
Ensemble -  Asthma : f1-macro 0.8945981554677206
Ensemble -  Asthma : f1-micro 0.9500000000000001
Evaluating model DL_USE
Do DL data preprocessing
Sentence 380
Evaluating model DL_GloVe
Do DL data preprocessing
Word 1416
Evaluating model SVM_ExtraTreeClassifier
Evaluating model RF_SelectKBest
Evaluating model KNN1_FastText
Ensemble -  CHF : f1-macro 0.8506858870030226
Ensemble -  CHF : f1-micro 0.8875739644970414
Evaluating model DL_USE
Do DL data preprocessing
Sentence 380
Evaluating model DL_GloVe
Do DL data preprocessing
Word 1416
Evaluating model SVM_ExtraTreeClassifier
Evaluating model RF_SelectKBest
Evaluating model KNN1_FastText
Ensemble -  Depression : f1-macro 0.9090778800126122
Ensemble -  Depression : f1-micro 0.9467213114754098
Evaluating model DL_USE