This notebook was created to support the data preparation required to support our CS 598 DLH project.  The paper we have chosen for the reproducibility project is:
***Ensembling Classical Machine Learning and Deep Learning Approaches for Morbidity Identification from Clinical Notes ***



 

The data cannot be shared publicly due to the agreements required to obtain the data so we are storing the data locally and not putting in GitHub.

**Ensemble Models**

![Ensemble Models](images\ensemble.gif)

In [1]:
import pickle
import torch
import numpy as np
import pandas as pd
import torchtext

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection, svm, naive_bayes
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error



In [2]:
seed = 24

MODELS_DIRECTORY = './final_models/'
DATA_PATH = './obesity_data/'
RESULTS_PATH = './results/'

all_df_expanded = pd.read_pickle(DATA_PATH + '/all_df_expanded.pkl')

DISEASES_LIST = ['Asthma', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'Gout', 'Hypercholesterolemia', 'Hypertriglyceridemia', 'OA', 'OSA', 'Obesity', 'CAD', 'Hypertension', 'PVD', 'Venous Insufficiency', 'GERD']
#MODELS_LIST = ['DL_FastText', 'DL_GloVe', 'SVM_ExtraTreeClassifier', 'RF_SelectKBest', 'KNN1_FastText']
MODELS_LIST = ['SVM_ExtraTreeClassifier', 'RF_SelectKBest', 'KNN1_FastText']

device = torch.device('cpu')

In [3]:
for index, entry in enumerate(all_df_expanded['tok_lem_text']):
    Final_words = []
    for word in entry:
        Final_words.append(word)
    all_df_expanded.loc[index, 'text_final'] = str(Final_words)

In [4]:
from keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV, RFE
from sklearn.tree import ExtraTreeClassifier
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import f_classif, mutual_info_classif
from statistics import mean
from sklearn.preprocessing import MinMaxScaler

def getVocab(X_train, y_train, feature, max_tokens):
 
    ## Step 1: Determine the Initial Vocabulary
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    vocab = list(tokenizer.word_index.keys())

    ## Step 2: Create term  matrix
    vectors = tokenizer.texts_to_matrix(X_train, mode='count')

    ## Do feature selection on term matrix (column headers are words)
    X = vectors
    y = y_train

    ##Choose algorithm
    if feature == 'SelectKBest':
        selector = SelectKBest(score_func=f_classif, k=max_tokens).fit(X,y)
    else: 
        if feature == 'InfoGainAttributeVal':
            #This should be similar to the InfoGain?
            selector = SelectKBest(score_func=mutual_info_classif, k=max_tokens).fit(X,y)
        else:
            #default to ExtraTreeClassifier
            estimator = ExtraTreeClassifier(random_state = seed)
            selector = SelectFromModel(estimator, max_features = max_tokens)
            selector = selector.fit(X, y)

    support_idx = selector.get_support(True)
    tokenizer2 = Tokenizer()
    tokenizer2.fit_on_texts([vocab[i-1].replace("'","") for i in support_idx])
    new_vocab = list(tokenizer2.word_index.keys())

    return new_vocab

In [5]:
def vectorize_batch_FastText(X_Train):
    embedding_size_used = 300
    vec = torchtext.vocab.FastText()
    
    X =  np.zeros((X_Train.shape[0], embedding_size_used * len(X_Train.iloc[0])))
    
    for i in range(len(X_Train)):
        vectors = vec.get_vecs_by_tokens(X_Train.iloc[i]).float().numpy()
        
        X[i,:] = vectors.flatten()
        
    return X

In [6]:
def performSVM(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test):

    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(Train_X_Tfidf, y_train)

    predictions_SVM = SVM.predict(Test_X_Tfidf)

    return predictions_SVM

In [7]:
def performKNN(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test):

    knn = KNeighborsClassifier(n_neighbors=1)
    clf = knn.fit(Train_X_Tfidf, y_train)
    
    predictions_KNN = clf.predict(Test_X_Tfidf)

    return predictions_KNN

In [8]:
def performRF(Train_X_Tfidf, Test_X_Tfidf, y_train, y_test):
    
    classifier=RandomForestClassifier(n_estimators =400,criterion="entropy",random_state =0)
    classifier.fit(Train_X_Tfidf,y_train)

    # predict the labels on validation dataset
    predictions_RF = classifier.predict(Test_X_Tfidf)

    return predictions_RF

In [12]:
MAX_TOKENS = 600
Encoder = LabelEncoder()

for _, disease in enumerate(DISEASES_LIST):
    disease_data_df = all_df_expanded[all_df_expanded['disease'] == disease]
    
    model_predictions = []
    
    for _, model in enumerate(MODELS_LIST):
        
        if model[0:3] == 'DL_':
            print("Do DL data preprocessing")
            
            #model_path = MODELS_DIRECTORY + model + '_' + disease + '.pkl'
        
            #loaded_model = torch.load(model_path)

            #pred = loaded_model.predict(X_testing)

            #model_predictions.append(pred)
            
            #X_train, X_test, y_train, y_test = train_test_split(disease_data_df['text_final'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
        
        else:
            
            if model[0:3] == 'SVM':
                feature = 'ExtraTreeClassifier'
                
                X_train, X_test, y_train, y_test = train_test_split(disease_data_df['text_final'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
                
                vocab = getVocab(X_train,y_train, feature, MAX_TOKENS)
                Tfidf_vect = TfidfVectorizer(max_features=MAX_TOKENS,vocabulary = vocab)

                X_train_values_list = Tfidf_vect.fit_transform(X_train).toarray()
                X_training = pd.DataFrame(X_train_values_list, columns=Tfidf_vect.get_feature_names_out())
                X_training = np.asarray(X_training, dtype=float)
                X_training = torch.from_numpy(X_training).to(device)

                X_test_values_list = Tfidf_vect.transform(X_test).toarray()
                X_testing = pd.DataFrame(X_test_values_list, columns=Tfidf_vect.get_feature_names_out())
                X_testing = np.asarray(X_testing, dtype=float)
                X_testing = torch.from_numpy(X_testing).to(device)

                Train_Y  = Encoder.fit_transform(y_train)
                Test_Y  = Encoder.fit_transform(y_test)
                
                pred = performSVM(X_training, X_testing, Train_Y, Test_Y)
                
                model_predictions.append(pred)
                
            elif model[0:2] == 'RF':
                feature = 'SelectKBest'
                
                X_train, X_test, y_train, y_test = train_test_split(disease_data_df['text_final'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
                
                vocab = getVocab(X_train,y_train, feature, MAX_TOKENS)
                Tfidf_vect = TfidfVectorizer(max_features=MAX_TOKENS,vocabulary = vocab)

                X_train_values_list = Tfidf_vect.fit_transform(X_train).toarray()
                X_training = pd.DataFrame(X_train_values_list, columns=Tfidf_vect.get_feature_names_out())
                X_training = np.asarray(X_training, dtype=float)
                X_training = torch.from_numpy(X_training).to(device)

                X_test_values_list = Tfidf_vect.transform(X_test).toarray()
                X_testing = pd.DataFrame(X_test_values_list, columns=Tfidf_vect.get_feature_names_out())
                X_testing = np.asarray(X_testing, dtype=float)
                X_testing = torch.from_numpy(X_testing).to(device)

                Train_Y  = Encoder.fit_transform(y_train)
                Test_Y  = Encoder.fit_transform(y_test)
                
                pred = performRF(X_training, X_testing, Train_Y, Test_Y)
                
                model_predictions.append(pred)
                
            else:
                feature = 'FastText'
                
                X_train, X_test, y_train, y_test = train_test_split(disease_data_df['vector_tokenized'], disease_data_df['judgment'], test_size=0.20, shuffle=True)
                
                X_train = vectorize_batch_FastText(X_train)
                X_test = vectorize_batch_FastText(X_test)
                
                Train_Y  = Encoder.fit_transform(y_train)
                Test_Y  = Encoder.fit_transform(y_test)
                
                pred = performKNN(X_train, X_test, Train_Y, Test_Y)
                
                model_predictions.append(pred)
        
    pred_final = (model_predictions[0] + model_predictions[1] + model_predictions[2])/3.0
    
    print(disease)
    print(mean_squared_error(Test_Y, pred_final))
    
    #model_1 = torch.load(MODELS_DIRECTORY + 'SVM_ExtraTreeClassifier' + '_' + disease + '.pkl')
    #model_2 = torch.load(MODELS_DIRECTORY + 'RF_SelectKBest' + '_' + disease + '.pkl')
        
    #final_model = VotingClassifier(estimators=[('svm', model_1), ('rf', model_3)], voting='hard')
        
        

  f = msb / msw


Asthma
0.1263888888888889


  f = msb / msw


CHF
0.26561472715318873


  f = msb / msw


Depression
0.23679417122040075


  f = msb / msw


Diabetes
0.2657596371882086


  f = msb / msw


Gallstones
0.23111111111111116


  f = msb / msw


Gout
0.1593625498007968


  f = msb / msw


Hypercholesterolemia
0.29375951293759517


  f = msb / msw


Hypertriglyceridemia
0.07588075880758807


  f = msb / msw


OA
0.22733612273361228


  f = msb / msw


OSA
0.144


  f = msb / msw


Obesity
0.3055555555555557


  f = msb / msw


CAD
0.3272896327289633


  f = msb / msw


Hypertension
0.1847163619315518


  f = msb / msw


PVD
0.1623931623931624


  f = msb / msw


Venous Insufficiency
0.10905349794238683


  f = msb / msw


GERD
0.25320512820512825
