This notebook was created to support the data preparation required to support our CS 598 DLH project.  The paper we have chosen for the reproducibility project is:
***Ensembling Classical Machine Learning and Deep Learning Approaches for Morbidity Identification from Clinical Notes ***



 

The data cannot be shared publicly due to the agreements required to obtain the data so we are storing the data locally and not putting in GitHub.

In [1]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import datetime
from datetime import timedelta
from tqdm import tqdm
import torchtext
from torch.utils.data import SubsetRandomSampler
from sklearn.model_selection import KFold,train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
# define data path
DATA_PATH = './obesity_data/'
RESULTS_PATH = './results/'
MODELS_PATH = './models/'

if os.path.exists(RESULTS_PATH) == False:
    os.mkdir(RESULTS_PATH)
if os.path.exists(MODELS_PATH) == False:
    os.mkdir(MODELS_PATH)


#test_df = pd.read_pickle(DATA_PATH + '/test.pkl') 
#train_df = pd.read_pickle(DATA_PATH + '/train.pkl') 
#corpus = pd.read_pickle(DATA_PATH + '/corpus.pkl')
all_df = pd.read_pickle(DATA_PATH + '/all_df.pkl') 
all_df_expanded = pd.read_pickle(DATA_PATH + '/all_df_expanded.pkl')

#Get this from the create embeddings file
max_tokens = 1416

disease_list = all_df['disease'].unique().tolist()
embedding_list = ['GloVe', "FastText"]
result_cols = ['Batch','Disease','Embedding','AUROC','F1','F1_MACRO', 'F1_MICRO', 'Exec Time', 'Total Run (secs)','Epochs', 'LR', 'CV']

***Common training and evaluation code***

In [2]:
eps=1e-10

def train_model(tmodel, train_dataloader, n_epoch=5, lr=0.003, device=None, model_name='unk', use_decay=False):
    import torch.optim as optim
    
    device = device or torch.device('cpu')
    tmodel.train()

    loss_history = []

    # your code here
    optimizer = optim.Adam(tmodel.parameters(), lr=lr)
    # want to decay the learning rate as teh number of epochs get larger
    #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma = 0.1)
    if use_decay:
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
            factor=0.1, patience=10, threshold=0.0001, threshold_mode='abs')

    #loss_func = nn.BCELoss()
    loss_func = nn.CrossEntropyLoss()
    #loss_func = nn.NLLLoss()

    for epoch in range(n_epoch):
        epoch = epoch+1
        curr_epoch_loss = []
        start = time.time()
        for X, Y in tqdm(train_dataloader,desc=f"Training {model_name}-Lr{str(lr)}-Epoch{epoch}..."):
            # your code here
            optimizer.zero_grad()

            y_hat = tmodel(X)

            loss = loss_func(y_hat, Y)
            #loss = loss_func(torch.log(y_hat+ eps), Y)
            
            loss.backward()
            optimizer.step()
            if use_decay:
                scheduler.step(loss)
            
            curr_epoch_loss.append(loss.cpu().data.numpy())


        end = time.time()
        if epoch % 10 == 0:
            print(f"epoch{epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)},execution_time={str(datetime.timedelta(seconds = (end-start)))},lr={optimizer.param_groups[0]['lr']}")

        #scheduler.step()
        loss_history += curr_epoch_loss
    return tmodel, loss_history

def eval_model(emodel, dataloader, device=None, model_name='unk'):
    """
    :return:
        pred_all: prediction of model on the dataloder.
        Y_test: truth labels. Should be an numpy array of ints
    TODO:
        evaluate the model using on the data in the dataloder.
        Add all the prediction and truth to the corresponding list
        Convert pred_all and Y_test to numpy arrays 
    """
    device = device or torch.device('cpu')
    emodel.eval()
    pred_all = []
    Y_test = []
    for X, Y in tqdm(dataloader, desc=f"Evaluating {model_name}..."):
        # your code here
        y_hat = emodel(X)
        
        pred_all.append(y_hat.detach().to('cpu'))
        Y_test.append(Y.detach().to('cpu'))
        
    pred_all = np.concatenate(pred_all, axis=0)
    Y_test = np.concatenate(Y_test, axis=0)

    return pred_all, Y_test

In [3]:
def evaluate_predictions(truth, pred):
    """
    TODO: Evaluate the performance of the predictoin via AUROC, and F1 score
    each prediction in pred is a vector representing [p_0, p_1].
    When defining the scores we are interesed in detecting class 1 only
    (Hint: use roc_auc_score and f1_score from sklearn.metrics, be sure to read their documentation)
    return: auroc, f1
    """
    from sklearn.metrics import roc_auc_score, f1_score

    # your code here
    auroc = roc_auc_score(truth, pred[:,1])
    f1 = f1_score(truth, np.argmax(pred,axis=1))
    f1_macro = f1_score(truth, np.argmax(pred,axis=1),average='macro')
    f1_micro = f1_score(truth, np.argmax(pred,axis=1),average='micro')

    return auroc, f1, f1_macro, f1_micro

In [4]:
def trainAndEvaluate(train_loader, val_loader, model, model_desc, batch_name, results_file, disease, lr,  dataformat, embedding, device, n_epoch, cv, use_decay):
            
    return_val = False

    start_train = time.time()
    model, loss_history = train_model(model, train_loader, n_epoch=n_epoch, lr = lr, device=device, model_name=model_desc, use_decay=use_decay)
    end_train = time.time()

    try:
        #Evaluate model
        start_eval = time.time()
        pred, truth = eval_model(model, val_loader, device=device, model_name=model_desc)
        end_eval = time.time()

        auroc, f1, f1_macro, f1_micro = evaluate_predictions(truth, pred)
        runtime = f"Trn,Eval,Ttl={str(datetime.timedelta(seconds = (end_train-start_train)))},{str(datetime.timedelta(seconds = (end_eval-start_eval)))},{str(datetime.timedelta(seconds = (end_eval-start_train)))}"
        runtime_sec = end_eval-start_train

        return_val = True

    except:
        auroc = -1
        f1=-1
        f1_macro = -1
        f1_micro = -1
        runtime_sec = end_train-start_train
        runtime = 'Failure'
        print("Failure!")


    #Append to results
    if os.path.exists(results_file):
        results = pd.read_csv(results_file)
    else:
        results = pd.DataFrame(columns=result_cols)

    result = pd.DataFrame(columns=result_cols,data=[[batch_name, disease,embedding,auroc,f1,f1_macro,f1_micro,runtime,runtime_sec,n_epoch,lr,str(cv)]])
    results = pd.concat([results,result])

    #Save results - overwrite so we can see progress
    results.to_csv(results_file, index=False)

    return return_val

*** DL TF-IDF ***

In [5]:
for index, entry in enumerate(all_df['tok_lem_text']):
    Final_words = []
    #print(entry)
    for word in entry:
        #print(word)
        Final_words.append(word)
    all_df.loc[index, 'text_final'] = str(Final_words)



In [6]:
class TDFClinicalNotesDataset(Dataset):
    def __init__(self, X_array, y):
        df = pd.DataFrame(index=y.index)
        
        df['tfidf_vector'] = [vector.tolist() for vector in X_array]
        
        self.tfidf_vector = df.tfidf_vector.tolist()
        self.targets = y.tolist()

    def __getitem__(self, i):
        return (self.tfidf_vector[i], self.targets[i])
    
    def __len__(self):
        return len(self.targets)

def collate_fn(batch):
    tfidf = torch.tensor([item[0] for item in batch]).float()
    target = torch.tensor([int(item[1]==True) for item in batch]).long()

    return tfidf, target        

In [7]:
class ClincalNoteTDFNet(nn.Module):
    def __init__(self, tokens):
        super(ClincalNoteTDFNet, self).__init__()
        
        self.tokens = tokens

        self.hidden_dim1 = 128
        self.hidden_dim2 = 64
        self.hidden_dim3 = 32
        self.num_layers = 1

        #Because it is bidirectional, the output from LTSM is coming in twice the size of the hidden states required.
        #input is (batch, #of tokens * embedding_dimension)
        self.bilstm1 = nn.LSTM(input_size = self.tokens, hidden_size = self.hidden_dim1, bidirectional = True, batch_first = True, num_layers = self.num_layers, bias = False) 
        self.bilstm2 = nn.LSTM(input_size = self.hidden_dim1 * 2, hidden_size = self.hidden_dim2, bidirectional = True, batch_first = True, num_layers=self.num_layers, bias = False)
 
        self.fc1 = nn.Linear(self.hidden_dim2 * 2, self.hidden_dim2)
        self.fc2 = nn.Linear(self.hidden_dim2, 2)

 
    def forward(self, x):

        x, states = self.bilstm1(x)
        x, states = self.bilstm2(x)
        
        x = self.fc1(x)

        x = self.fc2(x)


        return x 


In [50]:
from keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV, RFE
from sklearn.tree import ExtraTreeClassifier
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import f_classif, mutual_info_classif

def getVocab(X_train, y_train, feature, max_tokens):
 
    ## Step 1: Determine the Initial Vocabulary
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    vocab = list(tokenizer.word_index.keys())

    ## Step 2: Create term  matrix
    vectors = tokenizer.texts_to_matrix(X_train, mode='count')

    ## Do feature selection on term matrix (column headers are words)
    X = vectors
    y = y_train

    ##Choose algorithm
    if feature == 'SelectKBest':
        selector = SelectKBest(score_func=f_classif, k=max_tokens).fit(X,y)
    else: 
        if feature == 'InfoGainAttributeVal':
            #This should be similar to the InfoGain?
            selector = SelectKBest(score_func=mutual_info_classif, k=max_tokens).fit(X,y)
        else:
            #default to ExtraTreeClassifier
            estimator = ExtraTreeClassifier(random_state = seed)
            #selector = SelectFromModel(estimator, max_features = tokens,threshold=-np.inf)
            selector = SelectFromModel(estimator, max_features = max_tokens)
            selector = selector.fit(X, y)

    support_idx = selector.get_support(True)
    
    #print("Vocab:", [vocab[i-1].replace("'","") for i in support_idx])
    tokenizer2 = Tokenizer()
    tokenizer2.fit_on_texts([vocab[i-1].replace("'","") for i in support_idx])
    new_vocab = list(tokenizer2.word_index.keys())

    return new_vocab



In [46]:
def iterateTrainAndEvaluateTFIDF(df, k, disease_list, feature_list, lr_list, 
                            batch_name, batch_size, results_file, dataformat, device, tokens, n_epoch, cv = False, use_decay=False):

    for _,disease in enumerate(disease_list):
        for _,feature in enumerate(feature_list):
            for _,lr in enumerate(lr_list):
                #Create a name for the model
                model_name = f"{disease}_{feature}_{batch_name}"

                disease_df = df[df['disease'] == disease].copy()

                X_train, X_test, y_train, y_test = train_test_split(disease_df[dataformat], disease_df['judgment'], test_size=0.2, random_state=seed)

                if feature != 'All':
                    vocab = getVocab(X_train,y_train, feature, tokens)
                    Tfidf_vect = TfidfVectorizer(max_features=tokens,vocabulary = vocab)
                else:
                    Tfidf_vect = TfidfVectorizer(max_features=tokens)

                X_train_values_list = Tfidf_vect.fit_transform(X_train).toarray()
                X_training = pd.DataFrame(X_train_values_list, columns=Tfidf_vect.get_feature_names_out())
                X_training = np.asarray(X_training, dtype=float)
                X_training = torch.from_numpy(X_training).to(device)

                X_test_values_list = Tfidf_vect.transform(X_test).toarray()
                X_testing = pd.DataFrame(X_test_values_list, columns=Tfidf_vect.get_feature_names_out())
                X_testing = np.asarray(X_testing, dtype=float)
                X_testing = torch.from_numpy(X_testing).to(device)

                tokens_to_use = X_training.shape[1]
                print(tokens_to_use)

                #Create model
                model = ClincalNoteTDFNet(tokens = tokens_to_use)
                model = model.to(device)

                ds_train = TDFClinicalNotesDataset(X_training, y_train)
                ds_test = TDFClinicalNotesDataset(X_testing, y_test)

                #Load Data 
                train_loader = torch.utils.data.DataLoader(ds_train, batch_size = batch_size, collate_fn=collate_fn)
                val_loader = torch.utils.data.DataLoader(ds_test, batch_size = batch_size,collate_fn=collate_fn)

                model_desc = f"{disease}_{feature}"

                trainAndEvaluate(train_loader, val_loader, model, model_desc, batch_name, results_file, disease, lr, dataformat, feature, device, n_epoch, False, use_decay)

                #Save model
                torch.save(model.state_dict(), f'{MODELS_PATH}{model_name}.pkl')

                #Delete model
                del model

                

In [51]:
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
print(f'Using device: {device}')

#Override these if need be
#disease_list = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'OSA', 'PVD', 'Venous Insufficiency', 'Obesity']
disease_list = ['Asthma']
feature_list = ['All','ExtraTreeClassifier','SelectKBest','InfoGainAttributeVal']
results_file = f'{RESULTS_PATH}DL_tfidf_results.csv'

#0.01 seems to be the most effective with no decay
#lr_list = [0.1,0.01,0.001, 0.0001]
lr_list = [0.01]

#training parameters
n_epoch = 50
batch_size = 128
k = 2

#These should not change
dataformat = 'text_final'
tokens = 600

result_time = datetime.datetime.now()
result_name = result_time.strftime("%Y-%m-%d-%H-%M-%S")
batch_name = f'DL_tfidf_results_{result_name}'

iterateTrainAndEvaluateTFIDF(all_df, k, disease_list, feature_list, lr_list, batch_name, batch_size, results_file, dataformat, device, tokens, n_epoch, False, False)


Using device: cpu
600


Training Asthma_All-Lr0.01-Epoch1...: 100%|██████████| 7/7 [00:00<00:00, 29.03it/s]
Training Asthma_All-Lr0.01-Epoch2...: 100%|██████████| 7/7 [00:00<00:00, 30.49it/s]
Training Asthma_All-Lr0.01-Epoch3...: 100%|██████████| 7/7 [00:00<00:00, 28.54it/s]
Training Asthma_All-Lr0.01-Epoch4...: 100%|██████████| 7/7 [00:00<00:00, 25.15it/s]
Training Asthma_All-Lr0.01-Epoch5...: 100%|██████████| 7/7 [00:00<00:00, 25.36it/s]
Training Asthma_All-Lr0.01-Epoch6...: 100%|██████████| 7/7 [00:00<00:00, 30.42it/s]
Training Asthma_All-Lr0.01-Epoch7...: 100%|██████████| 7/7 [00:00<00:00, 27.91it/s]
Training Asthma_All-Lr0.01-Epoch8...: 100%|██████████| 7/7 [00:00<00:00, 26.57it/s]
Training Asthma_All-Lr0.01-Epoch9...: 100%|██████████| 7/7 [00:00<00:00, 25.58it/s]
Training Asthma_All-Lr0.01-Epoch10...: 100%|██████████| 7/7 [00:00<00:00, 28.77it/s]


epoch10: curr_epoch_loss=0.16729100048542023,execution_time=0:00:00.244411,lr=0.01


Training Asthma_All-Lr0.01-Epoch11...: 100%|██████████| 7/7 [00:00<00:00, 24.70it/s]
Training Asthma_All-Lr0.01-Epoch12...: 100%|██████████| 7/7 [00:00<00:00, 27.78it/s]
Training Asthma_All-Lr0.01-Epoch13...: 100%|██████████| 7/7 [00:00<00:00, 27.67it/s]
Training Asthma_All-Lr0.01-Epoch14...: 100%|██████████| 7/7 [00:00<00:00, 24.14it/s]
Training Asthma_All-Lr0.01-Epoch15...: 100%|██████████| 7/7 [00:00<00:00, 28.07it/s]
Training Asthma_All-Lr0.01-Epoch16...: 100%|██████████| 7/7 [00:00<00:00, 29.86it/s]
Training Asthma_All-Lr0.01-Epoch17...: 100%|██████████| 7/7 [00:00<00:00, 27.44it/s]
Training Asthma_All-Lr0.01-Epoch18...: 100%|██████████| 7/7 [00:00<00:00, 31.56it/s]
Training Asthma_All-Lr0.01-Epoch19...: 100%|██████████| 7/7 [00:00<00:00, 27.82it/s]
Training Asthma_All-Lr0.01-Epoch20...: 100%|██████████| 7/7 [00:00<00:00, 31.41it/s]


epoch20: curr_epoch_loss=0.0002622074098326266,execution_time=0:00:00.226112,lr=0.01


Training Asthma_All-Lr0.01-Epoch21...: 100%|██████████| 7/7 [00:00<00:00, 31.68it/s]
Training Asthma_All-Lr0.01-Epoch22...: 100%|██████████| 7/7 [00:00<00:00, 30.12it/s]
Training Asthma_All-Lr0.01-Epoch23...: 100%|██████████| 7/7 [00:00<00:00, 28.34it/s]
Training Asthma_All-Lr0.01-Epoch24...: 100%|██████████| 7/7 [00:00<00:00, 32.24it/s]
Training Asthma_All-Lr0.01-Epoch25...: 100%|██████████| 7/7 [00:00<00:00, 29.09it/s]
Training Asthma_All-Lr0.01-Epoch26...: 100%|██████████| 7/7 [00:00<00:00, 30.34it/s]
Training Asthma_All-Lr0.01-Epoch27...: 100%|██████████| 7/7 [00:00<00:00, 30.16it/s]
Training Asthma_All-Lr0.01-Epoch28...: 100%|██████████| 7/7 [00:00<00:00, 29.89it/s]
Training Asthma_All-Lr0.01-Epoch29...: 100%|██████████| 7/7 [00:00<00:00, 28.98it/s]
Training Asthma_All-Lr0.01-Epoch30...: 100%|██████████| 7/7 [00:00<00:00, 29.59it/s]


epoch30: curr_epoch_loss=5.6355118431383744e-05,execution_time=0:00:00.237646,lr=0.01


Training Asthma_All-Lr0.01-Epoch31...: 100%|██████████| 7/7 [00:00<00:00, 25.90it/s]
Training Asthma_All-Lr0.01-Epoch32...: 100%|██████████| 7/7 [00:00<00:00, 31.71it/s]
Training Asthma_All-Lr0.01-Epoch33...: 100%|██████████| 7/7 [00:00<00:00, 28.54it/s]
Training Asthma_All-Lr0.01-Epoch34...: 100%|██████████| 7/7 [00:00<00:00, 30.62it/s]
Training Asthma_All-Lr0.01-Epoch35...: 100%|██████████| 7/7 [00:00<00:00, 30.74it/s]
Training Asthma_All-Lr0.01-Epoch36...: 100%|██████████| 7/7 [00:00<00:00, 28.24it/s]
Training Asthma_All-Lr0.01-Epoch37...: 100%|██████████| 7/7 [00:00<00:00, 29.62it/s]
Training Asthma_All-Lr0.01-Epoch38...: 100%|██████████| 7/7 [00:00<00:00, 28.32it/s]
Training Asthma_All-Lr0.01-Epoch39...: 100%|██████████| 7/7 [00:00<00:00, 30.69it/s]
Training Asthma_All-Lr0.01-Epoch40...: 100%|██████████| 7/7 [00:00<00:00, 28.92it/s]


epoch40: curr_epoch_loss=2.61446966760559e-05,execution_time=0:00:00.245041,lr=0.01


Training Asthma_All-Lr0.01-Epoch41...: 100%|██████████| 7/7 [00:00<00:00, 29.55it/s]
Training Asthma_All-Lr0.01-Epoch42...: 100%|██████████| 7/7 [00:00<00:00, 31.40it/s]
Training Asthma_All-Lr0.01-Epoch43...: 100%|██████████| 7/7 [00:00<00:00, 28.96it/s]
Training Asthma_All-Lr0.01-Epoch44...: 100%|██████████| 7/7 [00:00<00:00, 30.13it/s]
Training Asthma_All-Lr0.01-Epoch45...: 100%|██████████| 7/7 [00:00<00:00, 27.16it/s]
Training Asthma_All-Lr0.01-Epoch46...: 100%|██████████| 7/7 [00:00<00:00, 30.70it/s]
Training Asthma_All-Lr0.01-Epoch47...: 100%|██████████| 7/7 [00:00<00:00, 29.21it/s]
Training Asthma_All-Lr0.01-Epoch48...: 100%|██████████| 7/7 [00:00<00:00, 28.81it/s]
Training Asthma_All-Lr0.01-Epoch49...: 100%|██████████| 7/7 [00:00<00:00, 28.10it/s]
Training Asthma_All-Lr0.01-Epoch50...: 100%|██████████| 7/7 [00:00<00:00, 30.33it/s]


epoch50: curr_epoch_loss=1.2390669326123316e-05,execution_time=0:00:00.232785,lr=0.01


Evaluating Asthma_All...: 100%|██████████| 2/2 [00:00<00:00, 62.51it/s]


Vocab: ['is', 'chest', 'medication', 's', 'normal', 'one', 'ekg', 'rhythm', 'appointment', 'episode', 'ef', 'recent', 'platelet', 'scheduled', 'notice', 'distress', 'plan', 'asthma', 'hypercholesterolemia', 'prednisone', 'mass', 'tenderness', 'skin', 'pump', 'drop', 'diaphoresis', 'nebulizer', 'unclear', 'evaluated', 'sl', 'focal', 'hypertrophy', 'lvh', 'schedule', 'gain', 'thrombosis', 'main', 'neurology', 'inhaled', 'begun', 'transient', 'polys', 'flovent', 'admitting', 'origin', 'idiopathic', 'fe', 'perform', 'yourself', 'tubular', 'vh', 'arthroplasty', 'digit', 'hypercarbic', 'rhinitis', 'replace', 'halfway', 'dvtpe', 'montelukast', 'sulfonamide', 'stand', 'midepigastric', 'postnasal', 'fidel', 'jared', 'dar', 'investigated', 'dmh', 'beclovent', 'elmo', 'aminophylline', 'pendulum', 'sant', 'hyperventilation', 'chondral', 'nidiffer', 'othernasal', 'amibi', 'preadmit', 'subcentimeter', 'nutritionist', 'sobchest', 'kushiner', 'ugarte', 'dykhoff', 'seriously', 'puzzling', 'hallucinatin

Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch1...: 100%|██████████| 7/7 [00:00<00:00, 50.71it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch2...: 100%|██████████| 7/7 [00:00<00:00, 50.07it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch3...: 100%|██████████| 7/7 [00:00<00:00, 50.15it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch4...: 100%|██████████| 7/7 [00:00<00:00, 48.92it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch5...: 100%|██████████| 7/7 [00:00<00:00, 55.47it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch6...: 100%|██████████| 7/7 [00:00<00:00, 56.22it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch7...: 100%|██████████| 7/7 [00:00<00:00, 49.72it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch8...: 100%|██████████| 7/7 [00:00<00:00, 55.20it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch9...: 100%|██████████| 7/7 [00:00<00:00, 55.17it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch10...: 100%|██████████| 7/7 [00:00<00:00, 56.46it/s]

epoch10: curr_epoch_loss=0.005808359477669001,execution_time=0:00:00.125983,lr=0.01


Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch11...: 100%|██████████| 7/7 [00:00<00:00, 54.31it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch12...: 100%|██████████| 7/7 [00:00<00:00, 47.27it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch13...: 100%|██████████| 7/7 [00:00<00:00, 58.16it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch14...: 100%|██████████| 7/7 [00:00<00:00, 51.07it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch15...: 100%|██████████| 7/7 [00:00<00:00, 62.91it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch16...: 100%|██████████| 7/7 [00:00<00:00, 54.14it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch17...: 100%|██████████| 7/7 [00:00<00:00, 54.75it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch18...: 100%|██████████| 7/7 [00:00<00:00, 55.98it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch19...: 100%|██████████| 7/7 [00:00<00:00, 59.68it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch20...: 100%|██████████| 7/7 [00:00<00:00, 5

epoch20: curr_epoch_loss=0.0003544053470250219,execution_time=0:00:00.137364,lr=0.01


Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch21...: 100%|██████████| 7/7 [00:00<00:00, 54.52it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch22...: 100%|██████████| 7/7 [00:00<00:00, 59.10it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch23...: 100%|██████████| 7/7 [00:00<00:00, 59.47it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch24...: 100%|██████████| 7/7 [00:00<00:00, 48.97it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch25...: 100%|██████████| 7/7 [00:00<00:00, 56.34it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch26...: 100%|██████████| 7/7 [00:00<00:00, 58.14it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch27...: 100%|██████████| 7/7 [00:00<00:00, 57.21it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch28...: 100%|██████████| 7/7 [00:00<00:00, 53.96it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch29...: 100%|██████████| 7/7 [00:00<00:00, 54.44it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch30...: 100%|██████████| 7/7 [00:00<00:00, 6

epoch30: curr_epoch_loss=0.00025270887999795377,execution_time=0:00:00.116337,lr=0.01


Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch31...: 100%|██████████| 7/7 [00:00<00:00, 55.18it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch32...: 100%|██████████| 7/7 [00:00<00:00, 59.62it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch33...: 100%|██████████| 7/7 [00:00<00:00, 58.39it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch34...: 100%|██████████| 7/7 [00:00<00:00, 58.65it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch35...: 100%|██████████| 7/7 [00:00<00:00, 62.10it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch36...: 100%|██████████| 7/7 [00:00<00:00, 55.09it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch37...: 100%|██████████| 7/7 [00:00<00:00, 58.75it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch38...: 100%|██████████| 7/7 [00:00<00:00, 55.96it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch39...: 100%|██████████| 7/7 [00:00<00:00, 53.71it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch40...: 100%|██████████| 7/7 [00:00<00:00, 5

epoch40: curr_epoch_loss=2.3726754079689272e-05,execution_time=0:00:00.128792,lr=0.01


Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch41...: 100%|██████████| 7/7 [00:00<00:00, 43.56it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch42...: 100%|██████████| 7/7 [00:00<00:00, 60.58it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch43...: 100%|██████████| 7/7 [00:00<00:00, 60.96it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch44...: 100%|██████████| 7/7 [00:00<00:00, 54.51it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch45...: 100%|██████████| 7/7 [00:00<00:00, 58.70it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch46...: 100%|██████████| 7/7 [00:00<00:00, 58.80it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch47...: 100%|██████████| 7/7 [00:00<00:00, 55.27it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch48...: 100%|██████████| 7/7 [00:00<00:00, 59.40it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch49...: 100%|██████████| 7/7 [00:00<00:00, 60.89it/s]
Training Asthma_ExtraTreeClassifier-Lr0.01-Epoch50...: 100%|██████████| 7/7 [00:00<00:00, 6

epoch50: curr_epoch_loss=1.2229303138155956e-05,execution_time=0:00:00.116340,lr=0.01


Evaluating Asthma_ExtraTreeClassifier...: 100%|██████████| 2/2 [00:00<00:00, 146.47it/s]
  f = msb / msw


Vocab: ['she', 'her', 'he', 'his', 'qd', 'bid', 'left', 'home', 'artery', 'pt', 'cardiac', 'if', 'prn', 'coronary', 'lasix', 'allergy', 'x', 'instruction', 'then', 'without', 't', 'iv', 'pulmonary', 'dc', 'your', 'low', 'infection', 'w', 'please', 'give', 'shortness', 'catheterization', 'treated', 'call', 'lopressor', 'resume', 'graft', 'bypass', 'levofloxacin', 'before', 'admit', 'respiratory', 'today', 'qid', 'obesity', 'causing', 'cough', 'obese', 'taken', 'avoid', 'sleep', 'woman', 'f', 'asthma', 'prednisone', 'puff', 'fu', 'dvt', 'unless', 'exacerbation', 'albuterol', 'grapefruit', 'instructs', 'meq', 'descending', 'inhaler', 'apnea', 'headache', 'can', 'furosemide', 'circumflex', 'kcl', 'dysfunction', 'obstructive', 'operating', 'inh', 'large', 'pulm', 'peak', 'wheezing', 'sputum', 'morbid', 'outpt', 'nebulizer', 'diastolic', 'diffuse', 'wheeze', 'steroid', 'sulfa', 'spray', 'much', 'taper', 'contrast', 'lvh', 'product', 'flare', 'reflux', 'neb', 'atrovent', 'sr', 'advair', 'calf

Training Asthma_SelectKBest-Lr0.01-Epoch1...: 100%|██████████| 7/7 [00:00<00:00, 25.51it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch2...: 100%|██████████| 7/7 [00:00<00:00, 29.04it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch3...: 100%|██████████| 7/7 [00:00<00:00, 31.84it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch4...: 100%|██████████| 7/7 [00:00<00:00, 28.89it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch5...: 100%|██████████| 7/7 [00:00<00:00, 27.79it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch6...: 100%|██████████| 7/7 [00:00<00:00, 29.20it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch7...: 100%|██████████| 7/7 [00:00<00:00, 31.51it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch8...: 100%|██████████| 7/7 [00:00<00:00, 24.88it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch9...: 100%|██████████| 7/7 [00:00<00:00, 25.10it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch10...: 100%|██████████| 7/7 [00:00<00:00, 24.53it/s]


epoch10: curr_epoch_loss=0.005130314733833075,execution_time=0:00:00.288708,lr=0.01


Training Asthma_SelectKBest-Lr0.01-Epoch11...: 100%|██████████| 7/7 [00:00<00:00, 29.47it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch12...: 100%|██████████| 7/7 [00:00<00:00, 28.52it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch13...: 100%|██████████| 7/7 [00:00<00:00, 25.82it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch14...: 100%|██████████| 7/7 [00:00<00:00, 27.13it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch15...: 100%|██████████| 7/7 [00:00<00:00, 26.87it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch16...: 100%|██████████| 7/7 [00:00<00:00, 26.98it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch17...: 100%|██████████| 7/7 [00:00<00:00, 23.51it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch18...: 100%|██████████| 7/7 [00:00<00:00, 24.75it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch19...: 100%|██████████| 7/7 [00:00<00:00, 24.89it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch20...: 100%|██████████| 7/7 [00:00<00:00, 24.90it/s]


epoch20: curr_epoch_loss=1.2209254236950073e-05,execution_time=0:00:00.282305,lr=0.01


Training Asthma_SelectKBest-Lr0.01-Epoch21...: 100%|██████████| 7/7 [00:00<00:00, 25.30it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch22...: 100%|██████████| 7/7 [00:00<00:00, 25.98it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch23...: 100%|██████████| 7/7 [00:00<00:00, 32.18it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch24...: 100%|██████████| 7/7 [00:00<00:00, 30.87it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch25...: 100%|██████████| 7/7 [00:00<00:00, 28.17it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch26...: 100%|██████████| 7/7 [00:00<00:00, 29.80it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch27...: 100%|██████████| 7/7 [00:00<00:00, 30.00it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch28...: 100%|██████████| 7/7 [00:00<00:00, 30.64it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch29...: 100%|██████████| 7/7 [00:00<00:00, 28.22it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch30...: 100%|██████████| 7/7 [00:00<00:00, 28.26it/s]


epoch30: curr_epoch_loss=6.766650130884955e-06,execution_time=0:00:00.250900,lr=0.01


Training Asthma_SelectKBest-Lr0.01-Epoch31...: 100%|██████████| 7/7 [00:00<00:00, 30.42it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch32...: 100%|██████████| 7/7 [00:00<00:00, 29.09it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch33...: 100%|██████████| 7/7 [00:00<00:00, 31.26it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch34...: 100%|██████████| 7/7 [00:00<00:00, 31.90it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch35...: 100%|██████████| 7/7 [00:00<00:00, 26.49it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch36...: 100%|██████████| 7/7 [00:00<00:00, 29.77it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch37...: 100%|██████████| 7/7 [00:00<00:00, 25.63it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch38...: 100%|██████████| 7/7 [00:00<00:00, 31.33it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch39...: 100%|██████████| 7/7 [00:00<00:00, 30.70it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch40...: 100%|██████████| 7/7 [00:00<00:00, 29.71it/s]


epoch40: curr_epoch_loss=4.5931046770419925e-06,execution_time=0:00:00.238693,lr=0.01


Training Asthma_SelectKBest-Lr0.01-Epoch41...: 100%|██████████| 7/7 [00:00<00:00, 27.73it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch42...: 100%|██████████| 7/7 [00:00<00:00, 29.13it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch43...: 100%|██████████| 7/7 [00:00<00:00, 30.04it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch44...: 100%|██████████| 7/7 [00:00<00:00, 32.76it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch45...: 100%|██████████| 7/7 [00:00<00:00, 29.06it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch46...: 100%|██████████| 7/7 [00:00<00:00, 29.47it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch47...: 100%|██████████| 7/7 [00:00<00:00, 30.71it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch48...: 100%|██████████| 7/7 [00:00<00:00, 31.03it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch49...: 100%|██████████| 7/7 [00:00<00:00, 31.18it/s]
Training Asthma_SelectKBest-Lr0.01-Epoch50...: 100%|██████████| 7/7 [00:00<00:00, 28.54it/s]


epoch50: curr_epoch_loss=3.360822574904887e-06,execution_time=0:00:00.246657,lr=0.01


Evaluating Asthma_SelectKBest...: 100%|██████████| 2/2 [00:00<00:00, 42.25it/s]


Vocab: ['with', 'no', 'her', 'disease', 'pressure', 'service', 'potentially', 'hypertension', 'infection', 'code', 'hour', 'abdomen', 'following', 'graft', 'known', 'back', 'before', 'overridden', 'subcutaneously', 'found', 'plavix', 'function', 'platelet', 'small', 'antibiotic', 'diagnosisconditions', 'atorvastatin', 'asthma', 'puff', 'second', 'albuterol', 'swelling', 'goal', 'cancer', 'block', 'inhaler', 'apnea', 'movement', 'changed', 'palpitation', 'operating', 'inh', 'pleural', 'wheezing', 'nebulizer', 'later', 'january', 'angioplasty', 'patent', 'sl', 'son', 'lima', 'oropharynx', 'ptca', 'below', 'product', 'neb', 'better', 'advair', 'grew', 'verapamil', 'stated', 'cta', 'receive', 'obtuse', 'fluticasone', 'midline', 'heel', 'cyclosporine', 'excellent', 'maximum', 'ldl', 'single', 'tni', 'dyslipidemia', 'cord', 'habitus', 'restrictive', 'propionate', 'noninsulin', 'diskus', 'capillary', 'plain', 'hypoxia', 'know', 'successful', 'reg', 'decompensated', 'consults', 'metamucil', 's

Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch1...: 100%|██████████| 7/7 [00:00<00:00, 31.08it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch2...: 100%|██████████| 7/7 [00:00<00:00, 31.14it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch3...: 100%|██████████| 7/7 [00:00<00:00, 29.50it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch4...: 100%|██████████| 7/7 [00:00<00:00, 29.41it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch5...: 100%|██████████| 7/7 [00:00<00:00, 33.96it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch6...: 100%|██████████| 7/7 [00:00<00:00, 29.48it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch7...: 100%|██████████| 7/7 [00:00<00:00, 31.43it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch8...: 100%|██████████| 7/7 [00:00<00:00, 31.98it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch9...: 100%|██████████| 7/7 [00:00<00:00, 29.59it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch10...: 100%|██████████| 7/7 [00:00<00:00, 

epoch10: curr_epoch_loss=0.0013369983062148094,execution_time=0:00:00.238001,lr=0.01


Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch11...: 100%|██████████| 7/7 [00:00<00:00, 30.66it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch12...: 100%|██████████| 7/7 [00:00<00:00, 27.65it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch13...: 100%|██████████| 7/7 [00:00<00:00, 31.52it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch14...: 100%|██████████| 7/7 [00:00<00:00, 31.10it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch15...: 100%|██████████| 7/7 [00:00<00:00, 27.93it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch16...: 100%|██████████| 7/7 [00:00<00:00, 29.09it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch17...: 100%|██████████| 7/7 [00:00<00:00, 30.00it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch18...: 100%|██████████| 7/7 [00:00<00:00, 30.67it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch19...: 100%|██████████| 7/7 [00:00<00:00, 31.99it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch20...: 100%|██████████| 7/7 [00:0

epoch20: curr_epoch_loss=1.031039391818922e-05,execution_time=0:00:00.247635,lr=0.01


Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch21...: 100%|██████████| 7/7 [00:00<00:00, 29.72it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch22...: 100%|██████████| 7/7 [00:00<00:00, 28.15it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch23...: 100%|██████████| 7/7 [00:00<00:00, 30.11it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch24...: 100%|██████████| 7/7 [00:00<00:00, 29.78it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch25...: 100%|██████████| 7/7 [00:00<00:00, 35.08it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch26...: 100%|██████████| 7/7 [00:00<00:00, 31.96it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch27...: 100%|██████████| 7/7 [00:00<00:00, 27.78it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch28...: 100%|██████████| 7/7 [00:00<00:00, 27.63it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch29...: 100%|██████████| 7/7 [00:00<00:00, 28.60it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch30...: 100%|██████████| 7/7 [00:0

epoch30: curr_epoch_loss=4.807474851986626e-06,execution_time=0:00:00.221990,lr=0.01


Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch31...: 100%|██████████| 7/7 [00:00<00:00, 29.53it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch32...: 100%|██████████| 7/7 [00:00<00:00, 31.86it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch33...: 100%|██████████| 7/7 [00:00<00:00, 29.41it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch34...: 100%|██████████| 7/7 [00:00<00:00, 31.48it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch35...: 100%|██████████| 7/7 [00:00<00:00, 29.39it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch36...: 100%|██████████| 7/7 [00:00<00:00, 29.43it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch37...: 100%|██████████| 7/7 [00:00<00:00, 27.61it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch38...: 100%|██████████| 7/7 [00:00<00:00, 31.53it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch39...: 100%|██████████| 7/7 [00:00<00:00, 29.46it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch40...: 100%|██████████| 7/7 [00:0

epoch40: curr_epoch_loss=3.1108479561225977e-06,execution_time=0:00:00.224469,lr=0.01


Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch41...: 100%|██████████| 7/7 [00:00<00:00, 27.99it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch42...: 100%|██████████| 7/7 [00:00<00:00, 27.59it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch43...: 100%|██████████| 7/7 [00:00<00:00, 28.45it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch44...: 100%|██████████| 7/7 [00:00<00:00, 32.02it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch45...: 100%|██████████| 7/7 [00:00<00:00, 30.12it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch46...: 100%|██████████| 7/7 [00:00<00:00, 29.83it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch47...: 100%|██████████| 7/7 [00:00<00:00, 29.91it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch48...: 100%|██████████| 7/7 [00:00<00:00, 27.93it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch49...: 100%|██████████| 7/7 [00:00<00:00, 29.41it/s]
Training Asthma_InfoGainAttributeVal-Lr0.01-Epoch50...: 100%|██████████| 7/7 [00:0

epoch50: curr_epoch_loss=2.2280257780948887e-06,execution_time=0:00:00.245770,lr=0.01


Evaluating Asthma_InfoGainAttributeVal...: 100%|██████████| 2/2 [00:00<00:00, 62.88it/s]


**Deep Learning - Bag of Words - All Feature Selections - Averaged**

![DL BagOfWords AllFeatures Averaged](images\DL-BagOfWords-ByFeatureSelection.gif)

****DL Model using word embeddings****

First we start by creating a dataset.  Note this will have to take the disease as part of the init and filter just for those records.

In [None]:

class ClinicalNoteDataset(Dataset):

    def __init__(self, dataframe, disease, dataformat):
        """
        TODO: init the Dataset instance.  datafomat is just the column to use from the dataframe 'vector_tokenized' , 'one_hot'
        """
        # your code here
        self.disease = disease
        self.dataformat = dataformat
        self.df = dataframe[dataframe['disease'] == disease].copy()
        self.df = self.df.reset_index()

    def __len__(self):
        """
        TODO: Denotes the total number of samples
        """
        return len(self.df)

    def __getitem__(self, i):
        """
        TODO: Generates one sample of data
            return X, y for the i-th data.
        """
        #Cannot make tensors yet, will need to happen in collate
        Y = self.df.iloc[i]['judgment']
        X = self.df.iloc[i][self.dataformat]

        return X,Y
        
def vectorize_batch_GloVe(batch):
    embedding_size_used = 300
    vec = torchtext.vocab.GloVe(name='6B', dim=embedding_size_used)    
    Xi, Yi = batch[0]
    batch_size = len(batch)

    X = torch.zeros(batch_size, len(Xi), embedding_size_used, dtype=torch.float)
    Y = torch.zeros((batch_size), dtype=torch.long)
    
    for i in range(len(batch)):
        x, y = batch[i]
        #vectors = vec.get_vecs_by_tokens(voc.lookup_tokens(x.tolist()))
        vectors = vec.get_vecs_by_tokens(x)

        X[i] = vectors.float()
        Y[i] = torch.tensor(float(y == True))

    return X,Y

def vectorize_batch_FastText(batch):
    embedding_size_used = 300
    vec = torchtext.vocab.FastText()
    Xi, Yi = batch[0]
    batch_size = len(batch)

    X = torch.zeros(batch_size, len(Xi), embedding_size_used, dtype=torch.float)
    Y = torch.zeros((batch_size), dtype=torch.long)

    for i in range(len(batch)):
        x, y = batch[i]
        #vectors = vec.get_vecs_by_tokens(voc.lookup_tokens(x.tolist()))
        vectors = vec.get_vecs_by_tokens(x)

        X[i] = vectors.float()
        Y[i] = torch.tensor(float(y == True))

    return X,Y 
 
          


In [None]:
##Test DataLoader
#batch_size = 128
#train_loader = torch.utils.data.DataLoader(ClinicalNoteDataset(train_df, 'Asthma', 'vector_tokenized'), batch_size = batch_size, shuffle=True, collate_fn=vectorize_batch_FastText)
#val_loader = torch.utils.data.DataLoader(ClinicalNoteDataset(test_df, 'Asthma', 'vector_tokenized'), batch_size = batch_size, shuffle=False, collate_fn=vectorize_batch_FastText)

#print("# of train batches:", len(train_loader))
#print("# of val batches:", len(val_loader))

#train_iter = iter(train_loader)
#x,y = next(train_iter)

#print(x.shape)
#print(y.shape)


In [None]:
class ClincalNoteEmbeddingNet(nn.Module):
    def __init__(self, embedding_type, max_tokens):
        super(ClincalNoteEmbeddingNet, self).__init__()
        
        self.max_tokens = max_tokens

        if(embedding_type == 'USE'):
            self.embedding_dimension = 512
        else:
            self.embedding_dimension = 300

        self.hidden_dim1 = 128
        self.hidden_dim2 = 64
        self.num_layers = 1

        #Because it is bidirectional, the output from LTSM is coming in twice the size of the hidden states required.
        #input is (batch, #of tokens * embedding_dimension)
        self.bilstm1 = nn.LSTM(input_size = self.embedding_dimension, hidden_size = self.hidden_dim1, bidirectional = True, 
                               batch_first = True, num_layers = self.num_layers) 
        self.bilstm2 = nn.LSTM(input_size = self.hidden_dim1 * 2, hidden_size = self.hidden_dim2, bidirectional = True, 
                               batch_first = True, num_layers=self.num_layers)
        self.flatten = nn.Flatten()

        self.fc1 = nn.Linear(self.hidden_dim2 * self.max_tokens * 2, self.hidden_dim2)
        self.fc2 = nn.Linear(self.hidden_dim2, 2)


    def forward(self, x):
        x, states = self.bilstm1(x)
        x, states = self.bilstm2(x)
        
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)

        return x #F.sigmoid(x).squeeze(dim=-1)



Need to create a loop to train and evaluate

In [None]:
def iterateTrainAndEvaluate(df, k, disease_list, embedding_list, lr_list, 
                            batch_name, results_file, dataformat, device, max_tokens, n_epoch, cv = False, use_decay = False):

    for _,disease in enumerate(disease_list):
        for _,embedding in enumerate(embedding_list):
            for _,lr in enumerate(lr_list):
                #Create a name for the model
                model_name = f"{disease}_{embedding}_{batch_name}"

                #Create model
                model = ClincalNoteEmbeddingNet(embedding, max_tokens = max_tokens)
                model = model.to(device)

                ds = ClinicalNoteDataset(df, disease, dataformat)
                if embedding == 'GloVe':
                    custom_collate=vectorize_batch_GloVe
                if embedding == 'FastText':
                    custom_collate=vectorize_batch_FastText

                ds_train, ds_test = train_test_split(ds, test_size=0.20, shuffle=True, random_state = seed)

                #Load Data 
                train_loader = torch.utils.data.DataLoader(ds_train, batch_size = batch_size, collate_fn=custom_collate)
                val_loader = torch.utils.data.DataLoader(ds_test, batch_size = batch_size, collate_fn=custom_collate)
                
                model_desc = f"{disease}_{embedding}"

                trainAndEvaluate(train_loader, val_loader, model, model_desc, batch_name, results_file, disease, lr, dataformat, embedding, device, n_epoch, False, use_decay)

                #Save model
                torch.save(model.state_dict(), f'{MODELS_PATH}{model_name}.pkl')

                #Delete model
                del model

                if cv:
                    #note, cross validation is only used to validate the model works consistently
                    splits=KFold(n_splits=k,shuffle=True,random_state=seed)

                    for fold, (train_idx,val_idx) in enumerate(splits.split(np.arange(len(ds)))):
                        #for now, let's keep the results at the fold level
                        model = ClincalNoteEmbeddingNet(embedding, max_tokens = max_tokens)
                        model = model.to(device)
                        
                        train_sampler = SubsetRandomSampler(train_idx)
                        val_sampler = SubsetRandomSampler(val_idx)
                        #Load Data 
                        train_loader = torch.utils.data.DataLoader(ds, batch_size = batch_size, sampler=train_sampler, collate_fn=custom_collate)
                        val_loader = torch.utils.data.DataLoader(ds, batch_size = batch_size, sampler=val_sampler, collate_fn=custom_collate)
                        
                        model_desc = f"{disease}_{embedding}_Fold{fold+1}"

                        trainAndEvaluate(train_loader, val_loader, model, model_desc, batch_name, results_file, disease, lr, dataformat, embedding, device, n_epoch, cv, use_decay)

                        del model
                

In [None]:
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
print(f'Using device: {device}')

#Override these if need be
#disease_list = ['Asthma', 'CAD', 'CHF', 'Depression', 'Diabetes', 'Gallstones', 'GERD', 'Gout', 'Hypercholesterolemia', 'Hypertension', 'Hypertriglyceridemia', 'OA', 'OSA', 'PVD', 'Venous Insufficiency', 'Obesity']
disease_list = ['Asthma']
embedding_list = ['GloVe','FastText']
results_file = f'{RESULTS_PATH}DL_embedding_results.csv'

#0.01 seems to be the most effective, added decay logic - starting at 0.1 seems to cause NaNs, if fix those it gets "stuck"
lr_list = [0.01]

#training parameters
n_epoch = 10
batch_size = 128
k = 2

#These should not change
dataformat = 'vector_tokenized'

result_time = datetime.datetime.now()
result_name = result_time.strftime("%Y-%m-%d-%H-%M-%S")
batch_name = f'DL_embedding_results_{result_name}'

iterateTrainAndEvaluate(all_df_expanded, k, disease_list, embedding_list, lr_list, batch_name, results_file, dataformat, device, max_tokens, n_epoch, False, False)



In [None]:
results = pd.read_csv(results_file)
results

**Deep Learning - Word Embeddings - All Features - With Stop Words**

![DL BagOfWords AllFeatures Averaged](images\dl-we-swyes.gif)