In [1]:
# standard modules
import os
import re
import shutil
import string
import time
import wget
from tqdm import tqdm, trange
import sqlalchemy as sa
from sqlalchemy.engine import URL
from IPython.display import display
import itertools

# data science modules
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression #import classifiers from sklearn
from sklearn import metrics #import different metrics to evaluate the classifiers
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.manifold import TSNE #scikit learn's TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# nlp modules
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# imports deep learning modules
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from pytorch_transformers import BertModel
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch as torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# custom
from sql_alchemy_utility.sql_alchemy_utility import SqlOperations as Sql

%matplotlib inline
%load_ext autotime

plt.style.use('ggplot')
nltk.download('punkt') # download Punkt sentence tokenization models
nltk.download('stopwords') # download stopwords
# pd.set_option('display.max_colwidth', None) # set max width of displayer columns to none to see entire transciptions

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dqtma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dqtma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

time: 312 ms (started: 2023-07-03 11:31:12 -04:00)


In [None]:
# load the data
query = "SELECT * FROM noteevents_05012023"

df_dd = Sql.load_data_from_db(query)

df_dd.head()

In [None]:
## keep track of viable records

# viable records that show the patient receiving proper VTE interventions
gold_inter = [192, 47760, 44381, 5410]

# viable records that shows a contraindication for VTE
gold_cont = [1846, 2741, 3675, 4558, 7224, 5909, 10840, 9438, 10111, 10715, 10048
    , 616712, 534, 1945, 7201, 10849, 8785, 16097, 25055, 40481, 45948
    , 46269, 46599, 46599, 33873, 52421, 52477, 43083, 54603, 49972, 327873
    , 2542, 4252, 5410, 5409, 7892, 7892, 6291, 8877, 8878, 8879]

# viable records that show a contraindication for non-VTE
silver_cont = [1258221]

# non-viable records
not_rel = [56, 979]

In [None]:
# active learning dataframe
df_bs = df_dd

df_bs["label"] =  "" #not labeled yet
df_bs["label"] =  np.where(df_bs["row_id"].isin(not_rel), "nonrelevant", df_bs["label"]) 
df_bs["label"] =  np.where(df_bs["row_id"].isin(gold_inter), "intervention", df_bs["label"])
df_bs["label"] =  np.where(df_bs["row_id"].isin(gold_cont), "contraindication", df_bs["label"])

# drop nan transcription to not break model
df_bs = df_bs.dropna(subset=['text'])

In [None]:
# separate gold from the rest.
shortlist = ['contraindication', 'intervention', 'nonrelevant']

gold = df_bs[df_bs['label'].isin(shortlist)]
rest = df_bs[~df_bs['label'].isin(shortlist)]

print('gold:', gold.shape)
print('rest:', rest.shape)

In [None]:
# set stopwords such as "a", "the", etc.
mystopwords = set(stopwords.words("english"))

# Function to tokenize text
def preprocess_corpus(texts):
    #initalize English stopwords
    mystopwords = set(stopwords.words("english"))

    def remove_stops_digits(tokens):
        # Nested function that removes stopwords and digits from a list of tokens
        # return [token.lower() for token in tokens if token.lower() not in mystopwords and not token.isdigit()]
        return [token.lower() for token in tokens if token.lower() and token not in string.punctuation not in mystopwords and not token.isdigit()]
    
    # This return statement below uses the above function to process  tokenizer output further. 
    return [remove_stops_digits(word_tokenize(content)) for content in texts]

# Use gold data for train and rest for prediction (test)
train_content = preprocess_corpus(gold['text'])
train_cats = gold['label']

test_content = preprocess_corpus(rest['text'])
test_cats = rest['label']

print("length of train data:", len(train_content), len(train_cats))
print("length of test data:", len(test_content), len(test_cats))

# print example tokenized sentence
print ("Tokenize the first sentence:")
print (train_content[0])

In [2]:
# prepare training data in doc2vec format:
train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_content)]

# Train a doc2vec model to learn representations
model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm =1, epochs=100)
model.build_vocab(train_doc2vec)
model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")
print("Model Saved")

NameError: name 'train_content' is not defined

In [None]:
# Infer the feature representation for training and test data using the trained model
model= Doc2Vec.load("d2v.model")

# infer in multiple epochs to get a stable representation. 
train_vectors =  [model.infer_vector(list_of_tokens) for list_of_tokens in train_content]
test_vectors = [model.infer_vector(list_of_tokens) for list_of_tokens in test_content]

# Use a logistic regression regular classifier
myclass = LogisticRegression(class_weight="balanced", max_iter=1000) #because classes are not balanced. high iterations to converge successfully
myclass.fit(train_vectors, train_cats)

# Make predictions off the rest of the dataset(non-gold) to do bootstrapping
preds = myclass.predict(test_vectors)

In [None]:
# grab the selected contraindications
value = 'contraindication'
print(f"number of predicted {value}:{len(preds[preds == value])}")
for i in range(len(preds)):
    if preds[i] == value:
        print(f"id:{rest['id'].iloc[i]}, pred:{preds[i]}")

In [None]:
# grab the selected interventions
value = 'interventions'
print(f"number of predicted {value}:{len(preds[preds == value])}")
for i in range(len(preds)):
    if preds[i] == value:
        print(f"id:{rest['id'].iloc[i]}, pred:{preds[i]}")

In [None]:
# grab the selected nonrelevant
limit = 10
value = 'nonrelevant'
print(f"number of predicted {value}:{len(preds[preds == value])}")
for i in range(limit):
    if preds[i] == value:
        print(f"id:{rest['id'].iloc[i]}, pred:{preds[i]}")

In [None]:
# examined record with print out transcription function
record_id = 2467
printOutTran(record_id, df_dd)

In [None]:
# add new labedled data 
gold_interv_b = gold_inter + [9,71,749]
gold_contrain_b = gold_cont + []
not_rel_b = not_rel + [913,970,1104,1216,1471,1575,1737,1756,2328,2465,2466,2467,2468,2481,2622,2821,2891,2993,3240,3261,4027,4129,4223,4232,4269,4665]