In [None]:
# import libraries
import pandas as pd
import numpy as np
import pickle

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Dropout, Activation, concatenate, Average
from tensorflow.keras.layers import Embedding
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers

from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, LSTM, TimeDistributed, GRU, Bidirectional, Layer
from tensorflow.keras import backend as K

import tensorflow as tf
import os

In [None]:
# load tokenizer if already trained
with open('notes_tokenizer_ps_find.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
data = pd.read_csv("all_clinical_notes (Valid PS).csv")
data = data.fillna("")
data

In [None]:
train_data = data[data["split"] == "train"]
valid_data = data[data["split"] == "validation"]
test_data = data[data["split"] == "test"] 

In [None]:
vocab_size = 10000
max_note_length = 2000

x_text_train = sequence.pad_sequences(tokenizer.texts_to_sequences([str(x) for x in train_data['text_no_ps']]), maxlen=max_note_length, padding='post')
x_text_valid = sequence.pad_sequences(tokenizer.texts_to_sequences([str(x) for x in valid_data['text_no_ps']]), maxlen=max_note_length, padding='post')
x_text_test = sequence.pad_sequences(tokenizer.texts_to_sequences([str(x) for x in test_data['text_no_ps']]), maxlen=max_note_length, padding='post')

In [None]:
def get_simple_model(target):
    vocab_size = 10000
    embedding_dims = 256
    filters = 250
    kernel_size = 3
    epochs = 2
    hidden_dims = 128
    max_note_length=2000
    batch_size = 32


    # make model
    text_input = Input(shape=(max_note_length,), dtype='float32')

    text_embed = Embedding(vocab_size, embedding_dims, input_length=max_note_length, mask_zero=False)(text_input)
    
    cnn1 = Conv1D(filters=500, kernel_size=kernel_size, strides=1, padding='valid')(text_embed)
    x = GlobalMaxPooling1D()(cnn1)

    hidden = Dense(hidden_dims)(x)
    hidden = Activation('relu')(hidden)

    output = Dense(1, activation='linear')(hidden)

    model = Model(inputs=text_input, outputs=output)

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [None]:
this_model = get_simple_model('ps_high')
this_model.load_weights('ps_high'+'.h5')

In [None]:
train_logits_list = this_model.predict(x_text_train)
val_logits_list = this_model.predict(x_text_valid)
test_logits_list = this_model.predict(x_text_test)

In [None]:
def generate_logits_and_probability(logits_list):
    logits_list = [logit[0] for logit in logits_list]
    # Logits
    logit_class_1 = logits_list
    logit_class_0 = [-logit for logit in logits_list]

    # Probabilities
    probability_class_1 = [tf.math.sigmoid(logit).numpy() for logit in logits_list]
    probability_class_0 = [1 - tf.math.sigmoid(logit).numpy() for logit in logits_list]
    
    prediction = [1 if a > b else 0 for a, b in zip(probability_class_1, probability_class_0)]
    
    return logit_class_1, logit_class_0, probability_class_1, probability_class_0, prediction

In [None]:
train_logit_class_1, train_logit_class_0, train_probability_class_1, train_probability_class_0, train_prediction = generate_logits_and_probability(train_logits_list)
train_data["Prediction"] = train_prediction
train_data["Logits (Class 0)"] = train_logit_class_0
train_data["Logits (Class 1)"] = train_logit_class_1
train_data["Probability (Class 0)"] = train_probability_class_0
train_data["Probability (Class 1)"] = train_probability_class_1
train_data

In [None]:
train_data.to_csv("CNN train result (Valid PS - PS Removed text).csv",index = False)

In [None]:
val_logit_class_1, val_logit_class_0, val_probability_class_1, val_probability_class_0, val_prediction = generate_logits_and_probability(val_logits_list)
valid_data["Prediction"] = val_prediction
valid_data["Logits (Class 0)"] = val_logit_class_0
valid_data["Logits (Class 1)"] = val_logit_class_1
valid_data["Probability (Class 0)"] = val_probability_class_0
valid_data["Probability (Class 1)"] = val_probability_class_1
valid_data

In [None]:
valid_data.to_csv("CNN validation result (Valid PS - PS Removed text).csv",index = False)

In [None]:
test_logit_class_1, test_logit_class_0, test_probability_class_1, test_probability_class_0, test_prediction = generate_logits_and_probability(test_logits_list)
test_data["Prediction"] = test_prediction
test_data["Logits (Class 0)"] = test_logit_class_0
test_data["Logits (Class 1)"] = test_logit_class_1
test_data["Probability (Class 0)"] = test_probability_class_0
test_data["Probability (Class 1)"] = test_probability_class_1
test_data

In [None]:
test_data.to_csv("CNN test result (Valid PS - PS Removed text).csv",index = False)