In [None]:
# import libraries
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import normalize
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

import re

In [None]:
# Load data from csv file
with open('all_clinical_notes.csv') as f:
    all_notes = pd.read_csv(f)
    
all_notes.dfci_mrn = pd.to_numeric(all_notes.dfci_mrn)
all_notes.text = all_notes.text.str.replace('\n|\r', ' ')
all_notes.text = all_notes.text.str.lower()

In [None]:
# Rules based finder function for ECOG or Karnovsky PS
# Input = text string, output = ECOG PS if it is explicitly labelled using a known rule

def find_ps(string):
    PS_search_phrase = "ecog|karnofsky|performance status|ps:"
    # Likely has some false positives after "ps:"
    splitup = re.split(PS_search_phrase, str(string))
    if len(splitup) < 2:
        return np.NaN
    else:
        after_PS = splitup[1]
        # Find the first instance of a number within the first 20 characters after a match
        start_after_PS = after_PS[0:20]
        # Fix the case when two indicator phrases are consecutive: "ecog performance status 2"
        if len(start_after_PS) < 3 and len(splitup) > 2:
            make_string_longer = splitup[1] + splitup[2]
            start_after_PS = make_string_longer[0:20]
        numbers = [int(s) for s in re.split(r'[;.:,\s\n]\s*', start_after_PS) if s.isdigit()]
        if not numbers:
            return np.NaN
        else:
            return numbers[0]

In [None]:
vocab_size = 10000
text = all_notes.text

In [None]:
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts([str(x) for x in text])
with open('notes_tokenizer_ps_find.pickle', 'wb') as handle:
     pickle.dump(tokenizer, handle, protocol=3)

In [None]:
all_notes['ps'] = [find_ps(x) for x in all_notes.text]

In [None]:
## Extract the list of notes where no PS was found
all_notes.loc[all_notes.ps > 100, 'ps'] = np.NaN

all_notes_valid_ps = all_notes[all_notes.ps.notnull()]
all_notes_no_ps = all_notes[all_notes.ps.isnull()]

In [None]:
# Function that removes the prases that contains the extracted ECOG PS

def remove_ps(string):
    PS_search_phrase = "ecog|karnofsky|performance status|ps:"
    # Likely has some false positives after "ps:"
    splitup = re.split(PS_search_phrase, str(string))
    if len(splitup) < 2:
        return string
    else:
        after_PS = splitup[1]
        # Find the first instance of a number within the first 20 characters after a match
        start_after_PS = after_PS[0:20]
        # Fix the case when two indicator phrases are consecutive: "ecog performance status 2"
        if len(start_after_PS) < 3:
            make_string_longer = splitup[1] + splitup[2]
            start_after_PS = make_string_longer[0:20]
        numbers = [int(s) for s in re.split(r'[;.:,\s\n]\s*', start_after_PS) if s.isdigit()]
        if not numbers:
            return string
        else:
            # Remove numbers from start_after_PS
            start_after_PS_nonumber = start_after_PS.replace(str(numbers[0]),'')
            # This automatically gets rid of the word indicators from subsequent occurrences as well
            rejoin = ''.join(splitup[1:])
            new_string = splitup[0] + start_after_PS_nonumber + rejoin[20:]
            return new_string

In [None]:
## Remove performance status text
all_notes_valid_ps['text_no_ps'] = all_notes_valid_ps['text'].apply(remove_ps)
all_notes_valid_ps['text_no_ps'] = all_notes_valid_ps['text_no_ps'].apply(remove_ps)

all_notes_no_ps['text_no_ps'] = all_notes_no_ps['text'].apply(remove_ps)
all_notes_no_ps['text_no_ps'] = all_notes_no_ps['text_no_ps'].apply(remove_ps)

In [None]:
## Overall design is to distinguish good ECOG (0 or 1) from poor ECOG (2-4)
## ECOG PS labels are strongly unbalanced, with the majority being 0-1

all_notes_valid_ps['ps_high'] = 0
all_notes_valid_ps.loc[all_notes_valid_ps['ps'] > 1, 'ps_high'] = 1

In [None]:
all_notes_valid_ps.to_csv("all_clinical_notes (Valid PS).csv", index = False)
all_notes_no_ps.to_csv("all_clinical_notes (No PS).csv", index = False)