In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json
import numpy as np
from acquire import remove_stopwords, basic_clean, tokenize, prep_and_split_data
from prepare_jag import basic_clean3
import re
from re import search
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow import keras
import io
import os
import shutil
import string

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

from nltk.corpus import stopwords
import nltk

Download the data from the [Kaggle Competition Site](https://www.kaggle.com/c/medicalnotes-2019/data)

# Data Dictionary
descriptor: the value held in the 'feature_text' column. These are features that describe the individual.

In [None]:
# Read csv files into a Pandas dataframe.
features = pd.read_csv('features.csv')

In [None]:
notes = pd.read_csv('patient_notes.csv')

In [None]:
train = pd.read_csv('train.csv')

In [None]:
train

In [None]:
# Get familiar with the 'features' dataframe.
features

# Set sights on target:
'feature_text' targeted

I will have to create a function that will iterate through the students' patient notes and identify the different ways different students express the descriptors.

Tentative plan: 
1. Rename {'case_num':'case', 'feature_text':'target'}
2. Rename {'pn_num':'note_id', 'case_num':'case', 'pn_history':'student_notes'}
3. Normalize the text in features.feature_text and notes.pn_history.
    * clean it
4. Create a dataframe that holds the original text and the clean.
5. Split data in train, validate, and test.

In [None]:
# Rename columns in the features dataframe.
features.rename(columns={'feature_num':'feature_id', 'case_num':'case', 'feature_text':'target'}, inplace=True)

In [None]:
# Verify
features

In [None]:
features.target.value_counts().head(50)

In [None]:
len(features.target)

In [None]:
# Rename columns in notes dataframe.
notes.rename(columns={'pn_num':'note_id', 'case_num':'case', 'pn_history':'student_notes'}, inplace=True)

In [None]:
# Verify
notes

In [None]:
# Check 'features' dataframe for null values and data types.
features.info()

# Takeaways
* The 'target' column holds values related to the individual patient. 
* There are no null values and the data types make sense.

In [None]:
# Check the type of values in the 'feature_text' column.
features.target.value_counts().head(50)

# Takeaways
* It seems as if they created a unique list of descriptors for each patient.

In [None]:
features.case.value_counts()

In [None]:
features.case.value_counts().describe()

# Takeaways
* Descriptors for each patient ranges from 9 - 18.
* Average amount of descriptors per patient is 14.

In [None]:
notes.case.value_counts()

# Takeaways
* Student notes for patient_3 has close to 10,000 submissions.

In [None]:
def prep_text2(df, column, extra_words=[], exclude_words=['no','i']):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    return df[['case', column, 'clean']]

In [None]:
prep_text2(notes, 'student_notes')

In [None]:
notes

In [None]:
notes.student_notes[0]

In [None]:
notes.clean[0]

# Takeaways
* All of the symbols in the original are causing the cleaned version to produce concatenated words which will must be fixed.
* I will use regular expression to convert all symbols into spaces. From there I can locate low value words and add them to the stopword list.

In [None]:
# Use regex to substitute everything that is not a number or leter with an empty space.
# re.sub(r"[\W]", ' ')

# Takeaways
* The regex method produces a more coherent output. I will use it on the entire column.

In [None]:
notes.student_notes

In [None]:
student_notes_words = ' '.join(notes.clean)

In [None]:
''' This line of code slows up the notebook. I will keep it commented out for now.'''
# Get a peak:
# student_notes_words

In [None]:
student_notes_corpus = student_notes_words

### Analyze the student notes

In [None]:
len(student_notes_corpus.split())

# Takeaways
* There is a grand total of 3,990,311 words written by students.
* The average reading speed for an adult is 200 - 250 words per minute.
* It would take the average person 15961.2 - 19952.60 minutes to read all this.
* 266.00 - 332.50 hours.
* 11.10 - 13.90 days.

In [None]:
word_frequencies = pd.Series(student_notes_corpus.split()).value_counts()

In [None]:
word_frequencies.tail(50)

# Takeaway
* Most of the words that only show up once are typos.

In [None]:
len(pd.Series(student_notes_corpus.split()).unique())

# Takeaways
* There are 44770 unique words that show up in student notes. Most of these could be typos.

In [None]:
features.target.head(50)

## Break down target into individual features

In [None]:
# Run text through 'basic_clean' function.
cleaned_targets = features.target.apply(basic_clean)

In [None]:
# Verify.
cleaned_targets

In [None]:
# Create a list of all individual targets.
lists_of_targets = []
for target in cleaned_targets:
    # This line of code will split targets that have the word 'or' in it at that word.
    lists_of_targets.append(list(re.split(r'\bor', target)))

In [None]:
# Create a list that separates nested lists. 
list_of_targets = []
for ailments in lists_of_targets:
    for ailment in ailments:
        list_of_targets.append(ailment)

In [None]:
list_of_targets

In [None]:
# Strip all whitespaces.
list_of_targets = [s.strip() for s in list_of_targets]

In [None]:
list_of_targets

In [None]:
# This function completes all the above tasks.
def boil_it_down(df, column):
    cleaned_column = df[column].apply(basic_clean)
    lists_of_targets = []
    for target in cleaned_column:
        lists_of_targets.append(list(re.split(r'\bor', target)))
    list_of_targets = []
    for ailments in lists_of_targets:
        for ailment in ailments:
            list_of_targets.append(ailment)
    list_of_targets = [s.strip() for s in list_of_targets]
    return list_of_targets

In [None]:
len(list_of_targets)

In [None]:
boil_it_down(features, 'target')

In [None]:
# Create a for loop that checks for perfect matches.
perfect_match = []
for ailment in list_of_targets:
    for note in notes.clean:
        if ailment in note:
            perfect_match.append(ailment)

In [None]:
pd.Series(perfect_match).value_counts()

In [None]:
len(pd.Series(perfect_match).value_counts())

# Takeaways
* Out of the 179 unique targets, 97 have shown up, word-for-word, in the student notes.

In [None]:
pd.Series(re.findall(r"\bno\b", list_of_targets[6])) + ' ' + list_of_targets[6].split()[1]

In [None]:
list_of_targets[5]

In [None]:
list_of_targets

In [None]:
new_list = []
for ailment in list_of_targets:
    new_list.append(remove_stopwords(ailment, exclude_words = ['no', 'i']))

In [None]:
new_list

In [None]:
len(new_list[0].split())

In [None]:
list_of_ailment_in_notes = []
for ailment in new_list:    
    for i in range(len(ailment.split())):
        if ailment.split()[i] in notes.clean[0]:
            list_of_ailment_in_notes.append(ailment.split()[i])

In [None]:
list_of_ailment_in_notes

In [None]:
pd.Series(list_of_ailment_in_notes).unique()

In [None]:
new_list

In [None]:
notes

In [None]:
list(cleaned_targets)

## Create a dataframe that combines all targets of a case into one list.

In [None]:
# Apply the 'basic_clean' function to the targets
features['cleaned_targets'] = features.target.apply(basic_clean)

In [None]:
# Create a list of targets for each case
case_0_targets = list(features[features.case == 0].cleaned_targets)
case_1_targets = list(features[features.case == 1].cleaned_targets)
case_2_targets = list(features[features.case == 2].cleaned_targets)
case_3_targets = list(features[features.case == 3].cleaned_targets)
case_4_targets = list(features[features.case == 4].cleaned_targets)
case_5_targets = list(features[features.case == 5].cleaned_targets)
case_6_targets = list(features[features.case == 6].cleaned_targets)
case_7_targets = list(features[features.case == 7].cleaned_targets)
case_8_targets = list(features[features.case == 8].cleaned_targets)
case_9_targets = list(features[features.case == 9].cleaned_targets)

In [None]:
case_targets = pd.DataFrame({'case':[n for n in np.arange(10)], 'targets':[case_0_targets,case_1_targets,case_2_targets,case_3_targets,case_4_targets,case_5_targets,case_6_targets,case_7_targets,case_8_targets,case_9_targets]})

In [None]:
case_targets

## Merge the newly created dataframe to the 'notes' dataframe.

In [None]:
df = notes.merge(case_targets, how='inner', on='case')

In [None]:
df

In [None]:
df.targets[0]

In [None]:
df.clean[0]

## Use stratified splits to ensure each case exists in the splits.

In [None]:
test_split = 0.1

# Initial train and test split.
train_df, test_df = train_test_split(
    df, test_size=test_split, stratify=df['case'].values,
)

# Splitting the test set further into validation and new test set.
val_df = test_df.sample(frac=0.5)
test_df.drop(val_df.index, inplace=True)

print(f"Number of rows in training set: {len(train_df)}")
print(f"Number of rows in validation set: {len(val_df)}")
print(f"Number of rows in test set: {len(test_df)}")

# Prepare and split the full dataset.

In [2]:
train_df, validate_df, test_df = prep_and_split_data()

Number of rows in training set: 37931
Number of rows in validation set: 2108
Number of rows in test set: 2107


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


### Prep and split process:
    * Renamed columns.
    * Normalize text:
        - lowercase
        - add space before and after punctuation
    * Merged student notes data with feature text
    * Split data and stratify on case number

In [None]:
train_df

# Takeaways
* I just realized I might have a problem because the stopwords are removed from the student notes, but not the targets.
* I will run this through for the MVP and do it again with the stopwords removed from the targets as well.

## Multi-label binarization
Let's preprocess our labels using the [StringLookup](https://keras.io/api/layers/preprocessing_layers/categorical/string_lookup) layer.

In [3]:
targets = tf.ragged.constant(train_df['targets'].values)
lookup = tf.keras.layers.StringLookup(output_mode='multi_hot')
lookup.adapt(targets)
vocab = lookup.get_vocabulary()

2022-02-28 15:14:06.722979: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
len(train_df.targets)

37931

In [5]:
vocab

['[UNK]',
 'female',
 'nausea',
 '35 year',
 'male',
 'post prandial bloating or fullness with meals',
 'nsaid use or nonsteroidal anti inflammatory drug use',
 'no blood in stool',
 'minimal to no change with tums',
 'intermittent',
 'getting worse or progressive or symptoms now daily',
 'fhx of pud or family history of peptic ulcer disease',
 'epigastric discomfort',
 'duration 2 months',
 'darker bowel movements',
 'burning or gnawing or burning and gnawing',
 'awakens at night',
 '2 to 3 beers a week',
 'recent visit to emergency department with negative workup',
 'onset 5 years ago',
 'no illicit drug use',
 'no chest pain',
 'no caffeine use',
 'increased stress',
 'increased frequency recently',
 'feels hot or feels clammy',
 'fatigue or difficulty concentrating',
 'episodes of heart racing',
 'episodes last 15 to 30 minutes',
 'episode of hand numbness or episode of finger numbness',
 'associated throat tightness',
 'associated sob or associated shortness of breath',
 'associat

In [6]:
def invert_multi_hot(encoded_labels):
    '''Reverse a single multi-hot encoded label to a tuple of vocab terms.'''
    hot_indices = np.argwhere(encoded_labels == 1.0)[...,0]
    return np.take(vocab, hot_indices)

print('Vocabulary:\n')
print(vocab)

Vocabulary:

['[UNK]', 'female', 'nausea', '35 year', 'male', 'post prandial bloating or fullness with meals', 'nsaid use or nonsteroidal anti inflammatory drug use', 'no blood in stool', 'minimal to no change with tums', 'intermittent', 'getting worse or progressive or symptoms now daily', 'fhx of pud or family history of peptic ulcer disease', 'epigastric discomfort', 'duration 2 months', 'darker bowel movements', 'burning or gnawing or burning and gnawing', 'awakens at night', '2 to 3 beers a week', 'recent visit to emergency department with negative workup', 'onset 5 years ago', 'no illicit drug use', 'no chest pain', 'no caffeine use', 'increased stress', 'increased frequency recently', 'feels hot or feels clammy', 'fatigue or difficulty concentrating', 'episodes of heart racing', 'episodes last 15 to 30 minutes', 'episode of hand numbness or episode of finger numbness', 'associated throat tightness', 'associated sob or associated shortness of breath', 'associated nausea', 'associ

### Separate the individual targets from the label pool and then use it to represent a given label set with 0's and 1's

In [7]:
sample_label = train_df['targets'].iloc[0]
print(f'Original label: {sample_label}')

label_binarized = lookup([sample_label])
print(f'Label-binarized representation: {label_binarized}')

Original label: ['prior normal periods', 'last pap smear i year ago', 'iud', 'sexually active', 'vaginal dryness', 'irregular menses', 'recent nausea vomiting or recent flulike symptoms', 'no premenstrual symptoms', 'female', 'stress', 'lmp 2 months ago or last menstrual period 2 months ago', 'hot flashes', 'irregular flow or irregular frequency or irregular intervals', 'onset 3 years ago', 'heavy sweating', 'sleep disturbance or early awakenings', '44 year']
Label-binarized representation: [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [8]:
label_binarized

<tf.Tensor: shape=(1, 132), dtype=float32, numpy=
array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]], dtype=float32)>

## Data preprocessing and [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) objects

In [9]:
train_df['clean'].apply(lambda x: len(x.split(" "))).describe()

count    37931.000000
mean        97.789486
std         16.338358
min          7.000000
25%         88.000000
50%        100.000000
75%        109.000000
max        150.000000
Name: clean, dtype: float64

# Takeaways
* Half of the student notes have a length of 97 words.

In [10]:
max_seqlen = 97
batch_size = 128
padding_token = '<pad>'
auto = tf.data.AUTOTUNE

def make_dataset(dataframe, is_train=True):
    labels = tf.ragged.constant(dataframe['targets'].values)
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe['clean'].values, label_binarized)
    )
    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
    return dataset.batch(batch_size)

## Prepare the [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) objects.

In [11]:
train_dataset = make_dataset(train_df, is_train=True)
validate_dataset = make_dataset(validate_df, is_train=False)
test_dataset = make_dataset(test_df, is_train=False)

## Preview the dataset

In [12]:
text_batch, label_batch = next(iter(train_dataset))

for i, text in enumerate(text_batch[:5]):
    label = label_batch[i].numpy()[None, ...]
    print(f'Student note: {text}')
    print(f'Targets: {invert_multi_hot(label[0])}')
    print(' ')

Student note: b'mr hamilton 35 yo presents 2 month history gnawing burning epigastric pain rates pain 5 10 frequency pain increasing 2 times day waking 3 times per week night pain associated nausea bloating emesis noticed melena last 2 weeks also decreased appetite due pain noticed weight loss fever sob fatigue tums used help pain stopped working takes motrin 1 time per week currently several stressors including divorce working high heights construction job concerned pain begun awakening sleep pt counseled ros negative except meds pmh psh fh sh no medical problems no surgeries uncle bleeding ulcer current 5 1 ppd smoker x20 yrs'
Targets: ['nausea' '35 year' 'male' 'post prandial bloating or fullness with meals'
 'nsaid use or nonsteroidal anti inflammatory drug use'
 'no blood in stool' 'minimal to no change with tums' 'intermittent'
 'getting worse or progressive or symptoms now daily'
 'fhx of pud or family history of peptic ulcer disease'
 'epigastric discomfort' 'duration 2 months'

## Vectorization
Vectorize the text to represent it as a quantitative value. We will use [TextVectorization layer](https://keras.io/api/layers/preprocessing_layers/text/text_vectorization)

In [13]:
# Get unique words in student notes.
vocabulary = set()
train_df['clean'].str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)
print(vocabulary_size)

42172


## Now we create our vectorization layer and map() to the [tf.data.Datasets](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) created earlier.

In [16]:
text_vectorizer = layers.TextVectorization(
    max_tokens=vocabulary_size, ngrams=2, output_mode='tf_idf'
)

with tf.device('/CPU:0'):
    text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

train_dataset = train_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)
validate_dataset = validate_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)
test_dataset = test_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)

ValueError: in user code:

    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/engine/base_preprocessing_layer.py", line 118, in adapt_step  *
        self.update_state(data)
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/layers/preprocessing/text_vectorization.py", line 431, in update_state  **
        self._lookup_layer.update_state(self._preprocess(data))
    File "/usr/local/anaconda3/lib/python3.8/site-packages/keras/layers/preprocessing/text_vectorization.py", line 520, in _preprocess
        raise ValueError(

    ValueError: When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, 42172) with rank=2


## Create a text classification model

In [17]:
def make_model():
    shallow_mlp_model = keras.Sequential(
        [
            layers.Dense(512, activation="relu"),
            layers.Dense(256, activation="relu"),
            layers.Dense(lookup.vocabulary_size(), activation="softmax"),
        ]  # More on why "sigmoid" has been used here in a moment.
    )
    return shallow_mlp_model

## Train our model

In [None]:
epochs = 10

shallow_mlp_model = make_model()
shallow_mlp_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["categorical_accuracy"]
)

history = shallow_mlp_model.fit(
    train_dataset, validation_data=validation_dataset, epochs=epochs
)


def plot_result(item):
    plt.plot(history.history[item], label=item)
    plt.plot(history.history["val_" + item], label="val_" + item)
    plt.xlabel("Epochs")
    plt.ylabel(item)
    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()


plot_result("loss")
plot_result("categorical_accuracy")

## Evaluate the model

In [None]:
_, categorical_acc = shallow_mlp_model.evaluate(test_dataset)
print(f"Categorical accuracy on the test set: {round(categorical_acc * 100, 2)}%.")

In [None]:
# Create a model for inference.
model_for_inference = keras.Sequential([text_vectorizer, shallow_mlp_model])

# Create a small dataset just for demoing inference.
inference_dataset = make_dataset(test_df.sample(100), is_train=False)
text_batch, label_batch = next(iter(inference_dataset))
predicted_probabilities = model_for_inference.predict(text_batch)

# Perform inference.
for i, text in enumerate(text_batch[0]):
    label = label_batch[i].numpy()[None, ...]
    print(f"Student notes: {text}")
    print(f"Targets(s): {invert_multi_hot(label[0])}")
    predicted_proba = [proba for proba in predicted_probabilities[i]]
    top_15_labels = [
        x
        for _, x in sorted(
            zip(predicted_probabilities[i], lookup.get_vocabulary()),
            key=lambda pair: pair[0],
            reverse=True,
            
        )
    ][:15]
    print(f"Predicted Targets(s): ({', '.join([label for label in all_labels])})")
    print(" ")

# Takeaways
* My model seems to be predicting every target for every case. I will have to look into this tomorrow.

# Here I will conduct word embedding

In [None]:
train_df

## Let's try on case 0.

In [None]:
case0 = train_df[train_df.case == 0]