### Data extraction
Extract data from MIMIC-III

In [1]:
import pandas as pd
import nltk.data
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [2]:
# Step 1

alive_df = pd.read_csv('data/inputs/SEPSIS/alive_df.csv')
dead_df = pd.read_csv('data/inputs/SEPSIS/dead_df.csv')

In [3]:
print(f"Number of patients in label_0: {dead_df['SUBJECT_ID_x'].nunique()}")
print(f"Number of patients in label_1: {alive_df['SUBJECT_ID_x'].nunique()}")

Number of patients in label_0: 260
Number of patients in label_1: 842


In [4]:
import json
# Read json
with open('data/inputs/SEPSIS/word_dict.json', 'r') as fp:
    word_dict = json.load(fp)

In [5]:
from functions import find_cooc_per_patient

KeyboardInterrupt: 

In [None]:
patient_node_0, patient_cooc_0, patient_note_num_0 = find_cooc_per_patient(dead_df, word_dict, 0.15)
patient_node_1, patient_cooc_1, patient_note_num_1 = find_cooc_per_patient(alive_df, word_dict, 0.15)

In [None]:
# Leave only unique values
patient_node_set = set()

for k, v in patient_node_0.items():
    for item in v:
        patient_node_set.add(item)
    
for k, v in patient_node_1.items():
    for item in v:
        patient_node_set.add(item)
        

In [None]:
# patient_square_node_data = pd.read_csv('patient_square_node_id_data.csv')
patient_square_node_data = pd.read_csv('patient_node_data_sepsis.csv')
patient_square_node_data

In [None]:
patient_square_node_id_data = patient_square_node_data.set_index("node")
# patient_square_node_id_data['subject'] = "positive"
patient_square_node_id_data

In [None]:
patient_subjects = patient_square_node_id_data["subject"]
patient_subjects

### word2vec embeddings

In [None]:
import os
import numpy as np
disease_name = 'SEPSIS'
inputs_path = os.path.join('data/inputs/', disease_name)
word2vec_emb = np.load(os.path.join(inputs_path,'word2vec_emb.npy'), allow_pickle=True)

In [None]:
word2vec_emb[()]['cmo']

In [None]:
# Create np.array only for set of patient nodes

emb_list = []

for node in tqdm(list(patient_node_set)):
    if node not in word2vec_emb[()]:
        print(node)
    else:
        emb_list.append(word2vec_emb[()][node])
    

In [None]:
len(emb_list)

In [None]:
emb_list[0]

### Visualise Node Embeddings generated by word2vec

In [None]:
# Retrieve node embeddings and corresponding subjects
patient_node_ids = list(patient_node_set)  # list of node IDs


In [None]:
# the gensim ordering may not match the StellarGraph one, so rearrange
patient_node_targets = patient_subjects.loc[patient_node_ids].astype("category")

In [None]:
""" As patient_node_embeddings contains embeddings from all possible words
    we narrow down the scope to only words that are contained in patient_node_ids
    from emb_list
"""
patient_node_embeddings = np.asarray(emb_list)

In [None]:
type(patient_node_embeddings)

In [None]:
patient_node_embeddings.shape

In [None]:
# from sklearn.manifold import TSNE
# Apply t-SNE transformation on node embeddings
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
patient_node_embeddings_2d = tsne.fit_transform(patient_node_embeddings)

In [None]:
# draw the points
import matplotlib.pyplot as plt
alpha = 0.7

plt.figure(figsize=(10, 8))
plt.scatter(
    patient_node_embeddings_2d[:, 0],
    patient_node_embeddings_2d[:, 1],
    c=patient_node_targets.cat.codes,
    cmap="jet",
    alpha=0.7,
)
plt.show()

### Fasttext

In [None]:
import os
import numpy as np
disease_name = 'SEPSIS'
inputs_path = os.path.join('data/inputs/', disease_name)
fasttext_emb = np.load(os.path.join(inputs_path,'fasttext_emb.npy'), allow_pickle=True)

In [None]:
# Create np.array only for set of patient nodes

emb_list = []

for node in tqdm(list(patient_node_set)):
    if node not in fasttext_emb[()]:
        print(node)
    else:
        emb_list.append(fasttext_emb[()][node])

In [None]:
emb_list

### Visualise Node Embeddings generated by fasttext

In [None]:
# Retrieve node embeddings and corresponding subjects
patient_node_ids = list(patient_node_set)  # list of node IDs

# the gensim ordering may not match the StellarGraph one, so rearrange
patient_node_targets = patient_subjects.loc[patient_node_ids].astype("category")

In [None]:
""" As patient_node_embeddings contains embeddings from all possible words
    we narrow down the scope to only words that are contained in patient_node_ids
    from emb_list
"""
patient_node_embeddings = np.asarray(emb_list)

In [None]:
type(patient_node_embeddings)

In [None]:
patient_node_embeddings.shape
# (633, 128) for pneumonia

In [None]:
# from sklearn.manifold import TSNE
# Apply t-SNE transformation on node embeddings
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
patient_node_embeddings_2d = tsne.fit_transform(patient_node_embeddings)

In [None]:
# draw the points
import matplotlib.pyplot as plt
alpha = 0.7

plt.figure(figsize=(10, 8))
plt.scatter(
    patient_node_embeddings_2d[:, 0],
    patient_node_embeddings_2d[:, 1],
    c=patient_node_targets.cat.codes,
    cmap="jet",
    alpha=0.7,
)
plt.show()

### GloVe

In [None]:
import os
import numpy as np
disease_name = 'SEPSIS'
inputs_path = os.path.join('data/inputs/', disease_name)
glove_emb = np.load(os.path.join(inputs_path,'glove_emb.npy'), allow_pickle=True)

### Visualise Node Embeddings generated by glove

In [None]:
# Create np.array only for set of patient nodes

emb_list = []

for node in tqdm(list(patient_node_set)):
    if node not in glove_emb[()]:
        print(node)
    else:
        emb_list.append(glove_emb[()][node])

In [None]:
# Retrieve node embeddings and corresponding subjects
patient_node_ids = list(patient_node_set)  # list of node IDs

# the gensim ordering may not match the StellarGraph one, so rearrange
patient_node_targets = patient_subjects.loc[patient_node_ids].astype("category")

In [None]:
""" As patient_node_embeddings contains embeddings from all possible words
    we narrow down the scope to only words that are contained in patient_node_ids
    from emb_list
"""
patient_node_embeddings = np.asarray(emb_list)

In [None]:
type(patient_node_embeddings)

In [None]:
patient_node_embeddings.shape

In [None]:
# from sklearn.manifold import TSNE
# Apply t-SNE transformation on node embeddings
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
patient_node_embeddings_2d = tsne.fit_transform(patient_node_embeddings)

In [None]:
# draw the points
import matplotlib.pyplot as plt
alpha = 0.7

plt.figure(figsize=(10, 8))
plt.scatter(
    patient_node_embeddings_2d[:, 0],
    patient_node_embeddings_2d[:, 1],
    c=patient_node_targets.cat.codes,
    cmap="jet",
    alpha=0.7,
)
plt.show()

### seq2vec not weighted

In [None]:
import os
import numpy as np
disease_name = 'SEPSIS'
inputs_path = os.path.join('data/inputs/', disease_name)
sequence2vec_notWeighted = np.load(os.path.join(inputs_path,'sequence2vec_notWeighted.npy'), allow_pickle=True)

### Visualise Node Embeddings generated by seq2vec


In [None]:
# Create np.array only for set of patient nodes

emb_list = []

for node in tqdm(list(patient_node_set)):
    if node not in sequence2vec_notWeighted[()]:
        print(node)
    else:
        emb_list.append(sequence2vec_notWeighted[()][node])

In [None]:
# Retrieve node embeddings and corresponding subjects
patient_node_ids = list(patient_node_set)  # list of node IDs

# the gensim ordering may not match the StellarGraph one, so rearrange
patient_node_targets = patient_subjects.loc[patient_node_ids].astype("category")

In [None]:
""" As patient_node_embeddings contains embeddings from all possible words
    we narrow down the scope to only words that are contained in patient_node_ids
    from emb_list
"""
patient_node_embeddings = np.asarray(emb_list)

In [None]:
# from sklearn.manifold import TSNE
# Apply t-SNE transformation on node embeddings
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
patient_node_embeddings_2d = tsne.fit_transform(patient_node_embeddings)

In [None]:
# draw the points
import matplotlib.pyplot as plt
alpha = 0.7

plt.figure(figsize=(10, 8))
plt.scatter(
    patient_node_embeddings_2d[:, 0],
    patient_node_embeddings_2d[:, 1],
    c=patient_node_targets.cat.codes,
    cmap="jet",
    alpha=0.7,
)
plt.show()

### seq2vec

In [None]:
sequence2vec = np.load(os.path.join(inputs_path,'sequence2vec.npy'), allow_pickle=True)

In [None]:
# Create np.array only for set of patient nodes

emb_list = []

for node in tqdm(list(patient_node_set)):
    if node not in sequence2vec[()]:
        print(node)
    else:
        emb_list.append(sequence2vec[()][node])

In [None]:
# Retrieve node embeddings and corresponding subjects
patient_node_ids = list(patient_node_set)  # list of node IDs

# the gensim ordering may not match the StellarGraph one, so rearrange
patient_node_targets = patient_subjects.loc[patient_node_ids].astype("category")

In [None]:
""" As patient_node_embeddings contains embeddings from all possible words
    we narrow down the scope to only words that are contained in patient_node_ids
    from emb_list
"""
patient_node_embeddings = np.asarray(emb_list)

In [None]:
# from sklearn.manifold import TSNE
# Apply t-SNE transformation on node embeddings
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
patient_node_embeddings_2d = tsne.fit_transform(patient_node_embeddings)

In [None]:
# draw the points
import matplotlib.pyplot as plt
alpha = 0.7

plt.figure(figsize=(10, 8))
plt.scatter(
    patient_node_embeddings_2d[:, 0],
    patient_node_embeddings_2d[:, 1],
    c=patient_node_targets.cat.codes,
    cmap="jet",
    alpha=0.7,
)
plt.show()