In [2]:
# Describe the disease
import os
import pandas as pd
import numpy as np
import json
import tensorflow as tf
import matplotlib.pyplot as plt
import functions
from functions import extract_data, count_notes_per_patient, logger, count_words_per_patient, find_frequent_word, find_cooc_per_patient
from functions import cooc_log_odd_score, sequence2vec, other_emb
from functions import create_graphs_lists, train_model, create_graph
from nltk.stem import PorterStemmer
from sklearn import model_selection

In [17]:
# Input vars --->
disease_name = 'SEPSIS'
database_path = '../MIMIC-III'
inputs_path = os.path.join('data/inputs/', disease_name)

patient_id_to_num_notes = {}

number_of_patients = {}
note_appearance_counter = {}
## Step 4
n_fold = float(3)
threshold = float(0.01)
frequent_word_lists = {}

min_sup = 0.15
# Input vars ---<

In [18]:
alive_df = pd.read_csv(os.path.join(inputs_path,'alive_df.csv'))
dead_df = pd.read_csv(os.path.join(inputs_path,'dead_df.csv'))

logger.info(f"Number of patients in label_0: {dead_df['SUBJECT_ID_x'].nunique()}")
logger.info(f"Number of patients in label_1: {alive_df['SUBJECT_ID_x'].nunique()}")

2021-01-22 16:04:26,106 - Number of patients in label_0: 260
2021-01-22 16:04:26,108 - Number of patients in label_1: 842


In [19]:
alive_df

Unnamed: 0,SUBJECT_ID_x,TEXT
0,94,Probable atrial fibrillation. Downsloping ST s...
1,94,Sinus rhythm. First degree A-V block. Probable...
2,94,[**2176-2-25**] 11:12 AM\n CTA CHEST W&W/O C &...
3,94,[**2176-2-25**] 2:20 PM\n CHEST (PORTABLE AP);...
4,94,[**2176-2-25**] 5:04 PM\n CHEST PORT. LINE PLA...
...,...,...
29131,99973,TITLE:\n Chief Complaint:\n 24 Hour Events...
29132,99973,Chief Complaint:\n I saw and examined the pa...
29133,99973,This is a 65 yr woman with nka who was admitte...
29134,99973,[**2180-11-29**] 3:43 PM\n HIP UNILAT MIN 2 VI...


In [20]:
# import text handling tool
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tqdm import tqdm
from collections import Counter

import sys
import re
import csv
import os.path

# define constants
# RESULT_HEADER = "WORD, FREQUENCY\n"
MIN_SEQ_LEN = 4
USE_1_N_SEQ = 2


# words that do not have meaning (can be modified later)
USELESS_WORDS = ["a", "the", "he", "she", ",", ".", "?", "!", ":", ";", "+", "*", "**"\
                 "your", "you"]

# count up the frequency of every word in every disease file
stemmer = PorterStemmer()
# create set of words to ignore in text
stop_words = set(stopwords.words('english'))

for word in USELESS_WORDS:
    stop_words.add(word)

Unnamed: 0,SUBJECT_ID_x,TEXT
0,111,PATIENT/TEST INFORMATION:\nIndication: s/p PEA...
1,111,Compared to the previous tracing QRS voltage i...
2,111,"Normal sinus rhythm, rate 80. Biatrial abnorm..."
3,111,Normal sinus rhythm. Q waves in leads V1-V2 c...
4,111,Chief Complaint: respiratory distress\n HPI...
...,...,...
46129,99937,[**2128-5-11**] 11:57 AM\n CHEST (PORTABLE AP)...
46130,99937,[**2128-5-11**] 1:44 PM\n CTA CHEST W&W/O C&RE...
46131,99937,Normal sinus rhythm. Compared to tracing #1 no...
46132,99937,[**2128-5-12**] 7:27 AM\n CHEST (PORTABLE AP) ...


In [27]:
import pickle
disease_df = pd.concat([dead_df, alive_df])
stemmed_words_dict = {}
for index, row in tqdm(disease_df.iterrows(), total=disease_df.shape[0]):
    note = re.sub(r'\[\*\*(.*?)\*\*\]|[_,\d\*:~=\.\-\+\\/]+', ' ', row['TEXT'])
    tokenized_note = word_tokenize(note)
    for word in tokenized_note:
        stemmed_word = stemmer.stem(word.lower())
        if not stemmed_word in stop_words:
            if stemmed_word in all_words_set:
                # Found a word. Put it into a dictionary

                if stemmed_word in stemmed_words_dict:
                    stemmed_word_set = stemmed_words_dict[stemmed_word]["words"]
                    if word not in stemmed_word_set:
                        stemmed_word_set.add(word.lower())
                        notes_list = stemmed_words_dict[stemmed_word]["notes"]
                        notes_list.append(row['TEXT'])
                        stemmed_words_dict[stemmed_word] = {"words" : stemmed_word_set, "notes" : notes_list}
                else:
                    stemmed_words_dict[stemmed_word] = {"words" : {word.lower()}, "notes" : [row['TEXT']]}

# https://stackoverflow.com/questions/7100125/storing-python-dictionaries
# Write pickle file
with open('stemmed_words_dict_SEPSIS.pickle', 'wb') as fp:
    pickle.dump(stemmed_words_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
# Read pickle file
# with open('data.pickle', 'rb') as fp:
#     data = pickle.load(fp)    


100%|██████████| 40684/40684 [04:36<00:00, 147.38it/s]


In [15]:
for item in stemmed_words_dict.items():
    print(item)
    break

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [28]:
len(stemmed_words_dict)

680

In [None]:
word = "PEA"
stemmed_word = 'pea'
stemmed_words_dict[stemmed_word] = {"words" : word.lower()}
print(stemmed_words_dict)
print(stemmed_words_dict['pea']['words'])

In [None]:
stemmed_words_dict

In [None]:
all_words_set

In [None]:
# # Read json
with open(os.path.join(inputs_path,'word_dict.json'), 'r') as fp:
    word_dict = json.load(fp)

In [25]:
# Read json
with open(os.path.join(inputs_path,'patient_node_0.json'), 'r') as fp:
    patient_node_0 = json.load(fp)
with open(os.path.join(inputs_path,'patient_node_1.json'), 'r') as fp:
    patient_node_1 = json.load(fp)
# Read txt with tuples
dic = ''
with open(os.path.join(inputs_path,'patient_cooc_0.txt'),'r') as f:
         for i in f.readlines():
            dic=i #string
patient_cooc_0 = eval(dic) # this is orignal dict with instace dict

dic = ''
with open(os.path.join(inputs_path,'patient_cooc_1.txt'),'r') as f:
         for i in f.readlines():
            dic=i #string
patient_cooc_1 = eval(dic) # this is orignal dict with instace dict

# Clean empty patients taht do not have any co-occurrences
patient_cooc_0 = {k: v for k, v in patient_cooc_0.items() if bool(v)}
patient_cooc_1 = {k: v for k, v in patient_cooc_1.items() if bool(v)}

# Step 6
logger.info("Get and normalize weights in co-occurrences...")
patient_cooc_set, normalized_cooc_odd_scores = cooc_log_odd_score(patient_cooc_0, patient_cooc_1, )

2021-01-22 16:05:07,186 - Get and normalize weights in co-occurrences...


In [None]:
normalized_cooc_odd_scores

In [None]:
patient_cooc_0

In [None]:
# Load npy file
sequence2vec = np.load(os.path.join(inputs_path,'sequence2vec.npy'), allow_pickle=True)

In [None]:
sequence2vec[()]['NaN'] = np.zeros(128)
sequence2vec[()]['NaN']

In [None]:
len(all_words_set)

In [None]:
print(len(word_dict))
print(word_dict)

In [26]:
all_words_set = set()
for item in patient_cooc_set:
    all_words_set.add(item[0])
    all_words_set.add(item[1])

print(len(all_words_set))

680


In [None]:
graphs, graph_labels, train_index, test_index = create_graphs_lists(patient_cooc_0, patient_cooc_1, normalized_cooc_odd_scores, sequence2vec)

In [None]:
graph_labels


In [None]:
graph_labels.value_counts()

In [None]:
item_counts = graph_labels[1].value_counts()
print(item_counts)

In [None]:
print(graphs[52].__dict__)
print()
print(graphs[52].__dir__())

In [None]:
graphs[52].edges()

In [None]:
tmp_graph = graphs[52]
print(tmp_graph.info())

In [None]:
for i in tmp_graph.nodes():
    print(i)

In [None]:
graphs[52]._edge_weights('pea', 'arrest')

In [None]:
print(graphs[52].info())

In [None]:
# Create a list of patient's dictionaries
patient_list_of_dicts = []
for p in patient_cooc_0.items():
    patient_list_of_dicts.append(p[1])
    
for p in patient_cooc_1.items():
    patient_list_of_dicts.append(p[1])
    
len(patient_list_of_dicts)

In [None]:
tmp_graphs = copy.deepcopy(graphs)
# tmp_graphs.pop()
print(len(tmp_graphs))
print(len(graphs))

In [None]:
for p in graphs:
    print(p.nodes())
    break

In [None]:
for word in all_words_set:
    print("Word:", word)
    for p_idx, p_graph in enumerate(graphs):
        # p_idx: num of a patient in a graphs list
        # p_graph: graph of that patient
        if word in p_graph.nodes():
            print(p_idx)
            patient_nan_dict = create_nan_patient(patient_list_of_dicts[p_idx], word)
            patient_nan_graph = create_graph(patient_nan_dict,normalized_cooc_odd_scores, sequence2vec)
            tmp_graphs[p_idx] = patient_nan_graph
            break
    break
            

In [None]:
print(graphs[467].edges())
print()
print(tmp_graphs[467].edges())

In [None]:
# Check dict in a list and dict in a graph
# They should be the same
if pa

In [None]:
p_cooc = {('oob', 'stair'): 18.0, ('ctx', 'stair'): 18.0, ('ctx', 'oob'): 18.0, ('ambul', 'ctx'): 18.0, ('ambul', 'stair'): 18.0, ('ambul', 'oob'): 18.0, ('phenylephrin', 'video'): 15.0, ('barium', 'video'): 15.0}
p_cooc

In [None]:
# create a new dict with NaN, instead of searched word
# This dictionary then becomes an input to create a new graph

def create_nan_patient(p_cooc, search_w):
    new_dict = {}
    for cooc in p_cooc:
        if search_w in cooc[0]:
            new_dict[('NaN', cooc[1])] = p_cooc[cooc]
        elif search_w in cooc[1]:
            new_dict[(cooc[0], 'NaN')] = p_cooc[cooc]
        else:
            new_dict[cooc] = p_cooc[cooc]
    return new_dict

# search_w = 'stair'
# new_dict = {}
# for cooc in p_cooc:
#     if search_w in cooc[0]:
#         new_dict[('NaN', cooc[1])] = p_cooc[cooc]
#     elif search_w in cooc[1]:
#         new_dict[(cooc[0], 'NaN')] = p_cooc[cooc]
#     else:
#         new_dict[cooc] = p_cooc[cooc]
# new_dict

In [None]:
new_test_index = np.concatenate((train_index, test_index))
new_test_index

In [None]:
patient_idx_0 = [35]
for idx in patient_idx_0:
    for num, item in enumerate(patient_tmp_0):
        if num == idx:
            print(item, patient_tmp_0[item])
            new_graph = create_graph(patient_tmp_0[item],normalized_cooc_odd_scores, sequence2vec)
print(new_graph)
print(new_graph.info())

In [None]:
from collections import defaultdict
source = []
target = []
edge_weight = []
node_idx = []
cooc_odd_scores = normalized_cooc_odd_scores
node_emb_dict = sequence2vec

for cooc in patient_cooc_0[16684]:

    source.extend([cooc[0], cooc[1]])
    target.extend([cooc[1], cooc[0]])
    edge_weight.extend([cooc_odd_scores[cooc], cooc_odd_scores[cooc]])
        
node_idx = list(set(source + target))

print(node_idx)
print(len(edge_weight))
print(edge_weight)
# Create a dataframe of only nodes
square_node_data = pd.DataFrame(
    index=node_idx)
        
# Create a dictionary for each column for a vector
node_features = defaultdict(list)
for node in node_idx:
    # Case 1: Use in defaul embeddings training 
    # for i, vec in enumerate(node_emb_dict[node]):
    # Case 2: Use when load npy file
    for i, vec in enumerate(node_emb_dict[()][node]):
        node_features['w_' + str(i)].append(vec)

# Add columns to a dataframe
for k, v in node_features.items():
              
    square_node_data[k] = v

square_edges = pd.DataFrame({ 
    "source": source, 
    "target": target, 
    "weight":edge_weight
})
print(square_node_data)

In [None]:
import copy
patient_tmp_0 = copy.deepcopy(patient_cooc_0)
patient_tmp_1 = copy.deepcopy(patient_cooc_1)
print(len(patient_tmp_0))
print(len(patient_tmp_1))

In [None]:
"""
Go through each patient and find if the patient has a word for which we are checking.
If it does, mask the word as nan. 
- Add nan into emb_dict and set it is vector to zeros.
- If there will be a co-occurrence with nan then set it's edge weigth to 0.5
- Re-create graphs of these patients and check the accurracy
"""

# Find out if the word is in patients co-occurrences
patient_idx_0 = []
patient_idx_1 = []

for word in all_words_set:
    word = 'cmo'
#     print("Word:", word)
#     print('='*60)
    # Go through every patient
    cnt_0 = 0
    for k, v in patient_tmp_0.items():
        # Go through every patient co-occurrence
        for key, val in v.items():
            if key[0] == word or key[1] == word:
                print('-'*30)
                print("Word:", word)
                print(f"patient_tmp_0 -> key:{k}, cnt:{cnt_0}")
        cnt_0 += 1
                break
#     cnt_1 = 0
#     for k, v in patient_tmp_1.items():
#         if set_item in v:
#             print('-'*30)
#             print(f"patient_tmp_1 -> key:{k}, cnt:{cnt_1}")
#             del v[set_item]
#         cnt_1 += 1
    break
            
    

In [None]:
for set_item in patient_cooc_set:
    set_item = ('declin', 'dic')
    print('='*60)
    print("Removing co-occurrence:", set_item)
    cnt_0 = 0
    for k, v in patient_tmp_0.items():
        if set_item in v:
            print('-'*30)
            print(f"patient_tmp_0 -> key:{k}, cnt:{cnt_0}")
            del v[set_item]
        cnt_0 += 1
    cnt_1 = 0
    for k, v in patient_tmp_1.items():
        if set_item in v:
            print('-'*30)
            print(f"patient_tmp_1 -> key:{k}, cnt:{cnt_1}")
            del v[set_item]
        cnt_1 += 1
            
    break

In [None]:
# Dead always start from 0
start_index_alive = len(patient_cooc_0)
print("Start index for alive:", start_index_alive)

In [None]:
# Load npy file
sequence2vec = np.load(os.path.join(inputs_path,'sequence2vec.npy'), allow_pickle=True)
# word2vec_emb = np.load(os.path.join(inputs_path,'word2vec_emb.npy'), allow_pickle=True)
# fasttext_emb = np.load(os.path.join(inputs_path,'fasttext_emb.npy'), allow_pickle=True)
# glove_emb = np.load(os.path.join(inputs_path,'glove_emb.npy'), allow_pickle=True)
# sequence2vec_notWeighted = np.load(os.path.join(inputs_path,'sequence2vec_notWeighted.npy'), allow_pickle=True)


In [None]:
patient_tmp_0 = copy.deepcopy(patient_cooc_0)
patient_tmp_1 = copy.deepcopy(patient_cooc_1)

import time
result_list = []
for i_cnt, set_item in enumerate(patient_cooc_set):

    # Read co-occurrences for dead patients
    dic = ''
    with open(os.path.join(inputs_path,'patient_cooc_0.txt'),'r') as f:
            for i in f.readlines():
                dic=i #string
    patient_cooc_0 = eval(dic) # this is orignal dict with instace dict

    # Read co-occurrences for alive patients
    dic = ''
    with open(os.path.join(inputs_path,'patient_cooc_1.txt'),'r') as f:
            for i in f.readlines():
                dic=i #string
    patient_cooc_1 = eval(dic) # this is orignal dict with instace dict

    # Count how many times set_item appears in each group
    set_item_cnt_0 = 0
    set_item_cnt_1 = 0
    print('-' * 60)
    print("Co-occurrence:", set_item)
    print(f"Train model...iter: {i_cnt} out of {len(patient_cooc_set)}")
    
    print(f"Before patient_cooc_0:{sum_keys(patient_cooc_0)}, patient_cooc_1:{sum_keys(patient_cooc_1)}")


    for _, v in patient_cooc_0.items():
        if set_item in v:
            set_item_cnt_0 += 1
            removed_items.append(set_item)
            del v[set_item]

    for _, v in patient_cooc_1.items():
        if set_item in v:
            set_item_cnt_1 += 1
            removed_items.append(set_item)
            del v[set_item]
            
    print(f"set_item_cnt_0: {set_item_cnt_0}, set_item_cnt_1: {set_item_cnt_1}")
    print(f"After patient_cooc_0:{sum_keys(patient_cooc_0)}, patient_cooc_1:{sum_keys(patient_cooc_1)}")


In [None]:
import time
start = time.time()
logger.info("Create graphs, graph labels, train and test data...")
graphs, graph_labels, train_index, test_index = create_graphs_lists(patient_cooc_0, patient_cooc_1, normalized_cooc_odd_scores, sequence2vec)
end = time.time()
print("Running time:", end - start)

In [None]:
logger.info("Train model...")
test_accs, test_f1_score, test_precision, test_recall, test_auc = train_model(graphs, graph_labels, train_index, test_index, "seq2vec", disease_name)
logger.info(f"Accuracy over all folds mean: {np.mean(test_accs)*100:.3}% and std: {np.std(test_accs)*100:.2}%")
logger.info(f"F1_socre over all folds mean: {np.mean(test_f1_score)*100:.3}% and std: {np.std(test_f1_score)*100:.2}%")
logger.info(f"Precision over all folds mean: {np.mean(test_precision)*100:.3}% and std: {np.std(test_precision)*100:.2}%")
logger.info(f"Recall over all folds mean: {np.mean(test_recall)*100:.3}% and std: {np.std(test_recall)*100:.2}%")
logger.info(f"AUC over all folds mean: {np.mean(test_auc)*100:.3}% and std: {np.std(test_auc)*100:.2}%")

In [None]:


def sum_keys(d):
    return 0 if not isinstance(d, dict) else len(d) + sum(sum_keys(v) for v in d.values())
def removekey(d, key):
    r = dict(d)
    del r[key]
    return r  

In [None]:
type(patient_cooc_0)

In [None]:
patient_cooc_1[84858]

In [None]:
count_all_cooc(patient_cooc_1)

In [None]:
def sum_keys(d):
    return 0 if not isinstance(d, dict) else len(d) + sum(sum_keys(v) for v in d.values())
print(f"Before patient_cooc_0:{sum_keys(patient_cooc_0)}, patient_cooc_1:{sum_keys(patient_cooc_1)}")

logger.info(f"Start the loop for set_item...{disease_name}")
"""
Method to count change of accurracy depending on removing a co-occurrence
"""
result_list = []
for i_cnt, set_item in enumerate(patient_cooc_set):

    # Read co-occurrences for dead patients
    dic = ''
    with open(os.path.join(inputs_path,'patient_cooc_0.txt'),'r') as f:
            for i in f.readlines():
                dic=i #string
    patient_cooc_0 = eval(dic) # this is orignal dict with instace dict

    # Read co-occurrences for alive patients
    dic = ''
    with open(os.path.join(inputs_path,'patient_cooc_1.txt'),'r') as f:
            for i in f.readlines():
                dic=i #string
    patient_cooc_1 = eval(dic) # this is orignal dict with instace dict

    print('-' * 60)
    print(f"Before patient_cooc_0:{sum_keys(patient_cooc_0)}, patient_cooc_1:{sum_keys(patient_cooc_1)}")
    # Count how many times set_item appears in each group
    set_item_cnt_0 = 0
    set_item_cnt_1 = 0

    for _, v in patient_cooc_0.items():
        if set_item in v:
            set_item_cnt_0 += 1
            del v[set_item]

    for _, v in patient_cooc_1.items():
        if set_item in v:
            set_item_cnt_1 += 1
            del v[set_item]

    
    print("Co-occurrence:", set_item)
    logger.info("Create graphs, graph labels, train and test data...")
    graphs, graph_labels, train_index, test_index = create_graphs_lists(patient_cooc_0, patient_cooc_1, normalized_cooc_odd_scores, sequence2vec)
    new_test_index = np.concatenate((train_index, test_index))
    print(f"Train model...iter: {i_cnt} out of {len(patient_cooc_set)}")
    test_accs, test_f1_score, test_precision, test_recall, test_auc = train_model(graphs, graph_labels, train_index, new_test_index, "seq2vec", disease_name)

    accs = np.mean(test_accs)*100
    accs_std = np.std(test_accs)*100
    f1_score = np.mean(test_f1_score)*100
    f1_score_std = np.std(test_f1_score)*100
    precision = np.mean(test_precision)*100
    precision_std = np.std(test_precision)*100
    recall = np.mean(test_recall)*100
    recall_std = np.std(test_recall)*100
    auc = np.mean(test_auc)*100
    auc_std = np.std(test_auc)*100


    print(f"set_item_cnt_0: {set_item_cnt_0}, set_item_cnt_1: {set_item_cnt_1}")
    print(f"After patient_cooc_0:{sum_keys(patient_cooc_0)}, patient_cooc_1:{sum_keys(patient_cooc_1)}")
    # print(f"test_accs: {accs} accs_std: {accs_std}, test_f1_score: {f1_score} f1_score_std: {f1_score_std}, test_precision: {precision} precision_std: {precision_std}, test_recall: {recall} recall_std: {recall_std}, test_auc: {auc} auc_std: {auc_std}")
    print(f"test_accs: {accs}, test_f1_score: {f1_score}, test_precision: {precision}, test_recall: {recall}, test_auc: {auc}")
    result_list.append([set_item, set_item_cnt_0, set_item_cnt_1, accs, f1_score, precision, recall, auc])

with open(os.path.join(inputs_path,'remove_cooc_result_list.json'), 'w') as fp:
    json.dump(result_list, fp)

### Put together remove_cooc_result_list and stemmed_words_dict

In [1]:
# Describe the disease
import os
import pandas as pd
import numpy as np
import json
import tensorflow as tf
import matplotlib.pyplot as plt
import functions
from functions import extract_data, count_notes_per_patient, logger, count_words_per_patient, find_frequent_word, find_cooc_per_patient
from functions import cooc_log_odd_score, sequence2vec, other_emb
from functions import create_graphs_lists, train_model, create_graph
from nltk.stem import PorterStemmer
from sklearn import model_selection

In [18]:
disease_name = 'PNEUMONIA'
database_path = '../MIMIC-III'
inputs_path = os.path.join('data/inputs/', disease_name)

import pickle
# Read pickle file
file_name = "stemmed_words_dict_PNEUMONIA.pickle"
with open(os.path.join(inputs_path, file_name), 'rb') as fp:
    stemmed_words_dict = pickle.load(fp)   

In [19]:
len(stemmed_words_dict)

607

In [20]:
for item in stemmed_words_dict.items():
    print(item)
    break

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [21]:
stemmed_words_df = pd.DataFrame(stemmed_words_dict) 

In [22]:
stemmed_words_df

Unnamed: 0,pea,arrest,precordium,cpr,norepinephrin,paralyt,phenylephrin,vasopressin,bicarbon,pcv,...,passi,muir,knowledg,passey,autoset,spacer,xanax,klonopin,ciwa,coach
words,"{pea, peas}","{arrest, arrested, arrests}",{precordium},{cpr},"{norepinephrin, norepinephrine}","{paralytic, paralytics}",{phenylephrine},{vasopressin},{bicarbonate},{pcv},...,"{passie, passy}","{muir, muire}","{knowledgeable, knowledge}",{passey},{autoset},{spacer},{xanax},"{klonopin, klonopins}","{ciwa, ciwas}","{coaching, coach, coached}"
notes,[PATIENT/TEST INFORMATION:\nIndication: s/p PE...,[PATIENT/TEST INFORMATION:\nIndication: s/p PE...,[Compared to the previous tracing QRS voltage ...,[Chief Complaint: respiratory distress\n HP...,[Chief Complaint: respiratory distress\n HP...,[Chief Complaint: respiratory distress\n HP...,[Chief Complaint: respiratory distress\n HP...,[Chief Complaint: respiratory distress\n HP...,[Chief Complaint: respiratory distress\n HP...,[Chief Complaint: respiratory distress\n HP...,...,[85 y/o M with respiratory failure [**3-5**] t...,"[Pneumonia, bacterial, ventilator acquired (VA...","[Respiratory failure, acute (not ARDS/[**Docto...","[Respiratory failure, acute (not ARDS/[**Docto...",[Demographics\n Day of intubation:\n Day o...,[CCU Nursing Progress Note (MICU service) 0700...,[NURSING PROGRESS NOTE 0400-0700\nREPORT RECEI...,[NPN 7a-7P:\n REview of systems;\n Nuer...,[[**Hospital Unit Name **] nsg note: 7:00-19:0...,[M/SICU NURSING PROGRESS NOTE.\n SEE CAREV...


In [23]:
with open(os.path.join(inputs_path,'remove_cooc_result_list.json'), 'r') as fp:
    remove_cooc_result_list = json.load(fp)
len(remove_cooc_result_list)

607

In [24]:
remove_cooc_result_list[:1]

[['forth',
  0,
  1,
  91.89189076423645,
  83.56708288192749,
  83.04682970046997,
  86.38889193534851,
  95.20286321640015]]

In [25]:
columns = ['word', 'num_of_patients_0', 'num_of_patients_1', 'accs', 'f1_score', 'precision', 'recall', 'auc']

In [26]:
# Create the pandas DataFrame 
remove_cooc_result_df = pd.DataFrame(remove_cooc_result_list, columns = columns) 

In [27]:
remove_cooc_result_df

Unnamed: 0,word,num_of_patients_0,num_of_patients_1,accs,f1_score,precision,recall,auc
0,forth,0,1,91.891891,83.567083,83.046830,86.388892,95.202863
1,cocain,0,15,90.315318,82.766289,83.046830,84.833336,93.916863
2,ggo,0,1,91.891891,83.567083,83.046830,86.388892,95.206696
3,diamox,0,1,91.891891,83.567083,83.046830,86.388892,95.200318
4,hugger,8,5,91.666669,83.388782,83.042079,86.038470,94.969475
...,...,...,...,...,...,...,...,...
602,treitz,1,2,91.779280,83.510590,83.046830,86.277783,95.132726
603,captopril,1,19,89.977479,82.570648,83.036953,84.465808,93.366325
604,adenosin,0,3,91.666669,83.454096,83.046830,86.166674,95.073205
605,thrive,2,4,91.666669,83.441174,83.074605,86.055565,95.163333


In [28]:
# import copy
# copy.deepcopy(patient_cooc_0)

In [29]:
from tqdm import tqdm
# Create a combined list
for num, item in tqdm(enumerate(remove_cooc_result_list)):
    stemmed_words = str(stemmed_words_dict[remove_cooc_result_list[num][0]]['words'])
    stemmed_words_notes = str(stemmed_words_dict[remove_cooc_result_list[num][0]]['notes'][:2])
    item.append(stemmed_words)
    item.append(stemmed_words_notes)

607it [00:00, 13165.69it/s]


In [30]:
remove_cooc_result_list[0]

['forth',
 0,
 1,
 91.89189076423645,
 83.56708288192749,
 83.04682970046997,
 86.38889193534851,
 95.20286321640015,
 "{'forth', 'forthe'}",
 '["NPN 0700-1900\\n\\n68yo female w/metastatic NSCLC stage 4B with metastatic brain lesions s/p craniotomy 3 weeks ago, presented with fever, rigors, dyspnea, and cough; found to have large R sided post obstructive pna.\\n\\nNeuro: Sedated on fentanyl/versed 25/1; opens eyes and follows commands occasionally, also seemed agitated at times, moving head back and forth and pulling on restraints; given fentanyl boluses prn. Maintenance dose not increased d/t hemodynamic instability. No seizure activity, remains on phenytoin, level this am subtherapeutic at 8.1\\n\\nCV: ABP 76-124/45-72, given 1L fluid bolus for low CVP and uo; levo weaned off in am but turned back on for ABP in the 70\'s; also started on vasopressin per Dr. [**Last Name (STitle) **] w/ plan to wean levo then keep vasopressin for 24hrs after levo off. HR 67-99, SR no ectopy, rate has

In [31]:
columns = ['word', 'num_of_patients_0', 'num_of_patients_1', 'accs', 'f1_score', 'precision', 'recall', 'auc', 'references_for_stemmed_word', 'notes_with_stemmed_word']
# Create the pandas DataFrame of stemmed words and texts for each word
combined_df = pd.DataFrame(remove_cooc_result_list, columns = columns) 

In [32]:
combined_df.tail()

Unnamed: 0,word,num_of_patients_0,num_of_patients_1,accs,f1_score,precision,recall,auc,references_for_stemmed_word,notes_with_stemmed_word
602,treitz,1,2,91.77928,83.51059,83.04683,86.277783,95.132726,{'treitz'},"[""[**2144-5-9**] 3:23 PM\n N-G TUBE PLACEMENT ..."
603,captopril,1,19,89.977479,82.570648,83.036953,84.465808,93.366325,"{'captopril', 'captoprile'}",['MICU A Nursing Progress Note (0700-1900)\n\n...
604,adenosin,0,3,91.666669,83.454096,83.04683,86.166674,95.073205,{'adenosine'},"[""PMICU Nursing Progress Note 7a-7p\n\n Pt and..."
605,thrive,2,4,91.666669,83.441174,83.074605,86.055565,95.163333,{'thrive'},"[""[**2123-7-6**] 3:57 PM\n CT CHEST W/CONTRAST..."
606,sxn'ing,0,1,91.891891,83.567083,83.04683,86.388892,95.192242,"{""sxn'ing""}","[""Nursing progress note (7pm-7am):\n\nEvents: ..."


In [33]:
combined_df.to_csv (r'remove_one_word_PNEUMONIA.csv', index = False, header=True)