Create dictionary of clinical notes indexed by patient_id

In [2]:
from collections import defaultdict
import numpy as np 
import math 
import re 
import pickle
import datetime
import pandas

In [46]:
notes_file = 'data/NOTEEVENTS.csv'

notes_dict = defaultdict(list)
note = []
note_words = ""
nurse_count = 0 
# categories = set()
patient_id_to_save = "-1"

# make sure to delete the original header and start from the actual data 
with open(notes_file, 'r') as f:
    header = ""
    saveWords = False 

    for line in f:
        note_data = line.split(",")

        # check if we've encountered a new patient note 
        # header is just a pointer that may be a 'true header'
        header = line.split(",")

        if len(header) >= 2:
            row_id = header[0]
            if row_id == '"ROW_ID"':
                continue 

            # to check if this is a real header
            patient_id = header[1]
            real_header_length = 11

            if len(header) == real_header_length and row_id.isdigit() and patient_id.isdigit():

                # if there's an old note that is not empty  
                if not len(note_words) == 0:
                    # add the text 
                    note.append(note_words)
                    
                    # save the data to dict
                    notes_dict[patient_id_to_save].append(note)
                    
                    # reset the notes and text 
                    note = []
                    note_words = ""

                # update the patient_id to the current one you're reading 
                patient_id_to_save = patient_id 

                # check category to save words if it is a nursing note or physician's note 
                category = header[6]
                # categories.add(category)
                right_category = (category == '"Nursing"' or category == '"Nursing/other"' or category == '"Physician"')
                is_error = (header[9] == '1')
                if right_category and not is_error: 
                    saveWords = True 
                    nurse_count += 1
                else:
                    saveWords = False 

        if saveWords:
            words = re.findall(r'\w+', line)
            if len(note) == 0:
                # set the first index to be the date and the second to be the time 
                date_and_time = header[4].split(" ")
                note = [date_and_time]

            if len(words) > 0:
                note_words += line
             #   note_words.extend(words)

# printing useful data 
print "# of nursing notes", nurse_count 
print "# of patients", len(notes_dict)

# of nursing notes 874035
# of patients 35823


In [None]:
# print "Saving the 1000 patients' data"
# new_keys = notes_dict.keys()[:1000]
# new_dict = defaultdict(list)
# print "Iterating through keys."
# for key in new_keys: 
#     new_dict[key] = notes_dict[key]
# print "Saving 1000 notes to a pickle file."  

In [22]:
# pickle.dump( new_dict, open( "notes_dict_1000.p", "wb" ) )
# print "Done saving 1000 notes."

Done saving 1000 notes.


**DO NOT EDIT ANYTHING ABOVE THIS LINE.

In [None]:
print "Number of Patients Stored:", len(notes_dict)

In [163]:
### ======== EXTRACTING PATIENT IDS FOR SEPSIS DATA 

### This script extracts the patient_id from the icustayid found in the Sepsis.xls file 

# store the matching into a dictionary to look up in constant time! 

icustays_file = 'data/ICUSTAYS.csv'

id_dict = defaultdict(list)

with open(icustays_file, 'r') as f:

    count = 0 
    for line in f: 
        if count == 0: 
            count += 1
            continue 
        row = line.split(",")
        subject_id = row[1]

        # add 200,000
        icustay_id = int(row[3])
        icustay_id -= 200000
        icustay_id = str(icustay_id)
        id_dict[icustay_id] = subject_id 

print id_dict['1']
print id_dict['12']
print id_dict['1791']
print id_dict['1785']

# There are 61532 unique icustayids 
print "Length of dictionary", len(id_dict)

55973
28448
7973
19213
Length of dictionary 61532


In [156]:
# FUNCTIONS FOR DATA CONVERSION 

# EXTRACT THE CORRECT DATE AND TIME 
def convert_time(i_time):
    dt = datetime.datetime.fromtimestamp(int(i_time)).strftime('%Y-%m-%d %H:%M:%S')
    dt = dt.split(" ")
    s_date = dt[0]
    s_time = dt[1]
    return s_date, s_time 


# EXTRACT PATIENT ID 
def convert_from_icu_id(i_id):
    s_id = id_dict[i_id]
    return s_id 

convert_time('7245052800')

print notes_dict.values()[3][0]

[['2166-08-25', '17:59:00'], '1413376,11543,107879,2166-08-25,2166-08-25 17:59:00,2166-08-25 18:01:00,"Nursing/other","Report",18795,,"Resp. Care Note\nPt received from OR S/P CABGx 2 and placed on vent with settings as per resp flowsheet. Pt weaned to PSV 5 peep 5 and 50% with good ABG. Plan is to extubate when more awake.\n']


In [168]:
import pandas as pd

In [161]:
sepsis_train = pd.read_csv("data/df-train.csv").as_matrix()
sepsis_test = pd.read_csv("data/df-test.csv").as_matrix()
print len(id_dict)

69242


In [169]:
def gen_dataset(data):
    col_names = ["label", "action", "age", "hr", "bp", "spo2", "lactate", "ph", "creatinine", "sofa", "note"]
    dataset = pd.DataFrame(columns=col_names)
    current_id = -1 
    patient_all_notes = []

    missing_date_count = 0
    missing_note_count = 0
    note_exists = 0
    counter = 0

    dataset_list = []
    for row in data:

        counter += 1
        if counter % 1000 == 0:
            print counter
        icustayid = row[2]
        sepsis_id = convert_from_icu_id(str(int(icustayid)))
        
        if not isinstance(sepsis_id, str):
            print icustayid
            print sepsis_id
            continue

        # convert time 
        sepsis_charttime = row[3]
        sepsis_date, sepsis_time = convert_time(sepsis_charttime)

        # extract action
        sepsis_action = row[60]
        sepsis_mortality = row[59]
        sepsis_sofa = row[8]

        # check to see if you hit new patient, so that you can gather new collection
        if not current_id == sepsis_id: 

            # update the current id
            current_id = sepsis_id

            # clear the most recent note collection
            patient_all_notes = [] 

            # try to extract patient notes
            patient_all_notes = notes_dict[sepsis_id]

            # check for key error; ie: no patient notes exist, so move on 
            if len(patient_all_notes) == 0: 
              #  print "No note collection found for this patient."
                missing_note_count += 1
                continue 

        # at this point, patient_all_notes should be the notes relevant to the patient 
        # if the code reaches this point, then tehre should be a clinical note available 
        # new goal: find the most recent note for the patient 
        # setup of patient_all_notes: each index is data from a note
        # For clinical note 0: (should be in chronological order, according to Matthieu)
        # patient_all_notes[0][0] = ['2018-10-13', '01:07:00']
        # patient_all_notes[0][1] = ['Hi', 'this', 'is', 'text', 'data']]

        most_recent_note = []
        for i, note in enumerate(patient_all_notes):
            note_date = ""
            note_time = ""
            if not len(note[0]) < 2:
                note_date = note[0][0]
                note_time = note[0][1]
            else:
                missing_date_count += 0
                continue

            if (note_date == sepsis_date):

                # find the time closest to this time 
                sepsis_hour = int(sepsis_time[:2])
                note_hour = int(sepsis_time[:2])

                # check to see if the last note was the most recent one 
                if note_hour > sepsis_hour:
                    continue 
                else: 
                    most_recent_note = note 

        # if there is no exact date match ==> need to find solution

        ### ======== GENERATING A VECTOR OF THE RL-FEATURES AND THE MORTALITY LABEL 
        if most_recent_note:
            note_exists += 1
            # features 
            age = row[5]
            hr = row[12]
            bp = row[14]
            spo2 = row[18]
            lactate = row[45]
            ph = row[41]
            creatinine = row[26]

            data_from_sepsis = [sepsis_mortality, sepsis_action, age, hr, bp, spo2, lactate, ph, creatinine, sepsis_sofa]
           # rl_features = [age, hr, bp, spo2, lactate, ph, creatinine, sepsis_sofa]

            ### GENERATE A PRELIMINARY DATASET WITHOUT CLINICAL NOTE EMBEDDING

            dataset_row = pd.DataFrame(index=range(1), columns=col_names)
            dataset_row.iloc[0, :] = np.append(data_from_sepsis, most_recent_note[1])
            dataset_list.append(dataset_row)

            # clear most recent note in prep for next icustay's note
            most_recent_note = []

    print "# of notes for icustays", note_exists
    print "# of note collections missing", missing_note_count
    print "# of missing dates", missing_date_count
    return pd.concat(dataset_list).reset_index()

In [170]:
# generate the actual datasets 
final_train = gen_dataset(sepsis_train)
final_test = gen_dataset(sepsis_test)

final_train.shape
final_test.shape

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
# of notes for icustays 26627
# of note collections missing 1191
# of missing dates 0


(26627, 12)

In [133]:
final_dataset.head()

Unnamed: 0,index,label,action,age,hr,bp,spo2,lactate,ph,creatinine,sofa,note
0,0,0.0,0.0,12049.2173,93.4,79.4,96.4,1.3,7.44,0.66,3.0,"1633344,28448,177527,2153-12-23,2153-12-23 05:..."
1,0,0.0,0.0,12049.2173,92.2,83.0,98.8,1.166666667,7.443333333,0.64,2.0,"1633344,28448,177527,2153-12-23,2153-12-23 05:..."
2,0,0.0,0.0,12049.2173,107.0,70.0,97.0,1.033333333,7.446666667,0.62,5.0,"1633344,28448,177527,2153-12-23,2153-12-23 05:..."
3,0,1.0,3.0,30946.97,69.6,71.2,98.2,0.96,7.42,0.6,4.0,"1388060,9514,127229,2105-02-17,2105-02-17 17:3..."
4,0,1.0,2.0,30946.97,69.33333333,82.16666667,98.16666667,1.0,7.4,0.6,4.0,"1388060,9514,127229,2105-02-17,2105-02-17 17:3..."


In [134]:
final_train.to_csv("train_dataset.csv")

In [171]:
final_test.to_csv("test_dataset.csv")