# Looking at MIMIC-3 Notes

Make sure to expand the table of content in the left sidebar.

### Main python imports

In [1]:
import os, re
import pandas as pd
import numpy as np


# Plotting
from matplotlib import pyplot as plt
import seaborn as sns

# Turn off FutureWarnings
import warnings
warnings.filterwarnings("ignore", category = FutureWarning)

print("Pandas version:", pd.__version__)
print("CPU threads detected:", os.cpu_count())

Pandas version: 1.3.3
CPU threads detected: 32


### Import dataset

Here we are restricting to the first 500k rows to speed things up. But we could do the full dataset if desired.

In [2]:
%%time

# Import the raw notes data.
# Restrict to the first 500k rows to speed this part up.
cache_file = "data/cache/note-events.feather"
df = pd.read_csv('data-raw/mimic/NOTEEVENTS.csv', low_memory = False)

# Lowercase the column names for easier typing.
df.columns = df.columns.str.lower()

print("Dataframe shape:", df.shape)
print("Dataframe columns:", df.columns)

Dataframe shape: (2083180, 11)
Dataframe columns: Index(['row_id', 'subject_id', 'hadm_id', 'chartdate', 'charttime',
       'storetime', 'category', 'description', 'cgid', 'iserror', 'text'],
      dtype='object')
CPU times: user 30.2 s, sys: 5.92 s, total: 36.2 s
Wall time: 36.1 s


## Exploratory data analysis

In [3]:
df.head()

Unnamed: 0,row_id,subject_id,hadm_id,chartdate,charttime,storetime,category,description,cgid,iserror,text
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


### Category distribution

In [4]:
# Review distribution of the notes category - a large percentage are ECG, but nursing is 2nd most frequent.
df.category.value_counts()

Nursing/other        822497
Radiology            522279
Nursing              223556
ECG                  209051
Physician            141624
Discharge summary     59652
Echo                  45794
Respiratory           31739
Nutrition              9418
General                8301
Rehab Services         5431
Social Work            2670
Case Management         967
Pharmacy                103
Consult                  98
Name: category, dtype: int64

### Review sample notes

In [None]:
# Stop the short truncation of notes.
pd.set_option('display.max_colwidth', 100)

#print(df.text[:5].values)

from IPython.display import display, HTML

# Via StackOverflow - lost the URL though.
# We can definitely improve this display over time.
def pretty_print(df):
  # Here we just pick the first 3 notes to display. 
    return display( HTML(df.text[:3].to_frame().to_html().replace("\\n", "<br>")))

pretty_print(df)

In [6]:
discharge_summaries=df[df.category=='Discharge summary'].text
physician=df[df.category=='Physician '].text
general=df[df.category=='General'].text
consult=df[df.category=='Consult'].text
nursing=df[df.category=='Nursing'].text
respiratory=df[df.category=='Respiratory '].text
rehab=df[df.category=='Rehab Services'].text
nutrition=df[df.category=='Nutrition'].text

In [7]:
def split_sentence(text):
    split_text=[i for i in re.split(r'\. ', text.replace('\n', ' ')) if len(i)>5]
    return split_text

In [8]:
dc_sums=pd.DataFrame(split_sentence(' '.join(discharge_summaries)), columns=['text'])
physician_notes=pd.DataFrame(split_sentence(' '.join(physician)), columns=['text'])
general_notes=pd.DataFrame(split_sentence(' '.join(general)), columns=['text'])
consult_notes=pd.DataFrame(split_sentence(' '.join(consult)), columns=['text'])
nursing_notes=pd.DataFrame(split_sentence(' '.join(nursing)), columns=['text'])
resp_notes=pd.DataFrame(split_sentence(' '.join(respiratory)), columns=['text'])
rehab_notes=pd.DataFrame(split_sentence(' '.join(rehab)), columns=['text'])
nutrition_notes=pd.DataFrame(split_sentence(' '.join(nutrition)), columns=['text'])

In [9]:
dc_sum_sample=dc_sums.sample(n=500).reset_index(drop=True)
physician_sample=physician_notes.sample(n=500).reset_index(drop=True)
general_sample=general_notes.sample(n=500).reset_index(drop=True)
consult_sample=consult_notes.sample(n=500).reset_index(drop=True)
nursing_sample=nursing_notes.sample(n=500).reset_index(drop=True)
resp_sample=resp_notes.sample(n=500).reset_index(drop=True)
rehab_sample=rehab_notes.sample(n=500).reset_index(drop=True)
nutrition_sample=nutrition_notes.sample(n=500).reset_index(drop=True)

In [10]:
#Sample of 500 sentences per note type (for smaller saved file)
dc_sum_sample.to_csv('MIMIC_sentence_discharge_summaries.csv', index=False)
physician_sample.to_csv('MIMIC_sentence_physician_notes.csv', index=False)
general_sample.to_csv('MIMIC_sentence_general_notes.csv',index=False)
consult_sample.to_csv('MIMIC_sentence_consult_notes.csv',index=False)
nursing_sample.to_csv('MIMIC_sentence_nursing_notes.csv',index=False)
resp_sample.to_csv('MIMIC_sentence_resp_notes.csv',index=False)
rehab_sample.to_csv('MIMIC_sentence_rehab_notes.csv',index=False)
nutrition_sample.to_csv('MIMIC_sentence_nutrition_notes.csv',index=False)

In [13]:
#All sentences for labelling
dc_sums.to_csv('data/MIMIC_sentence_discharge_summaries.csv', index=False)
physician_notes.to_csv('data/MIMIC_sentence_physician_notes.csv', index=False)
general_notes.to_csv('data/MIMIC_sentence_general_notes.csv',index=False)
consult_notes.to_csv('data/MIMIC_sentence_consult_notes.csv',index=False)
nursing_notes.to_csv('data/MIMIC_sentence_nursing_notes.csv',index=False)
resp_notes.to_csv('data/MIMIC_sentence_resp_notes.csv',index=False)
rehab_notes.to_csv('data/MIMIC_sentence_rehab_notes.csv',index=False)
nutrition_notes.to_csv('data/MIMIC_sentence_nutrition_notes.csv',index=False)

In [None]:
#pd.read_csv('~/Desktop/MIMIC_sentence_nutrition_notes.csv')