This is the EDA notebook for the MIMIC-III dataset, the summary of these findings will be found here : https://docs.google.com/document/d/1ZjDQ9wB8KuJkA-l58zpXj9fJ3q7fIgcHs_jye2gbOdc/edit#

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_dir = "/content/drive/MyDrive/Capstone/MIMIC/"

In [None]:
admissions = pd.read_csv(data_dir + "ADMISSIONS.csv")
notes = pd.read_csv(data_dir + "NOTEEVENTS.csv")

## Exploring number of notes and categories

In [None]:
categories = notes.CATEGORY.value_counts()/notes.shape[0]

In [None]:
categories.plot(kind='bar', title = 'Distribution of Note categories', xlabel = "Category", ylabel = "Percentage of notes")

In [None]:
categories = notes.CATEGORY.value_counts()
categories[0:10].plot.pie(figsize=(11, 11), textprops={'fontsize': 15})

In [None]:
notes.DESCRIPTION.unique().shape

In [None]:
descriptions = notes.DESCRIPTION.value_counts()[0:15]/notes.shape[0]

In [None]:
descriptions.plot(kind='bar', title = 'Distribution of Note Description', xlabel = "Description", ylabel = "Percentage of notes")

In [None]:
descriptions = notes.DESCRIPTION.value_counts()[0:10]
descriptions.plot.pie(figsize=(11, 11))

In [None]:
notes.SUBJECT_ID.unique().shape

In [None]:
#average number of notes per patient
notes.shape[0]/notes.SUBJECT_ID.unique().shape[0]

In [None]:
notes.HADM_ID.unique().shape

In [None]:
#average number of notes per admission (some patients have multiple admissions)
notes.shape[0]/notes.HADM_ID.unique().shape[0]

In [None]:
categories = notes.CATEGORY.value_counts()

In [None]:
#average number of notes from each cat per patient
categories/notes.SUBJECT_ID.unique().shape[0]

In [None]:
#average number of notes from each cat per admission
categories/notes.HADM_ID.unique().shape[0]

In [None]:
a = categories/notes.HADM_ID.unique().shape[0]
a.plot(kind='bar', title = 'Average number of notes from each category per admission', xlabel = "category", ylabel = "# of notes")

In [None]:
b = categories/notes.SUBJECT_ID.unique().shape[0]
b.plot(kind='bar', title = 'Average number of notes from each category per admission', xlabel = "category", ylabel = "# of notes")

In [None]:
plt.figure(figsize = (15,10))

X = a.index.to_list()

X_axis = np.arange(len(X))
  
plt.bar(X_axis - 0.2, a, 0.4, label = 'per admission')
plt.bar(X_axis + 0.2, b, 0.4, label = 'per subject')
  
plt.xticks(X_axis, X, rotation='vertical')
plt.xlabel("Categories")
plt.ylabel("Number of notes")
plt.title("Average number of notes from each category")
plt.legend()
plt.show()

In [None]:
notes[["SUBJECT_ID", "HADM_ID", "CATEGORY"]].value_counts()

In [None]:
notes[notes.SUBJECT_ID == 6975]

In [None]:
print(notes[notes.SUBJECT_ID == 6975][notes.CATEGORY == 'Discharge summary'].iloc[0].TEXT)

In [None]:
notes[["SUBJECT_ID", "HADM_ID", "CATEGORY"]].value_counts()

In [None]:
notes_bis = notes.copy()
notes_bis["count"]=1

In [None]:
counts_patients = notes_bis.groupby("SUBJECT_ID").sum()["count"]

In [None]:
counts_patients.describe()

In [None]:
plt.hist(counts_patients, bins = 20)
plt.title("Distribution of number of documents per subject")
plt.show()

In [None]:
counts_adm = notes_bis.groupby("HADM_ID").sum()["count"]
plt.hist(counts_adm, bins = 20)
plt.title("Distribution of number of documents per admission")
plt.show()

In [None]:
full = pd.merge(notes, admissions, on = 'HADM_ID' , how = 'outer')

In [None]:
full.columns

In [None]:
types = full.ADMISSION_TYPE.value_counts()
types

In [None]:
#mean number of notes per admission type
full[["HADM_ID", "ADMISSION_TYPE"]].value_counts().groupby("ADMISSION_TYPE").mean()

In [None]:
a = full[["HADM_ID", "ADMISSION_TYPE"]].value_counts().groupby("ADMISSION_TYPE").mean()
a.plot(kind='bar', title = 'Average number of notes per adm type', xlabel = "Type", ylabel = "# of notes")

In [None]:
full[["HADM_ID", "ADMISSION_LOCATION"]].value_counts().groupby("ADMISSION_LOCATION").mean()

In [None]:
a = full[["HADM_ID", "ADMISSION_LOCATION"]].value_counts().groupby("ADMISSION_LOCATION").mean()
a.plot(kind='bar', title = 'Average number of notes per adm loc', xlabel = "Location", ylabel = "# of notes")

Potential resources : https://notebook.community/MIT-LCP/mimic-code/tutorials/data_viz/01_data_viz_basic

## Exploring text features

In [None]:
pip install wordcloud

In [None]:
import wordcloud

In [None]:
# Read the whole text.
text = notes.iloc[np.random.randint(2083180,size = 20000)].TEXT.sum()

# Generate a word cloud image
cloud = wordcloud.WordCloud().generate(text)

In [None]:
plt.figure(figsize = (20,20))
plt.imshow(cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
notes[notes.HADM_ID.isnull()]

In [None]:
notes["length"] = notes['TEXT'].apply(lambda x : len(x))

In [None]:
notes["words"] = notes['TEXT'].apply(lambda x : len(x.split()))

In [None]:
print(notes[notes.words < 500].TEXT.iloc[1])

In [None]:
plt.hist(notes.length, bins = 20)
plt.title("Distribution of length")
plt.show()

In [None]:
plt.hist(notes.words, bins = 20, log = True)
plt.title("Distribution of number of tokens, log scale")
plt.show()

In [None]:
plt.hist(notes.words, bins = 20)
plt.title("Distribution of number of tokens")
plt.show()

In [None]:
notes.words.describe()