<a href="https://colab.research.google.com/github/dongspam0209/medCAT_study/blob/main/MedCAT_Tutorial_%7C_Part_2_Dataset_Analysis_and_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MedCAT tutorial - Dataset Analysis and Preparation

Welcome to the MedCAT tutorials!

First before be begin extracting information from with patient records.
As with the begining of every datascience project. Let's explore the data that we are dealing with.



In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

### Datasets

If you are using MIMIC-III you will have the create the create the `patients.csv` and `noteevents.csv` and place them into the folder specified below. How to prepare the CSV files is explained in the blog post [MedCAT | Dataset Analysis and Preparation](https://towardsdatascience.com/medcat-dataset-analysis-and-preparation-be8bc910bd6d).

In [None]:
DATA_DIR = "./data_p2/"
! DATA_DIR="./data_p2/"

### Download the data (only if using the pre-made datasets and not MIMIC-III)

In [None]:
# Load files if in google colab, otherwise skip this step
! wget -N https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/noteevents.csv -P $DATA_DIR
! wget -N https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/patients.csv -P $DATA_DIR

### Set plot sizes and style

In [None]:
# Set size and color for plots
sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(4,5)},
    style="white"
)

## Statistics on patients

In [None]:
patients = pd.read_csv(DATA_DIR + "patients.csv")
patients.head()

In [None]:
sns.countplot(patients['gender'])
plt.show()

In [None]:
print(f"Total number of patients: {len(patients)}")
print(f"Male: {len(patients[patients['gender'] == 'M'])}")
print(f"Female: {len(patients[patients['gender'] == 'F'])}")

## Statistics on note events

Note: This dataset is large and requires a bit more RAM to fully load into memory.

In [None]:
# Set size and color for plots
sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(10,6)},
    style="white"
)

In [None]:
noteevents = pd.read_csv(DATA_DIR + "noteevents.csv")
noteevents.head()

In [None]:
noteevents.category

In [None]:
# The column 'noteevents' contains the free text / unstructured text body of the document
print(noteevents.text.iloc[0])

In [None]:
noteevents_original = noteevents.copy()

In [None]:
lns = noteevents.text.str.len().tolist()
sns.distplot(lns, kde=False, axlabel='Document length')
plt.show()

In [None]:
# Sort lengths
lns = sorted(lns)

# Take 5% as the removal size
rm_size = int(len(lns) / 100) * 5

# Now plot with removal of most/least frequent
sns.distplot(lns[rm_size:-rm_size], kde=False, axlabel='Document length')
plt.show()

### Cleaning based on document length

In [None]:
# Remove rows from the dataframe based on document length of top / bottom 5%.
min_ln = max(lns[0:rm_size])
max_ln = min(lns[-rm_size:])
noteevents = noteevents[(noteevents.text.str.len() > min_ln) & (noteevents.text.str.len() < max_ln)]
noteevents.head()

In [None]:
print(f"Length after cleaning : {len(noteevents)}")
print(f"Length of the original: {len(noteevents_original)}")

## Number of documents per patient

In [None]:
sns.distplot(noteevents['subject_id'].value_counts().values, kde=False, axlabel='Documents per patient')
plt.show()

In [None]:
# Again a bit of clean-up, let's remove the bottom/top 1% of patients based on the number of
# documents they have.
docs_per_pt = noteevents['subject_id'].value_counts()
docs_per_pt_vals = docs_per_pt.values
docs_per_pt_vals.sort()

rm_size = int(len(docs_per_pt_vals) / 100) * 1
min_ln = max(docs_per_pt_vals[0:rm_size])
max_ln = min(docs_per_pt_vals[-rm_size:])

keep_subject_id = set([k for k, v in docs_per_pt.iteritems() if v > min_ln and v < max_ln])
noteevents = noteevents[[True if subject_id in keep_subject_id else False
                  for subject_id in noteevents['subject_id'].values]]
noteevents.head()

In [None]:
print(f"Length after cleaning : {len(noteevents)}")
print(f"Length of the original: {len(noteevents_original)}")

In [None]:
sns.distplot(noteevents['subject_id'].value_counts().values, kde=False, axlabel='Documents per patient')
plt.show()

### There are different sources for the documesnt, we'll also plot that...

In [None]:
plot = sns.countplot(noteevents['category'])
_ = plot.set_xticklabels(plot.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()

## Age based statistics

In [None]:
# Convert to pandas dates
noteevents['chartdate'] = pd.to_datetime(noteevents['chartdate'])
patients['dob'] = pd.to_datetime(patients['dob'])

# Add a year column
noteevents['create_year'] = pd.DatetimeIndex(noteevents['chartdate']).year
patients['dob_year'] = pd.DatetimeIndex(patients['dob']).year

In [None]:
# Joint noteevents with patients
pt_notes = noteevents.merge(patients, on='subject_id', how='left')

In [None]:
pt_notes['age_year'] = pt_notes['create_year'] - pt_notes['dob_year']

In [None]:
# Remove patients older than 89 and younger than 16
pt_notes = pt_notes[pt_notes['age_year'] >= 16]
pt_notes = pt_notes[pt_notes['age_year'] <= 89]

In [None]:
# It is possible that the cleaning above created some patients with only one document
print("Number of patients with only one doc: " + str(sum(pt_notes['subject_id'].value_counts().values == 1)))

# Remove it there are any
remove_subject = set([k for k, v in pt_notes['subject_id'].value_counts().iteritems() if v == 1])
pt_notes = pt_notes[~pt_notes.subject_id.isin(remove_subject)]
print("After removal: " + str(sum(pt_notes['subject_id'].value_counts().values == 1)))

In [None]:
pt_notes.category.value_counts()

In [None]:
# We'll do the following to make sure patients do not have documents that span
# over multiple years, meaning their age would change.
dif_pt = {}
for ind, row in pt_notes.iterrows():
    sid = row['subject_id']
    if sid in dif_pt:
        dif_pt[sid].append(row['age_year'])
    else:
        dif_pt[sid] = [row['age_year']]

ehr_length = []
median_age = []
for v in dif_pt.values():
    mx = max(v)
    mi = min(v)
    median_age.append(np.median(v))
    ehr_length.append(mx - mi)

In [None]:
# We show this mainly to be sure that most patients have an EHR that spans less than 1 year
sns.distplot(ehr_length, kde=False, axlabel='EHR Length in Years')
plt.show()

In [None]:
sns.distplot([np.round(x) for x in median_age], kde=False, bins=74, axlabel='Patient Age')
plt.show()

In [None]:
# Number of patients after/before cleaning
print(len(pt_notes['subject_id'].unique()))
print(len(noteevents_original['subject_id'].unique()))

In [None]:
# Document length after/before cleaning
print(np.average([len(str(x)) for x in pt_notes['text']]))
print(np.average([len(str(x)) for x in noteevents_original['text']]))

In [None]:
# Number of documents after/before cleaning
print(len(pt_notes))
print(len(noteevents_original))

In [None]:
# Min number of documents per patient after/before cleaning
print(min(pt_notes['subject_id'].value_counts().values))
print(min(noteevents_original['subject_id'].value_counts().values))

In [None]:
# Max number of documents per patient after/before cleaning
print(max(pt_notes['subject_id'].value_counts().values))
print(max(noteevents_original['subject_id'].value_counts().values))

In [None]:
# Save the new DF
pt_notes.to_csv(DATA_DIR + "pt_notes.csv", index=False)

End of Dataset Analysis and Preparation tutorial