This notebook parses patient notes and categorises it as 'Clinical' or 'Non-clinical'.

In [None]:
import pandas as pd
import csv

df = pd.read_csv('./patient_note_log.csv',  parse_dates=True, engine='python', error_bad_lines=False)

df.shape

In [2]:
# !aws s3 cp s3://patient-data-v1/patient_note_log.csv ./patient_note_log.csv
# !aws s3 cp s3://patient-data-v1/Chronic_condns.csv ./chronic_condns.csv

In [None]:
!pip install -U spacy

In [None]:
!python -m spacy download en_core_web_sm

In [5]:
import spacy
from spacy.matcher import PhraseMatcher
import pandas as pd
nlp = spacy.load('en_core_web_sm')

# Create Phrase Matcher Object
phrase_matcher = PhraseMatcher(nlp.vocab)


Assumption: Clinical note contains keywords such as 'MD', 'ICD'
If these words are found, the note is labelled as 'Clinical', 'Non-clinical' otherwise

In [6]:
clinical_patterns = [nlp(text) for text in ('MD', 'ICD')]
phrase_matcher.add('Clinical', None, *clinical_patterns)

def search_clinical_terms(note):
    is_clinical = False
    if (note is None) or isinstance(note, float):
        return None
    text = nlp(note)
    matched_phrases = phrase_matcher(text)
    if len(matched_phrases):
        is_clinical = True

    return is_clinical

In [7]:
df['is_clinical_note'] = df['note_text'].map(search_clinical_terms)
df

Unnamed: 0,id,mrn,noteDate,note_text,is_clinical_note
0,3,10743972,2022-01-10 12:10:00,INSTRUCTIONS FOR YOU:<br />Your Care Provider ...,False
1,4,3546328,2022-01-10 12:39:00,<br />Outpatient Procedure Nursing Follow up:...,False
2,5,10602162,2022-01-10 12:46:00,<br />\t\tPage 1<br /> <br />Discharge Reconc...,False
3,6,10315049,2022-01-10 11:04:00,Bronx Care Allergy and Asthma Center:<br />Enc...,True
4,10,3336997,2022-01-10 10:21:00,<br />Encounter Type:<br /> Encounter Type: ...,True
...,...,...,...,...,...
3925,6737,10728252,2022-01-26 21:35:00,HPI:<br />Arrival Information:<br /> MD First...,True
3926,6738,10320959,2022-01-26 21:55:00,MEDICAID TRANSPORTATION JUSTIFICATION REQUEST ...,True
3927,6739,10320959,2022-01-26 20:15:00,HPI:<br />Arrival Information:<br /> Date/Tim...,True
3928,6740,10320959,2022-01-26 20:15:00,HPI:<br />Arrival Information:<br /> Date/Tim...,True


In [8]:
df['is_clinical_note'].value_counts()

True     2191
False    1611
Name: is_clinical_note, dtype: int64

In [None]:
df.iloc[0,3]