In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json
import numpy as np

Download the data from the [Kaggle Competition Site](https://www.kaggle.com/c/medicalnotes-2019/data)

# Data Dictionary
descriptor: the value held in the 'feature_text' column. These are features that describe the individual.

In [4]:
# Read csv files into a Pandas dataframe.
features = pd.read_csv('features.csv')
notes = pd.read_csv('patient_notes.csv')

In [34]:
# Get familiar with the 'features' dataframe.
features

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded
...,...,...,...
138,912,9,Family-history-of-migraines
139,913,9,Female
140,914,9,Photophobia
141,915,9,No-known-illness-contacts


# Set sights on target:
'feature_text' targeted

I will have to create a function that will iterate through the students' patient notes and identify the different ways different students express the descriptors.

Tentative plan: 
1. Normalize the text in features.feature_text and notes.pn_history.
    * clean it, stem it, lemmatize it

In [17]:
# Check 'features' dataframe for null values and data types.
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   feature_num   143 non-null    int64 
 1   case_num      143 non-null    int64 
 2   feature_text  143 non-null    object
dtypes: int64(2), object(1)
memory usage: 3.5+ KB


# Takeaways
* The 'feature_text' column seems to hold values related to the individual patient. 
* There are no null values and the data types seems to make sense.

In [39]:
# Check the type of values in the 'feature_text' column.
features.feature_text.value_counts().head(50)

Female                                                              7
Male                                                                3
20-year                                                             2
Nausea                                                              2
17-year                                                             2
35-year                                                             2
Minimal-to-no-change-with-Tums                                      1
Global-headache-OR-diffuse-headache                                 1
No-shortness-of-breath                                              1
Episodes-last-15-to-30-minutes                                      1
Increased-frequency-recently                                        1
Family-history-of-migraines                                         1
No-rash                                                             1
No-suicidal-ideations                                               1
Few-months-duration 

# Takeaways
* It seems as if they created a unique list of features for each patient.

In [43]:
features.case_num.value_counts().describe()

count    10.000
mean     14.300
std       3.335
min       9.000
25%      12.250
50%      14.500
75%      17.000
max      18.000
Name: case_num, dtype: float64

# Takeaways
* Each patient has ~14 descriptors.

In [27]:
features[features.case_num == 2].feature_text.value_counts()

Vaginal-dryness                                                 1
LMP-2-months-ago-or-Last-menstrual-period-2-months-ago          1
Sexually-active                                                 1
Recent-nausea-vomiting-OR-Recent-flulike-symptoms               1
44-year                                                         1
IUD                                                             1
Hot-flashes                                                     1
Sleep-disturbance-OR-Early-awakenings                           1
Female                                                          1
Last-Pap-smear-I-year-ago                                       1
Prior-normal-periods                                            1
No-premenstrual-symptoms                                        1
Stress                                                          1
Irregular-menses                                                1
Onset-3-years-ago                                               1
Irregular-

In [6]:
notes

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...
...,...,...,...
42141,95330,9,Ms. Madden is a 20 yo female presenting w/ the...
42142,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143,95332,9,Ms. Madden is a 20yo female who presents with ...
42144,95333,9,Stephanie madden is a 20 year old woman compla...


In [10]:
notes.pn_history[0]

"17-year-old male, has come to the student health clinic complaining of heart pounding. Mr. Cleveland's mother has given verbal consent for a history, physical examination, and treatment\r\n-began 2-3 months ago,sudden,intermittent for 2 days(lasting 3-4 min),worsening,non-allev/aggrav\r\n-associated with dispnea on exersion and rest,stressed out about school\r\n-reports fe feels like his heart is jumping out of his chest\r\n-ros:denies chest pain,dyaphoresis,wt loss,chills,fever,nausea,vomiting,pedal edeam\r\n-pmh:non,meds :aderol (from a friend),nkda\r\n-fh:father had MI recently,mother has thyroid dz\r\n-sh:non-smoker,mariguana 5-6 months ago,3 beers on the weekend, basketball at school\r\n-sh:no std"

In [14]:
notes.case_num.value_counts()

3    9753
5    6909
4    5405
9    5151
8    4196
7    4101
0    2268
2    1958
6    1597
1     808
Name: case_num, dtype: int64

In [29]:
notes[notes.case_num == 2].pn_history[:5]

3076    44 yo F. C/o irregular mestrual periods.  prev...
3077    CC: 44 yo female c/o irregular periods\r\nHPI:...
3078    Dolores Montgomery, a 44-year-old female, has ...
3079    HPI: 44 yo female presenting with menstrual ir...
3080    CC: irregular mentrual bleeding \r\n44 yo F G2...
Name: pn_history, dtype: object