In [185]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json
import numpy as np
from acquire import remove_stopwords, basic_clean, tokenize 
import re
from re import search

from nltk.corpus import stopwords
import nltk

Download the data from the [Kaggle Competition Site](https://www.kaggle.com/c/medicalnotes-2019/data)

# Data Dictionary
descriptor: the value held in the 'feature_text' column. These are features that describe the individual.

In [2]:
# Read csv files into a Pandas dataframe.
features = pd.read_csv('features.csv')

In [317]:
notes = pd.read_csv('patient_notes.csv')

In [3]:
# Get familiar with the 'features' dataframe.
features

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded
...,...,...,...
138,912,9,Family-history-of-migraines
139,913,9,Female
140,914,9,Photophobia
141,915,9,No-known-illness-contacts


# Set sights on target:
'feature_text' targeted

I will have to create a function that will iterate through the students' patient notes and identify the different ways different students express the descriptors.

Tentative plan: 
1. Rename {'case_num':'case', 'feature_text':'target'}
2. Rename {'pn_num':'note_id', 'case_num':'case', 'pn_history':'student_notes'}
3. Normalize the text in features.feature_text and notes.pn_history.
    * clean it
4. Create a dataframe that holds the original text and the clean.
5. Split data in train, validate, and test.

In [4]:
# Rename columns in the features dataframe.
features.rename(columns={'feature_num':'feature_id', 'case_num':'case', 'feature_text':'target'}, inplace=True)

In [5]:
# Verify
features

Unnamed: 0,feature_id,case,target
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded
...,...,...,...
138,912,9,Family-history-of-migraines
139,913,9,Female
140,914,9,Photophobia
141,915,9,No-known-illness-contacts


In [6]:
features.target.value_counts().head(50)

Female                                                                7
Male                                                                  3
35-year                                                               2
20-year                                                               2
Nausea                                                                2
17-year                                                               2
No-rash                                                               1
getting-worse-OR-progressive-OR-symptoms-now-daily                    1
heart-pounding-OR-heart-racing                                        1
Sleeping-medication-ineffective                                       1
No-hair-changes-OR-no-nail-changes-OR-no-temperature-intolerance      1
No-shortness-of-breath                                                1
Stress-due-to-caring-for-elderly-parents                              1
Last-Pap-smear-I-year-ago                                       

In [7]:
len(features.target)

143

In [318]:
# Rename columns in notes dataframe.
notes.rename(columns={'pn_num':'note_id', 'case_num':'case', 'pn_history':'student_notes'}, inplace=True)

In [9]:
# Verify
notes

Unnamed: 0,note_id,case,student_notes
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...
...,...,...,...
42141,95330,9,Ms. Madden is a 20 yo female presenting w/ the...
42142,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143,95332,9,Ms. Madden is a 20yo female who presents with ...
42144,95333,9,Stephanie madden is a 20 year old woman compla...


In [10]:
# Check 'features' dataframe for null values and data types.
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   feature_id  143 non-null    int64 
 1   case        143 non-null    int64 
 2   target      143 non-null    object
dtypes: int64(2), object(1)
memory usage: 3.5+ KB


# Takeaways
* The 'target' column holds values related to the individual patient. 
* There are no null values and the data types make sense.

In [11]:
# Check the type of values in the 'feature_text' column.
features.target.value_counts().head(50)

Female                                                                7
Male                                                                  3
35-year                                                               2
20-year                                                               2
Nausea                                                                2
17-year                                                               2
No-rash                                                               1
getting-worse-OR-progressive-OR-symptoms-now-daily                    1
heart-pounding-OR-heart-racing                                        1
Sleeping-medication-ineffective                                       1
No-hair-changes-OR-no-nail-changes-OR-no-temperature-intolerance      1
No-shortness-of-breath                                                1
Stress-due-to-caring-for-elderly-parents                              1
Last-Pap-smear-I-year-ago                                       

# Takeaways
* It seems as if they created a unique list of descriptors for each patient.

In [12]:
features.case.value_counts()

5    18
8    18
2    17
9    17
3    16
0    13
1    13
6    12
4    10
7     9
Name: case, dtype: int64

In [13]:
features.case.value_counts().describe()

count    10.000
mean     14.300
std       3.335
min       9.000
25%      12.250
50%      14.500
75%      17.000
max      18.000
Name: case, dtype: float64

# Takeaways
* Descriptors for each patient ranges from 9 - 18.
* Average amount of descriptors per patient is 14.

In [14]:
notes.case.value_counts()

3    9753
5    6909
4    5405
9    5151
8    4196
7    4101
0    2268
2    1958
6    1597
1     808
Name: case, dtype: int64

# Takeaways
* Student notes for patient_3 has close to 10,000 submissions.

In [15]:
def prep_text(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    return df[['case', column, 'clean']]

In [320]:
prep_text(notes, 'student_notes')

Unnamed: 0,case,student_notes,clean
0,0,"17-year-old male, has come to the student heal...",17 year old male come student health clinic co...
1,0,17 yo male with recurrent palpitations for the...,17 yo male recurrent palpitations past 3 mo la...
2,0,Dillon Cleveland is a 17 y.o. male patient wit...,dillon cleveland 17 male patient significant p...
3,0,a 17 yo m c/o palpitation started 3 mos ago; \...,17 yo c palpitation started 3 mos ago nothing ...
4,0,17yo male with no pmh here for evaluation of p...,17yo male pmh evaluation palpitations states l...
...,...,...,...
42141,9,Ms. Madden is a 20 yo female presenting w/ the...,ms madden 20 yo female presenting w worst ha l...
42142,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...,20 yo f came complain dull 8 10 headache assoc...
42143,9,Ms. Madden is a 20yo female who presents with ...,ms madden 20yo female presents headache 1 days...
42144,9,Stephanie madden is a 20 year old woman compla...,stephanie madden 20 year old woman complaining...


In [321]:
notes

Unnamed: 0,note_id,case,student_notes,clean
0,0,0,"17-year-old male, has come to the student heal...",17 year old male come student health clinic co...
1,1,0,17 yo male with recurrent palpitations for the...,17 yo male recurrent palpitations past 3 mo la...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...,dillon cleveland 17 male patient significant p...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...,17 yo c palpitation started 3 mos ago nothing ...
4,4,0,17yo male with no pmh here for evaluation of p...,17yo male pmh evaluation palpitations states l...
...,...,...,...,...
42141,95330,9,Ms. Madden is a 20 yo female presenting w/ the...,ms madden 20 yo female presenting w worst ha l...
42142,95331,9,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...,20 yo f came complain dull 8 10 headache assoc...
42143,95332,9,Ms. Madden is a 20yo female who presents with ...,ms madden 20yo female presents headache 1 days...
42144,95333,9,Stephanie madden is a 20 year old woman compla...,stephanie madden 20 year old woman complaining...


In [18]:
notes.student_notes[0]

"17-year-old male, has come to the student health clinic complaining of heart pounding. Mr. Cleveland's mother has given verbal consent for a history, physical examination, and treatment\r\n-began 2-3 months ago,sudden,intermittent for 2 days(lasting 3-4 min),worsening,non-allev/aggrav\r\n-associated with dispnea on exersion and rest,stressed out about school\r\n-reports fe feels like his heart is jumping out of his chest\r\n-ros:denies chest pain,dyaphoresis,wt loss,chills,fever,nausea,vomiting,pedal edeam\r\n-pmh:non,meds :aderol (from a friend),nkda\r\n-fh:father had MI recently,mother has thyroid dz\r\n-sh:non-smoker,mariguana 5-6 months ago,3 beers on the weekend, basketball at school\r\n-sh:no std"

In [19]:
notes.clean[0]

'17 year old male come student health clinic complaining heart pounding mr cleveland mother given verbal consent history physical examination treatment began 2 3 months ago sudden intermittent 2 days lasting 3 4 min worsening non allev aggrav associated dispnea exersion rest stressed school reports fe feels like heart jumping chest ros denies chest pain dyaphoresis wt loss chills fever nausea vomiting pedal edeam pmh non meds aderol friend nkda fh father mi recently mother thyroid dz sh non smoker mariguana 5 6 months ago 3 beers weekend basketball school sh std'

# Takeaways
* All of the symbols in the original are causing the cleaned version to produce concatenated words which will must be fixed.
* I will use regular expression to convert all symbols into spaces. From there I can locate low value words and add them to the stopword list.

In [None]:
# Use regex to substitute everything that is not a number or leter with an empty space.
# re.sub(r"[\W]", ' ')

# Takeaways
* The regex method produces a more coherent output. I will use it on the entire column.

In [19]:
notes.student_notes

0        17-year-old male, has come to the student heal...
1        17 yo male with recurrent palpitations for the...
2        Dillon Cleveland is a 17 y.o. male patient wit...
3        a 17 yo m c/o palpitation started 3 mos ago; \...
4        17yo male with no pmh here for evaluation of p...
                               ...                        
42141    Ms. Madden is a 20 yo female presenting w/ the...
42142    A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143    Ms. Madden is a 20yo female who presents with ...
42144    Stephanie madden is a 20 year old woman compla...
42145    patient is a 20 yo F who presents with a heada...
Name: student_notes, Length: 42146, dtype: object

In [23]:
student_notes_words = ' '.join(notes.clean)

In [21]:
''' This line of code slows up the notebook. I will keep it commented out for now.'''
# Get a peak:
# student_notes_words

' This line of code slows up the notebook. I will keep it commented out for now.'

In [24]:
student_notes_corpus = student_notes_words

### Analyze the student notes

In [25]:
len(student_notes_corpus.split())

3990311

# Takeaways
* There is a grand total of 3,990,311 words written by students.
* The average reading speed for an adult is 200 - 250 words per minute.
* It would take the average person 15961.2 - 19952.60 minutes to read all this.
* 266.00 - 332.50 hours.
* 11.10 - 13.90 days.

In [26]:
word_frequencies = pd.Series(student_notes_corpus.split()).value_counts()

In [27]:
word_frequencies.tail(50)

deceseaed             1
aggraavte             1
depresseion           1
negtiave              1
dependance            1
drawing               1
dwindled              1
amputated             1
mother0               1
usred                 1
willingly             1
decreasedxappetite    1
scholesterol          1
hgeadache             1
nonths                1
motivates             1
seeinh                1
epusides              1
unintestionally       1
inuprofen             1
hyperventilate        1
cuttently             1
inefective            1
unremakrbale          1
reciopnist            1
sensitiviity          1
dayes                 1
persiration           1
tighterning           1
ruminations           1
concentracion         1
wiocks                1
anxsiety              1
precipatitng          1
tylnole               1
yys                   1
roughening            1
hospitilation         1
symnptoms             1
yeatrs                1
abnornal              1
cceased         

# Takeaway
* Most of the words that only show up once are typos.

In [28]:
len(pd.Series(student_notes_corpus.split()).unique())

44770

# Takeaways
* There are 44770 unique words that show up in student notes. Most of these could be typos.

In [29]:
features.target.head(50)

0     Family-history-of-MI-OR-Family-history-of-myoc...
1                    Family-history-of-thyroid-disorder
2                                        Chest-pressure
3                                 Intermittent-symptoms
4                                           Lightheaded
5     No-hair-changes-OR-no-nail-changes-OR-no-tempe...
6                                          Adderall-use
7                                   Shortness-of-breath
8                                          Caffeine-use
9                        heart-pounding-OR-heart-racing
10                                  Few-months-duration
11                                              17-year
12                                                 Male
13                                 No-vaginal-discharge
14                                          Weight-loss
15                                  Not-sexually-active
16                           Prior-episodes-of-diarrhea
17                                              

In [319]:
notes[notes.case == 0]

Unnamed: 0,note_id,case,student_notes
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...
...,...,...,...
2263,2447,0,17 yo M comes to clinic c/o of 1st time episod...
2264,2448,0,17 yo m c/o heart pounding for the past 2-3 mo...
2265,2449,0,Pt is a 17yM with c/o Heart pounding\r\n- Onse...
2266,2450,0,17 year old male presents with heart pounding ...


## Break down target into individual features

In [253]:
# Run text through 'basic_clean' function.
cleaned_targets = features.target.apply(basic_clean)

In [254]:
# Verify.
cleaned_targets

0      family history of mi or family history of myoc...
1                     family history of thyroid disorder
2                                         chest pressure
3                                  intermittent symptoms
4                                            lightheaded
                             ...                        
138                          family history of migraines
139                                               female
140                                          photophobia
141                            no known illness contacts
142                                     subjective fever
Name: target, Length: 143, dtype: object

In [280]:
# Create a list of all individual targets.
lists_of_targets = []
for target in cleaned_targets:
    # This line of code will split targets that have the word 'or' in it at that word.
    lists_of_targets.append(list(re.split(r'\bor', target)))

In [283]:
# Create a list that separates nested lists. 
list_of_targets = []
for ailments in lists_of_targets:
    for ailment in ailments:
        list_of_targets.append(ailment)

In [286]:
list_of_targets

['family history of mi ',
 ' family history of myocardial infarction',
 'family history of thyroid disorder',
 'chest pressure',
 'intermittent symptoms',
 'lightheaded',
 'no hair changes ',
 ' no nail changes ',
 ' no temperature intolerance',
 'adderall use',
 'shortness of breath',
 'caffeine use',
 'heart pounding ',
 ' heart racing',
 'few months duration',
 '17 year',
 'male',
 'no vaginal discharge',
 'weight loss',
 'not sexually active',
 'prior episodes of diarrhea',
 '20 year',
 'no bloody bowel movements',
 'recurrent bouts over past 6 months',
 'right sided lq abdominal pain ',
 ' right lower quadrant abdominal pain',
 'no urinary symptoms',
 'diminished appetite',
 'normal lmp 2 weeks ago ',
 ' normal last menstrual period 2 weeks ago',
 '8 to 10 hours of acute pain',
 'female',
 'prior normal periods',
 'last pap smear i year ago',
 'iud',
 'sexually active',
 'vaginal dryness',
 'irregular menses',
 'recent nausea vomiting ',
 ' recent flulike symptoms',
 'no premenstr

In [287]:
# Strip all whitespaces.
list_of_targets = [s.strip() for s in list_of_targets]

In [288]:
list_of_targets

['family history of mi',
 'family history of myocardial infarction',
 'family history of thyroid disorder',
 'chest pressure',
 'intermittent symptoms',
 'lightheaded',
 'no hair changes',
 'no nail changes',
 'no temperature intolerance',
 'adderall use',
 'shortness of breath',
 'caffeine use',
 'heart pounding',
 'heart racing',
 'few months duration',
 '17 year',
 'male',
 'no vaginal discharge',
 'weight loss',
 'not sexually active',
 'prior episodes of diarrhea',
 '20 year',
 'no bloody bowel movements',
 'recurrent bouts over past 6 months',
 'right sided lq abdominal pain',
 'right lower quadrant abdominal pain',
 'no urinary symptoms',
 'diminished appetite',
 'normal lmp 2 weeks ago',
 'normal last menstrual period 2 weeks ago',
 '8 to 10 hours of acute pain',
 'female',
 'prior normal periods',
 'last pap smear i year ago',
 'iud',
 'sexually active',
 'vaginal dryness',
 'irregular menses',
 'recent nausea vomiting',
 'recent flulike symptoms',
 'no premenstrual symptoms',

In [295]:
# This function completes all the above tasks.
def boil_it_down(df, column):
    cleaned_column = df[column].apply(basic_clean)
    lists_of_targets = []
    for target in cleaned_column:
        lists_of_targets.append(list(re.split(r'\bor', target)))
    list_of_targets = []
    for ailments in lists_of_targets:
        for ailment in ailments:
            list_of_targets.append(ailment)
    list_of_targets = [s.strip() for s in list_of_targets]
    return list_of_targets

In [299]:
list_of_targets

['family history of mi',
 'family history of myocardial infarction',
 'family history of thyroid disorder',
 'chest pressure',
 'intermittent symptoms',
 'lightheaded',
 'no hair changes',
 'no nail changes',
 'no temperature intolerance',
 'adderall use',
 'shortness of breath',
 'caffeine use',
 'heart pounding',
 'heart racing',
 'few months duration',
 '17 year',
 'male',
 'no vaginal discharge',
 'weight loss',
 'not sexually active',
 'prior episodes of diarrhea',
 '20 year',
 'no bloody bowel movements',
 'recurrent bouts over past 6 months',
 'right sided lq abdominal pain',
 'right lower quadrant abdominal pain',
 'no urinary symptoms',
 'diminished appetite',
 'normal lmp 2 weeks ago',
 'normal last menstrual period 2 weeks ago',
 '8 to 10 hours of acute pain',
 'female',
 'prior normal periods',
 'last pap smear i year ago',
 'iud',
 'sexually active',
 'vaginal dryness',
 'irregular menses',
 'recent nausea vomiting',
 'recent flulike symptoms',
 'no premenstrual symptoms',

In [296]:
boil_it_down(features, 'target')

['family history of mi',
 'family history of myocardial infarction',
 'family history of thyroid disorder',
 'chest pressure',
 'intermittent symptoms',
 'lightheaded',
 'no hair changes',
 'no nail changes',
 'no temperature intolerance',
 'adderall use',
 'shortness of breath',
 'caffeine use',
 'heart pounding',
 'heart racing',
 'few months duration',
 '17 year',
 'male',
 'no vaginal discharge',
 'weight loss',
 'not sexually active',
 'prior episodes of diarrhea',
 '20 year',
 'no bloody bowel movements',
 'recurrent bouts over past 6 months',
 'right sided lq abdominal pain',
 'right lower quadrant abdominal pain',
 'no urinary symptoms',
 'diminished appetite',
 'normal lmp 2 weeks ago',
 'normal last menstrual period 2 weeks ago',
 '8 to 10 hours of acute pain',
 'female',
 'prior normal periods',
 'last pap smear i year ago',
 'iud',
 'sexually active',
 'vaginal dryness',
 'irregular menses',
 'recent nausea vomiting',
 'recent flulike symptoms',
 'no premenstrual symptoms',

In [322]:
# Create a for loop that checks for perfect matches.
perfect_match = []
for ailment in list_of_targets:
    for note in notes.clean:
        if ailment in notes:
            perfect_match.append(ailment)

In [323]:
perfect_match

[]

In [324]:
for note in notes[notes.case == 0].clean:
    if 'family' in note:
        print('yup')
    else:
        print('nope')

nope
nope
nope
nope
nope
yup
nope
nope
nope
nope
yup
yup
nope
nope
yup
yup
nope
nope
yup
nope
nope
nope
nope
nope
nope
nope
nope
yup
yup
nope
nope
yup
nope
nope
nope
nope
nope
nope
nope
yup
nope
yup
nope
nope
nope
yup
nope
nope
nope
nope
nope
nope
nope
nope
nope
yup
yup
yup
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
yup
nope
nope
yup
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
yup
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
yup
nope
nope
nope
nope
nope
yup
yup
yup
nope
nope
nope
nope
nope
nope
yup
nope
yup
nope
nope
nope
yup
nope
nope
yup
nope
nope
nope
yup
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
yup
nope
yup
nope
nope
nope
yup
nope
nope
nope
yup
yup
nope
nope
nope
nope
nope
nope
nope
nope
yup
nope
nope
nope
nope
nope
nope
nope
nope
yup
nope
nope
nope
nope
yup
yup
nope
nope
yup
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
yup
nope
nope
nope
nop

nope
nope
nope
nope
nope
nope
nope
nope
yup
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
yup
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
yup
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
yup
nope
nope
nope
nope
yup
nope
nope
nope
nope
nope
nope
yup
nope
nope
nope
yup
nope
nope
nope
nope
nope
nope
yup
nope
nope
nope
nope
nope
yup
yup
nope
nope
yup
nope
nope
yup
nope
nope
nope
yup
nope
nope
nope
nope
yup
nope
nope
nope
yup
nope
yup
nope
yup
nope
nope
nope
nope
nope
nope
yup
nope
nope
yup
nope
nope
nope
yup
yup
nope
yup
nope
nope
nope
nope
nope
nope
yup
nope
nope
yup
nope
nope
nope
nope
nope
nope
nope
nope
nope
yup
nope
yup
nope
nope
nope
yup
nope
nope
nope
nope
nope
yup
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
nope
yup
nope
nope
nope
nope
yup
nope
yup
nope
yup
nope
nope
nope
nope
yup
yup
nope
nope
nope
nope
nope
nope
nope
nope
nope
yup
nope
nope
nope
nope
yup
nope
nope
nope
yup
no

In [326]:
notes.clean[5]

'17 yo presenting palpitations increased heart rate 5 6 episodes past months recently 2 days ago 2 days ago episode associated chest pressure shortness breath lightheadedness diaphoresis vomiting tremor loss consciousness fever nausea vomiting diarrhea rash change skin colour appetite good denies anxiety prior episode phx healthy medications taking friend prescription adderral help study tests prescribed taking times per week reported temporal relation palpitations allergies none substances cigarettes 3 4 beers weekends adderall cocaine use substances social history freshman college reported concerns school family history father mi 52 mother thyroid disease'

In [327]:
notes.clean[4]

'17yo male pmh evaluation palpitations states last 3 4mo felt heart intermittently beat chest associated difficulty catching breath states recent event 2 days ago activity soccer game seem note specific precipitatinig factors time also states feels faint events lost consciousness point furthermore endorse theses attacks occuring 1 2 times month peak 4 mins denies stressors home ros denies weight loss fevers recnet illness change bowel habits pmh negative psh negative fhx mom thyroid disorder dad heart condition mi 52yo shx tobacco etoh weekends marijuana tried month ago med taking roommates adderoll intermittently last 2 days ago prior event knda'

In [339]:
for ailment in list_of_targets[:5]:
    for word in ailment.split():
        print(word)

family
history
of
mi
family
history
of
myocardial
infarction
family
history
of
thyroid
disorder
chest
pressure
intermittent
symptoms


In [340]:
list_of_targets

['family history of mi',
 'family history of myocardial infarction',
 'family history of thyroid disorder',
 'chest pressure',
 'intermittent symptoms',
 'lightheaded',
 'no hair changes',
 'no nail changes',
 'no temperature intolerance',
 'adderall use',
 'shortness of breath',
 'caffeine use',
 'heart pounding',
 'heart racing',
 'few months duration',
 '17 year',
 'male',
 'no vaginal discharge',
 'weight loss',
 'not sexually active',
 'prior episodes of diarrhea',
 '20 year',
 'no bloody bowel movements',
 'recurrent bouts over past 6 months',
 'right sided lq abdominal pain',
 'right lower quadrant abdominal pain',
 'no urinary symptoms',
 'diminished appetite',
 'normal lmp 2 weeks ago',
 'normal last menstrual period 2 weeks ago',
 '8 to 10 hours of acute pain',
 'female',
 'prior normal periods',
 'last pap smear i year ago',
 'iud',
 'sexually active',
 'vaginal dryness',
 'irregular menses',
 'recent nausea vomiting',
 'recent flulike symptoms',
 'no premenstrual symptoms',

In [342]:
features.target.head(50)

0     Family-history-of-MI-OR-Family-history-of-myoc...
1                    Family-history-of-thyroid-disorder
2                                        Chest-pressure
3                                 Intermittent-symptoms
4                                           Lightheaded
5     No-hair-changes-OR-no-nail-changes-OR-no-tempe...
6                                          Adderall-use
7                                   Shortness-of-breath
8                                          Caffeine-use
9                        heart-pounding-OR-heart-racing
10                                  Few-months-duration
11                                              17-year
12                                                 Male
13                                 No-vaginal-discharge
14                                          Weight-loss
15                                  Not-sexually-active
16                           Prior-episodes-of-diarrhea
17                                              

In [367]:
for ailment in list_of_targets:
    print(pd.Series(re.findall(r"\bno\b", ailment)) + ' hi')

Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
0    no hi
dtype: object
0    no hi
dtype: object
0    no hi
dtype: object
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
0    no hi
dtype: object
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
0    no hi
dtype: object
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
0    no hi
dtype: object
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dt

  print(pd.Series(re.findall(r"\bno\b", ailment)) + ' hi')


In [370]:
for ailment in list_of_targets:
    print(pd.Series(re.findall(r"\bno\b", ailment)) + ailment.split()[1])

Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)
Series([], dtype: float64)


  print(pd.Series(re.findall(r"\bno\b", ailment)) + ailment.split()[1])


IndexError: list index out of range

In [362]:
pd.Series(re.findall(r"\bno\b", list_of_targets[6])) + ' ' + list_of_targets[6].split()[1]

0    no hair
dtype: object

In [372]:
list_of_targets[5]

'lightheaded'

In [373]:
list_of_targets

['family history of mi',
 'family history of myocardial infarction',
 'family history of thyroid disorder',
 'chest pressure',
 'intermittent symptoms',
 'lightheaded',
 'no hair changes',
 'no nail changes',
 'no temperature intolerance',
 'adderall use',
 'shortness of breath',
 'caffeine use',
 'heart pounding',
 'heart racing',
 'few months duration',
 '17 year',
 'male',
 'no vaginal discharge',
 'weight loss',
 'not sexually active',
 'prior episodes of diarrhea',
 '20 year',
 'no bloody bowel movements',
 'recurrent bouts over past 6 months',
 'right sided lq abdominal pain',
 'right lower quadrant abdominal pain',
 'no urinary symptoms',
 'diminished appetite',
 'normal lmp 2 weeks ago',
 'normal last menstrual period 2 weeks ago',
 '8 to 10 hours of acute pain',
 'female',
 'prior normal periods',
 'last pap smear i year ago',
 'iud',
 'sexually active',
 'vaginal dryness',
 'irregular menses',
 'recent nausea vomiting',
 'recent flulike symptoms',
 'no premenstrual symptoms',

In [381]:
new_list = []
for ailment in list_of_targets:
    new_list.append(remove_stopwords(ailment, exclude_words = ['no', 'i']))

In [388]:
new_list

['family history mi',
 'family history myocardial infarction',
 'family history thyroid disorder',
 'chest pressure',
 'intermittent symptoms',
 'lightheaded',
 'no hair changes',
 'no nail changes',
 'no temperature intolerance',
 'adderall use',
 'shortness breath',
 'caffeine use',
 'heart pounding',
 'heart racing',
 'months duration',
 '17 year',
 'male',
 'no vaginal discharge',
 'weight loss',
 'sexually active',
 'prior episodes diarrhea',
 '20 year',
 'no bloody bowel movements',
 'recurrent bouts past 6 months',
 'right sided lq abdominal pain',
 'right lower quadrant abdominal pain',
 'no urinary symptoms',
 'diminished appetite',
 'normal lmp 2 weeks ago',
 'normal last menstrual period 2 weeks ago',
 '8 10 hours acute pain',
 'female',
 'prior normal periods',
 'last pap smear i year ago',
 'iud',
 'sexually active',
 'vaginal dryness',
 'irregular menses',
 'recent nausea vomiting',
 'recent flulike symptoms',
 'no premenstrual symptoms',
 'female',
 'stress',
 'lmp 2 mon

In [390]:
len(new_list[0].split())

3

In [400]:
list_of_ailment_in_notes = []
for ailment in new_list:    
    for i in range(len(ailment.split())):
        if ailment.split()[i] in notes.clean[0]:
            list_of_ailment_in_notes.append(ailment.split()[i])

In [403]:
pd.Series(list_of_ailment_in_notes).unique()

array(['history', 'mi', 'thyroid', 'chest', 'intermittent', 'no', 'use',
       'heart', 'pounding', 'months', '17', 'year', 'male', 'loss', '6',
       'pain', '2', 'ago', 'last', 'i', 'recent', 'nausea', 'vomiting',
       'stress', '3', 'worse', 'beers', 'week', '5', 'associated',
       'recently', 'feels', 'x', '1', 'day', '7', 'days', 'fever'],
      dtype=object)

In [404]:
notes.clean[0]

'17 year old male come student health clinic complaining heart pounding mr cleveland mother given verbal consent history physical examination treatment began 2 3 months ago sudden intermittent 2 days lasting 3 4 min worsening non allev aggrav associated dispnea exersion rest stressed school reports fe feels like heart jumping chest ros denies chest pain dyaphoresis wt loss chills fever nausea vomiting pedal edeam pmh non meds aderol friend nkda fh father mi recently mother thyroid dz sh non smoker mariguana 5 6 months ago 3 beers weekend basketball school sh std'

In [405]:
features[features.case == 0]

Unnamed: 0,feature_id,case,target
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded
5,5,0,No-hair-changes-OR-no-nail-changes-OR-no-tempe...
6,6,0,Adderall-use
7,7,0,Shortness-of-breath
8,8,0,Caffeine-use
9,9,0,heart-pounding-OR-heart-racing


In [406]:
tup = ('family','history')

In [407]:
tup

('family', 'history')

In [409]:
tup[0]

'family'

In [411]:
if 'family history' in new_list[0]:
    print('yup')
else:
    print('nope')

yup
