In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import seaborn as sns
import matplotlib.pyplot as plt

import re, os
import unicodedata
import json

import nltk
import nltk.sentiment
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from time import strftime

from wordcloud import WordCloud

from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

import acquire

import spacy
pd.set_option('display.max_colwidth', None)

import prepare_jag

### Acquire and prepare data

We will be using the 1000 labeled notes in `test.csv`

In [4]:
# Load test data labels and merge on notes
df = pd.read_csv('train.csv')
notes = pd.read_csv('patient_notes.csv')
df = df.merge(notes, how='inner', on='pn_num')
df.rename(columns={'pn_history': 'original'}, inplace=True)
df = prepare_jag.prep_article_data(df, 'original', extra_words=[], exclude_words=['no'])

df.head(1)


Renamed 'pn_history' column to 'original'
Added a basic clean column lowercaseing and removing special characters
Added stemmed column with tokenized words and stopwords removed
Added lemmatized column with lemmatized words and stopwords removed
Data preparation complete


Unnamed: 0,id,case_num_x,pn_num,feature_num,annotation,location,case_num_y,original,clean,stemmed,lemmatized
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],0,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\nPMHx: none\r\nRx: uses friends adderrall\r\nFHx: mom with ""thyroid disease,"" dad with recent heart attcak\r\nAll: none\r\nImmunizations: up to date\r\nSHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms",hpi 17yo presents palpitations patient reports 34 months intermittent episodes heart beatingpounding chest 2 days ago soccer game episode time chest pressure felt going pass lose conciousness note patient endorses abusing adderall primarily study 13 times per week recent soccer game took adderrall night morning game denies shortness breath diaphoresis fevers chills headache fatigue changes sleep changes visionhearing abdominal paun changes bowel urinary habits pmhx none rx uses friends adderrall fhx mom thyroid disease dad recent heart attcak none immunizations date shx freshmen college endorses 34 drinks 3 nights week weekends denies tabacco endorses trying marijuana sexually active girlfriend x 1 year uses condoms,hpi 17yo present palpit patient report 34 month intermitt episod heart beatingpound chest 2 day ago dure soccer game episod thi time chest pressur felt go pass lose concious note patient endors abus adderal primarili studi 13 time per week befor recent soccer game took adderral night befor morn game deni short breath diaphoresi fever chill headach fatigu chang sleep chang visionhear abdomin paun chang bowel urinari habit pmhx none rx use friend adderral fhx mom thyroid diseas dad recent heart attcak none immun date shx freshmen colleg endors 34 drink 3 night week weekend deni tabacco endors tri marijuana sexual activ girlfriend x 1 year use condom,hpi 17yo present palpitation patient report 34 month intermittent episode heart beatingpounding chest 2 day ago soccer game episode time chest pressure felt going pas lose conciousness note patient endorses abusing adderall primarily study 13 time per week recent soccer game took adderrall night morning game denies shortness breath diaphoresis fever chill headache fatigue change sleep change visionhearing abdominal paun change bowel urinary habit pmhx none rx us friend adderrall fhx mom thyroid disease dad recent heart attcak none immunization date shx freshman college endorses 34 drink 3 night week weekend denies tabacco endorses trying marijuana sexually active girlfriend x 1 year us condom


### Split Data

- X is lemmatized text
- y is feature number
- convert y to object

In [8]:
# Split X Y
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.lemmatized)
y = df.feature_num.astype('object')

In [12]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2, random_state=123)

In [13]:
df.feature_num.value_counts()

0      100
611    100
605    100
606    100
607    100
      ... 
305    100
306    100
307    100
308    100
916    100
Name: feature_num, Length: 143, dtype: int64

All features appear to be equally represented, so pick any one fot baseline


In [16]:
# baseline
df[df['feature_num'] == 0].feature_num.value_counts()/sum(df.feature_num.value_counts())

0    0.006993
Name: feature_num, dtype: float64

Baseline is 0.70%


In [24]:
#Create evaluation dataframe
train = pd.DataFrame(dict(actual=y_train))
train['baseline']='11'


In [34]:
train.actual = train.actual.astype('category')
train.baseline = train.baseline.astype('category')


In [44]:
y_train = y_train.astype('category')

In [45]:
y_train

544       11
2856     201
1371     106
11268    800
7986     506
        ... 
10029    703
7847     511
3222     210
6862     402
9819     603
Name: feature_num, Length: 11440, dtype: category
Categories (143, int64): [0, 1, 2, 3, ..., 913, 914, 915, 916]

In [35]:
train.head()

Unnamed: 0,actual,baseline
544,11,11
2856,201,11
1371,106,11
11268,800,11
7986,506,11


In [36]:
train.dtypes

actual      category
baseline    category
dtype: object

In [37]:
# Calculate baseline model performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.baseline)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.actual, train.baseline))
print('---')
print(classification_report(train.actual, train.baseline))


Accuracy: 0.70%
---
Confusion Matrix
baseline  11
actual      
0         80
1         80
2         80
3         80
4         80
...       ..
912       80
913       80
914       80
915       80
916       80

[143 rows x 1 columns]
---
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.00      0.00      0.00        80
           2       0.00      0.00      0.00        80
           3       0.00      0.00      0.00        80
           4       0.00      0.00      0.00        80
           5       0.00      0.00      0.00        80
           6       0.00      0.00      0.00        80
           7       0.00      0.00      0.00        80
           8       0.00      0.00      0.00        80
           9       0.00      0.00      0.00        80
          10       0.00      0.00      0.00        80
          11       0.01      1.00      0.01        80
          12       0.00      0.00      0.00        80
         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


---

## Create Models

### Logistic Regression

In [46]:
y_train

544       11
2856     201
1371     106
11268    800
7986     506
        ... 
10029    703
7847     511
3222     210
6862     402
9819     603
Name: feature_num, Length: 11440, dtype: category
Categories (143, int64): [0, 1, 2, 3, ..., 913, 914, 915, 916]

In [47]:
# Make and fit the object
lm = LogisticRegression().fit(X_train, y_train)
# Use it to make predictions
train['lm_predicted'] = lm.predict(X_train)
# Asssess accuracy
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.lm_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.lm_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.lm_predicted))

Accuracy: 8.74%
---
Confusion Matrix
actual         0  1   2   3   4   5   6   7   8   9  ...  907  908  909  910  \
lm_predicted                                         ...                       
0             10  5   8   7   6   9  10  10  10   7  ...    0    0    0    0   
1              4  7   5   6   5   6   6   7   5   5  ...    0    0    0    0   
2              5  4   5   4   4   3   3   4   3   2  ...    0    0    0    0   
3              9  7   8  11  10   9   9   7   6   8  ...    0    0    0    0   
4              7  9  11   8  13  10  13  11  12  12  ...    0    0    0    0   
...           .. ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...  ...  ...  ...   
912            0  0   0   0   0   0   0   0   0   0  ...    4    5    6    4   
913            0  0   0   0   0   0   0   0   0   0  ...    8    9   10   10   
914            0  0   0   0   0   0   0   0   0   0  ...    3    4    3    4   
915            0  0   0   0   0   0   0   0   0   0  ...    6    7    6    5   
916

In [49]:
#cross_val_score(lm, X_train, y_train, cv = 5)


### Decision Tree

In [50]:
# Make and fit the object
dtc = DecisionTreeClassifier(max_depth = 2).fit(X_train, y_train)
# Use the object
train['dt_predicted'] = dtc.predict(X_train)
# Determine performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.dt_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.dt_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.dt_predicted))

Accuracy: 2.05%
---
Confusion Matrix
actual         0   1   2   3   4   5   6   7   8   9  ...  907  908  909  910  \
dt_predicted                                          ...                       
3             79  78  79  80  80  78  79  78  78  78  ...   80   80   80   80   
409            1   2   1   0   0   2   1   2   2   2  ...    0    0    0    0   
601            0   0   0   0   0   0   0   0   0   0  ...    0    0    0    0   
604            0   0   0   0   0   0   0   0   0   0  ...    0    0    0    0   

actual        911  912  913  914  915  916  
dt_predicted                                
3              80   80   80   80   80   80  
409             0    0    0    0    0    0  
601             0    0    0    0    0    0  
604             0    0    0    0    0    0  

[4 rows x 143 columns]
---
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.00      0.00      0.00        80
           2   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Random Forest

In [51]:
# Make and fit object
rf = RandomForestClassifier(bootstrap = True, 
                            class_weight = None, 
                            criterion = 'gini',
                            min_samples_leaf = 3,
                            n_estimators = 100,
                            max_depth = 8, 
                            random_state = 123).fit(X_train, y_train)
# Use it to make predictions
train['rf_predicted'] = rf.predict(X_train)
# Assess performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.rf_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.rf_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.rf_predicted))

Accuracy: 8.74%
---
Confusion Matrix
actual        0   1   2  3   4  5  6   7   8   9  ...  907  908  909  910  \
rf_predicted                                      ...                       
0             6   6   2  5   4  5  4   4   5   5  ...    0    0    0    0   
1             2   6   5  6   5  2  6   5   5   4  ...    0    0    0    0   
2             7   6   7  7   5  5  6   6   6   4  ...    0    0    0    0   
3             4   4   5  6   5  6  5   6   5   5  ...    0    0    0    0   
4             9  12  12  8  13  9  9  11  10  11  ...    0    0    0    0   
...          ..  ..  .. ..  .. .. ..  ..  ..  ..  ...  ...  ...  ...  ...   
912           0   0   0  0   0  0  0   0   0   0  ...    3    3    5    5   
913           0   0   0  0   0  0  0   0   0   0  ...    5    5    4    6   
914           0   0   0  0   0  0  0   0   0   0  ...    3    3    4    1   
915           0   0   0  0   0  0  0   0   0   0  ...    3    2    2    2   
916           0   0   0  0   0  0  0   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## KNN

In [52]:
# Make and fit the object
knn = KNeighborsClassifier(n_neighbors = 4).fit(X_train, y_train)
# Use the object 
train['knn_predicted'] = knn.predict(X_train)
# Evaluate performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.knn_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.knn_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.knn_predicted))

Accuracy: 8.74%
---
Confusion Matrix
actual          0   1   2   3   4   5   6   7   8   9  ...  907  908  909  \
knn_predicted                                          ...                  
0              34  28  24  27  27  26  26  25  27  27  ...    0    0    0   
1              19  27  27  21  20  20  20  24  22  21  ...    0    0    0   
2              11  10  14   9  11  13  12  12  11  12  ...    0    0    0   
3               5   5   5   9   7   7   9   6   8   6  ...    0    0    0   
4               4   3   5   6   6   5   6   5   3   4  ...    0    0    0   
...            ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...  ...  ...   
904             0   0   0   0   0   0   0   0   0   0  ...    4    5    6   
905             0   0   0   0   0   0   0   0   0   0  ...    3    3    4   
906             0   0   0   0   0   0   0   0   0   0  ...    9   10   10   
907             0   0   0   0   0   0   0   0   0   0  ...    5    5    5   
908             0   0   0   0   0   0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Support Vector Classifier

In [53]:
# Make and fit the object
svc = LinearSVC(random_state=0).fit(X_train, y_train)
# Use the object
train['svc_predicted'] = svc.predict(X_train)
# Evaluate model
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.svc_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.svc_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.svc_predicted))

Accuracy: 8.74%
---
Confusion Matrix
actual          0  1   2  3   4   5   6  7   8   9  ...  907  908  909  910  \
svc_predicted                                       ...                       
0              10  7   9  8   7   8   9  8  10   7  ...    0    0    0    0   
1               5  7   4  6   5   7   7  7   5   6  ...    0    0    0    0   
2               5  4   7  6   6   6   6  5   4   4  ...    0    0    0    0   
3               7  4   4  8   7   7   7  5   5   7  ...    0    0    0    0   
4               8  8  10  8  12  10  12  9  11  11  ...    0    0    0    0   
...            .. ..  .. ..  ..  ..  .. ..  ..  ..  ...  ...  ...  ...  ...   
912             0  0   0  0   0   0   0  0   0   0  ...    5    6    6    5   
913             0  0   0  0   0   0   0  0   0   0  ...    7    7    9    9   
914             0  0   0  0   0   0   0  0   0   0  ...    4    5    4    5   
915             0  0   0  0   0   0   0  0   0   0  ...    5    6    5    5   
916            

### Gaussian Naive Bayes

In [55]:
from sklearn.naive_bayes import GaussianNB


In [57]:
X_train = X_train.toarray()


In [58]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

train['gnb_predicted'] = gnb.predict(X_train)
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.gnb_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.gnb_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.gnb_predicted))

Accuracy: 8.74%
---
Confusion Matrix
actual          0   1   2   3   4   5   6   7   8   9  ...  907  908  909  \
gnb_predicted                                          ...                  
6              11  13  12  15  14  11  16  10  15  14  ...    0    0    0   
7               1   1   0   1   1   1   0   1   1   1  ...    0    0    0   
10              2   3   3   2   2   3   0   2   3   3  ...    0    0    0   
11             66  63  65  62  63  65  64  67  61  62  ...    0    0    0   
101             0   0   0   0   0   0   0   0   0   0  ...    0    0    0   
105             0   0   0   0   0   0   0   0   0   0  ...    0    0    0   
111             0   0   0   0   0   0   0   0   0   0  ...    0    0    0   
112             0   0   0   0   0   0   0   0   0   0  ...    0    0    0   
204             0   0   0   0   0   0   0   0   0   0  ...    0    0    0   
205             0   0   0   0   0   0   0   0   0   0  ...    0    0    0   
213             0   0   0   0   0   0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Multimoial Naive Bayes

In [60]:
from sklearn.naive_bayes import MultinomialNB


In [62]:
# Multinomial naive bayes
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

train['mnb_predicted'] = mnb.predict(X_train)

print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.mnb_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.mnb_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.mnb_predicted))

Accuracy: 8.74%
---
Confusion Matrix
actual          0   1   2   3   4   5   6   7   8   9  ...  907  908  909  \
mnb_predicted                                          ...                  
0               5   3   4   4   3   4   4   5   5   4  ...    0    0    0   
2               6   6   6   5   4   4   2   6   5   4  ...    0    0    0   
3               7   7   5   8   8   8   6   5   7   6  ...    0    0    0   
4              14  17  17  16  22  16  21  17  16  22  ...    0    0    0   
5               1   1   1   0   0   1   1   1   1   1  ...    0    0    0   
...            ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...  ...  ...   
912             0   0   0   0   0   0   0   0   0   0  ...    1    2    2   
913             0   0   0   0   0   0   0   0   0   0  ...   24   26   27   
914             0   0   0   0   0   0   0   0   0   0  ...    1    1    1   
915             0   0   0   0   0   0   0   0   0   0  ...    7    7    6   
916             0   0   0   0   0   0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


---

Test knn model

In [68]:
y_test = y_test.astype('category')

In [72]:
X_test = X_test.toarray()


In [74]:
test.sample(100)

Unnamed: 0,actual,gnb_predicted
9644,608,603
3977,200,204
13875,900,913
14102,906,913
296,10,11
...,...,...
1854,108,105
10158,706,700
2057,103,105
10800,800,809


In [78]:
len(train)

11440

In [79]:
len(train[train.actual==train.knn_predicted])

1000

In [75]:
test[test.actual==test.gnb_predicted]

Unnamed: 0,actual,gnb_predicted


In [73]:
# Create testing dataframe
test = pd.DataFrame(dict(actual=y_test))
test['gnb_predicted'] = gnb.predict(X_test)
# Evaluate model
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.gnb_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.gnb_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.gnb_predicted))

Accuracy: 0.00%
---
Confusion Matrix
actual          0   1   2   3   4   5   6   7   8   9  ...  907  908  909  \
gnb_predicted                                          ...                  
6               5   3   4   1   2   5   0   6   1   2  ...    0    0    0   
7               0   0   1   0   0   0   1   0   0   0  ...    0    0    0   
10              1   0   0   1   1   0   3   1   0   0  ...    0    0    0   
11             14  17  15  18  17  15  16  13  19  18  ...    0    0    0   
101             0   0   0   0   0   0   0   0   0   0  ...    0    0    0   
105             0   0   0   0   0   0   0   0   0   0  ...    0    0    0   
111             0   0   0   0   0   0   0   0   0   0  ...    0    0    0   
112             0   0   0   0   0   0   0   0   0   0  ...    0    0    0   
204             0   0   0   0   0   0   0   0   0   0  ...    0    0    0   
205             0   0   0   0   0   0   0   0   0   0  ...    0    0    0   
213             0   0   0   0   0   0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [71]:
y_test

4936     312
4107     211
10080    700
9886     610
3720     215
        ... 
10250    708
12688    903
4666     314
13035    910
5615     303
Name: feature_num, Length: 2860, dtype: category
Categories (143, int64): [0, 1, 2, 3, ..., 913, 914, 915, 916]

In [69]:
# Create testing dataframe
test = pd.DataFrame(dict(actual=y_test))
test['knn_predicted'] = knn.predict(X_test)
# Evaluate model
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.knn_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.knn_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.knn_predicted))

Accuracy: 0.00%
---
Confusion Matrix
actual         0  1   2  3  4  5  6  7  8  9  ...  907  908  909  910  911  \
knn_predicted                                 ...                            
0              0  6  10  7  7  8  8  9  7  7  ...    0    0    0    0    0   
1              8  0   0  6  7  7  7  3  5  6  ...    0    0    0    0    0   
2              3  4   0  5  3  1  2  2  3  2  ...    0    0    0    0    0   
3              4  4   4  0  2  2  0  3  1  3  ...    0    0    0    0    0   
4              2  3   1  0  0  1  0  1  3  2  ...    0    0    0    0    0   
...           .. ..  .. .. .. .. .. .. .. ..  ...  ...  ...  ...  ...  ...   
904            0  0   0  0  0  0  0  0  0  0  ...    3    2    1    3    4   
905            0  0   0  0  0  0  0  0  0  0  ...    2    2    1    0    1   
906            0  0   0  0  0  0  0  0  0  0  ...    1    0    0    2    1   
907            0  0   0  0  0  0  0  0  0  0  ...    0    0    0    0    1   
908            0  0   0  0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [81]:
# Calculate percent improvement from baseline accuracy (.67%)
pct_improvement = round(((8.74-0.70)/(0.70)*100), 2)
print(f'Our 8.74% accuracy represent a {pct_improvement}% improvement from baseline')

Our 8.74% accuracy represent a 1148.57% improvement from baseline
