In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import seaborn as sns
import matplotlib.pyplot as plt

import re, os
import unicodedata
import json

import nltk
import nltk.sentiment
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from time import strftime

from wordcloud import WordCloud

from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

# make sure these imports are before ->
import sys
directory_path = '/Users/jaredgodar/codeup-data-science/nlp-capstone/src'
sys.path.append(directory_path)

# <- these imports
import acquire
import prepare_jag

import spacy
pd.set_option('display.max_colwidth', None)

### Acquire and prepare data

We will be using the 1000 labeled notes in `test.csv`

In [3]:
# Load test data labels and merge on notes
df = pd.read_csv('../../data/train.csv')
notes = pd.read_csv('../../data/patient_notes.csv')
df = df.merge(notes, how='inner', on='pn_num')
df.rename(columns={'pn_history': 'original'}, inplace=True)
df = prepare_jag.prep_article_data(df, 'original', extra_words=[], exclude_words=['no'])

df.head(1)


Renamed 'pn_history' column to 'original'
Added a basic clean column lowercaseing and removing special characters
Added stemmed column with tokenized words and stopwords removed
Added lemmatized column with lemmatized words and stopwords removed
Data preparation complete


KeyError: "['case_num', 'feature_text'] not in index"

In [4]:
df.sample(1)

Unnamed: 0,id,case_num_x,pn_num,feature_num,annotation,location,case_num_y,original,clean,stemmed,lemmatized
6659,44005_409,4,44005,409,['45YO'],['0 4'],4,"45YO female c/o nervousness for the past few weeks....feels as if she is ""loosing my mind"". Seems to notice these feelings mostly on Sunday evenings as she prepares for work on Monday morning. Has had difficulty falling asleep and a decreased appetite as well. Has never had similar symptoms. Denies feeling as if she is an anxious person at baseline. No previous psych hx or care. Has been under a lot of stress lately since changing positions in her job and with demands at home. Denies recent illness, fevers, chills, CP, palpitations, SOB, Abd pain, N/V/D/C, bowel or bladder changes, rashes, or swelling. \r\nROS: Neg except as mentioned above. \r\nPMH: neg\r\nPSH: neg\r\nMeds: Tylenol\r\nAllergies: NKDA\r\nSocial: Denies tobacco or illicit drug use; Occasional/social alcohol use; Drinks 5-6cups of coffee per day, no recent increase in amount. Is married. English professor. Cares for mother and child at home. Assist w/ care of inlaws. \r\nFly hx: neg",45yo female c/o nervousness past weeks feels loosing mind seems notice feelings mostly sunday evenings prepares work monday morning difficulty falling asleep decreased appetite well never similar symptoms denies feeling anxious person baseline no previous psych hx care lot stress lately since changing positions job demands home denies recent illness fevers chills cp palpitations sob abd pain n/v/d/c bowel bladder changes rashes swelling ros neg except mentioned pmh neg psh neg meds tylenol allergies nkda social denies tobacco illicit drug use occasional/social alcohol use drinks 5-6cups coffee per day no recent increase amount married english professor cares mother child home assist w/ care inlaws fly hx neg,45yo femal c/o nervous past week feel loos mind seem notic feel mostli sunday even prepar work monday morn ha difficulti fall asleep decreas appetit well ha never similar symptom deni feel anxiou person baselin no previou psych hx care ha lot stress late sinc chang posit job demand home deni recent ill fever chill cp palpit sob abd pain n/v/d/c bowel bladder chang rash swell ro neg except mention abov pmh neg psh neg med tylenol allergi nkda social deni tobacco illicit drug use occasional/soci alcohol use drink 5-6cup coffe per day no recent increas amount marri english professor care mother child home assist w/ care inlaw fli hx neg,45yo female c/o nervousness past week feel loosing mind seems notice feeling mostly sunday evening prepares work monday morning ha difficulty falling asleep decreased appetite well ha never similar symptom denies feeling anxious person baseline no previous psych hx care ha lot stress lately since changing position job demand home denies recent illness fever chill cp palpitation sob abd pain n/v/d/c bowel bladder change rash swelling ro neg except mentioned pmh neg psh neg med tylenol allergy nkda social denies tobacco illicit drug use occasional/social alcohol use drink 5-6cups coffee per day no recent increase amount married english professor care mother child home assist w/ care inlaws fly hx neg


In [5]:
df.shape

(14300, 11)

### Split Data

- X is lemmatized text
- y is feature number
- convert y to object

In [6]:
# Split X Y
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.lemmatized)
y = df.feature_num.astype('object')

In [7]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2, random_state=123)

In [8]:
df.feature_num.value_counts()

0      100
611    100
605    100
606    100
607    100
      ... 
305    100
306    100
307    100
308    100
916    100
Name: feature_num, Length: 143, dtype: int64

All features appear to be equally represented, so pick any one fot baseline


In [9]:
# baseline
df[df['feature_num'] == 0].feature_num.value_counts()/sum(df.feature_num.value_counts())

0    0.006993
Name: feature_num, dtype: float64

Baseline is 0.70%


In [10]:
#Create evaluation dataframe
train = pd.DataFrame(dict(actual=y_train))
train['baseline']='11'


In [11]:
train.actual = train.actual.astype('category')
train.baseline = train.baseline.astype('category')


In [12]:
y_train = y_train.astype('category')

In [13]:
y_train

544       11
2856     201
1371     106
11268    800
7986     506
        ... 
10029    703
7847     511
3222     210
6862     402
9819     603
Name: feature_num, Length: 11440, dtype: category
Categories (143, int64): [0, 1, 2, 3, ..., 913, 914, 915, 916]

In [14]:
train.head()

Unnamed: 0,actual,baseline
544,11,11
2856,201,11
1371,106,11
11268,800,11
7986,506,11


In [15]:
train.dtypes

actual      category
baseline    category
dtype: object

In [16]:
# Calculate baseline model performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.baseline)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.actual, train.baseline))
print('---')
print(classification_report(train.actual, train.baseline))


Accuracy: 0.00%
---
Confusion Matrix
baseline  11
actual      
0         80
1         80
2         80
3         80
4         80
...       ..
912       80
913       80
914       80
915       80
916       80

[143 rows x 1 columns]
---


ValueError: Mix of label input types (string and number)

---

## Create Models

### Logistic Regression

In [None]:
y_train

In [None]:
# Make and fit the object
lm = LogisticRegression().fit(X_train, y_train)
# Use it to make predictions
train['lm_predicted'] = lm.predict(X_train)
# Asssess accuracy
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.lm_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.lm_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.lm_predicted))

In [None]:
#cross_val_score(lm, X_train, y_train, cv = 5)


### Decision Tree

In [None]:
# Make and fit the object
dtc = DecisionTreeClassifier(max_depth = 2).fit(X_train, y_train)
# Use the object
train['dt_predicted'] = dtc.predict(X_train)
# Determine performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.dt_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.dt_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.dt_predicted))

### Random Forest

In [None]:
# Make and fit object
rf = RandomForestClassifier(bootstrap = True, 
                            class_weight = None, 
                            criterion = 'gini',
                            min_samples_leaf = 3,
                            n_estimators = 100,
                            max_depth = 8, 
                            random_state = 123).fit(X_train, y_train)
# Use it to make predictions
train['rf_predicted'] = rf.predict(X_train)
# Assess performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.rf_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.rf_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.rf_predicted))

## KNN

In [None]:
# Make and fit the object
knn = KNeighborsClassifier(n_neighbors = 4).fit(X_train, y_train)
# Use the object 
train['knn_predicted'] = knn.predict(X_train)
# Evaluate performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.knn_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.knn_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.knn_predicted))

### Support Vector Classifier

In [None]:
# Make and fit the object
svc = LinearSVC(random_state=0).fit(X_train, y_train)
# Use the object
train['svc_predicted'] = svc.predict(X_train)
# Evaluate model
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.svc_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.svc_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.svc_predicted))

### Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB


In [None]:
X_train = X_train.toarray()


In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

train['gnb_predicted'] = gnb.predict(X_train)
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.gnb_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.gnb_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.gnb_predicted))

### Multimoial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB


In [None]:
# Multinomial naive bayes
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

train['mnb_predicted'] = mnb.predict(X_train)

print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.mnb_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.mnb_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.mnb_predicted))

---

Test knn model

In [None]:
y_test = y_test.astype('category')

In [None]:
X_test = X_test.toarray()


In [None]:
test.sample(100)

In [None]:
len(train)

In [None]:
len(train[train.actual==train.knn_predicted])

In [None]:
test[test.actual==test.gnb_predicted]

In [None]:
# Create testing dataframe
test = pd.DataFrame(dict(actual=y_test))
test['gnb_predicted'] = gnb.predict(X_test)
# Evaluate model
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.gnb_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.gnb_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.gnb_predicted))

In [None]:
y_test

In [None]:
# Create testing dataframe
test = pd.DataFrame(dict(actual=y_test))
test['knn_predicted'] = knn.predict(X_test)
# Evaluate model
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.knn_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.knn_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.knn_predicted))

In [None]:
# Calculate percent improvement from baseline accuracy (.67%)
pct_improvement = round(((8.74-0.70)/(0.70)*100), 2)
print(f'Our 8.74% accuracy represent a {pct_improvement}% improvement from baseline of 0.70')