In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import seaborn as sns
import matplotlib.pyplot as plt

import re, os
import unicodedata
import json

import nltk
import nltk.sentiment
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from time import strftime

from wordcloud import WordCloud

from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

import acquire

import spacy
pd.set_option('display.max_colwidth', None)

import prepare_jag

### Acquire and prepare data

We will be using the 1000 labeled notes in `test.csv`

In [2]:
# Load test data labels and merge on notes
df = pd.read_csv('train.csv')
notes = pd.read_csv('patient_notes.csv')
df = df.merge(notes, how='inner', on='pn_num')
df.rename(columns={'pn_history': 'original'}, inplace=True)
df = prepare_jag.prep_article_data(df, 'original', extra_words=[], exclude_words=['no'])

df.head(1)


Renamed 'pn_history' column to 'original'
Added a basic clean column lowercaseing and removing special characters
Added stemmed column with tokenized words and stopwords removed
Added lemmatized column with lemmatized words and stopwords removed
Data preparation complete


KeyError: "['case_num', 'feature_text'] not in index"

In [None]:
df.sample(1)

### Split Data

- X is lemmatized text
- y is feature number
- convert y to object

In [None]:
# Split X Y
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.lemmatized)
y = df.feature_num.astype('object')

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2, random_state=123)

In [None]:
df.feature_num.value_counts()

All features appear to be equally represented, so pick any one fot baseline


In [None]:
# baseline
df[df['feature_num'] == 0].feature_num.value_counts()/sum(df.feature_num.value_counts())

Baseline is 0.70%


In [None]:
#Create evaluation dataframe
train = pd.DataFrame(dict(actual=y_train))
train['baseline']='11'


In [None]:
train.actual = train.actual.astype('category')
train.baseline = train.baseline.astype('category')


In [None]:
y_train = y_train.astype('category')

In [None]:
y_train

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
# Calculate baseline model performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.baseline)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.actual, train.baseline))
print('---')
print(classification_report(train.actual, train.baseline))


---

## Create Models

### Logistic Regression

In [None]:
y_train

In [None]:
# Make and fit the object
lm = LogisticRegression().fit(X_train, y_train)
# Use it to make predictions
train['lm_predicted'] = lm.predict(X_train)
# Asssess accuracy
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.lm_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.lm_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.lm_predicted))

In [None]:
#cross_val_score(lm, X_train, y_train, cv = 5)


### Decision Tree

In [None]:
# Make and fit the object
dtc = DecisionTreeClassifier(max_depth = 2).fit(X_train, y_train)
# Use the object
train['dt_predicted'] = dtc.predict(X_train)
# Determine performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.dt_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.dt_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.dt_predicted))

### Random Forest

In [None]:
# Make and fit object
rf = RandomForestClassifier(bootstrap = True, 
                            class_weight = None, 
                            criterion = 'gini',
                            min_samples_leaf = 3,
                            n_estimators = 100,
                            max_depth = 8, 
                            random_state = 123).fit(X_train, y_train)
# Use it to make predictions
train['rf_predicted'] = rf.predict(X_train)
# Assess performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.rf_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.rf_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.rf_predicted))

## KNN

In [None]:
# Make and fit the object
knn = KNeighborsClassifier(n_neighbors = 4).fit(X_train, y_train)
# Use the object 
train['knn_predicted'] = knn.predict(X_train)
# Evaluate performance
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.knn_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.knn_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.knn_predicted))

### Support Vector Classifier

In [None]:
# Make and fit the object
svc = LinearSVC(random_state=0).fit(X_train, y_train)
# Use the object
train['svc_predicted'] = svc.predict(X_train)
# Evaluate model
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.svc_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.svc_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.svc_predicted))

### Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB


In [None]:
X_train = X_train.toarray()


In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

train['gnb_predicted'] = gnb.predict(X_train)
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.gnb_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.gnb_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.gnb_predicted))

### Multimoial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB


In [None]:
# Multinomial naive bayes
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

train['mnb_predicted'] = mnb.predict(X_train)

print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.mnb_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.mnb_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.mnb_predicted))

---

Test knn model

In [None]:
y_test = y_test.astype('category')

In [None]:
X_test = X_test.toarray()


In [None]:
test.sample(100)

In [None]:
len(train)

In [None]:
len(train[train.actual==train.knn_predicted])

In [None]:
test[test.actual==test.gnb_predicted]

In [None]:
# Create testing dataframe
test = pd.DataFrame(dict(actual=y_test))
test['gnb_predicted'] = gnb.predict(X_test)
# Evaluate model
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.gnb_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.gnb_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.gnb_predicted))

In [None]:
y_test

In [None]:
# Create testing dataframe
test = pd.DataFrame(dict(actual=y_test))
test['knn_predicted'] = knn.predict(X_test)
# Evaluate model
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.knn_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.knn_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.knn_predicted))

In [None]:
# Calculate percent improvement from baseline accuracy (.67%)
pct_improvement = round(((8.74-0.70)/(0.70)*100), 2)
print(f'Our 8.74% accuracy represent a {pct_improvement}% improvement from baseline of 0.70')