In [11]:
#Import all the things
import pandas as pd    
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

#Read in the data
data = pd.read_csv("health_data.tsv", header=0, \
                    delimiter="\t")

In [12]:
data.head(10)

Unnamed: 0,CS_ID,Keyword_ID,Keyword,Intent_1,Authority
0,CS_0001,3486,poison oak pictures,Resource,General
1,CS_0002,3486,best foods for hypothyroidism,Direct Answer,Expert
2,CS_0003,3486,kidney stones in women,Generic,Expert
3,CS_0004,3484,what spider bites look like,Resource,General
4,CS_0005,3484,fifths disease picture of rash,Resource,General
5,CS_0006,3484,multi myeloma life expectancy,Direct Answer,Expert
6,CS_0007,3483,pictures of impetigo,Resource,General
7,CS_0008,3483,how serious is spinal stenosis,Direct Answer,Expert
8,CS_0009,3483,how to correct hammer toe,Guide,Expert
9,CS_0010,3482,ear wax removal peroxide,Direct Answer,Expert


In [6]:
#Drop columns that will not be used in the analysis
health_data = data.drop(["CS_ID", "Keyword_ID", "Authority"], axis=1)
health_data.head(10)

#Change User Intent labels to numerical values
health_data['Intent_1'] = health_data.Intent_1.map({'Generic': 0 , 'Direct Answer': 1, 'Guide': 2, 'Navigational': 3, 'Resource': 4, 'Transactional': 5 })

keywords = health_data['Keyword']

In [14]:
#Function to clean individual keyword
def clean_keywords(keyword):
    
    #Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", keyword)
    
    #Lowercase & splits the keyword on white space so we can iterate through individual words of the keyword
    tokenized_keyword = letters_only.lower().split()
    
    #Stopwords are terms that occur frequently but generally do not carry much meaning within sentence/phrase (i.e. "the", "her")
    #The NLTK toolkit has a built-in stopword list. This line grabs the list and turns it into a set for faster processing.
    stopword = set(stopwords.words("english"))
    
    #Removes words contain in the stopword list
    meaningful_words = [kw for kw in tokenized_keyword if not kw in stopword]
    
    #Replaces the whitespace
    return( " ".join( meaningful_words ))

#Apply function to all the keywords!
health_data['Keyword'] = keywords.apply(clean_keywords)
keywords.head(10)

0              poison oak pictures
1        best foods hypothyroidism
2              kidney stones women
3           spider bites look like
4      fifths disease picture rash
5    multi myeloma life expectancy
6                pictures impetigo
7          serious spinal stenosis
8               correct hammer toe
9         ear wax removal peroxide
Name: Keyword, dtype: object

In [15]:
#Return a new dataframe with only the Generic and Direct Answer user intents
health_data_gda = health_data[health_data.Intent_1<2] 
health_data_gda.head(10)

Unnamed: 0,Keyword,Intent_1
1,best foods hypothyroidism,1
2,kidney stones women,0
5,multi myeloma life expectancy,1
7,serious spinal stenosis,1
9,ear wax removal peroxide,1
10,side effects diabetes,1
13,pancreas,1
14,causes blood clots,1
15,canker sores tongue,0
16,enlarged prostate,0


In [16]:
#Split into training and test sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(health_data_gda.Keyword, health_data_gda.Intent_1, random_state=1)
print X_train.shape, y_train.shape, X_test.shape, y_test.shape

(55627,) (55627,) (18543,) (18543,)


In [17]:
#Vectorize and turn into a document term matrix
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(decode_error = 'ignore')
X_train_counts = vect.fit_transform(X_train)
X_train_counts.shape

(55627, 7798)

In [18]:
#
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
train_dtm = tfidf_transformer.fit_transform(X_train_counts)
train_dtm.shape

(55627, 7798)

In [19]:
#Instantiate and fit model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
#Transform test data as we did training data
X_test_counts = vect.transform(X_test)
test_dtm = tfidf_transformer.transform(X_test_counts)

In [21]:
#Predict over test data
preds = nb.predict(test_dtm)
preds

array([1, 0, 0, ..., 1, 1, 0])

In [27]:
#Let's get our accuracy score and confusion matrix
from sklearn import metrics

print("Accuracy Score: " + str(metrics.accuracy_score(y_test, preds)))
print ("Confusion Matrix: " + '\n' + str(metrics.confusion_matrix(y_test, preds)))

Accuracy Score: 0.761203688724
Confusion Matrix: 
[[4495 3255]
 [1173 9620]]


In [28]:
# Let's look at some errors in the model. 
X_test[(y_test == 1) & (preds == 0)]

45002                   ct scan detect
25300      ablation procedure recovery
31480                liver test levels
86166      heart transplant statistics
45951     large red blood cells cancer
20263                pain lung surgery
57343            coughing clear phlegm
9673            high white blood count
6171          radiculopathy definition
39314                 skin cancer hurt
47415    bifocal vs progressive lenses
84614               articles addiction
34976               bone density score
19964                  marfan syndrome
29808          dr rachael ross married
11796         free icd training online
70996                       pdr online
78053                  physician codes
9600     free medical terminology test
32014        prolozone therapy reviews
76192                  surgeon ratings
17139        ablation procedure uterus
14055    hepatitis c antibody positive
62485         lipid panel test results
53052                  physician codes
46955             laminec