In [4]:
import pandas as pd
import numpy as np
import re



#For Naive Bayes 
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score

from sklearn import metrics
data = pd.read_csv("health_data2.tsv", header=0, \
                    delimiter="\t")

In [5]:
data.head(10)

Unnamed: 0,﻿CS_ID,Keyword,Intent_1,Authority
0,CS-0001,signs of gallbladder problems,Direct Answer,Expert
1,CS-0002,normal blood pressure chart,Resource,General
2,CS-0003,what is normal bloodpressure,Direct Answer,General
3,CS-0004,what is hyperlipidemia,Generic,Expert
4,CS-0005,pill identifier,Generic,General
5,CS-0006,blood clots in leg symptoms,Direct Answer,Expert
6,CS-0007,blood pressure range chart,Resource,General
7,CS-0008,ask webmd question,Navigational,General
8,CS-0009,photos of bed bug bites,Resource,General
9,CS-0010,lyme disease and symptoms,Direct Answer,Expert


In [6]:
data.Intent_1.value_counts()

Generic          43151
Direct Answer    37765
Guide             9138
Resource          7293
Transactional     1238
Navigational      1196
Name: Intent_1, dtype: int64

In [5]:
data.Keyword.describe()

count                         90835
unique                        55000
top       major depression symptoms
freq                              4
Name: Keyword, dtype: object

In [6]:
#Change User Intent labels to numerical values
data['Intent_1'] = data.Intent_1.map({'Generic': 0 , 'Direct Answer': 1, 'Guide': 2, 'Navigational': 3, 'Resource': 4, 'Transactional': 5 })

health_data = data.drop(['CS_ID', 'Keyword_ID', 'Authority'], axis=1)
health_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90835 entries, 0 to 90834
Data columns (total 2 columns):
Keyword     90835 non-null object
Intent_1    90835 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.1+ MB


In [14]:
health_data['duplicates'] = health_data.duplicated('Keyword')
health_data[health_data['duplicates']==True]

Unnamed: 0,Keyword,Intent_1,duplicates
8690,major depression symptoms,1,True
11062,signs of depression in men,1,True
40833,arthritis symptoms,1,True
40835,shingles causes and cures,1,True
40837,a1c average blood sugar,1,True
40839,how does shingles spread,1,True
40843,itar compliance checklist,1,True
40844,edgepark medical supplies,1,True
40846,severe bad breath causes,1,True
40849,causes of chronic dry cough,1,True


In [8]:
healthy = health_data.drop_duplicates('Keyword')
keywords = healthy['Keyword']



In [9]:
healthy.info()
healthy.Intent_1.value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55000 entries, 0 to 65832
Data columns (total 3 columns):
Keyword       55000 non-null object
Intent_1      55000 non-null int64
duplicates    55000 non-null bool
dtypes: bool(1), int64(1), object(1)
memory usage: 1.3+ MB


1    21958
0    21868
2     5280
4     4631
5      642
3      621
Name: Intent_1, dtype: int64

In [10]:
#vect = CountVectorizer(decode_error = 'ignore', stop_words = 'english')
#vect.fit(keywords)
#kw_features = vect.get_feature_names()
#instantiate CountVectorizer, fit the data, get the features

#kw_dtm = vect.transform(keywords)
#kw_array = kw_dtm.toarray()


#kw_token_counts = pd.DataFrame({'token':kw_features, 'count':np.sum(kw_array, axis=0)})
#kw_token_counts.sort_values(by='count', ascending=False)

my_ks = range(2,16)
silhouettes = []
for k in my_ks:
    temp_kn = KMeans(k)
    temp_kn.fit(X)
    temp_labels = temp_kn.labels_
    new_score = silhouette_score(X,temp_labels,metric='euclidean')
    silhouettes.append(new_score)
silhouettes

In [160]:
#da_frame= healthy[healthy['Intent_1'] == 1]
#kws = da_frame['Keyword']

da_gen_frame = healthy[healthy['Intent_1']<2].copy()
kws = da_gen_frame['Keyword']

da_gen_frame.info()
da_gen_frame.Intent_1.value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43826 entries, 1 to 65832
Data columns (total 2 columns):
Keyword     43826 non-null object
Intent_1    43826 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


1    21958
0    21868
Name: Intent_1, dtype: int64

#Adding this tanks precision from 78 - 74%
#Function to clean individual keyword
def clean_keywords(keyword):
    
    #Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", keyword)
    
    #Lowercase & splits the keyword on white space so we can iterate through individual words of the keyword
    tokenized_keyword = letters_only.lower().split()
    
    #Stopwords are terms that occur frequently but generally do not carry much meaning within sentence/phrase (i.e. "the", "her")
    #The NLTK toolkit has a built-in stopword list. This line grabs the list and turns it into a set for faster processing.
    stopword = set(stopwords.words("english"))
    
    #Removes words contain in the stopword list
    meaningful_words = [kw for kw in tokenized_keyword if not kw in stopword]
    
    #Replaces the whitespace
    return( " ".join( meaningful_words ))

#Apply function to all the keywords!
da_gen_frame['Keyword'] = kws.apply(clean_keywords)
kws.head(10)

In [161]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(da_gen_frame.Keyword, da_gen_frame.Intent_1, random_state=1)
print X_train.shape, y_train.shape, X_test.shape, y_test.shape

(32869,) (32869,) (10957,) (10957,)


In [162]:
#TFIDF vectorize and count
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True)

train_dtm = tfidf_vectorizer.fit_transform(X_train) #fit the vectorizer to keywords
train_features = tfidf_vectorizer.get_feature_names() #list of features/vocab used in tf-idf matrix
train_array = X_train_dtm.toarray() #turn the matrix into an array

test_dtm = tfidf_vectorizer.transform(X_test)
test_array = X_test_dtm.toarray()
#kws_token_counts = pd.DataFrame({'token':X_train_features, 'count':np.sum(X_train_array, axis=0)}) #create a dataframe with token
#kws_token_counts.sort_values(by='count', ascending=False)


In [None]:
#Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_dtm, y_train)

In [147]:
# make predictions on test data using test_dtm
preds = nb.predict(test_dtm)
preds

array([1, 0, 0, ..., 1, 1, 0])

In [148]:
# compare predictions to true labels
from sklearn import metrics

print metrics.accuracy_score(y_test, preds)
print metrics.confusion_matrix(y_test, preds)


0.778406498129
[[4108 1367]
 [1061 4421]]


In [149]:
cm = metrics.confusion_matrix(y_test,preds)
cm_df = pd.DataFrame(cm, index=['Predicted Generic', 'Predicted Direct Answer'], 
                     columns=['Actual Generic', 'Actual Direct Answer'])

cm_df

Unnamed: 0,Actual Generic,Actual Direct Answer
Predicted Generic,4108,1367
Predicted Direct Answer,1061,4421


In [150]:
X_test[(y_test == 0) & (preds == 1)]

38831     irregular menstruation period
19907          health benefits of cacao
54448                 normal gfr by age
60692          involuntary head shaking
55379                overactive thyroid
39287          magnesium as a sleep aid
60077              liver detoxification
37271            icy hot medicated roll
17604              lithotripsy recovery
47778         blood sugar normal ranges
26396       foods with fiber for adults
19757          bronchitis home remedies
60586           types of brain seizures
44846      denture cream zinc poisoning
55756         what is black cohosh root
36003                antibiotic listing
19317                     flu shot 2014
52211             cherry juice and gout
3124                 lung cancer stages
54781              severe carpal tunnel
19798                      uvula cancer
34769                 senokot laxatives
33672            herbal muscle relaxers
58991          cure for liver cirrhosis
37406                  hiv aids summary
