In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score

from sklearn import metrics
data = pd.read_csv("health_data.tsv", header=0, \
                    delimiter="\t")

In [2]:
data.head(10)

Unnamed: 0,CS_ID,Keyword_ID,Keyword,Intent_1,Authority
0,CS_0001,3486,poison oak pictures,Resource,General
1,CS_0002,3486,best foods for hypothyroidism,Direct Answer,Expert
2,CS_0003,3486,kidney stones in women,Generic,Expert
3,CS_0004,3484,what spider bites look like,Resource,General
4,CS_0005,3484,fifths disease picture of rash,Resource,General
5,CS_0006,3484,multi myeloma life expectancy,Direct Answer,Expert
6,CS_0007,3483,pictures of impetigo,Resource,General
7,CS_0008,3483,how serious is spinal stenosis,Direct Answer,Expert
8,CS_0009,3483,how to correct hammer toe,Guide,Expert
9,CS_0010,3482,ear wax removal peroxide,Direct Answer,Expert


In [3]:
data.Intent_1.value_counts()

Direct Answer    43198
Generic          30972
Guide             7642
Resource          6948
Transactional     1137
Navigational       938
Name: Intent_1, dtype: int64

In [4]:
data.Keyword.describe()

count                         90835
unique                        55000
top       major depression symptoms
freq                              4
Name: Keyword, dtype: object

In [5]:
#Change User Intent labels to numerical values
data['Intent_1'] = data.Intent_1.map({'Generic': 0 , 'Direct Answer': 1, 'Guide': 2, 'Navigational': 3, 'Resource': 4, 'Transactional': 5 })

health_data = data.drop(['CS_ID', 'Keyword_ID', 'Authority'], axis=1)
health_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90835 entries, 0 to 90834
Data columns (total 2 columns):
Keyword     90835 non-null object
Intent_1    90835 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.1+ MB


In [6]:
#health_data['duplicates'] = health_data.duplicated('Keyword')
health_data.head(20)

Unnamed: 0,Keyword,Intent_1
0,poison oak pictures,4
1,best foods for hypothyroidism,1
2,kidney stones in women,0
3,what spider bites look like,4
4,fifths disease picture of rash,4
5,multi myeloma life expectancy,1
6,pictures of impetigo,4
7,how serious is spinal stenosis,1
8,how to correct hammer toe,2
9,ear wax removal peroxide,1


In [7]:
healthy = health_data.drop_duplicates('Keyword')
keywords = healthy['Keyword']



In [8]:
healthy.info()
healthy.Intent_1.value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55000 entries, 0 to 65832
Data columns (total 2 columns):
Keyword     55000 non-null object
Intent_1    55000 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


1    21958
0    21868
2     5280
4     4631
5      642
3      621
Name: Intent_1, dtype: int64

In [9]:
#vect = CountVectorizer(decode_error = 'ignore', stop_words = 'english')
#vect.fit(keywords)
#kw_features = vect.get_feature_names()
#instantiate CountVectorizer, fit the data, get the features

#kw_dtm = vect.transform(keywords)
#kw_array = kw_dtm.toarray()


#kw_token_counts = pd.DataFrame({'token':kw_features, 'count':np.sum(kw_array, axis=0)})
#kw_token_counts.sort_values(by='count', ascending=False)

my_ks = range(2,16)
silhouettes = []
for k in my_ks:
    temp_kn = KMeans(k)
    temp_kn.fit(X)
    temp_labels = temp_kn.labels_
    new_score = silhouette_score(X,temp_labels,metric='euclidean')
    silhouettes.append(new_score)
silhouettes

In [10]:
#da_frame= healthy[healthy['Intent_1'] == 1]
#kws = da_frame['Keyword']

da_gen_frame = healthy[healthy['Intent_1']<2].copy()
kws = da_gen_frame['Keyword']

da_gen_frame.info()
da_gen_frame.Intent_1.value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43826 entries, 1 to 65832
Data columns (total 2 columns):
Keyword     43826 non-null object
Intent_1    43826 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


1    21958
0    21868
Name: Intent_1, dtype: int64

#Adding this tanks precision from 78 - 74%
#Function to clean individual keyword
def clean_keywords(keyword):
    
    #Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", keyword)
    
    #Lowercase & splits the keyword on white space so we can iterate through individual words of the keyword
    tokenized_keyword = letters_only.lower().split()
    
    #Stopwords are terms that occur frequently but generally do not carry much meaning within sentence/phrase (i.e. "the", "her")
    #The NLTK toolkit has a built-in stopword list. This line grabs the list and turns it into a set for faster processing.
    stopword = set(stopwords.words("english"))
    
    #Removes words contain in the stopword list
    meaningful_words = [kw for kw in tokenized_keyword if not kw in stopword]
    
    #Replaces the whitespace
    return( " ".join( meaningful_words ))

#Apply function to all the keywords!
da_gen_frame['Keyword'] = kws.apply(clean_keywords)
kws.head(10)

In [161]:
#from sklearn.cross_validation import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(da_gen_frame.Keyword, da_gen_frame.Intent_1, random_state=1)
#print X_train.shape, y_train.shape, X_test.shape, y_test.shape

(32869,) (32869,) (10957,) (10957,)


In [15]:
#TFIDF vectorize and count
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True)

kws_dtm = tfidf_vectorizer.fit_transform(kws) #fit the vectorizer to keywords
kws_features = tfidf_vectorizer.get_feature_names() #list of features/vocab used in tf-idf matrix

X = kws_dtm.toarray() #turn the matrix into an array
y = da_gen_frame['Intent_1']

In [None]:
#Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_dtm, y_train)

In [147]:
# make predictions on test data using test_dtm
preds = nb.predict(test_dtm)
preds

array([1, 0, 0, ..., 1, 1, 0])

In [148]:
# compare predictions to true labels
from sklearn import metrics

print metrics.accuracy_score(y_test, preds)
print metrics.confusion_matrix(y_test, preds)


0.778406498129
[[4108 1367]
 [1061 4421]]


In [149]:
cm = metrics.confusion_matrix(y_test,preds)
cm_df = pd.DataFrame(cm, index=['Predicted Generic', 'Predicted Direct Answer'], 
                     columns=['Actual Generic', 'Actual Direct Answer'])

cm_df

Unnamed: 0,Actual Generic,Actual Direct Answer
Predicted Generic,4108,1367
Predicted Direct Answer,1061,4421


In [150]:
X_test[(y_test == 0) & (preds == 1)]

38831     irregular menstruation period
19907          health benefits of cacao
54448                 normal gfr by age
60692          involuntary head shaking
55379                overactive thyroid
39287          magnesium as a sleep aid
60077              liver detoxification
37271            icy hot medicated roll
17604              lithotripsy recovery
47778         blood sugar normal ranges
26396       foods with fiber for adults
19757          bronchitis home remedies
60586           types of brain seizures
44846      denture cream zinc poisoning
55756         what is black cohosh root
36003                antibiotic listing
19317                     flu shot 2014
52211             cherry juice and gout
3124                 lung cancer stages
54781              severe carpal tunnel
19798                      uvula cancer
34769                 senokot laxatives
33672            herbal muscle relaxers
58991          cure for liver cirrhosis
37406                  hiv aids summary
