In [6]:
import pandas as pd
import numpy as np
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score

from sklearn import metrics
data = pd.read_csv("health_data.tsv", header=0, \
                    delimiter="\t")

In [7]:
data.head(10)

Unnamed: 0,CS_ID,Keyword_ID,Keyword,Intent_1,Authority
0,CS_0001,3486,poison oak pictures,Resource,General
1,CS_0002,3486,best foods for hypothyroidism,Direct Answer,Expert
2,CS_0003,3486,kidney stones in women,Generic,Expert
3,CS_0004,3484,what spider bites look like,Resource,General
4,CS_0005,3484,fifths disease picture of rash,Resource,General
5,CS_0006,3484,multi myeloma life expectancy,Direct Answer,Expert
6,CS_0007,3483,pictures of impetigo,Resource,General
7,CS_0008,3483,how serious is spinal stenosis,Direct Answer,Expert
8,CS_0009,3483,how to correct hammer toe,Guide,Expert
9,CS_0010,3482,ear wax removal peroxide,Direct Answer,Expert


In [8]:
#Drop unwanted columns
health_data = data.drop(["CS_ID", "Keyword_ID", "Authority"], axis=1)

#Change User Intent labels to numerical values
health_data['Intent_1'] = health_data.Intent_1.map({'Resource': 0 , 'Direct Answer': 1, 'Guide': 2, 'Navigational': 3, 'Generic': 4, 'Transactional': 5 })


In [9]:
#Take a look at keywords
keywords = health_data['Keyword']
print(keywords.head(20))

0                poison oak pictures
1      best foods for hypothyroidism
2             kidney stones in women
3        what spider bites look like
4     fifths disease picture of rash
5      multi myeloma life expectancy
6               pictures of impetigo
7     how serious is spinal stenosis
8          how to correct hammer toe
9           ear wax removal peroxide
10          side effects of diabetes
11           pictures of foot rashes
12       zenni optical discount code
13                 where is pancreas
14           what causes blood clots
15        canker sores on the tongue
16         what is enlarged prostate
17      what is bursitis in the knee
18          what is cataract surgery
19         what is plantar fasciitis
Name: Keyword, dtype: object


In [10]:
#Function to clean individual keywords
def clean_keywords(keyword):
    stopwords = {'a', 'about', 'above', 'after', 'again', 'against', 'aint', 'all', 'am', 'an', 'and', 'any', 'anybody', 'anyone', 'are', 'arent', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'between', 'both', 'but', 'by', 'can', 'cant', 'could', 'couldnt', 'couldve', 'd', 'didnt', 'doesnt', 'doing', 'don', 'dont', 'during', 'each', 'early', 'everybody', 'everyone', 'fact', 'few', 'for', 'from', 'further', 'gotta', 'had', 'hadnt', 'has', 'hasnt', 'have', 'havent', 'having', 'he', 'hed', 'her', 'here', 'hers', 'herself', 'hes', 'him', 'himself', 'his', 'i', 'if', 'im', 'in', 'into', 'is', 'isnt', 'it', 'itd', 'itll', 'its', 'itself', 'ive', 'lets', 'll', 'm', 'maam', 'many', 'me', 'mean', 'meaning', 'more', 'most', 'much', 'mustnt', 'mustve', 'my', 'myself', 'need', 'neednt', 'no', 'nobody', 'noone', 'nor', 'not', 'notve', 'now', 'of', 'off', 'on', 'once', 'one', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shall', 'shant', 'she', 'shes', 'should', 'shouldnt', 'shouldve', 'so', 'some', 'somebody', 'someone', 'such', 't', 'than', 'that', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'thered', 'therere', 'theres', 'these', 'they', 'theyd', 'theyll', 'theyre', 'theyve', 'this', 'those', 'through', 'too', 'us', 've', 'very', 'was', 'wasnt', 'we', 'were', 'werent', 'weve', 'what', 'whatll', 'whatre', 'whats', 'whatve', 'which', 'while', 'with', 'wont', 'would', 'wouldnt', 'wouldve', 'yet', 'you', 'youd', 'youll', 'your', 'youre', 'yours', 'yourself', 'yourselves', 'youve', 'below', 'best', 'down', 'fun', 'interesting', 'just', 'might', 'mightnt', 'mightve', 'new', 'top', 'under', 'until', 'up', 'will'}
    tokenized_keyword = keyword.split()
    meaningful_words = [kw for kw in tokenized_keyword if not kw in stopwords]
    return( " ".join( meaningful_words ))

health_data['Keyword'] = keywords.apply(clean_keywords)
print(keywords.head(20))

0               poison oak pictures
1              foods hypothyroidism
2               kidney stones women
3            spider bites look like
4       fifths disease picture rash
5     multi myeloma life expectancy
6                 pictures impetigo
7       how serious spinal stenosis
8         how to correct hammer toe
9          ear wax removal peroxide
10            side effects diabetes
11             pictures foot rashes
12      zenni optical discount code
13                   where pancreas
14               causes blood clots
15              canker sores tongue
16                enlarged prostate
17                    bursitis knee
18                 cataract surgery
19                plantar fasciitis
Name: Keyword, dtype: object


In [13]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(health_data.Keyword, health_data.Intent_1, random_state=1)

In [15]:
print X_train.shape
print X_train[:20]

(68126,)
48440                     good diabetes
84236      urinary tract infection food
34039    bladder inflammation treatment
82000            side effects hida scan
54315                             merca
68830                 seizures sleeping
19104                  spider look like
89192                proxy form example
79518          normal vital signs chart
13616      cervical fusion side effects
10312            signs viral infections
48174            green tea side effects
7034             gallstone symptoms men
4390        thyroid problems older cats
67719               eggs good diabetics
60482     shoulder joint pain treatment
53808               bowel movement size
17901          pre diabetes information
28893             vietnam ptsd symptoms
18789                list foot problems
Name: Keyword, dtype: object


TypeError: get_feature_names() takes exactly 1 argument (2 given)