In [7]:
import pandas as pd
import numpy as np
import re



#For Naive Bayes 
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score

from sklearn import metrics
data = pd.read_csv("health_data.tsv", header=0, \
                    delimiter="\t")

In [8]:
data.head(10)

Unnamed: 0,CS_ID,Keyword_ID,Keyword,Intent_1,Authority
0,CS_0001,3486,poison oak pictures,Resource,General
1,CS_0002,3486,best foods for hypothyroidism,Direct Answer,Expert
2,CS_0003,3486,kidney stones in women,Generic,Expert
3,CS_0004,3484,what spider bites look like,Resource,General
4,CS_0005,3484,fifths disease picture of rash,Resource,General
5,CS_0006,3484,multi myeloma life expectancy,Direct Answer,Expert
6,CS_0007,3483,pictures of impetigo,Resource,General
7,CS_0008,3483,how serious is spinal stenosis,Direct Answer,Expert
8,CS_0009,3483,how to correct hammer toe,Guide,Expert
9,CS_0010,3482,ear wax removal peroxide,Direct Answer,Expert


In [9]:
data.Intent_1.value_counts()

Direct Answer    43198
Generic          30972
Guide             7642
Resource          6948
Transactional     1137
Navigational       938
Name: Intent_1, dtype: int64

In [13]:
data.Keyword.describe()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90835 entries, 0 to 90834
Data columns (total 5 columns):
CS_ID         90835 non-null object
Keyword_ID    5000 non-null float64
Keyword       90835 non-null object
Intent_1      90835 non-null int64
Authority     90185 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 4.2+ MB


In [11]:
#Change User Intent labels to numerical values
data['Intent_1'] = data.Intent_1.map({'Generic': 0 , 'Direct Answer': 1, 'Guide': 2, 'Navigational': 3, 'Resource': 4, 'Transactional': 5 })

health_data = data.drop(['CS_ID', 'Keyword_ID', 'Authority'], axis=1)
health_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90835 entries, 0 to 90834
Data columns (total 2 columns):
Keyword     90835 non-null object
Intent_1    90835 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.1+ MB


In [12]:
#health_data['duplicates'] = health_data.duplicated('Keyword')
health_data.head(20)

Unnamed: 0,Keyword,Intent_1
0,poison oak pictures,4
1,best foods for hypothyroidism,1
2,kidney stones in women,0
3,what spider bites look like,4
4,fifths disease picture of rash,4
5,multi myeloma life expectancy,1
6,pictures of impetigo,4
7,how serious is spinal stenosis,1
8,how to correct hammer toe,2
9,ear wax removal peroxide,1


In [36]:
healthy = health_data.drop_duplicates('Keyword')
keywords = healthy['Keyword']

In [37]:
healthy.info()
healthy.Intent_1.value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55000 entries, 0 to 65832
Data columns (total 2 columns):
Keyword     55000 non-null object
Intent_1    55000 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


1    21958
0    21868
2     5280
4     4631
5      642
3      621
Name: Intent_1, dtype: int64

In [38]:
da_gen_frame = healthy[healthy['Intent_1']<2].copy()
kws = da_gen_frame['Keyword']

da_gen_frame.info()
da_gen_frame.Intent_1.value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43826 entries, 1 to 65832
Data columns (total 2 columns):
Keyword     43826 non-null object
Intent_1    43826 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


1    21958
0    21868
Name: Intent_1, dtype: int64

#Function to clean individual keywords
def clean_keywords(keyword):
    stopwords = {'a', 'about', 'above', 'after', 'again', 'against', 'aint', 'all', 'am', 'an', 'and', 'any', 'anybody', 'anyone', 'are', 'arent', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'between', 'both', 'but', 'by', 'can', 'cant', 'could', 'couldnt', 'couldve', 'd', 'didnt', 'doesnt', 'doing', 'don', 'dont', 'during', 'each', 'early', 'everybody', 'everyone', 'fact', 'few', 'for', 'from', 'further', 'gotta', 'had', 'hadnt', 'has', 'hasnt', 'have', 'havent', 'having', 'he', 'hed', 'her', 'here', 'hers', 'herself', 'hes', 'him', 'himself', 'his', 'i', 'if', 'im', 'in', 'into', 'is', 'isnt', 'it', 'itd', 'itll', 'its', 'itself', 'ive', 'lets', 'll', 'm', 'maam', 'many', 'me', 'mean', 'meaning', 'more', 'most', 'much', 'mustnt', 'mustve', 'my', 'myself', 'need', 'neednt', 'no', 'nobody', 'noone', 'nor', 'not', 'notve', 'now', 'of', 'off', 'on', 'once', 'one', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shall', 'shant', 'she', 'shes', 'should', 'shouldnt', 'shouldve', 'so', 'some', 'somebody', 'someone', 'such', 't', 'than', 'that', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'thered', 'therere', 'theres', 'these', 'they', 'theyd', 'theyll', 'theyre', 'theyve', 'this', 'those', 'through', 'too', 'us', 've', 'very', 'was', 'wasnt', 'we', 'were', 'werent', 'weve', 'what', 'whatll', 'whatre', 'whats', 'whatve', 'which', 'while', 'with', 'wont', 'would', 'wouldnt', 'wouldve', 'yet', 'you', 'youd', 'youll', 'your', 'youre', 'yours', 'yourself', 'yourselves', 'youve', 'below', 'best', 'down', 'fun', 'interesting', 'just', 'might', 'mightnt', 'mightve', 'new', 'top', 'under', 'until', 'up', 'will'}
    tokenized_keyword = keyword.split()
    meaningful_words = [kw for kw in tokenized_keyword if not kw in stopwords]
    return( " ".join( meaningful_words ))

da_gen_frame['Keyword'] = kws.apply(clean_keywords)
da_gen_frame['Keyword'].head(20)

In [39]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(da_gen_frame.Keyword, da_gen_frame.Intent_1, random_state=1)
print X_train.shape, y_train.shape, X_test.shape, y_test.shape

(32869,) (32869,) (10957,) (10957,)


In [40]:
#TFIDF vectorize and count
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True)

train_dtm = tfidf_vectorizer.fit_transform(X_train) #fit the vectorizer to keywords
train_features = tfidf_vectorizer.get_feature_names() #list of features/vocab used in tf-idf matrix
train_array = train_dtm.toarray() #turn the matrix into an array

test_dtm = tfidf_vectorizer.transform(X_test)
test_array = test_dtm.toarray()
#kws_token_counts = pd.DataFrame({'token':X_train_features, 'count':np.sum(X_train_array, axis=0)}) #create a dataframe with token
#kws_token_counts.sort_values(by='count', ascending=False)


In [41]:
#Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [42]:
# make predictions on test data using test_dtm
preds = nb.predict(test_dtm)
preds

array([1, 0, 0, ..., 1, 1, 0])

In [43]:
# compare predictions to true labels
from sklearn import metrics

print metrics.accuracy_score(y_test, preds)
print metrics.confusion_matrix(y_test, preds)


0.778406498129
[[4108 1367]
 [1061 4421]]


In [56]:
cm = metrics.confusion_matrix(y_test,preds)
cm_df = pd.DataFrame(cm, index=['Predicted Generic', 'Predicted Direct Answer'], 
                     columns=['Actual Generic', 'Actual Direct Answer'])

cm_df

Unnamed: 0,Actual Generic,Actual Direct Answer
Predicted Generic,4108,1367
Predicted Direct Answer,1061,4421


In [94]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)

print metrics.accuracy_score(y_test, predictions)
print metrics.confusion_matrix(y_test, predictions)

0.778406498129
[[4108 1367]
 [1061 4421]]


In [98]:
#SGDClassifier
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', SGDClassifier(loss='hinge', penalty='l2', n_iter=5, random_state=42)), ])


from sklearn.linear_model import SGDClassifier


text_clf = text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)

print metrics.accuracy_score(y_test, predictions)
print metrics.confusion_matrix(y_test, predictions)

0.838185634754
[[4955  520]
 [1253 4229]]


In [100]:
cm = metrics.confusion_matrix(y_test,preds)
cm_df = pd.DataFrame(cm, index=['Predicted Generic', 'Predicted Direct Answer'], 
                     columns=['Actual Generic', 'Actual Direct Answer'])

cm_df

Unnamed: 0,Actual Generic,Actual Direct Answer
Predicted Generic,4108,1367
Predicted Direct Answer,1061,4421


In [99]:
X_test[(y_test == 1) & (preds == 0)]

5720       what is a normal cholesterol
38955         bilirubin levels in women
19083              what is a normal inr
35329        swelling in one ankle only
5934           normal troponin 1 levels
15314                what is a erythema
16819         health care questionnaire
23948             breast cancer article
12561         diet after hernia surgery
13513          what is hansen's disease
7207                what is glutathione
39879               pacemaker questions
11979      stomach bloating weight gain
16096         low white blood platelets
10850         rite aid shingles vaccine
13178            leg pain heart disease
18330          municipal code violation
13903                 what is pollution
36789    back pain and kidney infection
24087             lung and liver cancer
23962         shoulder bursitis therapy
11491     prediabetic blood sugar level
5431            psa test results by age
6062       stage 4 bone cancer survival
12372            swollen parotid glands


In [45]:
# Output false negatives/positives to file? X_test[(y_test == 0) & (preds == 1)]

In [88]:
#SGDClassifier Gridsearch
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1,3)],'tfidf__use_idf': (True, False),'clf__alpha': (1e-2, 1e-3),}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

gs_clf = gs_clf.fit(X_train[:400], y_train[:400])

best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

clf__alpha: 0.01
tfidf__use_idf: True
vect__ngram_range: (1, 1)
