In [93]:
import pandas as pd
import numpy as np
import re



#For Naive Bayes 
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline

from sklearn import metrics
data = pd.read_csv("healthy.txt", header=0, \
                    delimiter="\t")

In [94]:
data.head(10)

Unnamed: 0,﻿CS_ID,Keyword_ID,Keyword,Intent_1,Authority
0,CS-0001,188195,signs of gallbladder problems,Direct Answer,Expert
1,CS-0002,177403,normal blood pressure chart,Resource,General
2,CS-0003,159458,what is normal bloodpressure,Direct Answer,General
3,CS-0004,152734,what is hyperlipidemia,Generic,Expert
4,CS-0005,139086,pill identifier,Generic,General
5,CS-0006,98502,blood clots in leg symptoms,Direct Answer,Expert
6,CS-0007,98115,blood pressure range chart,Resource,General
7,CS-0008,94122,ask webmd question,Navigational,General
8,CS-0009,90914,photos of bed bug bites,Resource,General
9,CS-0010,83756,lyme disease and symptoms,Direct Answer,Expert


In [95]:
data.Intent_1.value_counts()

Generic          43151
Direct Answer    37765
Guide             9138
Resource          7293
Transactional     1238
Navigational      1196
Name: Intent_1, dtype: int64

In [96]:
data.Keyword.describe()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99997 entries, 0 to 99996
Data columns (total 5 columns):
﻿CS_ID        99997 non-null object
Keyword_ID    10000 non-null float64
Keyword       99997 non-null object
Intent_1      99781 non-null object
Authority     99678 non-null object
dtypes: float64(1), object(4)
memory usage: 4.6+ MB


In [97]:
data['Intent_1'] = data.Intent_1.map({'Generic': 0 , 'Direct Answer': 1, 'Guide': 2, 'Navigational': 3, 'Resource': 4, 'Transactional': 5 })
data.head(10)

Unnamed: 0,﻿CS_ID,Keyword_ID,Keyword,Intent_1,Authority
0,CS-0001,188195,signs of gallbladder problems,1,Expert
1,CS-0002,177403,normal blood pressure chart,4,General
2,CS-0003,159458,what is normal bloodpressure,1,General
3,CS-0004,152734,what is hyperlipidemia,0,Expert
4,CS-0005,139086,pill identifier,0,General
5,CS-0006,98502,blood clots in leg symptoms,1,Expert
6,CS-0007,98115,blood pressure range chart,4,General
7,CS-0008,94122,ask webmd question,3,General
8,CS-0009,90914,photos of bed bug bites,4,General
9,CS-0010,83756,lyme disease and symptoms,1,Expert


In [98]:
data.columns

Index([u'﻿CS_ID', u'Keyword_ID', u'Keyword', u'Intent_1', u'Authority'], dtype='object')

In [99]:
data.columns[0]

'\xef\xbb\xbfCS_ID'

In [100]:
#Drop unused columns, null rows, and reindex

health_data = data.drop(['\xef\xbb\xbfCS_ID', 'Keyword_ID', 'Authority'], axis=1)
health_data = health_data.dropna()
health_data = health_data.drop_duplicates('Keyword')
health_data_reindex = health_data.reset_index(drop=True)
health_data_reindex.info()
health_data_reindex.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98140 entries, 0 to 98139
Data columns (total 2 columns):
Keyword     98140 non-null object
Intent_1    98140 non-null float64
dtypes: float64(1), object(1)
memory usage: 2.2+ MB


Unnamed: 0,Keyword,Intent_1
0,signs of gallbladder problems,1
1,normal blood pressure chart,4
2,what is normal bloodpressure,1
3,what is hyperlipidemia,0
4,pill identifier,0
5,blood clots in leg symptoms,1
6,blood pressure range chart,4
7,ask webmd question,3
8,photos of bed bug bites,4
9,lyme disease and symptoms,1


In [101]:
#Make a new dataframe with just Generic and Direct Answer
da_gen_frame = health_data_reindex[health_data_reindex['Intent_1']<2].copy()
kws = da_gen_frame['Keyword']

da_gen_frame.info()
da_gen_frame.Intent_1.value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79684 entries, 0 to 98139
Data columns (total 2 columns):
Keyword     79684 non-null object
Intent_1    79684 non-null float64
dtypes: float64(1), object(1)
memory usage: 1.8+ MB


0    42953
1    36731
Name: Intent_1, dtype: int64

In [102]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(da_gen_frame.Keyword, da_gen_frame.Intent_1, random_state=1)
print X_train.shape, y_train.shape, X_test.shape, y_test.shape

(59763,) (59763,) (19921,) (19921,)


In [103]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', SGDClassifier(loss='hinge', penalty='l2', n_iter=5, random_state=42)), ])


from sklearn.linear_model import SGDClassifier


text_clf = text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)

print metrics.accuracy_score(y_test, predictions)
print metrics.confusion_matrix(y_test, predictions)

0.833140906581
[[9858  906]
 [2418 6739]]


In [89]:
cm = metrics.confusion_matrix(y_test,predictions)
cm_df = pd.DataFrame(cm, index=['Predicted Generic', 'Predicted Direct Answer'], 
                     columns=['Actual Generic', 'Actual Direct Answer'])

cm_df

Unnamed: 0,Actual Generic,Actual Direct Answer
Predicted Generic,9860,904
Predicted Direct Answer,2422,6735


In [45]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

        0.0       0.80      0.92      0.86     10764
        1.0       0.88      0.74      0.80      9157

avg / total       0.84      0.83      0.83     19921



In [76]:
#Errors CSV
incorrect_DAs = X_test[(y_test == 0) & (predictions == 1)]
incorrect_GENs = X_test[(y_test ==1) & (predictions == 0)]

with open('errors_output.txt', 'w') as w:
    for i in incorrect_DAs:
        w.write(i+'\t'+'False Direct Answer'+'\n')
    for g in incorrect_GENs:
        w.write(g+'\t'+'False Generic'+'\n')