In [44]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn import metrics


In [45]:
data = pd.read_csv("healthy.txt", header=0, \
                    delimiter="\t")
data.head(10)

Unnamed: 0,﻿CS_ID,Keyword_ID,Keyword,Intent_1,Authority
0,CS-0001,188195,signs of gallbladder problems,Direct Answer,Expert
1,CS-0002,177403,normal blood pressure chart,Resource,General
2,CS-0003,159458,what is normal bloodpressure,Direct Answer,General
3,CS-0004,152734,what is hyperlipidemia,Generic,Expert
4,CS-0005,139086,pill identifier,Generic,General
5,CS-0006,98502,blood clots in leg symptoms,Direct Answer,Expert
6,CS-0007,98115,blood pressure range chart,Resource,General
7,CS-0008,94122,ask webmd question,Navigational,General
8,CS-0009,90914,photos of bed bug bites,Resource,General
9,CS-0010,83756,lyme disease and symptoms,Direct Answer,Expert


In [46]:
#The "Intent_1" column contains the values that we'll be working with for the classifier
data.Intent_1.value_counts()

Generic          43151
Direct Answer    37765
Guide             9138
Resource          7293
Transactional     1238
Navigational      1196
Name: Intent_1, dtype: int64

In [47]:
#Change the values in the 'Intent_1' column to numeric labels
data['Intent_1'] = data.Intent_1.map({'Generic': 0 , 'Direct Answer': 1, 'Guide': 2, 'Navigational': 3, 'Resource': 4, 'Transactional': 5 })

In [48]:
#Some data cleanup: unused columns, null rows, remove duplicate keywords, and reindex the data.
health_data = data.drop(['\xef\xbb\xbfCS_ID', 'Keyword_ID', 'Authority'], axis=1)
health_data = health_data.dropna()
health_data = health_data.drop_duplicates('Keyword')
health_data_reindex = health_data.reset_index(drop=True)

In [49]:
#Text Pre-Processing
kws = health_data_reindex['Keyword']

def clean_keywords(keyword):
    stopwords = set('i me you he she it they them this that these those a an the to of in for and on'.split())
    letters_only = re.sub("[^a-zA-Z]", " ", keyword) 
    tokenized_keyword = letters_only.split()
    meaningful_words = [kw for kw in tokenized_keyword if not kw in stopwords]
    return( " ".join( meaningful_words ))

health_data_reindex['Keyword'] = kws.apply(clean_keywords)

In [50]:
#Split DataFrame: One Frame for Generic/Direct Answer, one for the other Intents

da_gen_frame = health_data_reindex[health_data_reindex['Intent_1']<2].copy()
other_intents_frame = health_data_reindex[health_data_reindex['Intent_1']>1].copy()
da_gen_frame.head(20)

Unnamed: 0,Keyword,Intent_1
0,signs gallbladder problems,1
2,what is normal bloodpressure,1
3,what is hyperlipidemia,0
4,pill identifier,0
5,blood clots leg symptoms,1
9,lyme disease symptoms,1
12,what is gluten,1
13,when is shingles contagious,1
15,shingles vaccine side effects,1
16,health benefits chia seeds,1


In [51]:
#Split the Generic/Direct Answer data 
kw_features = da_gen_frame.Keyword
kw_target = da_gen_frame.Intent_1

X_train, X_test, y_train, y_test = train_test_split(kw_features, kw_target, random_state=1)
print X_train.shape, y_train.shape, X_test.shape, y_test.shape

(59763,) (59763,) (19921,) (19921,)


In [52]:
#The SGDClassifier is a linear SVM model with stochastic gradient descent (SGD) learning.
                                                 
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2', n_iter=5, random_state=42)) ])

#Fit the model
text_clf_fit = text_clf.fit(X_train, y_train)

#Output predictions
predictions = text_clf_fit.predict(X_test)

In [53]:
#Evaluation Block
accuracy_score = metrics.accuracy_score(y_test, predictions)
auc_score = metrics.roc_auc_score(y_test,predictions)
cross_val_accuracy = cross_val_score(text_clf, kw_features, kw_target, cv=5).mean()
 
print('Accuracy Score: ' + str(accuracy_score))
print('AUC Score: ' + str(auc_score))
print('Cross Validated Accuracy: ' + str(cross_val_accuracy))

Accuracy Score: 0.83203654435
AUC Score: 0.824765384492
Cross Validated Accuracy: 0.828346165856


In [54]:
#Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

        0.0       0.80      0.91      0.85     10764
        1.0       0.88      0.73      0.80      9157

avg / total       0.84      0.83      0.83     19921



In [55]:
cm = metrics.confusion_matrix(y_test,predictions)
cm_df = pd.DataFrame(cm, index=['Predicted Generic', 'Predicted Direct Answer'], 
                     columns=['Actual Generic', 'Actual Direct Answer'])

cm_df

Unnamed: 0,Actual Generic,Actual Direct Answer
Predicted Generic,9848,916
Predicted Direct Answer,2430,6727


In [56]:
#Evaluation CSV: Outputs a CSV containing the errors from my model
incorrect_DAs = X_test[(y_test == 0) & (predictions == 1)]
incorrect_GENs = X_test[(y_test ==1) & (predictions == 0)]

with open('errors_output.txt', 'w') as w:
    for i in incorrect_DAs:
        w.write(i+'\t'+'False Direct Answer'+'\n')
    for g in incorrect_GENs:
        w.write(g+'\t'+'False Generic'+'\n')

In [57]:
#Split the Other Intents data
kw_features = other_intents_frame.Keyword
kw_target = other_intents_frame.Intent_1

X_train, X_test, y_train, y_test = train_test_split(kw_features, kw_target, random_state=1)
print X_train.shape, y_train.shape, X_test.shape, y_test.shape

(13842,) (13842,) (4614,) (4614,)


In [59]:
#The SGDClassifier is a linear SVM model with stochastic gradient descent (SGD) learning.
                                                 
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2', n_iter=5, random_state=42, class_weight='balanced')) ])

#Fit the model
text_clf_fit = text_clf.fit(X_train, y_train)

#Output predictions
predictions = text_clf_fit.predict(X_test)

In [60]:
#Evaluation Block
accuracy_score = metrics.accuracy_score(y_test, predictions)
cross_val_accuracy = cross_val_score(text_clf, kw_features, kw_target, cv=5).mean()
 
print('Accuracy Score: ' + str(accuracy_score))
print('Cross Validated Accuracy: ' + str(cross_val_accuracy))

Accuracy Score: 0.968140442133
Cross Validated Accuracy: 0.96266718674


In [61]:
#Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

        2.0       0.98      0.99      0.98      2197
        3.0       0.81      0.94      0.87       299
        4.0       0.98      0.96      0.97      1800
        5.0       0.97      0.94      0.96       318

avg / total       0.97      0.97      0.97      4614



In [43]:
cm = metrics.confusion_matrix(y_test,predictions)
cm_df = pd.DataFrame(cm, index=['Predicted GUIDE (2)', 'Predicted NAV (3)', 'Predicted RES (4)', 'Predicted BUY (5)'], 
                     columns=['Actual GUIDE (2)', 'Actual NAV (3)', 'Actual RES (4)', 'Actual BUY (5)'])

cm_df

Unnamed: 0,Actual GUIDE (2),Actual NAV (3),Actual RES (4),Actual BUY (5)
Predicted GUIDE (2),2166,9,20,2
Predicted NAV (3),2,281,12,4
Predicted RES (4),33,43,1722,2
Predicted BUY (5),3,13,4,298
