In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import metrics



In [3]:
data = pd.read_csv("healthy.txt", header=0, \
                    delimiter="\t")
data.head(10)

Unnamed: 0,﻿CS_ID,Keyword_ID,Keyword,Intent_1,Authority
0,CS-0001,188195,signs of gallbladder problems,Direct Answer,Expert
1,CS-0002,177403,normal blood pressure chart,Resource,General
2,CS-0003,159458,what is normal bloodpressure,Direct Answer,General
3,CS-0004,152734,what is hyperlipidemia,Generic,Expert
4,CS-0005,139086,pill identifier,Generic,General
5,CS-0006,98502,blood clots in leg symptoms,Direct Answer,Expert
6,CS-0007,98115,blood pressure range chart,Resource,General
7,CS-0008,94122,ask webmd question,Navigational,General
8,CS-0009,90914,photos of bed bug bites,Resource,General
9,CS-0010,83756,lyme disease and symptoms,Direct Answer,Expert


In [4]:
#The "Intent_1" column contains the values that we'll be working with for the classifier
data.Intent_1.value_counts()

Generic          43151
Direct Answer    37765
Guide             9138
Resource          7293
Transactional     1238
Navigational      1196
Name: Intent_1, dtype: int64

In [5]:
#Change the values in the 'Intent_1' column to numeric labels, so they can be processed by the classifier
data['Intent_1'] = data.Intent_1.map({'Generic': 0 , 'Direct Answer': 1, 'Guide': 2, 'Navigational': 3, 'Resource': 4, 'Transactional': 5 })
data.head(10)

Unnamed: 0,﻿CS_ID,Keyword_ID,Keyword,Intent_1,Authority
0,CS-0001,188195,signs of gallbladder problems,1,Expert
1,CS-0002,177403,normal blood pressure chart,4,General
2,CS-0003,159458,what is normal bloodpressure,1,General
3,CS-0004,152734,what is hyperlipidemia,0,Expert
4,CS-0005,139086,pill identifier,0,General
5,CS-0006,98502,blood clots in leg symptoms,1,Expert
6,CS-0007,98115,blood pressure range chart,4,General
7,CS-0008,94122,ask webmd question,3,General
8,CS-0009,90914,photos of bed bug bites,4,General
9,CS-0010,83756,lyme disease and symptoms,1,Expert


In [11]:
#Some data cleanup: unused columns, null rows, remove duplicate keywords, and reindex the data.
health_data = data.drop(['\xef\xbb\xbfCS_ID', 'Keyword_ID', 'Authority'], axis=1).dropna()
health_data = health_data.drop_duplicates('Keyword')
health_data_reindex = health_data.reset_index(drop=True)
health_data_reindex.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98140 entries, 0 to 98139
Data columns (total 2 columns):
Keyword     98140 non-null object
Intent_1    98140 non-null float64
dtypes: float64(1), object(1)
memory usage: 2.2+ MB


In [12]:
#Make a new dataframe with just Generic and Direct Answer. The first portion of the classifier deals with the two largest intents
da_gen_frame = health_data_reindex[health_data_reindex['Intent_1']<2].copy()
kws = da_gen_frame['Keyword']

da_gen_frame.info()
da_gen_frame.Intent_1.value_counts(1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79684 entries, 0 to 98139
Data columns (total 2 columns):
Keyword     79684 non-null object
Intent_1    79684 non-null float64
dtypes: float64(1), object(1)
memory usage: 1.8+ MB


0    0.539042
1    0.460958
Name: Intent_1, dtype: float64

In [13]:

#Split the data into training and test sets
kw_features = da_gen_frame.Keyword
kw_target = da_gen_frame.Intent_1

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(kw_features, kw_target, random_state=1)
print X_train.shape, y_train.shape, X_test.shape, y_test.shape

(59763,) (59763,) (19921,) (19921,)


In [10]:
from sklearn.svm import SVC

text_clf= Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC(kernel='linear'))])

#Fit and train model
text_clf_fit = text_clf.fit(X_train, y_train)

#Output predictions
predictions = text_clf_fit.predict(X_test)

#Let's look at some metrics!
print metrics.accuracy_score(y_test, predictions)
print metrics.confusion_matrix(y_test, predictions)
print(roc_auc_score(y_test,predictions))

cross_val_accuracy = cross_val_score(text_clf, kw_features, kw_target, cv=5).mean()
cross_val_accuracy

NameError: name 'X_train' is not defined

In [32]:
#The SGDClassifier is a linear SVM model with stochastic gradient descent (SGD) learning.

from sklearn.linear_model import SGDClassifier

# Classifier pipeline-- CountVectorizer and TfidfTransformer vectorize the keywords and transform them into numerical values
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', SGDClassifier(loss='hinge', penalty='l2', n_iter=5, random_state=42)), ])


#Fit and train model
text_clf_fit = text_clf.fit(X_train, y_train)

#Output predictions
predictions = text_clf_fit.predict(X_test)

#Let's look at some metrics!
print metrics.accuracy_score(y_test, predictions)
print metrics.confusion_matrix(y_test, predictions)
print(roc_auc_score(y_test,predictions))

cross_val_accuracy = cross_val_score(text_clf, kw_features, kw_target, cv=5).mean()
cross_val_accuracy

0.833140906581
[[9858  906]
 [2418 6739]]
0.825885132257


0.82893598749556774

In [33]:
#Data frame with our confusion matrix-- shows the # of labeling errors
cm = metrics.confusion_matrix(y_test,predictions)
cm_df = pd.DataFrame(cm, index=['Predicted Generic', 'Predicted Direct Answer'], 
                     columns=['Actual Generic', 'Actual Direct Answer'])

cm_df

Unnamed: 0,Actual Generic,Actual Direct Answer
Predicted Generic,9858,906
Predicted Direct Answer,2418,6739


In [34]:
#Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

        0.0       0.80      0.92      0.86     10764
        1.0       0.88      0.74      0.80      9157

avg / total       0.84      0.83      0.83     19921



In [None]:
#Outputs a CSV containing the errors from my model
incorrect_DAs = X_test[(y_test == 0) & (predictions == 1)]
incorrect_GENs = X_test[(y_test ==1) & (predictions == 0)]

with open('errors_output.txt', 'w') as w:
    for i in incorrect_DAs:
        w.write(i+'\t'+'False Direct Answer'+'\n')
    for g in incorrect_GENs:
        w.write(g+'\t'+'False Generic'+'\n')