In [180]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics


In [181]:
data = pd.read_csv("healthy.txt", header=0, \
                    delimiter="\t")
data.head(10)

Unnamed: 0,﻿CS_ID,Keyword_ID,Keyword,Intent_1,Authority
0,CS-0001,188195,signs of gallbladder problems,Direct Answer,Expert
1,CS-0002,177403,normal blood pressure chart,Resource,General
2,CS-0003,159458,what is normal bloodpressure,Direct Answer,General
3,CS-0004,152734,what is hyperlipidemia,Generic,Expert
4,CS-0005,139086,pill identifier,Generic,General
5,CS-0006,98502,blood clots in leg symptoms,Direct Answer,Expert
6,CS-0007,98115,blood pressure range chart,Resource,General
7,CS-0008,94122,ask webmd question,Navigational,General
8,CS-0009,90914,photos of bed bug bites,Resource,General
9,CS-0010,83756,lyme disease and symptoms,Direct Answer,Expert


In [182]:
data.Intent_1.value_counts()

Generic          43151
Direct Answer    37765
Guide             9138
Resource          7293
Transactional     1238
Navigational      1196
Name: Intent_1, dtype: int64

In [183]:
#Change User Intent labels to numerical values
data['Intent_1'] = data.Intent_1.map({'Generic': 0 , 'Direct Answer': 1, 'Guide': 2, 'Navigational': 3, 'Resource': 4, 'Transactional': 5 })

#Drop unused columns, null rows, and reindex

health_data = data.drop(['\xef\xbb\xbfCS_ID', 'Keyword_ID', 'Authority'], axis=1).dropna()
health_data = health_data.drop_duplicates('Keyword')
health_data_reindex = health_data.reset_index(drop=True)
health_data_reindex.info()
health_data_reindex.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98140 entries, 0 to 98139
Data columns (total 2 columns):
Keyword     98140 non-null object
Intent_1    98140 non-null float64
dtypes: float64(1), object(1)
memory usage: 2.2+ MB


Unnamed: 0,Keyword,Intent_1
0,signs of gallbladder problems,1
1,normal blood pressure chart,4
2,what is normal bloodpressure,1
3,what is hyperlipidemia,0
4,pill identifier,0
5,blood clots in leg symptoms,1
6,blood pressure range chart,4
7,ask webmd question,3
8,photos of bed bug bites,4
9,lyme disease and symptoms,1


In [184]:
da_other_intents = health_data_reindex[health_data_reindex['Intent_1']>1].copy()
da_other_intents.Intent_1.value_counts()

kw_target = da_other_intents.Intent_1
kw_features = da_other_intents.Keyword

X_train, X_test, Y_train, Y_test = train_test_split(kw_features, kw_target, test_size=0.2, random_state=1) 

#da_other_intents.Intent_1.value_counts()

#train, test = train_test_split(da_other_intents, test_size=0.2, random_state=1)

In [185]:
from sklearn.linear_model import SGDClassifier

# Classifier pipeline-- CountVectorizer and TfidfTransformer vectorize the keywords and transform them into numerical values
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', SGDClassifier(loss='hinge', penalty='l2', n_iter=5, random_state=42, class_weight='balanced')), ])


#Fit and train model
text_clf_fit = text_clf.fit(X_train, Y_train)

#Output predictions
predictions = text_clf_fit.predict(X_test)

In [186]:
#Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

        2.0       0.98      0.98      0.98      1773
        3.0       0.84      0.95      0.89       237
        4.0       0.97      0.96      0.97      1431
        5.0       0.98      0.94      0.96       251

avg / total       0.97      0.97      0.97      3692



In [169]:
#Undersample Classes
classes = [2,3,4,5]
subsets = []
for i in classes:
    subset = train[train.Intent_1 == i].sample(921)
    subsets.append(subset)
    
train_subset = pd.concat(subsets)

X_train, y_train = train_subset['Keyword'], train_subset['Intent_1']
X_test, y_test = test['Keyword'], test['Intent_1']

In [170]:
#LinearSVC
from sklearn.svm import LinearSVC

text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', LinearSVC()), ])

text_clf_fit = text_clf.fit(X_train, y_train)
predictions = text_clf_fit.predict(X_test)

In [171]:
#Evaluation Block
print metrics.accuracy_score(y_test, predictions)
print metrics.confusion_matrix(y_test, predictions)

cross_val_accuracy = cross_val_score(text_clf, kw_features, kw_target, cv=5).mean()
cross_val_accuracy

cm = metrics.confusion_matrix(y_test,predictions)
cm_df = pd.DataFrame(cm, index=['Predicted GUIDE (2)', 'Predicted NAV (3)', 'Predicted RES (4)', 'Predicted BUY (5)'], 
                     columns=['Actual GUIDE (2)', 'Actual NAV (3)', 'Actual RES (4)', 'Actual BUY (5)'])

cm_df

0.947724810401
[[1710   23   39    1]
 [   2  229    2    4]
 [  38   70 1322    1]
 [   0   13    0  238]]


Unnamed: 0,Actual GUIDE (2),Actual NAV (3),Actual RES (4),Actual BUY (5)
Predicted GUIDE (2),1710,23,39,1
Predicted NAV (3),2,229,2,4
Predicted RES (4),38,70,1322,1
Predicted BUY (5),0,13,0,238


In [172]:
#Evaluation Block
print metrics.accuracy_score(y_test, predictions)
print metrics.confusion_matrix(y_test, predictions)

cross_val_accuracy = cross_val_score(text_clf, kw_features, kw_target, cv=5).mean()
cross_val_accuracy

cm = metrics.confusion_matrix(y_test,predictions)
cm_df = pd.DataFrame(cm, index=['Predicted GUIDE (2)', 'Predicted NAV (3)', 'Predicted RES (4)', 'Predicted BUY (5)'], 
                     columns=['Actual GUIDE (2)', 'Actual NAV (3)', 'Actual RES (4)', 'Actual BUY (5)'])

cm_df

0.947724810401
[[1710   23   39    1]
 [   2  229    2    4]
 [  38   70 1322    1]
 [   0   13    0  238]]


Unnamed: 0,Actual GUIDE (2),Actual NAV (3),Actual RES (4),Actual BUY (5)
Predicted GUIDE (2),1710,23,39,1
Predicted NAV (3),2,229,2,4
Predicted RES (4),38,70,1322,1
Predicted BUY (5),0,13,0,238


In [173]:
#Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

        2.0       0.98      0.96      0.97      1773
        3.0       0.68      0.97      0.80       237
        4.0       0.97      0.92      0.95      1431
        5.0       0.98      0.95      0.96       251

avg / total       0.96      0.95      0.95      3692



#Evaluation Block?
classes = [2,3,4,5]

with open('errors_output.txt', 'w') as w:
    for i in classes:
        errors = X_test[(y_test == i) & (predictions != i)]
        w.write(errors + '\t'+ i )
for i in classes:
    errors = X_test[(y_test == i) & (predictions != i)]
    

errors = X_test[(y_test == 3) & (predictions != 3)]


with open('errors_output.txt', 'w') as w:
    for i in incorrect_DAs:
        w.write(i+'\t'+'False Direct Answer'+'\n')
    for g in incorrect_GENs:
        w.write(g+'\t'+'False Generic'+'\n')

In [158]:
errors = X_test[(y_test == 3) & (predictions != 3)]
errors

69557               edgepark supplies
1535        zenni optical coupon code
30376       coupon code zenni optical
78774             shalom nursing home
86922               somnapure walmart
79566           airamid nursing homes
63194    renaissance treatment center
12257     crisis relief international
5270      take a depression test free
40037          alcohol addiction quiz
Name: Keyword, dtype: object