In [6]:
import pandas as pd    
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

data = pd.read_csv("user_intent.tsv", header=0, \
                    delimiter="\t")

In [7]:
data.head(10)
keywords = data['keyword']
data['user_intent'].value_counts()

DA      990
GEN     910
BUY     695
INFO    588
RES     383
NAV     290
Name: user_intent, dtype: int64

In [8]:
#Change User Intent labels to numerical values
data['user_intent'] = data.user_intent.map({'GEN': 0 , 'DA': 1, 'BUY': 2, 'INFO': 3, 'RES': 4, 'NAV': 5 })

#Return a new dataframe with only the Generic and Direct Answer user intents
data_gda = data[data.user_intent<2]
data_gda.head(10)

Unnamed: 0,keyword,user_intent
1,what is this coin worth,1
2,what color is sable brown,1
4,what is carcinoma,1
7,remove dark spots on skin,1
8,how do you get spider mites,1
9,air climate control,0
11,havanese dogs,0
12,mini goldendoodles rescue,0
13,what causes hair to turn gray,1
14,poplar trees,0


In [9]:
#Split into training and test sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_gda.keyword, data_gda.user_intent, random_state=1)
print X_train.shape, y_train.shape, X_test.shape, y_test.shape

(1425,) (1425,) (475,) (475,)


In [10]:
#Vectorize and turn into a document term matrix
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(decode_error = 'ignore')
X_train_counts = vect.fit_transform(X_train)
X_train_counts.shape

(1425, 2389)

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
train_dtm = tfidf_transformer.fit_transform(X_train_counts)
train_dtm.shape

(1425, 2389)

In [12]:
#Instantiate and fit model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
#Transform test data as we did training data
X_test_counts = vect.transform(X_test)
test_dtm = tfidf_transformer.transform(X_test_counts)

In [14]:
#Predict over test data
preds = nb.predict(test_dtm)
preds

array([0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0,

In [15]:
from sklearn import metrics

print metrics.accuracy_score(y_test, preds)
print metrics.confusion_matrix(y_test, preds)

0.747368421053
[[127  98]
 [ 22 228]]


In [17]:
cm = metrics.confusion_matrix(y_test,preds)
cm_df = pd.DataFrame(cm, index=['Predicted Generic', 'Predicted Direct Answer'], 
                     columns=['Actual Generic', 'Actual Direct Answer'])

cm_df

Unnamed: 0,Actual Generic,Actual Direct Answer
Predicted Generic,127,98
Predicted Direct Answer,22,228


In [18]:
X_test[(y_test == 0) & (preds == 1)]

1861                 hybrid oak trees
233                         toy worth
3090                double gold stock
310           ubiquinol form of coq10
2230                      220 voltage
2673                 current politics
2080          change telephone number
907                   valuable stamps
1905                    new born baby
391      new product pricing strategy
2713          free ads for nonprofits
2579     hdl ldl triglycerides levels
2043        animals that are reptiles
797       perm for color treated hair
376              natural cure for hiv
1680                    stomach tumor
3540                    gondola rides
194              civil lawsuits cases
133          cure for prostate cancer
982              small garden lizards
2134                  bathroom suites
2183               aristotle elements
1645          magnesium for sleep aid
2447    architectural project manager
2363                   edgar painting
173                    valuable coins
3185        