In [28]:
import pandas as pd
import numpy as np
import nltk 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score


from sklearn import metrics
data = pd.read_csv("healthy.txt", header=0, \
                    delimiter="\t")

In [29]:
data.head(10)

Unnamed: 0,﻿CS_ID,Keyword_ID,Keyword,Intent_1,Authority
0,CS-0001,188195,signs of gallbladder problems,Direct Answer,Expert
1,CS-0002,177403,normal blood pressure chart,Resource,General
2,CS-0003,159458,what is normal bloodpressure,Direct Answer,General
3,CS-0004,152734,what is hyperlipidemia,Generic,Expert
4,CS-0005,139086,pill identifier,Generic,General
5,CS-0006,98502,blood clots in leg symptoms,Direct Answer,Expert
6,CS-0007,98115,blood pressure range chart,Resource,General
7,CS-0008,94122,ask webmd question,Navigational,General
8,CS-0009,90914,photos of bed bug bites,Resource,General
9,CS-0010,83756,lyme disease and symptoms,Direct Answer,Expert


In [30]:
#The "Intent_1" column contains the values that we'll be working with for the classifier
data.Intent_1.value_counts()

Generic          43151
Direct Answer    37765
Guide             9138
Resource          7293
Transactional     1238
Navigational      1196
Name: Intent_1, dtype: int64

In [31]:
#Checking out the keywords column
data.Keyword.describe()
#Checking out the dataframe overall
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99997 entries, 0 to 99996
Data columns (total 5 columns):
﻿CS_ID        99997 non-null object
Keyword_ID    10000 non-null float64
Keyword       99997 non-null object
Intent_1      99781 non-null object
Authority     99678 non-null object
dtypes: float64(1), object(4)
memory usage: 4.6+ MB


In [32]:
#Change the values in the 'Intent_1' column to numeric labels, so they can be processed by the classifier
#data['Intent_1'] = data.Intent_1.map({'Generic': 0 , 'Direct Answer': 1, 'Guide': 2, 'Navigational': 3, 'Resource': 4, 'Transactional': 5 })
data.head(10)

Unnamed: 0,﻿CS_ID,Keyword_ID,Keyword,Intent_1,Authority
0,CS-0001,188195,signs of gallbladder problems,Direct Answer,Expert
1,CS-0002,177403,normal blood pressure chart,Resource,General
2,CS-0003,159458,what is normal bloodpressure,Direct Answer,General
3,CS-0004,152734,what is hyperlipidemia,Generic,Expert
4,CS-0005,139086,pill identifier,Generic,General
5,CS-0006,98502,blood clots in leg symptoms,Direct Answer,Expert
6,CS-0007,98115,blood pressure range chart,Resource,General
7,CS-0008,94122,ask webmd question,Navigational,General
8,CS-0009,90914,photos of bed bug bites,Resource,General
9,CS-0010,83756,lyme disease and symptoms,Direct Answer,Expert


In [42]:
#Some data cleanup: unused columns, null rows, remove duplicate keywords, and reindex the data.
health_data = data.drop(['\xef\xbb\xbfCS_ID', 'Keyword_ID', 'Authority'], axis=1)
health_data = health_data.dropna().drop_duplicates('Keyword')
health_data_reindex = health_data.reset_index(drop=True)
health_data_reindex.info()
health_data_reindex.head(10)

health_data_reindex.Intent_1.value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98140 entries, 0 to 98139
Data columns (total 2 columns):
Keyword     98140 non-null object
Intent_1    98140 non-null object
dtypes: object(2)
memory usage: 2.2+ MB


Generic          42953
Direct Answer    36731
Guide             9008
Resource          7064
Transactional     1226
Navigational      1158
Name: Intent_1, dtype: int64

In [40]:
classes = ['Guide', 'Resource']
subsets = []
for i in classes:
    subset = health_data_reindex[health_data_reindex.Intent_1 == i].sample(10)
    subsets.append(subset)
    
train_subset = pd.concat(subsets)
train_subset

Unnamed: 0,Keyword,Intent_1
75509,how to increase hormones,Guide
59819,facet arthropathy treatment,Guide
97011,intertrigo treatment,Guide
23376,treatment for eye,Guide
95214,home remedies for nasal drip,Guide
4314,stomach bloating treatment,Guide
8576,hysteroscopy procedure,Guide
35832,magnilife sciatica relief,Guide
56501,tooth pain relief,Guide
90372,tooth abscess cure,Guide


In [25]:
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
 
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
 
evaluate_classifier(bigram_word_feats)

<class 'pandas.core.frame.DataFrame'>


In [10]:
#Make a new dataframe with just Generic and Direct Answer. The first portion of the classifier deals with the two largest intents
#da_gen_frame = health_data_reindex[health_data_reindex['Intent_1']<2].copy()
#kws = da_gen_frame['Keyword']

#da_gen_frame.info()
#da_gen_frame.Intent_1.value_counts(1)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_intent_kw_counts(intent):
    intent_df = health_data[health_data['Intent_1']==intent]
    intent_kws = intent_df['Keyword']
    vect = CountVectorizer()
    vect.fit(intent_kws)
    intent_kw_features = vect.get_feature_names()
    intent_kw_counts = vect.transform(intent_kws)
    intent_kw_array = intent_kw_counts.toarray()
    intent_kw_token_counts = pd.DataFrame({'token':intent_kw_features, 'count':np.sum(intent_kw_array, axis=0)}).sort_values(by='count', ascending=False)
    return intent_kw_token_counts.head(20)


    
def get_intent_idf_kw_counts(intent):
    intent_df = health_data[health_data['Intent_1']==intent]
    intent_kws = intent_df['Keyword']
    vect = TfidfVectorizer(use_idf=True)
    vect.fit(intent_kws)
    intent_kw_features = vect.get_feature_names()
    intent_kw_counts = vect.transform(intent_kws)
    intent_kw_array = intent_kw_counts.toarray()
    intent_kw_token_counts = pd.DataFrame({'token':intent_kw_features, 'count':np.sum(intent_kw_array, axis=0)}).sort_values(by='count', ascending=False)
    return intent_kw_token_counts.head(10)


get_intent_kw_counts('Resource')

  result = getattr(x, name)(y)


TypeError: invalid type comparison

In [13]:
health_data['Intent_1'].value_counts()

Generic          42953
Direct Answer    36731
Guide             9008
Resource          7064
Transactional     1226
Navigational      1158
Name: Intent_1, dtype: int64

Determining classes statistics... 6 classes detected: {'Resource': 5703, 'Transactional': 970, 'Generic': 34290, 'Navigational': 930, 'Direct Answer': 29356, 'Guide': 7263}
Under-sampling performed: Counter({nan: 4015, 'Navigational': 930, 'Direct Answer': 304, 'Generic': 215, 'Resource': 58, 'Guide': 49, 'Transactional': 9})


ValueError: Shape of passed values is (1, 5580), indices imply (5, 5580)

In [11]:
#The SGDClassifier is a linear SVM model with stochastic gradient descent (SGD) learning.

from sklearn.linear_model import SGDClassifier

# Classifier pipeline-- CountVectorizer and TfidfTransformer vectorize the keywords and transform them into numerical values
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', SGDClassifier(loss='hinge', penalty='l2', n_iter=5, random_state=42)), ])

#Fit and train model
text_clf_fit = text_clf.fit(X_train, y_train)

#Output predictions
predictions = text_clf_fit.predict(X_test)

#Let's look at some metrics!
print metrics.accuracy_score(y_test, predictions)
print metrics.confusion_matrix(y_test, predictions)
print(roc_auc_score(y_test,predictions))

cross_val_accuracy = cross_val_score(text_clf, kw_features, kw_target, cv=5).mean()
cross_val_accuracy

0.833140906581
[[9858  906]
 [2418 6739]]
0.825885132257


0.82893598749556774

In [89]:
#Data frame with our confusion matrix-- shows the # of labeling errors
cm = metrics.confusion_matrix(y_test,predictions)
cm_df = pd.DataFrame(cm, index=['Predicted Generic', 'Predicted Direct Answer'], 
                     columns=['Actual Generic', 'Actual Direct Answer'])

cm_df

Unnamed: 0,Actual Generic,Actual Direct Answer
Predicted Generic,9860,904
Predicted Direct Answer,2422,6735


In [45]:
#Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

        0.0       0.80      0.92      0.86     10764
        1.0       0.88      0.74      0.80      9157

avg / total       0.84      0.83      0.83     19921



In [76]:
#Outputs a CSV containing the errors from my model
incorrect_DAs = X_test[(y_test == 0) & (predictions == 1)]
incorrect_GENs = X_test[(y_test ==1) & (predictions == 0)]

with open('errors_output.txt', 'w') as w:
    for i in incorrect_DAs:
        w.write(i+'\t'+'False Direct Answer'+'\n')
    for g in incorrect_GENs:
        w.write(g+'\t'+'False Generic'+'\n')