In [51]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.svm import LinearSVC
from pprint import pprint

from sklearn import metrics
data = pd.read_csv("healthy.txt", header=0, \
                    delimiter="\t")

In [52]:
data.head()

Unnamed: 0,﻿CS_ID,Keyword_ID,Keyword,Intent_1,Authority
0,CS-0001,188195,signs of gallbladder problems,Direct Answer,Expert
1,CS-0002,177403,normal blood pressure chart,Resource,General
2,CS-0003,159458,what is normal bloodpressure,Direct Answer,General
3,CS-0004,152734,what is hyperlipidemia,Generic,Expert
4,CS-0005,139086,pill identifier,Generic,General


In [53]:
#Change User Intent labels to numerical values
data['Intent_1'] = data.Intent_1.map({'Generic': 0 , 'Direct Answer': 1, 'Guide': 2, 'Navigational': 3, 'Resource': 4, 'Transactional': 5 })

#Drop unused columns, null rows, and reindex
health_data = data.drop(['\xef\xbb\xbfCS_ID', 'Keyword_ID', 'Authority'], axis=1).dropna()
health_data = health_data.drop_duplicates('Keyword')
health_data_reindex = health_data.reset_index(drop=True)
health_data_reindex.info()
health_data_reindex.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98140 entries, 0 to 98139
Data columns (total 2 columns):
Keyword     98140 non-null object
Intent_1    98140 non-null float64
dtypes: float64(1), object(1)
memory usage: 2.2+ MB


Unnamed: 0,Keyword,Intent_1
0,signs of gallbladder problems,1
1,normal blood pressure chart,4
2,what is normal bloodpressure,1
3,what is hyperlipidemia,0
4,pill identifier,0
5,blood clots in leg symptoms,1
6,blood pressure range chart,4
7,ask webmd question,3
8,photos of bed bug bites,4
9,lyme disease and symptoms,1


In [54]:
da_other_intents = health_data_reindex[health_data_reindex['Intent_1']<2].copy()
da_other_intents.Intent_1.value_counts()

kw_target = da_other_intents.Intent_1
kw_features = da_other_intents.Keyword

X_train, X_test, y_train, y_test = train_test_split(kw_features, kw_target, test_size=0.2, random_state=1) 

#da_other_intents.Intent_1.value_counts()

#train, test = train_test_split(da_other_intents, test_size=0.2, random_state=1)
print(len(X_train))
print(len(Y_train))

63747
63747


#Undersample Classes
classes = [2,3,4,5]
subsets = []
for i in classes:
    subset = train[train.Intent_1 == i].sample(921)
    subsets.append(subset)
    
train_subset = pd.concat(subsets)

X_train, y_train = train_subset['Keyword'], train_subset['Intent_1']
X_test, y_test = test['Keyword'], test['Intent_1']

In [50]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC()) ])

from sklearn.grid_search import GridSearchCV

parameters = {
    'vect__max_df': (5,10,100,1000),
    'vect__min_df': [1,2,3],
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    #'clf__loss': ('hinge', 'squared_hinge'), 
    #'clf__alpha': (0.00001, 0.000001),
    #'clf__penalty': ('l1', 'l2'),
    #'clf__n_iter': (10, 50, 80),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=1)

fit_grid = gs_clf.fit(X_train,y_train)

fit_grid.score(X_test,y_test)

fit_grid.best_params_

{'tfidf__norm': 'l2',
 'tfidf__use_idf': False,
 'vect__max_df': 1000,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 2)}

In [60]:
#GridSearch for GEN/DA

from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge')) ])

from sklearn.grid_search import GridSearchCV
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=1)

fit_grid = gs_clf.fit(X_train,y_train)

fit_grid.score(X_test,y_test)

fit_grid.best_params_

{'clf__alpha': 1e-05,
 'clf__penalty': 'l2',
 'tfidf__norm': 'l1',
 'tfidf__use_idf': True,
 'vect__max_df': 0.5,
 'vect__max_features': 50000,
 'vect__ngram_range': (1, 2)}

In [61]:
other_intents = health_data_reindex[health_data_reindex['Intent_1']>1].copy()
other_intents.Intent_1.value_counts()

kw_target = da_other_intents.Intent_1
kw_features = da_other_intents.Keyword

X_train, X_test, y_train, y_test = train_test_split(kw_features, kw_target, test_size=0.2, random_state=1) 

In [62]:
#GridSearch for OtherIntents

from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge')) ])

from sklearn.grid_search import GridSearchCV
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=1)

fit_grid = gs_clf.fit(X_train,y_train)

fit_grid.score(X_test,y_test)

fit_grid.best_params_

{'clf__alpha': 1e-05,
 'clf__penalty': 'elasticnet',
 'tfidf__norm': 'l1',
 'tfidf__use_idf': True,
 'vect__max_df': 1.0,
 'vect__max_features': 50000,
 'vect__ngram_range': (1, 1)}

In [None]:
class sklearn.feature_extraction.text.TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)[source]¶

In [None]:
CountVectorizer(input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, 
                preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\b\w\w+\b', ngram_range=(1, 1),
                analyzer='word', max_df=1.0, min_df=1, 
                max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)[source]¶

In [None]:

#All three at once!
model_names = ['Logistic Regression', 'KNN', 'Naive Bayes']
models = [LogisticRegression(C=1), KNeighborsClassifier(5), GaussianNB()]
i = 0
for model in models:
    print model
    model.fit(osx, osy)  
    predictions = model.predict(X_test)

    cm = confusion_matrix(Y_test,predictions)

    #lets make that easier to read
    cm_df = pd.DataFrame(cm, columns=['Predicted Class 0', 'Predicted Class 1'], 
                             index=['Actual Class 0', 'Actual Class 1'])

    print "Results of {}: ".format(model_names[i])
    print cm_df,'\n'
    print "Precision:",float(cm[1,1])/(cm[0,1]+cm[1,1])
    print "Recall:",float(cm[1,1])/(cm[1,0]+cm[1,1]),'\n'
    print "============================================"
    i +=1

