In [1]:
from spacy.en import English
parser = English()

In [2]:
import foodbornenyc.models.models as models
from foodbornenyc.models.businesses import Business, business_category_table
from foodbornenyc.models.documents import YelpReview, Tweet, Document
from foodbornenyc.models.locations import Location
from foodbornenyc.models.metadata import metadata

In [3]:
from sklearn.externals import joblib
from foodbornenyc.settings import yelp_classify_config as config

#old classifier
sick = joblib.load("../"+config['model_file'])
sick.steps

[('count',
  CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
          dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
          lowercase=True, max_df=0.95, max_features=None, min_df=1,
          ngram_range=(1, 3), preprocessor=None, stop_words=None,
          strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
          tokenizer=None, vocabulary=None)),
 ('tfidf',
  TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
 ('log',
  LogisticRegression(C=100, class_weight=None, dual=True, fit_intercept=True,
            intercept_scaling=0.01, max_iter=100, multi_class='ovr',
            n_jobs=1, penalty='l2', random_state=57, solver='liblinear',
            tol=0.0001, verbose=0, warm_start=False))]

In [4]:
import numpy as np
from sklearn.metrics import roc_auc_score
def analyze(reviews, classifier):
    predictions_new = classifier.predict_proba(reviews['X'])
    pred = np.array([pred[1] for pred in predictions_new])
    print "ROC_AUC SCORE ::", roc_auc_score(reviews['y'], pred, average='micro')
    # determine true/false positive/negative rates
    tp_rate = 0.0
    fp_rate = 0.0
    tn_rate = 0.0
    fn_rate = 0.0

    for review, pred in zip(reviews['y'], predictions_new):
        if review == 1.0 and pred[1] > 0.5: tp_rate += 1
        elif review == 1.0 and pred[1] < 0.5: fn_rate += 1
        elif review == 0.0 and pred[1] > 0.5: fp_rate += 1
        elif review == 0.0 and pred[1] < 0.5: tn_rate += 1
    tp_rate /= len(reviews['y'])
    fn_rate /= len(reviews['y'])
    fp_rate /= len(reviews['y'])
    tn_rate /= len(reviews['y'])
    print "True positive ::", tp_rate
    print "False negative ::", fn_rate
    print "False positive ::", fp_rate
    print "True negative ::", tn_rate
    print "FP / TP ::", fp_rate / tp_rate
    print "FN / TN ::", fn_rate / tn_rate

In [5]:
#importing and reading data files
import xlrd

sheet1 = xlrd.open_workbook('data/yelp_sick_classifier_data.xlsx').sheet_by_index(0)
reviews = {'X':np.array([cell.value for cell in sheet1.col(1)][1:]), 'y':np.array([cell.value for cell in sheet1.col(2)][1:])}

sheet2 = xlrd.open_workbook('data/sick_test_preds.xlsx').sheet_by_index(0)
reviews2 = {'X':np.array([cell.value for cell in sheet2.col(0)][1:]), 'y':np.array([cell.value for cell in sheet2.col(2)][1:])}

In [6]:
# key words to watch out for: poisoning, sick, 
# tokens that perform strictly negation: not, n't, no, none, nobody, neither, 
# if negation word's head == key word's head, prepend key word with "not" and remove negation word
# only potential issue is double negative, e.g. "no one didn't get food poisoning", but this is a first step
from spacy import attrs
example = u"I hope none of us gets sick tonight. I didn't order food poisoning. I do not think you should come here because I got food poisoning."

# uncomment the following to see an example of a dependency parse
# parsedEx = parser(example)
# for token in parsedEx:
#     print token.orth_, token.dep_, token.head, [t.orth_ for t in token.children]

In [7]:
def transform_doc_1(doc): 
    """if root of sentence had negation and sentence contained kw"""
    kw = ['poisoning', 'sick']
    neg = ['not', "n't", 'no', 'none', 'nobody', 'neither']
    parsedDoc = parser(doc.lower())
    tokens = [[t.orth_ for t in s] for s in parsedDoc.sents] #this will be modified
    sents = list(parsedDoc.sents)
    for i in range(len(sents)):
        # each span has only one root
        if not any([c.orth_ in neg for c in sents[i].root.children]): continue #if there's no negation
        neg_i = [j for j in range(len(sents[i])) if sents[i][j].orth_ in neg][0]
        kw_list = [j for j in range(len(sents[i])) if sents[i][j].orth_ in kw]
        
        if len(kw_list) == 0: continue
        
        kw_i = kw_list[0]
        
        #now modify
        tokens[i].insert(kw_i, "not")
        tokens[i].pop(neg_i)
    #now we join everything with spaces
    out = []
    for sent in tokens:
        out.append(" ".join(sent))
    return " ".join(out)

print transform_doc_1(example)

i hope none of us gets sick tonight . i did order food not poisoning . i do think you should come here because i got food not poisoning .


In [8]:
def transform_doc_2(doc): 
    """if negation and kw share a head"""
    kw = ['poisoning', 'sick']
    neg = ['not', "n't", 'no', 'none', 'nobody', 'neither']
    parsedDoc = parser(doc.lower())
    tokens = [[t.orth_ for t in s] for s in parsedDoc.sents] #this will be modified
    sents = list(parsedDoc.sents)
    for i in range(len(sents)):
        if not any([c.orth_ in neg for c in sents[i]]): continue #if there's no negation
        neg_list = [j for j in range(len(sents[i])) if sents[i][j].orth_ in neg]
        kw_list = [j for j in range(len(sents[i])) if sents[i][j].orth_ in kw]
        
        if len(kw_list) == 0: continue
        
        # attempt at handling double negatives
        double_negative = True
        kw_i = -1
        neg_i = -1
        
        for j in neg_list:
            for k in kw_list:
                if sents[i][j].head == sents[i][k].head and double_negative:
                    neg_i = j
                    kw_i = k
                    double_negative = False
                elif sents[i][j].head == sents[i][k].head and not double_negative:
                    double_negative = True
        
        if double_negative: continue
        
        #now modify
        tokens[i].insert(kw_i, "not")
        tokens[i].pop(neg_i)
    #now we join everything with spaces
    out = []
    for sent in tokens:
        out.append(" ".join(sent))
    return " ".join(out)

print transform_doc_2(example)

i hope of us gets not sick tonight . i did order food not poisoning . i do not think you should come here because i got food poisoning .


In [9]:
from sklearn.base import TransformerMixin

class NegationTransformer(TransformerMixin):
    """ Brings negation words closer to relevant key terms to make it detectable with n-gram detector """
    
    def __init__(self, transform_doc=transform_doc_1):
        self.transform_doc = transform_doc
    
    def transform(self, X, **transform_params):
        return np.array([self.transform_doc(doc) for doc in X])
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {'transform_doc' : self.transform_doc}


In [10]:
#from yelp classifier training notebook
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

def my_roc_auc(ground_truth, predictions):
    ground_truth = np.array(ground_truth)
    predictions = np.array(predictions)
    return metrics.roc_auc_score(ground_truth, predictions, average='micro')

my_roc_auc_scorer = metrics.make_scorer(my_roc_auc, needs_threshold=True, greater_is_better=True)

# param_grid = {
#     'count__ngram_range':[(1,1),(1,2),(1,3)],
#     'tfidf__norm':['l1', 'l2'],
#     'tfidf__use_idf':[True, False],
#     'tfidf__sublinear_tf':[True,False],
#     'logreg__C':[.001, .01, .1]
# }
param_grid = {
   # 'negTransformer__transform_doc': [transform_doc_1, transform_doc_2],
    'count__ngram_range': [(1, 3)],
    'count__max_df' : [ .95],
    'count__stop_words': [None],
    'count__lowercase' : [True],
    'count__max_features': [None],
    'count__strip_accents': ['unicode'],
    'tfidf__use_idf' : [True],
    'tfidf__norm': [('l2')],
    'logreg__C': [100],
    'logreg__dual' : [True],
    'logreg__fit_intercept': [True],
    'logreg__penalty': ['l2'],
    'logreg__intercept_scaling':[.01],
    'logreg__random_state': [57],
    'logreg__solver': ['liblinear']
}

In [11]:
pipe1 = Pipeline([
    ('negTransformer', NegationTransformer(transform_doc_1)),
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('logreg', LogisticRegression())
    ])
pipe2 = Pipeline([
    ('negTransformer', NegationTransformer(transform_doc_2)),
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('logreg', LogisticRegression())
    ])


grid_search1 = GridSearchCV(pipe1, 
                           param_grid,
                           scoring=my_roc_auc_scorer,
                           n_jobs=-1, verbose=1)
grid_search2 = GridSearchCV(pipe2, 
                           param_grid,
                           scoring=my_roc_auc_scorer,
                           n_jobs=-1, verbose=1)
#does it make a difference if it's put in as a preprocessor?

In [12]:
def split_dev_test(data, test_size=.2):
    """Splits input data into a training set and a testing set"""
    train_data = {}
    test_data = {}
    for train, test in cross_validation.StratifiedShuffleSplit(data['y'], n_iter=1, test_size=test_size, random_state=0):
        train_data['X'] = data['X'][train]
        train_data['y'] = data['y'][train]
        test_data['X'] = data['X'][test]
        test_data['y'] = data['y'][test]
        
    print "Training/Dev data shape: ", train_data['X'].shape, train_data['y'].shape
    print "Test data shape: ",test_data['X'].shape, test_data['y'].shape
    return train_data, test_data

train_data, test_data = split_dev_test(reviews)

Training/Dev data shape:  (1113,) (1113,)
Test data shape:  (279,) (279,)


In [29]:
def reveal_fp_fn(reviews, classifier):
    """Returns false positives and false negatives from the classifier"""
    prediction = classifier.predict_proba(reviews['X'])
    fp = {'X':[review for label, review, pred in zip(reviews['y'], reviews['X'], prediction) if label == 0.0 and pred[1] > 0.5],
          'y':[pred[1] for label, review, pred in zip(reviews['y'], reviews['X'], prediction) if label == 0.0 and pred[1] > 0.5]}
    fn = {'X':[review for label, review, pred in zip(reviews['y'], reviews['X'], prediction) if label == 1.0 and pred[1] < 0.5],
          'y':[pred[1] for label, review, pred in zip(reviews['y'], reviews['X'], prediction) if label == 1.0 and pred[1] < 0.5]}
    return (fp, fn)

#reveal_fp(reviews2, grid_search2.best_estimator_)
#analyze(reviews2, grid_search2.best_estimator_)

def split_and_test(data, grid_search, test_size=.2, n_iter=1):
    """Tests performance of a grid search pipeline, returns false positives and false negatives"""
    train_data = {}
    test_data = {}
    out = (None, None)
    for train, test in cross_validation.StratifiedShuffleSplit(data['y'], n_iter=n_iter, test_size=test_size, random_state=0):
        train_data['X'] = data['X'][train]
        train_data['y'] = data['y'][train]
        test_data['X'] = data['X'][test]
        test_data['y'] = data['y'][test]
        %time grid_search.fit(train_data['X'], train_data['y'])
        print("Best score: %0.3f" % grid_search.best_score_)
        %time analyze(test_data, grid_search.best_estimator_)
        out = reveal_fp_fn(test_data, grid_search.best_estimator_)
    return out
        

In [30]:
fp1, fn1 = split_and_test(reviews, grid_search1)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
CPU times: user 18.2 s, sys: 357 ms, total: 18.6 s
Wall time: 44.6 s
Best score: 0.889
ROC_AUC SCORE :: 0.885131646877
True positive :: 0.444444444444
False negative :: 0.089605734767
False positive :: 0.114695340502
True negative :: 0.351254480287
FP / TP :: 0.258064516129
FN / TN :: 0.255102040816
CPU times: user 4.26 s, sys: 29.4 ms, total: 4.29 s
Wall time: 4.32 s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   26.0s finished


In [32]:
fp2, fn2 = split_and_test(reviews, grid_search2)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
CPU times: user 18.9 s, sys: 472 ms, total: 19.4 s
Wall time: 46.2 s
Best score: 0.891
ROC_AUC SCORE :: 0.886525554982
True positive :: 0.448028673835
False negative :: 0.0860215053763
False positive :: 0.111111111111
True negative :: 0.354838709677
FP / TP :: 0.248
FN / TN :: 0.242424242424
CPU times: user 4.12 s, sys: 58.9 ms, total: 4.18 s
Wall time: 4.26 s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   26.6s finished


In [68]:
param_grid_temp = {
    'count__ngram_range': [(1, 3)],
    'count__max_df' : [ .95],
    'count__stop_words': [None],
    'count__lowercase' : [True],
    'count__max_features': [None],
    'tfidf__use_idf' : [True],
    'tfidf__norm': [('l2')],
    'log__C': [100],
    'log__dual' : [True],
    'log__fit_intercept': [True],
    'log__penalty': ['l2'],
    'log__intercept_scaling':[.01],
    'log__random_state': [57],
    'log__solver': ['liblinear']
}
grid_search_sick = GridSearchCV(sick, 
                           param_grid_temp,
                           scoring=my_roc_auc_scorer,
                           n_jobs=1, verbose=1)
fp_sick, fn_sick = split_and_test(reviews, grid_search_sick)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
CPU times: user 8.77 s, sys: 299 ms, total: 9.07 s
Wall time: 10.7 s
Best score: 0.890
ROC_AUC SCORE :: 0.887816210635
True positive :: 0.451612903226
False negative :: 0.0824372759857
False positive :: 0.114695340502
True negative :: 0.351254480287
FP / TP :: 0.253968253968
FN / TN :: 0.234693877551
CPU times: user 210 ms, sys: 6.15 ms, total: 216 ms
Wall time: 227 ms


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    7.3s finished


In [62]:
# transform_doc_1 has no change
print set(fp_sick['X']) - set(fp1['X'])
print set(fp1['X']) - set(fp_sick['X'])

set([])
set([])


In [63]:
# transform_doc_2 improves false positive rate by one review
print set(fp_sick['X']) - set(fp2['X'])
print set(fp2['X']) - set(fp_sick['X'])
print 'Old prob :: ', fp_sick['y'][fp_sick['X'].index(list(set(fp_sick['X']) - set(fp2['X']))[0])]
print 'New prob :: ', grid_search2.best_estimator_.predict_proba(set(fp_sick['X']) - set(fp2['X']))[0][1]

set([u"Gross.\n\nJust ordered fish tacos and my boyfriend a fish burrito. The fish was old and smelly so we took it out and just ate it veggie. Hopefully we won't get sick tonight..."])
set([])
Old prob ::  0.564806287835
New prob ::  0.47147320544


In [72]:
# transform_doc_1 deproves false negative rate by two reviews
print set(fn_sick['X']) - set(fn1['X'])
print set(fn1['X']) - set(fn_sick['X'])
print 'New probs :: ', fn1['y'][fn1['X'].index(list(set(fn1['X']) - set(fn_sick['X']))[0])], fn1['y'][fn1['X'].index(list(set(fn1['X']) - set(fn_sick['X']))[1])]
print 'Old probs :: ', grid_search_sick.best_estimator_.predict_proba(set(fn1['X']) - set(fn_sick['X']))[:,1]

set([])
set([u"While there was a line of 4 customers, I saw the staff jokingly do nothing in front of my eyes while there was nothing left to do other than put the food into the boxes. This went on for 10 minutes. There were customers who didn't even have their orders taken.\n\nOn top of that they got multiple orders wrong, and there were only 4 customers on line! They had 4 people behind the counter. It's mind boggling how incompetent they were.\n\nOn top of the bad service, I puked up my lunch. It was gross and disgusting. \n\nI should've known better. If the service was that terrible then how do I know that the food is fresh, not passed the expiration date, how do I know that the cooks washed their hands.\n\nI don't. \n\nDon't go here. You'll get food poisoning.", u"The first thing I noticed when going here for lunch was the rap music that was blasting. This isn't a club/bar, it's a restaurant. I won't be going back just so I don't have to listen to that all through my lunch. \n\nTh

In [74]:
# transform_doc_2 deproves false negative rate by one review
print set(fn_sick['X']) - set(fn2['X'])
print set(fn2['X']) - set(fn_sick['X'])
print 'Old prob :: ', grid_search_sick.best_estimator_.predict_proba(set(fn2['X']) - set(fn_sick['X']))[0][1]
print 'New prob :: ', fn2['y'][fn2['X'].index(list(set(fn2['X']) - set(fn_sick['X']))[0])]

set([])
set([u"The first thing I noticed when going here for lunch was the rap music that was blasting. This isn't a club/bar, it's a restaurant. I won't be going back just so I don't have to listen to that all through my lunch. \n\nThai iced tea, probably the best I've had and tom yum soup was decent, but Pad thai was not good, couldn't bring myself to finish it. Lacking in flavor all around.  The worst part was a few minutes after I left this place I felt sick to my stomach for a few hours. Wouldn't recommend this place."])
Old prob ::  0.573606147071
New prob ::  0.472581011559
