In [4]:
import copy
import pandas
import numpy

from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV

In [2]:
class ReviewSentiment:
    
    _classifier = SGDClassifier(loss='hinge', 
                                penalty='l2',
                                n_iter=5
                               )
    
    # Calculated from `subset = reviews.sample(frac=.01, random_state=2016)` 
    # (see 'review_data_classify_tune' module)
    _optimalParameters = {'vectorizer__ngram_range': (1, 2),
                          'vectorizer__use_idf': True,
                          'vectorizer__token_pattern': "[a-z']{2,}", 
                          'vectorizer__stop_words': 'english', 
                          'vectorizer__min_df': 0.01, 
                          'vectorizer__max_df': 0.99,
                          'classifier__alpha': 0.001
                         }
    
    def __init__(self, data, label, text, classifier=copy.deepcopy(_classifier)):
        self.data = data
        self.label = label
        self.text = text
        self.classifier = classifier
        self.pipeline = Pipeline([('vectorizer', TfidfVectorizer()),
                                  ('classifier', self.classifier)
                                ])
        
    def tuneClassifier(self, tuningParameters, kfolds):
    
        self.pipeline = (GridSearchCV(estimator=self.pipeline, 
                                      param_grid=tuningParameters, 
                                      cv=kfolds, 
                                      n_jobs=-1, 
                                      error_score=numpy.nan
                                     )
                         .fit(self.data[self.text], self.data[self.label])
                        )
        
    
        return (pandas.DataFrame.from_dict([(params, mean_score, (scores.std() * 2)) 
                                            for params, mean_score, scores in self.pipeline.grid_scores_
                                           ])
                .rename(columns={0:'parameters', 1:'mean_score', 2:'confidence'})
               )
    
    def predictSentiment(self, modelParameters):
        (self.pipeline
         .set_params(**modelParameters)
         .fit(self.data[self.text], self.data[self.label])
        )
        self.data['sentiment'] = self.pipeline.predict(self.data[self.text])
        return self.data
    
    def accuracy(self):
        return numpy.mean(self.data['sentiment'] == self.data[self.label])
    
    def classificationReport(self):
        return metrics.classification_report(self.data[self.label], self.data['sentiment'])
    
    def confusionMatrix(self):
        return metrics.confusion_matrix(self.data[self.label], self.data['sentiment'])