In [1]:
import copy
import pandas
import numpy

from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV

In [2]:
class ReviewSentiment:
    
    _PIPELINE = Pipeline([('vectorizer', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('classifier', SGDClassifier(loss='hinge', 
                                                      penalty='l2',
                                                      n_iter=5
                                                     )
                         )
                        ])
    
    # Calculated from `subset = reviews.sample(frac=.01, random_state=0)` 
    # (see 'review_data_classify_tune' module)
    _OPTIMAL_PARAMETERS = {'vectorizer__ngram_range': (1, 3), 
                           'classifier__alpha': 0.0001, 
                           'tfidf__use_idf': True
                          }
    
    def __init__(self, data, label, text, pipeline=copy.deepcopy(_PIPELINE)):
        self.data = data
        self.label = label
        self.text = text
        self.pipeline = pipeline
        
    def tuneClassifier(self, tuningParameters):
    
        gridSearch = (GridSearchCV(self.pipeline, tuningParameters, cv=3, n_jobs=-1, error_score=numpy.nan)
                      .fit(self.data[self.text], self.data[self.label]) 
                     )
    
        return (pandas.DataFrame.from_dict([(params, mean_score, (scores.std() * 2)) 
                                            for params, mean_score, scores in gridSearch.grid_scores_
                                           ])
                .rename(columns={0:'parameters', 1:'mean_score', 2:'confidence'})
               )
    
    def predictSentiment(self, modelParameters):
        (self.pipeline
         .set_params(**modelParameters)
         .fit(self.data[self.text], self.data[self.label])
        )
        self.data['sentiment'] = self.pipeline.predict(self.data[self.text])
        return self.data
    
    def accuracy(self):
        return numpy.mean(self.data['sentiment'] == self.data[self.label])
    
    def classificationReport(self):
        return metrics.classification_report(self.data[self.label], self.data['sentiment'])
    
    def confusionMatrix(self):
        return metrics.confusion_matrix(self.data[self.label], self.data['sentiment'])