In [1]:
import pandas as pd

In [11]:
review_data = pd.read_csv('data/Reviews.csv')

In [12]:
review_data.count()

Id                        568454
ProductId                 568454
UserId                    568454
ProfileName               568438
HelpfulnessNumerator      568454
HelpfulnessDenominator    568454
Score                     568454
Time                      568454
Summary                   568428
Text                      568454
dtype: int64

In [13]:
review_data = review_data.sample(n=10000)

In [14]:
review_data.count()

Id                        10000
ProductId                 10000
UserId                    10000
ProfileName               10000
HelpfulnessNumerator      10000
HelpfulnessDenominator    10000
Score                     10000
Time                      10000
Summary                    9999
Text                      10000
dtype: int64

In [15]:
review_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
275349,275350,B005VOONI0,ADS5APY1NKTL4,"D. Pawl ""Dani""",1,2,5,1334448000,Nice and strong....,This coffee is like none I have ever had. For ...
446008,446009,B0029NIMYW,A23CIZQS0Q3K93,Lee Schuler,0,0,5,1345593600,Spoiled chihuahua loves it!!!,I started feeding this when my vet told me to ...
192432,192433,B006GA666U,A1NAM2T05NTLVX,mac,0,0,5,1346716800,big easy is great,as good a strong coffee as any other quality b...
390000,390001,B0084B8Z90,A1A97CBY4L4R8I,Quetzal09,0,0,5,1344729600,Great Taste!!!,"This hot sauce taste just great on anything, i..."
236220,236221,B0018OJJZO,ATVSO72U28NQX,"M. Papadopoulos ""Mihalis the Greek""",1,1,5,1318118400,Absolutely delicious and addicting!,These small crackers are baked so they are not...


In [16]:
review_data = review_data[['Text','Summary','Score']]

#### Remove all rows with score 3(neutral comments)

In [17]:
review_data = review_data[review_data.Score != 3]

In [31]:
review_data.dropna(inplace=True)
review_data.count()

Text       9267
Summary    9267
Score      9267
dtype: int64

In [20]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [25]:
from sklearn_pandas import DataFrameMapper
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectKBest

In [21]:
mapper = DataFrameMapper([
    ('Text',[CountVectorizer(stop_words='english'),TfidfTransformer()]),
    ('Summary',[CountVectorizer(stop_words='english'),TfidfTransformer()])
])

In [22]:
from sklearn.pipeline import Pipeline

In [28]:
mn_pipeline = Pipeline([
    ('mapper',mapper),
    ('select',SelectKBest(k=1000)),
    ('clf',MultinomialNB())
])

In [32]:
mn_pipeline.fit(review_data[['Text','Summary']], review_data.Score)

Pipeline(memory=None,
     steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[('Text', [CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
  ...assif at 0x000002616342AF28>)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [33]:
mn_pipeline.score(review_data[['Text','Summary']], review_data.Score)

0.74576454084385457

In [35]:
mn_pipeline.predict(review_data[['Text','Summary']][:2])

array([5, 5], dtype=int64)

In [36]:
review_data.groupby('Score').count()

Unnamed: 0_level_0,Text,Summary
Score,Unnamed: 1_level_1,Unnamed: 2_level_1
1,877,877
2,520,520
4,1396,1396
5,6474,6474


In [41]:
class FoodClassifier:
    def __init__(self,classifiers):
        self.classifiers = classifiers
        
    def create_pipelines(self):
        self.pipelines = []
        for classifier in self.classifiers:
            self.pipelines.append(Pipeline([
                                  ('mapper',mapper),
                                  ('select',SelectKBest(k=1000)),
                                  ('clf',classifier)]))
            
    def train(self,trainX,trainY):
        for pipeline in self.pipelines:
            pipeline.fit(trainX,trainY)
            
    def score(self,testX,testY):
        self.score = []
        for pipeline in self.pipelines:
            self.score.append(pipeline.score(testX,testY))
    
        

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [43]:
clf = FoodClassifier([LogisticRegression(), RandomForestClassifier(), MultinomialNB()])

In [44]:
clf.create_pipelines()

In [45]:
clf.train(review_data[['Text','Summary']], review_data.Score)

In [46]:
clf.score(review_data[['Text','Summary']], review_data.Score)