# Yelp Reviews Sentiment Analysis

In [1]:
import gensim.models as m
import numpy as np
import json

### Building a Corpus Class

In [2]:
class Corpus(object):
    def __init__(self, path):
        self.path = path
        
        
    def __iter__(self):
        for line in open(self.path):
            yield line


class JsonCorpus(Corpus):
    def __init__(self, path):
        super(JsonCorpus, self).__init__(path)
            
            
    def __parse_json(self, line):
        return json.loads(line)
            
        
    def head(self, n, return_type='json', pos_threshold=3):
        with open(self.path) as file:
            json = [self.__parse_json(next(file).strip()) for x in xrange(n)]
            
            if return_type is 'json':
                return json
            elif return_type is 'text_rating':
                return [[j['text'], j['stars']] for j in json]
            elif return_type is 'text_sentiment':
                return [[j['text'], 'pos' if j['stars'] > pos_threshold else 'neg'] for j in json]
            else:
                raise NameError('invalid return_type')

In [3]:
corpus = JsonCorpus('../dataset/yelp_academic_dataset_review.json')

In [4]:
corpus.head(1, 'text_sentiment')

[[u'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.',
  'pos']]

## NLP with scikit-learn

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

### Playground Example 

In [6]:
vocabulary = [c[0] for c in corpus.head(100, 'text_sentiment')]
sentiment = [c[1] for c in corpus.head(100, 'text_sentiment')]

In [7]:
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(vocabulary)

In [8]:
X_train_counts.shape

(100, 2176)

In [9]:
multinomial_naive_bayes = MultinomialNB()
clf = multinomial_naive_bayes.fit(X_train_counts, sentiment)

In [10]:
print clf.predict(count_vectorizer.transform(['this is good food']))
print clf.predict(count_vectorizer.transform(['that place was bad']))

['pos']
['neg']


### Improving classifier

In [84]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
import pandas as pd

In [56]:
X, y = [c[0] for c in corpus.head(20000, 'text_sentiment')], [c[1] for c in corpus.head(20000, 'text_sentiment')]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [59]:
text_clf_1 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
    ])

text_clf_2 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(loss='hinge', 
                          penalty='l2', 
                          alpha=1e-3, 
                          n_iter=5, 
                          random_state=42)),
    ])

In [74]:
%time text_clf_1.fit(X_train, y_train)
print '\ntext_clf_1\n %s \n' %metrics.classification_report(y_test, 
                                                            text_clf_1.predict(X_test))

%time text_clf_2.fit(X_train, y_train)
print '\ntext_clf_2\n %s \n' %metrics.classification_report(y_test, 
                                                            text_clf_2.predict(X_test))

CPU times: user 1.93 s, sys: 57.3 ms, total: 1.98 s
Wall time: 1.97 s

text_clf_1
              precision    recall  f1-score   support

        neg       0.79      0.71      0.75      2374
        pos       0.85      0.89      0.87      4226

avg / total       0.82      0.83      0.82      6600
 

CPU times: user 2.09 s, sys: 217 ms, total: 2.31 s
Wall time: 2.2 s

text_clf_2
              precision    recall  f1-score   support

        neg       0.90      0.58      0.70      2374
        pos       0.80      0.96      0.88      4226

avg / total       0.84      0.83      0.81      6600
 



In [68]:
print text_clf_2.predict(['oh I love this place it is so good the food is nice'])
print text_clf_2.predict(['the food was really super bad'])

['pos']
['neg']


In [77]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [80]:
gs_clf = GridSearchCV(text_clf_2, parameters, n_jobs=-1)

In [81]:
%time gs_clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...     penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (0.01, 0.001)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [88]:
pd.DataFrame(gs_clf.cv_results_ ).sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_clf__alpha,param_tfidf__use_idf,param_vect__ngram_range,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
4,2.514964,1.176068,0.815149,0.840485,0.001,True,"(1, 1)","{u'vect__ngram_range': (1, 1), u'tfidf__use_id...",1,0.80752,0.841245,0.814375,0.842064,0.823556,0.838146,0.061501,0.010844,0.00657,0.001687
6,2.677888,1.220552,0.793806,0.807724,0.001,False,"(1, 1)","{u'vect__ngram_range': (1, 1), u'tfidf__use_id...",2,0.790734,0.806538,0.788401,0.809828,0.802284,0.806805,0.122359,0.060093,0.006069,0.001492
7,8.585097,2.645326,0.790896,0.810709,0.001,False,"(1, 2)","{u'vect__ngram_range': (1, 2), u'tfidf__use_id...",3,0.789167,0.810233,0.784147,0.808708,0.799373,0.813186,0.373139,0.02462,0.006335,0.001859
5,11.255774,3.961326,0.722239,0.725859,0.001,True,"(1, 2)","{u'vect__ngram_range': (1, 2), u'tfidf__use_id...",4,0.727171,0.731079,0.721451,0.723416,0.718092,0.72308,0.208553,0.050215,0.003748,0.003694
2,2.716545,1.379582,0.628731,0.628657,0.01,False,"(1, 1)","{u'vect__ngram_range': (1, 1), u'tfidf__use_id...",5,0.628469,0.628751,0.629198,0.628498,0.628527,0.628722,0.036988,0.060439,0.000331,0.000113
0,2.585405,1.151708,0.628507,0.628507,0.01,True,"(1, 1)","{u'vect__ngram_range': (1, 1), u'tfidf__use_id...",6,0.628469,0.628527,0.628527,0.628498,0.628527,0.628498,0.044785,0.006754,2.7e-05,1.4e-05
1,11.339675,3.655587,0.628507,0.628507,0.01,True,"(1, 2)","{u'vect__ngram_range': (1, 2), u'tfidf__use_id...",6,0.628469,0.628527,0.628527,0.628498,0.628527,0.628498,0.331686,0.031999,2.7e-05,1.4e-05
3,10.036358,3.26035,0.628507,0.628507,0.01,False,"(1, 2)","{u'vect__ngram_range': (1, 2), u'tfidf__use_id...",6,0.628469,0.628527,0.628527,0.628498,0.628527,0.628498,0.298908,0.074574,2.7e-05,1.4e-05
