# Yelp Reviews Sentiment Analysis

In [1]:
import gensim.models as m
import numpy as np
import json

### Building a Corpus Class

In [2]:
class Corpus(object):
    def __init__(self, path):
        self.path = path
        
        
    def __iter__(self):
        for line in open(self.path):
            yield line


class JsonCorpus(Corpus):
    def __init__(self, path):
        super(JsonCorpus, self).__init__(path)
            
            
    def __parse_json(self, line):
        return json.loads(line)
            
        
    def head(self, n, return_type='json', pos_threshold=3):
        with open(self.path) as file:
            json = [self.__parse_json(next(file).strip()) for x in xrange(n)]
            
            if return_type is 'json':
                return json
            elif return_type is 'text_rating':
                return [[j['text'], j['stars']] for j in json]
            elif return_type is 'text_sentiment':
                return [[j['text'], 'pos' if j['stars'] > pos_threshold else 'neg'] for j in json]
            else:
                raise NameError('invalid return_type')

In [3]:
corpus = JsonCorpus('../dataset/yelp_academic_dataset_review.json')

In [4]:
corpus.head(1, 'text_sentiment')

[[u'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.',
  'pos']]

## NLP with scikit-learn

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

### Playground Example 

In [6]:
vocabulary = [c[0] for c in corpus.head(100, 'text_sentiment')]
sentiment = [c[1] for c in corpus.head(100, 'text_sentiment')]

In [7]:
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(vocabulary)

In [8]:
X_train_counts.shape

(100, 2176)

In [9]:
multinomial_naive_bayes = MultinomialNB()
clf = multinomial_naive_bayes.fit(X_train_counts, sentiment)

In [10]:
print clf.predict(count_vectorizer.transform(['this is good food']))
print clf.predict(count_vectorizer.transform(['that place was bad']))

['pos']
['neg']


### Improving classifier

In [45]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [56]:
X, y = [c[0] for c in corpus.head(20000, 'text_sentiment')], [c[1] for c in corpus.head(20000, 'text_sentiment')]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [59]:
text_clf_1 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
    ])

text_clf_2 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(loss='hinge', 
                          penalty='l2', 
                          alpha=1e-3, 
                          n_iter=5, 
                          random_state=42)),
    ])

In [60]:
%time text_clf_1.fit(X_train, y_train)
print 'text_clf_1 is achieving %0.2f accuracy \n' %np.mean(text_clf_1.predict(X_test) == y_test)

%time text_clf_2.fit(X_train, y_train)
print 'text_clf_2 is achieving %0.2f accuracy \n' %np.mean(text_clf_2.predict(X_test) == y_test)

CPU times: user 1.92 s, sys: 44.9 ms, total: 1.97 s
Wall time: 1.98 s
text_clf_1 is achieving 0.83 accuracy 

CPU times: user 2.03 s, sys: 188 ms, total: 2.21 s
Wall time: 2.11 s
text_clf_2 is achieving 0.83 accuracy 



In [68]:
print text_clf_2.predict(['oh I love this place it is so good the food is nice'])
print text_clf_2.predict(['the food was really super bad'])

['pos']
['neg']
