# Yelp Reviews Sentiment Analysis

In [1]:
import gensim.models as m
import numpy as np
import json

### Building a Corpus Class

In [55]:
class Corpus(object):
    def __init__(self, path):
        self.path = path
        
        
    def __iter__(self):
        for line in open(self.path):
            yield line


class JsonCorpus(Corpus):
    def __init__(self, path):
        super(JsonCorpus, self).__init__(path)
            
            
    def __parse_json(self, line):
        return json.loads(line)
            
        
    def head(self, n, return_type='json', pos_threshold=3):
        with open(self.path) as file:
            json = [self.__parse_json(next(file).strip()) for x in xrange(n)]
            
            if return_type is 'json':
                return json
            elif return_type is 'text_rating':
                return [[j['text'], j['stars']] for j in json]
            elif return_type is 'text_sentiment':
                return [[j['text'], 'pos' if j['stars'] > pos_threshold else 'neg'] for j in json]
            else:
                raise NameError('invalid return_type')

In [53]:
corpus = JsonCorpus('../dataset/yelp_academic_dataset_review.json')

In [54]:
corpus.head(1, 'text_sentiment')

[[u'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.',
  'pos']]

## NLP with scikit-learn

In [66]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

### Playground Example 

In [60]:
vocabulary = [c[0] for c in corpus.head(100, 'text_sentiment')]
sentiment = [c[1] for c in corpus.head(100, 'text_sentiment')]

In [63]:
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(vocabulary)

In [65]:
X_train_counts.shape

(100, 2176)

In [67]:
multinomial_naive_bayes = MultinomialNB()
clf = multinomial_naive_bayes.fit(X_train_counts, sentiment)

In [73]:
print clf.predict(count_vectorizer.transform(['this is good food']))
print clf.predict(count_vectorizer.transform(['that place was bad']))

['pos']
['neg']
