In [2]:
import os
import numpy as np
import pandas as pd

from subprocess import check_output
print(check_output(['ls', './data/']).decode('utf8'))

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import json

yelp_academic_dataset_business.json
yelp_academic_dataset_review.json



In [3]:
from sklearn.feature_extraction.text import CountVectorizer
review_file = 'data/yelp_academic_dataset_review.json'
biz_file = 'data/yelp_academic_dataset_business.json'

with open(review_file) as f:
    js = []
    for i in range(1000):
        js.append(json.loads(f.readline()))

review_df = pd.DataFrame(js)

# take a look at the first five reviews
review_df[['text']][:5]


                                                text
0  My wife took me here on my birthday for breakf...
1  I have no idea why some people give bad review...
2  love the gyro plate. Rice is so good and I als...
3  Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4  General Manager Scott Petello is a good egg!!!...

In [4]:
#bag-of-words
bow_converter = CountVectorizer(token_pattern='(?u)\\b\\w+\\b') # unigram
bigram_converter = CountVectorizer(ngram_range=(2, 2),
                                   token_pattern='(?u)\\b\\w+\\b')
trigram_converter = CountVectorizer(ngram_range=(3, 3),
                                    token_pattern='(?u)\\b\\w+\\b')

bow_converter.fit(review_df['text'])
words = bow_converter.get_feature_names()

bigram_converter.fit(review_df['text'])
bigrams = bigram_converter.get_feature_names()

trigram_converter.fit(review_df['text'])
trigrams = trigram_converter.get_feature_names()

print(len(words), len(bigrams), len(trigrams))


9925 65263 111809


In [5]:
print(words[:5])
print('='*80)
print(bigrams[-5:])
print('='*80)
print(trigrams[-5:])


['0', '00', '000', '00pm', '02']
['zucchini the', 'zucchini was', 'zuch and', 'zupas officially', 'zuzus room']
['zucchini the zucchini', 'zucchini was really', 'zuch and asparagus', 'zupas officially opens', 'zuzus room service']


In [6]:

print(" unique unigrams={}\n unique bigrams={}".
      format(len(set(words)), len(set(bigrams))))

len(set(trigrams))

 unique unigrams=9925
 unique bigrams=65263


111809

## Use of Stemming

In [7]:
import nltk

stemmer = nltk.stem.porter.PorterStemmer()
stemmer.stem('zeroes')

'zero'

In [11]:
import spacy

# use the first 10 reviews to exploration
review_df2 = review_df[:10]

nlp = spacy.load('en')
doc_df = review_df2['text'].apply(nlp)

for doc in doc_df[4]:
    print([doc.text, doc.pos_, doc.tag_])


['General', 'PROPN', 'NNP']
['Manager', 'PROPN', 'NNP']
['Scott', 'PROPN', 'NNP']
['Petello', 'PROPN', 'NNP']
['is', 'VERB', 'VBZ']
['a', 'DET', 'DT']
['good', 'ADJ', 'JJ']
['egg', 'NOUN', 'NN']
['!', 'PUNCT', '.']
['!', 'PUNCT', '.']
['!', 'PUNCT', '.']
['Not', 'ADV', 'RB']
['to', 'PART', 'TO']
['go', 'VERB', 'VB']
['into', 'ADP', 'IN']
['detail', 'NOUN', 'NN']
[',', 'PUNCT', ',']
['but', 'CCONJ', 'CC']
['let', 'VERB', 'VB']
['me', 'PRON', 'PRP']
['assure', 'VERB', 'VB']
['you', 'PRON', 'PRP']
['if', 'ADP', 'IN']
['you', 'PRON', 'PRP']
['have', 'VERB', 'VBP']
['any', 'DET', 'DT']
['issues', 'NOUN', 'NNS']
['(', 'PUNCT', '-LRB-']
['albeit', 'ADP', 'IN']
['rare', 'ADJ', 'JJ']
[')', 'PUNCT', '-RRB-']
['speak', 'VERB', 'VBP']
['with', 'ADP', 'IN']
['Scott', 'PROPN', 'NNP']
['and', 'CCONJ', 'CC']
['treat', 'VERB', 'VB']
['the', 'DET', 'DT']
['guy', 'NOUN', 'NN']
['with', 'ADP', 'IN']
['some', 'DET', 'DT']
['respect', 'NOUN', 'NN']
['as', 'ADP', 'IN']
['you', 'PRON', 'PRP']
['state', 'VERB'

In [12]:
# noun chunking
print([chunk for chunk in doc_df[4].noun_chunks])

[General Manager Scott Petello, a good egg, detail, me, you, you, any issues, Scott, the guy, some respect, you, your case, I, you, I, I, Mistakes, it, we, them, Thanks, Scott, his awesome staff, You, a customer, life]


## Use `textblob`

In [17]:
# nltk.download('punkt')

from textblob import TextBlob

blob_df = review_df2['text'].apply(TextBlob)

blob_df[4].tags

[('General', 'NNP'),
 ('Manager', 'NNP'),
 ('Scott', 'NNP'),
 ('Petello', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('good', 'JJ'),
 ('egg', 'NN'),
 ('Not', 'RB'),
 ('to', 'TO'),
 ('go', 'VB'),
 ('into', 'IN'),
 ('detail', 'NN'),
 ('but', 'CC'),
 ('let', 'VB'),
 ('me', 'PRP'),
 ('assure', 'VB'),
 ('you', 'PRP'),
 ('if', 'IN'),
 ('you', 'PRP'),
 ('have', 'VBP'),
 ('any', 'DT'),
 ('issues', 'NNS'),
 ('albeit', 'IN'),
 ('rare', 'NN'),
 ('speak', 'NN'),
 ('with', 'IN'),
 ('Scott', 'NNP'),
 ('and', 'CC'),
 ('treat', 'VB'),
 ('the', 'DT'),
 ('guy', 'NN'),
 ('with', 'IN'),
 ('some', 'DT'),
 ('respect', 'NN'),
 ('as', 'IN'),
 ('you', 'PRP'),
 ('state', 'NN'),
 ('your', 'PRP$'),
 ('case', 'NN'),
 ('and', 'CC'),
 ('I', 'PRP'),
 ("'d", 'MD'),
 ('be', 'VB'),
 ('surprised', 'VBN'),
 ('if', 'IN'),
 ('you', 'PRP'),
 ('do', 'VBP'),
 ("n't", 'RB'),
 ('walk', 'VB'),
 ('out', 'RP'),
 ('totally', 'RB'),
 ('satisfied', 'JJ'),
 ('as', 'IN'),
 ('I', 'PRP'),
 ('just', 'RB'),
 ('did', 'VBD'),
 ('Like', 'IN'),
 ('

In [21]:
print([np for np in blob_df[4].noun_phrases])

['general manager', 'scott petello', 'good egg', 'scott', "n't walk", '... ..', 'mistakes', 'thanks', 'scott', 'awesome staff', '... ... ...']


## Tf-Idf (term frequency-inverse documents frequency)

- $bow(w, d)$ = #times words $w$ apprears in document d
- $tf-idf(w, d)$ = $bow(w, d)*N$/(# documents in which word $w$ appears)

> $N$ is the total number of documents

In [27]:
# load datasets

with open(biz_file) as f:
    biz_df = pd.DataFrame([json.loads(x) for x in f.readlines()])

with open(review_file) as f:
    review_df = pd.DataFrame([json.loads(x) for x in f.readlines()])
   

In [28]:
two_biz = biz_df[biz_df.apply(lambda x: 'Nightlife' in x['categories'] or 'Restaurants' in x['categories'], axis=1)]

twobiz_reviews = two_biz.merge(review_df, on='business_id', how='inner')

# create a target column 
twobiz_reviews['target'] = twobiz_reviews.apply(lambda x: 'Nightlife' in x['categories'], axis=1)

In [29]:
twobiz_reviews.head(5)

                                          attributes             business_id  \
0  {'Take-out': True, 'Wi-Fi': 'no', 'Good For': ...  mQfT3JYu18HN22DVylcE7A   
1  {'Take-out': True, 'Wi-Fi': 'no', 'Good For': ...  mQfT3JYu18HN22DVylcE7A   
2  {'Take-out': True, 'Wi-Fi': 'no', 'Good For': ...  mQfT3JYu18HN22DVylcE7A   
3  {'Take-out': True, 'Wi-Fi': 'no', 'Good For': ...  mQfT3JYu18HN22DVylcE7A   
4  {'Take-out': True, 'Wi-Fi': 'no', 'Good For': ...  mQfT3JYu18HN22DVylcE7A   

                                          categories     city  \
0  [Bakeries, Food, Breakfast & Brunch, Sandwiche...  Phoenix   
1  [Bakeries, Food, Breakfast & Brunch, Sandwiche...  Phoenix   
2  [Bakeries, Food, Breakfast & Brunch, Sandwiche...  Phoenix   
3  [Bakeries, Food, Breakfast & Brunch, Sandwiche...  Phoenix   
4  [Bakeries, Food, Breakfast & Brunch, Sandwiche...  Phoenix   

                                 full_address  \
0  3134 E Indian School Rd\nPhoenix, AZ 85018   
1  3134 E Indian School Rd\nPh