In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# 1. Extracting NYT data

### 1.1 Extracting HTML w/ BeautifulSoup

#### Import NYT text file 

This text file is manually generated by copying XML from: https://www.nytimes.com/reviews/dining, reviews starting from 2012. We then use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to format the data for data extraction.

In [2]:
# import nyt html text file (manually generated from: https://www.nytimes.com/reviews/dining)
text = open('import/nyt.txt', 'r')
text = text.read()

# use BeautifulSoup to format
soup = BeautifulSoup(text,"html.parser")

# Extract restaurant review sections
top_rest = soup.find_all('div', attrs={'class': 'story-meta'})

#### Extract names of reviewed restaurants in NYT.

In [3]:
list_rest = []
for tr in top_rest:
    result = tr.find_all('h2', attrs={'class': 'headline'})
    result_name = result[0].string.split()
    result_name = " ".join(result_name)
    list_rest.append(result_name)

In [4]:
print('Number of reviews: ', len(list_rest))

Number of reviews:  590


#### Extract date/time of review, which is contained in a separate XML section.

In [5]:
# Extract HTML sections containing review time/date
top_rest_time = soup.find_all('footer', attrs={'class':'story-footer'})

# Extract date/time
list_time = []
for tr in top_rest_time:
    
    result = tr.find_all('time', attrs={'class': 'dateline'})
    result = result[0]
    list_time.append(result.string)

In [6]:
df_nyt = pd.DataFrame({'nyt_name':list_rest, 'review_time':list_time})
df_nyt.head()

Unnamed: 0,nyt_name,review_time
0,Davelle,"June 7, 2018"
1,Lahi,"May 31, 2018"
2,Don Angie,"May 29, 2018"
3,Bar Patrón by Rockpool,"May 24, 2018"
4,Rangoon Spoon,"May 24, 2018"


# 2. Yelp API

We use the Python library [YelpAPI](https://github.com/gfairchild/yelpapi). Accessing Yelp's API requires an API key, which is obtained by registering project as an app.

In [636]:
from yelpapi import YelpAPI

api_key = ''
yelp_api = YelpAPI(api_key)

### 2.1 Extract Yelp ID and other basic business info.
Using our generated list of NYT restaurants, conduct a search query to Yelp's API to identify the restaurant's Yelp ID, among other basic info. This ID is used to look up more details in the next step.

In [None]:
import string

y_id = []
y_isclosed = []
y_name = []
y_price = []
y_rating = []
y_reviewcount = []
y_url = []

for rest in list_rest:
    
    # Conduct business search in Yelp API
    
    rest_punct = rest.translate(string.punctuation) # remove punctuation to avoid confounding Yelp search
    response = yelp_api.search_query(term=rest_punct, location='new york city, ny', sort_by='rating', limit=1) # limit=1 for simplicity; we assume the top 1 result is the actual restaurant
    
    if not response['businesses']: # no search result
        y_id.append(np.nan)
        y_isclosed.append(np.nan)
        y_name.append(rest)
        y_price.append(np.nan)
        y_rating.append(np.nan)
        y_reviewcount.append(np.nan)
        y_url.append(np.nan)
        continue
        
    else:
        response = response['businesses'][0]
        # Append to lists, to combine into dataframe of all restaurants' Yelp data
        y_id.append(response['id'])
        y_isclosed.append(response.get('is_closed',np.nan)) 
        y_name.append(response['name'])
        y_price.append(response.get('price',np.nan))
        y_rating.append(response['rating'])
        y_reviewcount.append(response['review_count']) 
        y_url.append(response['url'])

df_yelp = pd.DataFrame({
            'id':y_id,
            'yelp_name':y_name,
            'rating':y_rating,
            'reviewcount':y_reviewcount,
            'isclosed':y_isclosed,
            'y_url':y_url,
            'y_price':y_price})

Convert "$" Yelp price level format to numeric string.

In [638]:
df_yelp['y_price'] = df_yelp['y_price'].fillna('#') # fill NaN's in 'yelp_price' column w/ "#"
price_dict = {"$":"1", "$$":"2", "$$$":"3", "$$$$":"4", "#":"0"} 

for idx, row in df_yelp.iterrows():
    price = row['y_price']
    if price == "$":
        price = 1
    if price == "$$":
        price = 2
    if price == "$$$":
        price = 3
    if price == "$$$$":
        price = 4
    df_yelp.loc[idx, 'y_price'] = price

df_yelp.head()

Unnamed: 0,id,isclosed,rating,reviewcount,y_price,y_url,yelp_name
0,PwrYnl3j3YCGzILX_5UHpw,False,4.0,47.0,2,https://www.yelp.com/biz/davelle-new-york-2?ad...,Davelle
1,kDqW1eeZEDTitXerwzwCzw,False,4.5,4.0,1,https://www.yelp.com/biz/lahi-queens?adjust_cr...,LAHI
2,h37t9rA06Sr4EetJjKrfzw,False,4.5,127.0,3,https://www.yelp.com/biz/don-angie-new-york?ad...,Don Angie
3,,,,,#,,Bar Patrón by Rockpool
4,chdmH_y3YReZgIlNzj8djQ,False,3.5,55.0,2,https://www.yelp.com/biz/rangoon-spoon-brookly...,Rangoon Spoon


#### Combine all info so far into one dataframe

In [647]:
df_nyt_yelp = pd.concat([df_nyt, df_yelp], axis=1)
df_nyt_yelp.head()

Unnamed: 0,nyt_name,review_time,id,isclosed,rating,reviewcount,y_price,y_url,yelp_name
0,Davelle,"June 7, 2018",PwrYnl3j3YCGzILX_5UHpw,False,4.0,47.0,2,https://www.yelp.com/biz/davelle-new-york-2?ad...,Davelle
1,Lahi,"May 31, 2018",kDqW1eeZEDTitXerwzwCzw,False,4.5,4.0,1,https://www.yelp.com/biz/lahi-queens?adjust_cr...,LAHI
2,Don Angie,"May 29, 2018",h37t9rA06Sr4EetJjKrfzw,False,4.5,127.0,3,https://www.yelp.com/biz/don-angie-new-york?ad...,Don Angie
3,Bar Patrón by Rockpool,"May 24, 2018",,,,,#,,Bar Patrón by Rockpool
4,Rangoon Spoon,"May 24, 2018",chdmH_y3YReZgIlNzj8djQ,False,3.5,55.0,2,https://www.yelp.com/biz/rangoon-spoon-brookly...,Rangoon Spoon


### 2.2 Fix errors & fill missing data

#### Export data, manually inspect for errors, import back.
During this process, the dates were manually re-formatted to be in the same format as well. Errors usually due to:
1. NYT and Yelp restaurant spelling was different - thus, using NYT's spelling to conduct Yelp API search yielded inaccurate results.
2. Restaurant closed and did not turn up in Yelp API search (but Yelp listing/URL still exists)

In [289]:
df_nyt_yelp.to_csv('import/df_nyt_yelp.csv')

Edited errors manually in Excel sheet and renamed `df_nyt_yelp2.csv`, to import back into this notebook.

In [648]:
df = pd.read_csv('import/df_nyt_yelp2.csv')

df.loc[35, 'id'] = '-Exv1AEsaKU6Cdb2TO_ZUg'
df.loc[112, 'id'] = '-sUqq_Gty6LTWcOzosjBrQ'
df.loc[116, 'id'] = '-hUvO0C0A-pGZdiSKvjoFw'
df.loc[240, 'id'] = '-ic13XJ9ekpEfshQFPKL4A'
df.loc[309, 'id'] = '-u9yPjmvZilrXYM8JlKG6w'
df.loc[411, 'id'] = '-i0BX3tNEhVPRuhvWxlayQ'
df.loc[428, 'id'] = '-vF3hX7v1R4oAjfNgifZvA'
df.loc[541, 'id'] = '-VvrQWfbkQWeuGC4L0KcfQ'

df_nyt_yelp['id'] = df['id']
df_nyt_yelp['isclosed'] = df['isclosed']
df_nyt_yelp['rating'] = df['rating']
df_nyt_yelp['reviewcount'] = df['reviewcount']

#### Delete the establishments without Yelp data. 

This missing info is usually due to:
- Restaurant closed before listing was active on Yelp.
- Restaurant was not located in New York area (other countries, across the US, etc).
- Restaurant was an upstairs/subsection bar/restaurant of another establishment, which has since been defunct.

In [649]:
df_nyt_yelp = df_nyt_yelp.drop(df[df['delete']=='y']['index'])

Rename columns so that origin of data (NYT or Yelp) is clear.

In [650]:
df_nyt_yelp = df_nyt_yelp.rename({'review_time': 'nyt_review_time',
                    'id':'yelp_id',
                    'rating':'yelp_rating',
                    'y_url':'yelp_url',
                    'y_price':'yelp_price',}, axis='columns')
#df_nyt_yelp = df_nyt_yelp.drop('Unnamed: 0', axis=1)
df_nyt_yelp.head()

Unnamed: 0,nyt_name,nyt_review_time,yelp_id,isclosed,yelp_rating,reviewcount,yelp_price,yelp_url,yelp_name
0,Davelle,"June 7, 2018",PwrYnl3j3YCGzILX_5UHpw,False,4.0,47.0,2,https://www.yelp.com/biz/davelle-new-york-2?ad...,Davelle
1,Lahi,"May 31, 2018",kDqW1eeZEDTitXerwzwCzw,False,4.5,4.0,1,https://www.yelp.com/biz/lahi-queens?adjust_cr...,LAHI
2,Don Angie,"May 29, 2018",h37t9rA06Sr4EetJjKrfzw,False,4.5,126.0,3,https://www.yelp.com/biz/don-angie-new-york?ad...,Don Angie
4,Rangoon Spoon,"May 24, 2018",chdmH_y3YReZgIlNzj8djQ,False,3.5,55.0,2,https://www.yelp.com/biz/rangoon-spoon-brookly...,Rangoon Spoon
5,Wokuni,"May 22, 2018",D6ZEcG1FCZ18nekMWrhgMg,False,4.0,60.0,3,https://www.yelp.com/biz/wokuni-new-york?adjus...,Wokuni


#### Append NYT review "stars" (1-5) and "Critic's Pick" (y/n). 

This is done by importing the manually generated `df_nyt_stars.csv`, which was assembled by reviewing info on NYT website.

In [651]:
df_nyt_stars = pd.read_csv('import/df_nyt_stars.csv')
df_nyt_stars.head()

Unnamed: 0.1,Unnamed: 0,nyt_name,critics_pick,nyt_stars
0,0,Davelle,y,
1,1,Lahi,,
2,2,Don Angie,y,2.0
3,4,Rangoon Spoon,,
4,5,Wokuni,,1.0


In [652]:
df_nyt_yelp['critics_pick'] = df_nyt_stars['critics_pick']
df_nyt_yelp['nyt_stars'] = df_nyt_stars['nyt_stars']

df_nyt_yelp['critics_pick'] = df_nyt_yelp['critics_pick'].fillna('n') # fill NaN's under "Critic's Pick" column w/ "n" (no)

df_nyt_yelp.head()

Unnamed: 0,nyt_name,nyt_review_time,yelp_id,isclosed,yelp_rating,reviewcount,yelp_price,yelp_url,yelp_name,critics_pick,nyt_stars
0,Davelle,"June 7, 2018",PwrYnl3j3YCGzILX_5UHpw,False,4.0,47.0,2,https://www.yelp.com/biz/davelle-new-york-2?ad...,Davelle,y,
1,Lahi,"May 31, 2018",kDqW1eeZEDTitXerwzwCzw,False,4.5,4.0,1,https://www.yelp.com/biz/lahi-queens?adjust_cr...,LAHI,n,
2,Don Angie,"May 29, 2018",h37t9rA06Sr4EetJjKrfzw,False,4.5,126.0,3,https://www.yelp.com/biz/don-angie-new-york?ad...,Don Angie,y,2.0
4,Rangoon Spoon,"May 24, 2018",chdmH_y3YReZgIlNzj8djQ,False,3.5,55.0,2,https://www.yelp.com/biz/rangoon-spoon-brookly...,Rangoon Spoon,n,1.0
5,Wokuni,"May 22, 2018",D6ZEcG1FCZ18nekMWrhgMg,False,4.0,60.0,3,https://www.yelp.com/biz/wokuni-new-york?adjus...,Wokuni,y,


#### Fill missing URL's

Some restaurants did not receive a URL to fill their row in the `yelp_url` column. This was either because:
1. Their Yelp API response had an anomalous data layout.
2. Their info (except for URL) was manually generated in step 2.2 and URL was left out.

In [653]:
missing_url = df_nyt_yelp[df_nyt_yelp['yelp_url'].isnull()].index.values
print('Number of restaurants missing URLs: ', len(missing_url))

Number of restaurants missing URLs:  9


Since there are only 8 restaurants missing URL's, we proceed with manually correcting the mistakes (saves time, ensures accuracy).

In [654]:
df_nyt_yelp.loc[13, 'yelp_url'] = 'https://www.yelp.com/biz/ludas-dumplings-brooklyn'
df_nyt_yelp.loc[72, 'yelp_url'] = 'https://www.yelp.com/biz/cervos-new-york'
df_nyt_yelp.loc[227, 'yelp_url'] = 'https://www.yelp.com/biz/pravue-cafe-and-albanian-grill-ridgewood'
df_nyt_yelp.loc[280, 'yelp_url'] = 'https://www.yelp.com/biz/hanyang-boonshik-flushing-2'
df_nyt_yelp.loc[404, 'yelp_url'] = 'https://www.yelp.com/biz/gastronomia-culinaria-new-york'
df_nyt_yelp.loc[502, 'yelp_url'] = 'https://www.yelp.com/biz/fishermans-dawta-brooklyn'
df_nyt_yelp.loc[523, 'yelp_url'] = 'https://www.yelp.com/biz/miss-lilys-new-york'
df_nyt_yelp.loc[528, 'yelp_url'] = 'https://www.yelp.com/biz/alison-eighteen-new-york'
df_nyt_yelp.loc[559, 'yelp_url'] = 'https://www.yelp.com/biz/alfama-restaurant-new-york'

# Check that missing URLs are now filled
df_nyt_yelp.loc[missing_url]

# Reset index to account for removed restaurants
df_nyt_yelp = df_nyt_yelp.reset_index()
df_nyt_yelp = df_nyt_yelp.drop('index', axis=1)

#### Fix Yelp URLs

Trim the URLs in `yelp_url` column (i.e. URL to restaurant's Yelp page, provided by Yelp API). Our Yelp webscraper in the next section requires the basic URL that users navigate to when using a desktop computer, not Yelp API's URL - it works for ultimately arriving at business page, but also contains a string of identifiers about how the URL was obtained, etc.

In [655]:
yelp_url_replace = []
for idx, restaurant in df_nyt_yelp.iterrows():
    s = restaurant['yelp_url']
    s = s.split('?')[0] # basic URL is string up to ('?') in Yelp API's URL output
    yelp_url_replace.append(s)

df_nyt_yelp['yelp_url'] = yelp_url_replace

# 3. Scrape Yelp reviews

Yelp restricts review access on its API and only provides snippet of each review's text. We therefore rely on web scraping to extract review text for each restaurant in our database.

In [656]:
df_nyt_yelp_backup = pd.DataFrame.copy(df_nyt_yelp)
df_nyt_yelp['yelp_reviews'] = np.nan
df_nyt_yelp.head()

Unnamed: 0,nyt_name,nyt_review_time,yelp_id,isclosed,yelp_rating,reviewcount,yelp_price,yelp_url,yelp_name,critics_pick,nyt_stars,yelp_reviews
0,Davelle,"June 7, 2018",PwrYnl3j3YCGzILX_5UHpw,False,4.0,47.0,2,https://www.yelp.com/biz/davelle-new-york-2,Davelle,y,,
1,Lahi,"May 31, 2018",kDqW1eeZEDTitXerwzwCzw,False,4.5,4.0,1,https://www.yelp.com/biz/lahi-queens,LAHI,n,,
2,Don Angie,"May 29, 2018",h37t9rA06Sr4EetJjKrfzw,False,4.5,126.0,3,https://www.yelp.com/biz/don-angie-new-york,Don Angie,y,2.0,
3,Rangoon Spoon,"May 24, 2018",chdmH_y3YReZgIlNzj8djQ,False,3.5,55.0,2,https://www.yelp.com/biz/rangoon-spoon-brooklyn,Rangoon Spoon,n,1.0,
4,Wokuni,"May 22, 2018",D6ZEcG1FCZ18nekMWrhgMg,False,4.0,60.0,3,https://www.yelp.com/biz/wokuni-new-york,Wokuni,y,,


In [None]:
from urllib.request import urlopen as url
import io
import re
import time

# Create list to store all Yelp review data. 
# Each dict corresponds to a restaurant, where values = each review's info and text for that restaurant.
rest_dict_list = []
counter = 0

for idx, restaurant in df_nyt_yelp.iloc[80:100].iterrows():
        
    user_count_total = []
    friend_count_total = []
    #photo_count_total = []
    review_count_total = []
    elite_count_total = []
    funny_count_total = []
    cool_count_total = []
    useful_count_total = []
    length_count_total = []
    #checkin_count_total = []
    reviews_total = []
    rating_total = []
    
    num_pages = int(np.ceil(restaurant['reviewcount'] / 20)) # number of pages of Yelp reviews for this restaurant (to navigate through to scrape all reviews)

    for _ in range(num_pages): 
        
        counter = counter + 1
        if counter%2 == 0:
            time.sleep(300) # pause for 5 min after every 2 queries
            
        my_url= restaurant['yelp_url']+'?start='+str(_*20)
        request=url(my_url)
        htmlscrap=request.read()
        request.close()
        page_soup=BeautifulSoup(htmlscrap,"html.parser")
        container=page_soup.findAll("div",{"class":"review review--with-sidebar"}) # the class name where all the features are contained
        links = page_soup.find_all('div', {'class':'review-list'})
        
        # Extract reviews 
        
        for i in container:
            
            # Just extract review info (all info except the text of the review itself)
            
            friend_counter=i.findAll("li",{"class":"friend-count responsive-small-display-inline-block"})
            friend_count=friend_counter[0].b.text
            review_counter=i.findAll("li",{"class":"review-count responsive-small-display-inline-block"})
            review_count=review_counter[0].b.text
            photo_counter=i.findAll("li",{"class":"photo-count responsive-small-display-inline-block"})

            if photo_counter:
                photo_count=photo_counter[0].b.text
            else:
                photo_count=0
            elite_counter=i.findAll("li",{"class":"is-elite responsive-small-display-inline-block"})
            if elite_counter:
                elite_count=1
            else:
                elite_count=0
            funny_counter=i.findAll("a",{"class":"ybtn ybtn--small funny js-analytics-click"})
            funny_count1=funny_counter[0].findAll("span",{"class":"count"})
            funny_count=funny_count1[0].text
            if funny_count:
                funny_count=funny_count
            else:
                funny_count=0
            cool_counter=i.findAll("a",{"class":"ybtn ybtn--small cool js-analytics-click"})
            cool_count1=cool_counter[0].findAll("span",{"class":"count"})
            cool_count=cool_count1[0].text
            if cool_count:
                cool_count=cool_count
            else:
                cool_count=0
            useful_counter=i.findAll("a",{"class":"ybtn ybtn--small useful js-analytics-click"})
            useful_count1=useful_counter[0].findAll("span",{"class":"count"})
            useful_count=useful_count1[0].text
            if useful_count:
                useful_count=useful_count
            else:
                useful_count=0
            user_counter=i.findAll("a",{"class":"user-display-name js-analytics-click"})
            user_count=user_counter[0].text
            rating_counter=i.findAll("div",{"class":"biz-rating biz-rating-large clearfix"})
            rating_count=rating_counter[0].div.div["title"]
            rating_count=(int(rating_count[0]))

            length_counter=i.findAll("p",{"lang":"en"})
            xx=str(length_counter[0])
            length_count=len(xx)

            #checkin_counter=i.findAll("li",{"class":"review-tags_item"})
            #if checkin_counter:
            #    var1=checkin_counter[0].text.strip()
            #    if var1[0] in ['F','R']: # checks for 'First to Review' and 'ROTD (Review of the Day)' labels, which are placed in same area as checkins
            #        checkin_count=0
            #    else:
            #        checkin_count=(int(var1[0]))      
            #else:
            #    checkin_count=0
            
            # extract rating
            rating = i.findAll("div",{"class":'i-stars i-stars--regular-1 rating-large'})
            if not rating:
                rating = i.findAll("div",{"class":'i-stars i-stars--regular-2 rating-large'})
                if not rating:
                    rating = i.findAll("div",{"class":'i-stars i-stars--regular-3 rating-large'})
                    if not rating:
                        rating = i.findAll("div",{"class":'i-stars i-stars--regular-4 rating-large'})
                        if not rating:
                            rating = i.findAll("div",{"class":'i-stars i-stars--regular-5 rating-large'})
                        else:
                            rating = np.nan
                
            user_count_total.append(user_count)
            friend_count_total.append(friend_count)
            #photo_count_total.append(photo_count)
            review_count_total.append(review_count)
            elite_count_total.append(elite_count)
            funny_count_total.append(funny_count)
            cool_count_total.append(cool_count)
            useful_count_total.append(useful_count)
            length_count_total.append(length_count)
            rating_total.append(rating)
            # checkin_count_total.append(checkin_count)
            
        # Extract review text

        rest_list = links[0].find_all('div', {'class':'review-content'})

        for rest in rest_list:
            # text = rest[0].find_all('div', {'class':'review-content'})
            # text = text[0].find_all('p', {'lang':'en'})
            text = rest.find_all('p', {'lang':'en'})[0]

            text_revised = []
            for tag in text:
                if str(type(tag)) == "<class 'bs4.element.NavigableString'>":
                    text_revised.append(tag)
                else:
                    tag = ". "
                    text_revised.append(tag)

            text_revised = ''.join(text_revised)
            text_revised = text_revised.replace(u'\xa0', u' ')

            reviews_total.append(text_revised)
            
        # Save extracted data, to combine into dataframe
        
        rest_dict = {}
        rest_dict = {'user_count':user_count_total,
                     'friend_count':friend_count_total,
                     'review_count':review_count_total,
                     'elite_count':elite_count_total,
                     'funny_count':funny_count_total,
                     'cool_count':cool_count_total,
                     'useful_count':useful_count_total,
                     'length_count':length_count_total,
                     'review_text':reviews_total,
                     'rating':rating_total}
        
    rest_dict_list.append(rest_dict)

# 4. Yelp reviews NLP

### 4.1 Working with the first 80 restaurants

Still in the process of automating Yelp webscraping - prevented from scraping at regular intervals. The Yelp webscraping data we have so far is stored in Pickle files. We run into a `maximum recursion depth exceeded` error if saving ~>40 restaurants' worth of data into Pickle file, so info is stored in separate Pickle files in the format: `yelp_reviews_` + `(max index of restaurant)`.

#### Load Pickle files

Load the Pickle files of what we have so far of Yelp review text/data. Combine with `df_nyt_yelp` dataframe, up to the restaurants we have Yelp review data for. Save new dataframe as `df_master`.

In [623]:
rest_dict_list1 = pd.read_pickle('yelp_reviews_5.pkl')
rest_dict_list2 = pd.read_pickle('yelp_reviews_47.pkl')
rest_dict_list3 = pd.read_pickle('yelp_reviews_73.pkl')
rest_dict_list4 = pd.read_pickle('yelp_reviews_79.pkl')
rest_dict_list = pd.concat([rest_dict_list1, rest_dict_list2, rest_dict_list3, rest_dict_list4])

In [624]:
df_master = pd.DataFrame.copy(df_nyt_yelp)
df_master = df_master.drop('yelp_reviews', axis=1)
df_master = df_master.iloc[:80]
df_master['yelp_reviews'] = rest_dict_list['yelp_reviews'].values
df_master.head()

Unnamed: 0,nyt_name,nyt_review_time,yelp_id,isclosed,yelp_rating,reviewcount,yelp_price,yelp_url,yelp_name,critics_pick,nyt_stars,yelp_reviews
0,Davelle,"June 7, 2018",PwrYnl3j3YCGzILX_5UHpw,False,4.0,47.0,2,https://www.yelp.com/biz/davelle-new-york-2,Davelle,y,,"{'user_count': ['Jennie C.', 'Yvonne C.', 'Hel..."
1,Lahi,"May 31, 2018",kDqW1eeZEDTitXerwzwCzw,False,4.5,4.0,1,https://www.yelp.com/biz/lahi-queens,LAHI,n,,"{'user_count': ['Achi A.', 'Amy L.', 'Marissa ..."
2,Don Angie,"May 29, 2018",h37t9rA06Sr4EetJjKrfzw,False,4.5,126.0,3,https://www.yelp.com/biz/don-angie-new-york,Don Angie,y,2.0,"{'user_count': ['Joanna C.', 'Jen S.', 'Emi B...."
3,Rangoon Spoon,"May 24, 2018",chdmH_y3YReZgIlNzj8djQ,False,3.5,55.0,2,https://www.yelp.com/biz/rangoon-spoon-brooklyn,Rangoon Spoon,n,1.0,"{'user_count': ['Tiffany L.', 'Marissa L.', 'J..."
4,Wokuni,"May 22, 2018",D6ZEcG1FCZ18nekMWrhgMg,False,4.0,60.0,3,https://www.yelp.com/biz/wokuni-new-york,Wokuni,y,,"{'user_count': ['Jin C.', 'Enrique G.', 'Valer..."


### 4.2 Generate an NLP corpus

In this section, we tokenize and pre-process our documents for inputting into NLP algorithms. Our final output is a gensim corpus, which consists of a list of documents:

- Each document <i>represents</i> a tokenized, pre-processed Yelp review's text 
- Each document <i>structure</i> is a numerically-mapped token and its frequency in the document 


#### Generate pre-processed, tokenized list of documents

Generate a pre-processed, tokenized list of documents in preparation for using gensim to create a corpus. Here, a <u>document</u> = a review's text. Each document is converted to a list of pre-processed tokens (not unique - will list all instances of tokens).

Pre-processing steps: 
- Lowercase
- Remove non-alphabetic characters/punctuation
- Remove stop words
- Lemmatize

In [599]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

corpus_tokenized = [] # list of all documents (review text), with contents in tokenized form
reviewcount = []
idx_tracker = []

for idx, restaurant in df_master.iterrows():
    
    review_list = restaurant['yelp_reviews']['review_text']
    reviewcount.append(len(review_list))
    
    # lowercase document (where document = review text)
    review_list = [t.lower() for t in review_list]
    
    # tokenize each document
    doc_list = [word_tokenize(r) for r in review_list]
    
    # retain only alphabetic words
    alpha_only = [[t for t in d if t.isalpha()] for d in doc_list]

    # remove stopwords
    stop_words = set(stopwords.words('english')) # generate stopwords
    no_stops = [[t for t in d if t not in stop_words] for d in alpha_only]

    # lemmatize tokens
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = [[wordnet_lemmatizer.lemmatize(t) for t in d] for d in no_stops]
    
    [corpus_tokenized.append(d) for d in lemmatized]
    [idx_tracker.append(idx) for d in lemmatized]

#### Create corpus w/ gensim

We use [gensim](https://radimrehurek.com/gensim/) to create a corpus, where each token is mapped to a unique numerical ID and word count (i.e. bag of words, BoW) in order to set up structure for inputting to NLP algorithms.

In [601]:
from gensim.corpora.dictionary import Dictionary

# create dictionary from list of pre-processed tokens (all instances) across all documents ('lemmatized')
dictionary = Dictionary(corpus_tokenized)

# generate corpus
corpus = [dictionary.doc2bow(document) for document in corpus_tokenized] # .doc2bow method converts documents into BoW format

# combine BoW corpus with df_corpus (dataframe of info for each document, i.e. review, in corpus)
df_corpus['bow'] = corpus
#df_corpus.head(1)

Visualize a sample review under our different processing steps leading up to gensim corpus.

In [408]:
print('Review (after pre-processing): ', review_list[-1], '\n')
print('Review (after document tokenization, removing stopwords, lemmatization): ', corpus_tokenized[-1], '\n')
print('Review (after gensim corpus): ', corpus[-1])

Review (after pre-processing):  we went this past weekend on saturday and was really surprised at how well organized it all seemed. we got there around 6, easily found parking and didn't have to stand in a line for more than 10 minutes at each food stall. i think it's great than queens was the first borough to get a night market going, would definitely recommend stopping by when they are back in july. 

Review (after document tokenization, removing stopwords, lemmatization):  ['went', 'past', 'weekend', 'saturday', 'really', 'surprised', 'well', 'organized', 'seemed', 'got', 'around', 'easily', 'found', 'parking', 'stand', 'line', 'minute', 'food', 'stall', 'think', 'great', 'queen', 'first', 'borough', 'get', 'night', 'market', 'going', 'would', 'definitely', 'recommend', 'stopping', 'back', 'july'] 

Review (after gensim corpus):  [(40, 1), (44, 1), (123, 1), (131, 1), (176, 1), (178, 1), (213, 1), (243, 1), (312, 1), (313, 1), (320, 1), (343, 1), (365, 1), (386, 1), (424, 1), (465, 

#### Create dataframe of corpus, which tracks the restaurant that the review belongs to

In [602]:
df_corpus = pd.DataFrame({'restaurant_idx':idx_tracker, 'corpus':corpus})
df_corpus.head()

Unnamed: 0,corpus,restaurant_idx
0,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",0
1,"[(17, 1), (26, 1), (34, 1), (42, 1), (72, 1), ...",0
2,"[(14, 1), (29, 1), (30, 1), (42, 1), (44, 1), ...",0
3,"[(23, 1), (26, 1), (42, 1), (60, 2), (61, 1), ...",0
4,"[(4, 1), (13, 1), (29, 1), (42, 2), (50, 2), (...",0


### 4.3 Perform some EDA

#### Find most frequent words in best-rated and worst-rated Yelp restaurants

- "Best-rating" Yelp reviews have ratings >= 4.5/5 
- "Worst-rating" Yelp reviews have ratings <= 3

In [590]:
# Best-rating reviews

idx_best = df_master[df_master['yelp_rating'] >= 4.5].index
idx_best_doc = [t for t,j in df_corpus['restaurant_idx'].iteritems() if j in idx_best] # index of docs belonging to those restaurants

subcorpus_best = []
subcorpus_best = [(subcorpus_best + doc) for idx, doc in df_corpus.iloc[idx_best_doc]['corpus'].iteritems()]

# Worst-rating reviews

idx_worst = df_master[df_master['yelp_rating'] <= 3].index
idx_worst_doc = [t for t,j in df_corpus['restaurant_idx'].iteritems() if j in idx_worst] # index of docs belonging to those restaurants

subcorpus_worst = []
subcorpus_worst = [(subcorpus_worst + doc) for idx, doc in df_corpus.iloc[idx_worst_doc]['corpus'].iteritems()]

Print top 10 words for "best-rating" and "worst-rating" Yelp reviews.

In [593]:
import collections
import itertools

# Best-rating reviews

total_word_count_best = collections.defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(subcorpus_best):
    total_word_count_best[word_id] += word_count

sorted_word_count_best = sorted(total_word_count_best.items(), key=lambda w: w[1], reverse=True) 

print('Top 10 words for BEST-rating Yelp reviews:','\n')
for word_id, word_count in sorted_word_count_best[:10]:
    print(dictionary.get(word_id), word_count)
print('\n')

# Worst-rating reviews

total_word_count_worst = collections.defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(subcorpus_worst):
    total_word_count_worst[word_id] += word_count

sorted_word_count_worst = sorted(total_word_count_worst.items(), key=lambda w: w[1], reverse=True) 

print('Top 10 words for WORST-rating Yelp reviews:','\n')
for word_id, word_count in sorted_word_count_worst[:10]:
    print(dictionary.get(word_id), word_count)

Top 10 words for BEST-rating Yelp reviews: 

food 1617
place 1118
good 952
great 781
dish 772
like 720
restaurant 684
also 636
one 598
really 594


Top 10 words for WORST-rating Yelp reviews: 

duck 855
food 475
dish 360
restaurant 350
good 323
dadong 318
service 289
like 259
one 248
would 229


### 4.4 tf-idf

In [619]:
from gensim.models.tfidfmodel import TfidfModel

# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

# Calculate the tfidf weights of doc: tfidf_weights
doc = corpus[0]
tfidf_weights = tfidf[doc]

# Print the first five weights
print('tfidf weights: ', '\n', tfidf_weights[:5], '\n')

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

# Print the top 5 weighted words
print('Top 5 weighted words:')
for term_id, weight in sorted_tfidf_weights[:5]:
    print(dictionary[term_id], weight)

print('\n')
print('Text: ','\n', ' '.join(corpus_tokenized[0]))

tfidf weights:  
 [(0, 0.1055642607631973), (1, 0.05957501646551434), (2, 0.05994039811573356), (3, 0.020509934091441764), (4, 0.028228376627908995)] 

Top 5 weighted words:
oden 0.38078435163064966
dashi 0.30155229186255794
uh 0.2365325200743029
spaghetti 0.20231486911864288
mentaiko 0.18664453616596044


Text:  
 davelle uh oden uh foodie trippin get order right uh shawty look good eatin oden oden dish drink dashi davelle oden moonlight xxxtentacion rip everything amazing u dining tiny cozy cramped beautiful little spot got oden set karaage cod spaghetti hokkaido spaghetti uni tomato cold dish topped kinda optional light cheese drink dashi aaaalllll dish good soft blanched skinless savory daikon served spicy yuzu paste use sparingly pretty big kick red miso paste soft mushy perfectly cooked heart shaped daikon mochi lightly fried bag soft gooey delicious mochi def drink dashi scallion enoki mushroom ginger hanpen white fish cake soft texture airy typical fish cake denseness fishcake 

### Summary

Much more work remaining. So far, we've set up a pipeline for converting documents (Yelp reviews) into token-wordcount mappings. As seen from the top wordcounts of "best-rating" and "worst_rating" Yelp reviews, there are many confounding terms that probably won't serve as good predictors for "good" or "bad" restaurants. We'll need more work - for instance, introducing word embeddings, sentiment analysis, etc.

Next task is to try using tf-idf weights to:
- Perform linear regression in order to predict Yelp and NYT rating.
- Obtain timestamps of Yelp reviews to do time-series analysis of rating changes (esp during major events, such as publication of a NYT critics review of restaurant)