In [1]:
import warnings
warnings.filterwarnings('ignore')
from pymongo import MongoClient
import pandas as pd
import numpy as np
# Requests sends and recieves HTTP requests.
import requests
# Beautiful Soup parses HTML documents in python.
from bs4 import BeautifulSoup

# Load MongoDB

In [2]:
client = MongoClient('localhost', 27017)
db = client.tripadvisor_hon_eats_reviews
pages = db.pages

In [3]:
# pages.count_documents({})
# ->15,903 documents

# Create & Test Function for Saving Data from MongoDB

In [4]:
def get_curr_page_info(soup):
    # restaurant_name
    restaurant_name = soup.find("h1", {"class": "header heading masthead masthead_h1"}).getText()
#     print(restaurant_name) #<-debug

    # description
    description = soup.find("meta", {"name": "description"})['content']
    
    # url
    url = soup.find("link", {"rel": "alternate", "hreflang": "en"})['href']
    
    # overall listing info
    listing_details = soup.find("div", {"id": "taplc_detail_overview_cards_0"})
    
    # top_details
    top_details = soup.find("div", {"id": "taplc_top_info_0"})
    top_details = top_details.findAll("a", {"class": "restaurants-detail-top-info-TopInfo__tagLink--2LkIo"})
    top_details =[top_detail.getText() for top_detail in top_details]
    top_details = ' | '.join(top_details)

    # main_details
    main_details = soup.find("div", {"id": "taplc_details_card_0"})

    # about
    try:
        about = main_details.find("div", {"class": "restaurants-details-card-DesktopView__desktopAboutText--1VvQH"}).getText()
    except AttributeError:
        about = np.nan
    
    # tags
    tags = main_details.findAll("div", {"class": "restaurants-details-card-TagCategories__tagText--Yt3iG"})
    tags = [tag.getText() for tag in tags]
    tags = ' | '.join(tags)
    
    # more_details
    more_details = soup.find("div", {"id": "taplc_detail_overview_cards_0"})
    more_details = more_details.findAll("div", {"class": "restaurants-detail-overview-cards-DetailsSectionOverviewCard__tagText--1OH6h"})
    more_details = [more_detail.getText() for more_detail in more_details]
    more_details = ' | '.join(more_details)

    # overall_rating
    try: 
        overall_rating = listing_details.find("span", {"class": "restaurants-detail-overview-cards-RatingsOverviewCard__overallRating--nohTl"})
        overall_rating = float(overall_rating.getText()[:3])
    except AttributeError:
        overall_rating = np.nan
        
    # more overall ratings (food, service, value, atmosphere)
    more_overall_rating_types = listing_details.findAll("span", {"class": "restaurants-detail-overview-cards-RatingsOverviewCard__ratingText--1P1Lq"})
    more_overall_rating_types = [rating.getText() for rating in more_overall_rating_types]

    more_overall_ratings_raw = listing_details.findAll("span", {"class": "ui_bubble_rating"})[1:]
    more_overall_ratings = [int(str(rating)[37:39])/10 for rating in more_overall_ratings_raw]

    more_overall_ratings_dict = {}
    for i in range(len(more_overall_rating_types)):
        more_overall_ratings_dict[more_overall_rating_types[i]] = more_overall_ratings[i]

    try: 
        food_rating = more_overall_ratings_dict['Food']
    except KeyError:
        food_rating = np.nan
    try: 
        service_rating = more_overall_ratings_dict['Service']
    except KeyError:
        service_rating = np.nan
    try: 
        value_rating = more_overall_ratings_dict['Value']
    except KeyError:
        value_rating = np.nan
    try: 
        atmosphere_rating = more_overall_ratings_dict['Atmosphere']
    except KeyError:
        atmosphere_rating = np.nan
        
    # num_reviews
    try: 
        num_reviews = listing_details.find("a", {"class": "restaurants-detail-overview-cards-RatingsOverviewCard__ratingCount--DFxkG"})
        num_reviews = int(num_reviews.getText().replace(',', '').split(' ')[0])
    except AttributeError:
        num_reviews = np.nan
    
    # ranking
    try:
        ranking = listing_details.findAll("div", {"class": "restaurants-detail-overview-cards-RatingsOverviewCard__ranking--17CmN"})
        ranking = ranking[-1].getText()
    except IndexError:
        ranking = np.nan
        
    # location info
    location_info = listing_details.findAll("span", {"class": "restaurants-detail-overview-cards-LocationOverviewCard__detailLinkText--co3ei"})

    # address
    try:
        address = location_info[0].getText()
    except IndexError:
        address = np.nan

    # location
    try: 
        location = location_info[1]
        location = location.findAll("div")[-1].getText()
    except IndexError:
        location = np.nan
        
    # image_url
    try: 
        image_urls = soup.find("div", {"class": "mosaic_photos"})
        image_url = image_urls.find("img", {"class": "basicImg"})['data-lazyurl']
    except TypeError:
        image_url = ''

    # user_names
    user_names = []
    user_names_raw = soup.findAll("div", {"class": "info_text pointer_cursor"})
    for name in user_names_raw:
        user_names.append(name.getText())

    # bubble_ratings
    bubble_ratings = []
    reviews = soup.find("div", {"class": "listContainer"})
    bubble_ratings_raw = reviews.findAll("span", {"class": "ui_bubble_rating"})
    for rating in bubble_ratings_raw:
        bubble_ratings.append(int(str(rating)[37:39])/10)

    # review_contents
    review_contents = []
    review_contents_raw = reviews.findAll("p", {"class": "partial_entry"})
    for review in review_contents_raw:
        review_contents.append(review.getText().replace('...More', ''))

    return [restaurant_name, description, url, top_details, about, tags, more_details, 
            overall_rating, food_rating, service_rating, value_rating, atmosphere_rating, 
            num_reviews, ranking, address, location, image_url, user_names, bubble_ratings, review_contents]

In [5]:
soup = BeautifulSoup(pages.find_one()['html'], features="html.parser")

In [6]:
get_curr_page_info(soup = BeautifulSoup(pages.find_one()['html'], features="html.parser"))

['Hanks Cafe Honolulu',
 'Hanks Cafe Honolulu, Honolulu: See 5 unbiased reviews of Hanks Cafe Honolulu, rated 5 of 5 on Tripadvisor and ranked #1,269 of 2,220 restaurants in Honolulu.',
 'https://www.tripadvisor.com/Restaurant_Review-g60982-d4685440-Reviews-Hanks_Cafe_Honolulu-Honolulu_Oahu_Hawaii.html',
 '',
 nan,
 '',
 '',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 '',
 ['waynen95', 'Maria H', 'RowerChicago', 'Kim J', 'cg148777'],
 [4.0, 5.0, 5.0, 5.0, 5.0],
 ['a small pub in Chinatown with great service and great live music upstairs. Service was friendly and efficient',
  'Friendly and a good selection of beers and. Liquors. Nice crowd.crowdSay sports bar.Say mahalo to Dave at the bar.',
  'Very casual fun spot for all kinds of hot dogs and burgers. Order at counter, short wait as food prepared to order. Great outside seating in up and changing Honolulu hood. Great fare for fair price.',
  "If you are looking for a fun night out with live entertainment you must go to Han

In [7]:
info = []
for page in pages.find({}).limit(50):
    soup = BeautifulSoup(page['html'], features="html.parser")
    curr_page_info = get_curr_page_info(soup)
    info.append(curr_page_info)
info_df = pd.DataFrame(np.array(info),
                   columns=['restaurant_name', 'description', 'url', 'top_details', 'about', 'tags', 
                            'more_details', 'overall_rating', 'food_rating', 'service_rating', 
                            'value_rating', 'atmosphere_rating', 'num_reviews', 'ranking', 
                            'address', 'location', 'image_url', 'user_names', 
                            'bubble_ratings', 'review_contents'])

In [8]:
info_df.head()

Unnamed: 0,restaurant_name,description,url,top_details,about,tags,more_details,overall_rating,food_rating,service_rating,value_rating,atmosphere_rating,num_reviews,ranking,address,location,image_url,user_names,bubble_ratings,review_contents
0,Hanks Cafe Honolulu,"Hanks Cafe Honolulu, Honolulu: See 5 unbiased ...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,,"[waynen95, Maria H, RowerChicago, Kim J, cg148...","[4.0, 5.0, 5.0, 5.0, 5.0]",[a small pub in Chinatown with great service a...
1,"Bethel Union, Honolulu","Bethel Union, Honolulu: See 9 unbiased reviews...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,https://media-cdn.tripadvisor.com/media/photo-...,"[28franniel, LW811055, cindyq2, grantkidani, v...","[4.0, 4.0, 5.0, 4.0, 2.0, 5.0, 5.0]",[Our son took us to this new and hip spot to t...
2,"Cafe Anasia, Honolulu","Cafe Anasia, Honolulu: See 5 unbiased reviews ...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,https://media-cdn.tripadvisor.com/media/photo-...,"[Amy M, EmmJoans, Craig O, Tim K]","[5.0, 5.0, 5.0, 4.0]",[This is an unexpected find. Great bar atmosph...
3,"Starbucks, Honolulu","Starbucks, Honolulu: See 8 unbiased reviews of...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,https://media-cdn.tripadvisor.com/media/photo-...,"[vrite, jujuju123, tryblueocean, bombguy64, ga...","[4.0, 5.0, 5.0, 5.0, 4.0]",[Super busy Starbucks location but efficient a...
4,"La Vela Wine & Spa, Honolulu","La Vela Wine & Spa, Honolulu: See 2 unbiased r...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,https://media-cdn.tripadvisor.com/media/photo-...,"[bobika, Dianne W]","[5.0, 5.0]","[Great ambiance, wine and delicious salmon on ..."


In [9]:
info_df.set_index('url')\
              .apply(lambda x: x.apply(pd.Series).stack())\
              .reset_index(level=1, drop=True).reset_index()

Unnamed: 0,url,restaurant_name,description,top_details,about,tags,more_details,overall_rating,food_rating,service_rating,value_rating,atmosphere_rating,num_reviews,ranking,address,location,image_url,user_names,bubble_ratings,review_contents
0,https://www.tripadvisor.com/Restaurant_Review-...,"The Cream Pot, Honolulu","The Cream Pot, Honolulu: See 468 unbiased revi...",,,,,,,,,,,,,,https://media-cdn.tripadvisor.com/media/photo-...,y0gapam,5.0,"The most amazing soufflé pancakes & omelettes,..."
1,https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,,,,vansancopp,4.0,For all the diners who dream of starring in a ...
2,https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,,,,rick o,5.0,Wonderful food with a delight to the eyes and ...
3,https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,,,,rainb0wss,1.0,The staff who waited our table was friendly an...
4,https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,,,,Thomas M,5.0,"Had eggs and a banana pancake soufflé,it was v..."
5,https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,,,,margie r,5.0,Step into this lovely garden that leads you in...
6,https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,,,,Judy D,5.0,The Cream Pot now has floor to ceiling view wi...
7,https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,,,,NashLuz,5.0,Had breakfast and tried soufflé pancakes . It ...
8,https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,,,,tinar0s,5.0,This is a great place for breakfast and brunch...
9,https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,,,,ashtahini,3.0,I came here with friends and we ordered a few ...


# Load Saved Scraped Data

In [10]:
hon_eats = pd.read_csv('data/hon_eats_data.csv', index_col=0)
hon_eats.shape

(15903, 20)

In [11]:
hon_eats.head()

Unnamed: 0,restaurant_name,description,url,top_details,about,tags,more_details,overall_rating,food_rating,service_rating,value_rating,atmosphere_rating,num_reviews,ranking,address,location,image_url,user_names,bubble_ratings,review_contents
0,Hanks Cafe Honolulu,"Hanks Cafe Honolulu, Honolulu: See 5 unbiased ...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,,"['waynen95', 'Maria H', 'RowerChicago', 'Kim J...","[4.0, 5.0, 5.0, 5.0, 5.0]",['a small pub in Chinatown with great service ...
1,"Bethel Union, Honolulu","Bethel Union, Honolulu: See 9 unbiased reviews...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,https://media-cdn.tripadvisor.com/media/photo-...,"['28franniel', 'LW811055', 'cindyq2', 'grantki...","[4.0, 4.0, 5.0, 4.0, 2.0, 5.0, 5.0]",['Our son took us to this new and hip spot to ...
2,"Cafe Anasia, Honolulu","Cafe Anasia, Honolulu: See 5 unbiased reviews ...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,https://media-cdn.tripadvisor.com/media/photo-...,"['Amy M', 'EmmJoans', 'Craig O', 'Tim K']","[5.0, 5.0, 5.0, 4.0]",['This is an unexpected find. Great bar atmosp...
3,"Starbucks, Honolulu","Starbucks, Honolulu: See 8 unbiased reviews of...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,https://media-cdn.tripadvisor.com/media/photo-...,"['vrite', 'jujuju123', 'tryblueocean', 'bombgu...","[4.0, 5.0, 5.0, 5.0, 4.0]",['Super busy Starbucks location but efficient ...
4,"La Vela Wine & Spa, Honolulu","La Vela Wine & Spa, Honolulu: See 2 unbiased r...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,,,,,,,https://media-cdn.tripadvisor.com/media/photo-...,"['bobika', 'Dianne W']","[5.0, 5.0]","['Great ambiance, wine and delicious salmon on..."
