In [1]:
import warnings
warnings.filterwarnings('ignore')
from pymongo import MongoClient
import pandas as pd
import numpy as np
# Requests sends and recieves HTTP requests.
import requests
# Beautiful Soup parses HTML documents in python.
from bs4 import BeautifulSoup

# Load MongoDB with Scraped Page Data

In [2]:
client = MongoClient('localhost', 27017)
db = client.tripadvisor_hon_eats_reviews
pages = db.pages

In [3]:
# pages.count_documents({})
# ->15,903 documents

# Create & Test Function for Saving Data from MongoDB

In [4]:
def get_curr_page_info(soup):
    # restaurant_name
    restaurant_name = soup.find("h1", {"class": "header heading masthead masthead_h1"}).getText()
#     print(restaurant_name) #<-debug

    # description
    description = soup.find("meta", {"name": "description"})['content']
    
    # url
    url = soup.find("link", {"rel": "alternate", "hreflang": "en"})['href']
    
    
    # Listing Details
    
    # overall listing info
    listing_details = soup.find("div", {"id": "taplc_detail_overview_cards_0"})
    
    # top_details
    top_details = soup.find("div", {"id": "taplc_top_info_0"})
    top_details = top_details.findAll("a", {"class": "_2mn01bsa"})
    top_details =[top_detail.getText() for top_detail in top_details]
    top_details = ' | '.join(top_details)

    # main_details
    main_details = soup.find("div", {"id": "taplc_details_card_0"})

    # about
    try:
        about = main_details.find("div", {"class": "_1lSTB9ov"}).getText()
    except AttributeError:
        about = np.nan

    # tag_categories
    tag_cats = main_details.findAll("div", {"class": "o3o2Iihq"})
    tag_cats = [tag_cat.getText() for tag_cat in tag_cats]
    tag_cats = ' | '.join(tag_cats)
    # tags
    tags = main_details.findAll("div", {"class": "_2170bBgV"})
    tags = [tag.getText() for tag in tags]
    tags = ' | '.join(tags)
    
    # more details
    more_details = soup.find("div", {"id": "taplc_detail_overview_cards_0"})
    # more_details categories
    more_details_cats = more_details.findAll("div", {"class": "_14zKtJkz"})
    more_details_cats = [more_details_cat.getText() for more_details_cat in more_details_cats]
    more_details_cats = ' | '.join(more_details_cats)
    # more_details
    more_details = more_details.findAll("div", {"class": "_1XLfiSsv"})
    more_details = [more_detail.getText() for more_detail in more_details]
    more_details = ' | '.join(more_details)

    
    # overall_rating
    try: 
        overall_rating = listing_details.find("span", {"class": "r2Cf69qf"})
        overall_rating = float(overall_rating.getText()[:3])
    except AttributeError:
        overall_rating = np.nan

    # more overall ratings (food, service, value, atmosphere)
    more_overall_rating_types = listing_details.findAll("span", {"class": "_2vS3p6SS"})
    more_overall_rating_types = [rating.getText() for rating in more_overall_rating_types]

    more_overall_ratings_raw = listing_details.findAll("span", {"class": "ui_bubble_rating"})[1:]
    more_overall_ratings = [int(str(rating)[37:39])/10 for rating in more_overall_ratings_raw]

    more_overall_ratings_dict = {}
    for i in range(len(more_overall_rating_types)):
        more_overall_ratings_dict[more_overall_rating_types[i]] = more_overall_ratings[i]

    try: 
        food_rating = more_overall_ratings_dict['Food']
    except KeyError:
        food_rating = np.nan
    try: 
        service_rating = more_overall_ratings_dict['Service']
    except KeyError:
        service_rating = np.nan
    try: 
        value_rating = more_overall_ratings_dict['Value']
    except KeyError:
        value_rating = np.nan
    try: 
        atmosphere_rating = more_overall_ratings_dict['Atmosphere']
    except KeyError:
        atmosphere_rating = np.nan
        
        
    # num_reviews
    try: 
        num_reviews = listing_details.find("a", {"class": "_10Iv7dOs"})
        num_reviews = int(num_reviews.getText().replace(',', '').split(' ')[0])
    except AttributeError:
        num_reviews = np.nan

    # ranking
    try:
        ranking = listing_details.findAll("div", {"class": "_3-W4EexF"})
        ranking = ranking[-1].getText()
    except IndexError:
        ranking = np.nan

    # location info
    location_info = listing_details.findAll("span", {"class": "_2saB_OSe"})

    # address
    try:
        address = location_info[0].getText()
    except IndexError:
        address = np.nan

    # location
    try: 
        location = location_info[1]
        location = location.findAll("div")[-1].getText()
    except IndexError:
        location = np.nan
        
    # image_url
    try: 
        image_urls = soup.find("div", {"class": "mosaic_photos"})
        image_url = image_urls.find("img", {"class": "basicImg"})['data-lazyurl']
    except TypeError:
        image_url = ''
    
    
    # Reviews

    # user_names
    user_names = []
    user_names_raw = soup.findAll("div", {"class": "info_text pointer_cursor"})
    for name in user_names_raw:
        user_names.append(name.getText())

    # bubble_ratings
    bubble_ratings = []
    reviews = soup.find("div", {"class": "listContainer"})
    bubble_ratings_raw = reviews.findAll("span", {"class": "ui_bubble_rating"})
    for rating in bubble_ratings_raw:
        bubble_ratings.append(int(str(rating)[37:39])/10)

    # review_contents
    review_contents = []
    review_contents_raw = reviews.findAll("p", {"class": "partial_entry"})
    for review in review_contents_raw:
        review_contents.append(review.getText().replace('...More', ''))

    return [restaurant_name, description, url, top_details, about, tag_cats, tags, 
            more_details_cats, more_details, 
            overall_rating, food_rating, service_rating, value_rating, atmosphere_rating, 
            num_reviews, ranking, address, location, image_url, user_names, bubble_ratings, 
            review_contents]

In [5]:
get_curr_page_info(soup = BeautifulSoup(pages.find_one()['html'], features="html.parser"))

['Hanks Cafe Honolulu',
 'Hanks Cafe Honolulu, Honolulu: See 5 unbiased reviews of Hanks Cafe Honolulu, rated 5 of 5 on Tripadvisor and ranked #1,269 of 2,220 restaurants in Honolulu.',
 'https://www.tripadvisor.com/Restaurant_Review-g60982-d4685440-Reviews-Hanks_Cafe_Honolulu-Honolulu_Oahu_Hawaii.html',
 '',
 nan,
 '',
 '',
 'Meals',
 'Breakfast, Lunch, Dinner',
 5.0,
 nan,
 nan,
 nan,
 nan,
 5,
 '#943 of 1,581 Restaurants in Honolulu',
 '1038 Nuuanu Ave, Honolulu, Oahu, HI 96817-5117',
 '0.1 miles from Downtown Honolulu',
 '',
 ['waynen95', 'Maria H', 'RowerChicago', 'Kim J', 'cg148777'],
 [4.0, 5.0, 5.0, 5.0, 5.0],
 ['a small pub in Chinatown with great service and great live music upstairs. Service was friendly and efficient',
  'Friendly and a good selection of beers and. Liquors. Nice crowd.crowdSay sports bar.Say mahalo to Dave at the bar.',
  'Very casual fun spot for all kinds of hot dogs and burgers. Order at counter, short wait as food prepared to order. Great outside seatin

In [6]:
info = []
for page in pages.find({}).limit(50):
    soup = BeautifulSoup(page['html'], features="html.parser")
    curr_page_info = get_curr_page_info(soup)
    info.append(curr_page_info)
info_df = pd.DataFrame(np.array(info),
                   columns=['restaurant_name', 'description', 'url', 'top_details', 'about', 
                            'tag_cats', 
                            'tags', 'more_details_cats', 
                            'more_details', 'overall_rating', 'food_rating', 
                            'service_rating', 
                            'value_rating', 'atmosphere_rating', 'num_reviews', 'ranking', 
                            'address', 'location', 'image_url', 'user_names', 
                            'bubble_ratings', 'review_contents'])

In [7]:
info_df.head(50)

Unnamed: 0,restaurant_name,description,url,top_details,about,tag_cats,tags,more_details_cats,more_details,overall_rating,...,value_rating,atmosphere_rating,num_reviews,ranking,address,location,image_url,user_names,bubble_ratings,review_contents
0,Hanks Cafe Honolulu,"Hanks Cafe Honolulu, Honolulu: See 5 unbiased ...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,Meals,"Breakfast, Lunch, Dinner",5.0,...,,,5.0,"#943 of 1,581 Restaurants in Honolulu","1038 Nuuanu Ave, Honolulu, Oahu, HI 96817-5117",0.1 miles from Downtown Honolulu,,"[waynen95, Maria H, RowerChicago, Kim J, cg148...","[4.0, 5.0, 5.0, 5.0, 5.0]",[a small pub in Chinatown with great service a...
1,"Bethel Union, Honolulu","Bethel Union, Honolulu: See 9 unbiased reviews...",https://www.tripadvisor.com/Restaurant_Review-...,$$ - $$$,,,,FEATURES,Reservations,4.0,...,,,9.0,"#832 of 1,581 Restaurants in Honolulu","1115 Bethel St, Honolulu, Oahu, HI 96813-2202",0.1 miles from Downtown Honolulu,https://media-cdn.tripadvisor.com/media/photo-...,"[28franniel, LW811055, cindyq2, grantkidani, v...","[4.0, 4.0, 5.0, 4.0, 2.0, 5.0, 5.0]",[Our son took us to this new and hip spot to t...
2,"Cafe Anasia, Honolulu","Cafe Anasia, Honolulu: See 5 unbiased reviews ...",https://www.tripadvisor.com/Restaurant_Review-...,$$ - $$$ | Vietnamese,,,,CUISINES | Meals,"Vietnamese | Lunch, Dinner",4.5,...,,,5.0,"#903 of 1,581 Restaurants in Honolulu","2227 S Beretania St, Honolulu, Oahu, HI 96826-...",1.5 miles from Waikiki Beach,https://media-cdn.tripadvisor.com/media/photo-...,"[Amy M, EmmJoans, Craig O, Tim K]","[5.0, 5.0, 5.0, 4.0]",[This is an unexpected find. Great bar atmosph...
3,"Starbucks, Honolulu","Starbucks, Honolulu: See 8 unbiased reviews of...",https://www.tripadvisor.com/Restaurant_Review-...,$$ - $$$,,,,Meals,"Breakfast, Lunch, Dinner",4.5,...,,,8.0,"#749 of 1,581 Restaurants in Honolulu","949 Auahi St, Honolulu, Oahu, HI 96814",1.1 miles from Downtown Honolulu,https://media-cdn.tripadvisor.com/media/photo-...,"[vrite, jujuju123, tryblueocean, bombguy64, ga...","[4.0, 5.0, 5.0, 5.0, 4.0]",[Super busy Starbucks location but efficient a...
4,"La Vela Wine & Spa, Honolulu","La Vela Wine & Spa, Honolulu: See 2 unbiased r...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,5.0,...,,,2.0,"#1,040 of 1,581 Restaurants in Honolulu","2375 Ala Wai Blvd Waikiki Sand Villa Hotel, Ho...",0.4 miles from Waikiki Beach,https://media-cdn.tripadvisor.com/media/photo-...,"[bobika, Dianne W]","[5.0, 5.0]","[Great ambiance, wine and delicious salmon on ..."
5,"Golden City Restaurant, Honolulu","Golden City Restaurant, Honolulu: See 4 unbias...",https://www.tripadvisor.com/Restaurant_Review-...,$,,,,Meals,"Lunch, Dinner",4.0,...,,,4.0,"#1,238 of 1,581 Restaurants in Honolulu","1418 N School St, Honolulu, Oahu, HI 96817-1914",1.5 miles from Downtown Honolulu,https://media-cdn.tripadvisor.com/media/photo-...,"[116kristenf, Derek P, bf96819, travelmeisteren]","[3.0, 5.0, 4.0, 3.0]",[Went to the gas station next door and thought...
6,"Chinatown Kitchen, Honolulu","Chinatown Kitchen, Honolulu: See 3 unbiased re...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,Meals,"Breakfast, Lunch, Dinner",4.0,...,,,3.0,"#1,183 of 1,581 Restaurants in Honolulu","119 N Hotel St Chinatown, Honolulu, Oahu, HI 9...",0.2 miles from Downtown Honolulu,https://media-cdn.tripadvisor.com/media/photo-...,"[ToroFarm, 221simont]","[5.0, 4.0]",[We ate at this mom and pop restaurant in Chin...
7,"Vima Cafe, Honolulu","Vima Cafe, Honolulu: See unbiased reviews of V...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,,,,...,,,,,"1114 Fort Street Mall, Honolulu, Oahu, HI 9681...",0.1 miles from Downtown Honolulu,,[],[],[]
8,"Hunan Cuisine, Honolulu","Hunan Cuisine, Honolulu: See 14 unbiased revie...",https://www.tripadvisor.com/Restaurant_Review-...,$$ - $$$ | Chinese | Asian | Szechuan,,,,CUISINES | Meals | FEATURES,"Chinese, Asian | Lunch, Dinner | Seating, Tabl...",4.0,...,4.0,,14.0,"#977 of 1,581 Restaurants in Honolulu","53 N Beretania St, Honolulu, Oahu, HI 96817-4711",0.1 miles from Downtown Honolulu,https://media-cdn.tripadvisor.com/media/photo-...,"[yin1104hr, lornah649, G8523SUpeterp, CourtGra...","[5.0, 5.0, 3.0, 1.0, 5.0, 4.0, 3.0, 2.0, 5.0, ...",[This 5 star review is straightly for the hous...
9,"Hunan Cuisine, Honolulu","Hunan Cuisine, Honolulu: See 14 unbiased revie...",https://www.tripadvisor.com/Restaurant_Review-...,$$ - $$$ | Chinese | Asian | Szechuan,,,,CUISINES | Meals | FEATURES,"Chinese, Asian | Lunch, Dinner | Seating, Tabl...",4.0,...,4.0,,14.0,"#977 of 1,581 Restaurants in Honolulu","53 N Beretania St, Honolulu, Oahu, HI 96817-4711",0.1 miles from Downtown Honolulu,https://media-cdn.tripadvisor.com/media/photo-...,"[Joel J, leonardmarty]","[5.0, 4.0]",[This is a hidden gem in Honolulu and has the ...


In [8]:
info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 22 columns):
restaurant_name      50 non-null object
description          50 non-null object
url                  50 non-null object
top_details          50 non-null object
about                7 non-null object
tag_cats             50 non-null object
tags                 50 non-null object
more_details_cats    50 non-null object
more_details         50 non-null object
overall_rating       48 non-null object
food_rating          41 non-null object
service_rating       41 non-null object
value_rating         41 non-null object
atmosphere_rating    32 non-null object
num_reviews          48 non-null object
ranking              48 non-null object
address              50 non-null object
location             50 non-null object
image_url            50 non-null object
user_names           50 non-null object
bubble_ratings       50 non-null object
review_contents      50 non-null object
dtypes: object(22)