In [1]:
import warnings
warnings.filterwarnings('ignore')
from pymongo import MongoClient
import pandas as pd
import numpy as np
# Requests sends and recieves HTTP requests.
import requests
# Beautiful Soup parses HTML documents in python.
from bs4 import BeautifulSoup

# Load MongoDB with Scraped Page Data

In [2]:
client = MongoClient('localhost', 27017)
db = client.tripadvisor_hon_eats_reviews
pages = db.pages

In [3]:
# pages.count_documents({})
# ->15,903 documents

# Create & Test Function for Saving Data from MongoDB

In [4]:
def get_curr_page_info(soup):
    """Return list of info selected from current page"""
    # restaurant_name
    restaurant_name = soup.find("h1", {"class": "header heading masthead masthead_h1"}).getText()
#     print(restaurant_name) #<-debug

    # description
    description = soup.find("meta", {"name": "description"})['content']
    
    # url
    url = soup.find("link", {"rel": "alternate", "hreflang": "en"})['href']
    
    
    # Listing Details
    
    # overall listing info
    listing_details = soup.find("div", {"id": "taplc_detail_overview_cards_0"})
    
    # top_details
    top_details = soup.find("div", {"id": "taplc_top_info_0"})
    top_details = top_details.findAll("a", {"class": "_2mn01bsa"})
    top_details =[top_detail.getText() for top_detail in top_details]
    top_details = ' | '.join(top_details)

    # main_details
    main_details = soup.find("div", {"id": "taplc_details_card_0"})

    # about
    try:
        about = main_details.find("div", {"class": "_1lSTB9ov"}).getText()
    except AttributeError:
        about = np.nan

    # check main tags for restaurant details
    tag_cats = main_details.findAll("div", {"class": "o3o2Iihq"})
    if len(tag_cats) == 0:
        # get details from another section
        details = soup.find("div", {"id": "taplc_detail_overview_cards_0"})
        # get detail categories
        detail_cats = details.findAll("div", {"class": "_14zKtJkz"})
        detail_cats = [detail_cat.getText() for detail_cat in detail_cats]
        details = details.findAll("div", {"class": "_1XLfiSsv"})
        details = [detail.getText() for detail in details]
    else:
        # proceed to get tags and their categories
        tag_cats = [tag_cat.getText() for tag_cat in tag_cats]
        details = main_details.findAll("div", {"class": "_2170bBgV"})
        details = [detail.getText() for detail in details]
        detail_cats = tag_cats

    details_dict = dict(zip(detail_cats, details))

    # populate detail fields
    try:
        price = details_dict['PRICE RANGE']
    except KeyError:
        price = np.nan
    try:
        diets = details_dict['Special Diets']
    except KeyError:
        diets = np.nan
    try:
        meals = details_dict['Meals']
    except KeyError:
        meals = np.nan
    try: 
        cuisines = details_dict['CUISINES']
    except KeyError:
        cuisines = np.nan
    try:
        features = details_dict['FEATURES']
    except KeyError:
        features = np.nan

    
    # overall_rating
    try: 
        overall_rating = listing_details.find("span", {"class": "r2Cf69qf"})
        overall_rating = float(overall_rating.getText()[:3])
    except AttributeError:
        overall_rating = np.nan

    # more overall ratings (food, service, value, atmosphere)
    more_overall_rating_types = listing_details.findAll("span", {"class": "_2vS3p6SS"})
    more_overall_rating_types = [rating.getText() for rating in more_overall_rating_types]

    more_overall_ratings_raw = listing_details.findAll("span", {"class": "ui_bubble_rating"})[1:]
    more_overall_ratings = [int(str(rating)[37:39])/10 for rating in more_overall_ratings_raw]

    more_overall_ratings_dict = {}
    for i in range(len(more_overall_rating_types)):
        more_overall_ratings_dict[more_overall_rating_types[i]] = more_overall_ratings[i]

    try: 
        food_rating = more_overall_ratings_dict['Food']
    except KeyError:
        food_rating = np.nan
    try: 
        service_rating = more_overall_ratings_dict['Service']
    except KeyError:
        service_rating = np.nan
    try: 
        value_rating = more_overall_ratings_dict['Value']
    except KeyError:
        value_rating = np.nan
    try: 
        atmosphere_rating = more_overall_ratings_dict['Atmosphere']
    except KeyError:
        atmosphere_rating = np.nan
        
        
    # num_reviews
    try: 
        num_reviews = listing_details.find("a", {"class": "_10Iv7dOs"})
        num_reviews = int(num_reviews.getText().replace(',', '').split(' ')[0])
    except AttributeError:
        num_reviews = np.nan

    # ranking
    try:
        ranking = listing_details.findAll("div", {"class": "_3-W4EexF"})
        ranking = ranking[-1].getText()
    except IndexError:
        ranking = np.nan

    # location info
    location_info = listing_details.findAll("span", {"class": "_2saB_OSe"})

    # address
    try:
        address = location_info[0].getText()
    except IndexError:
        address = np.nan

    # location
    try: 
        location = location_info[1]
        location = location.findAll("div")[-1].getText()
    except IndexError:
        location = np.nan
        
    # image_url
    try: 
        image_urls = soup.find("div", {"class": "mosaic_photos"})
        image_url = image_urls.find("img", {"class": "basicImg"})['data-lazyurl']
    except TypeError:
        image_url = ''
    
    
    # Reviews
    reviews_container = soup.find("div", {"class": "listContainer"})
    reviews = reviews_container.findAll("div", {"class": "prw_rup prw_reviews_review_resp"})
    review_data = []
    for review in reviews:
        curr_review = dict()
        try:
            curr_review['user_name'] = review.find("div", {"class": "info_text pointer_cursor"}).getText()
        except AttributeError:
            curr_review['user_name'] = np.nan
        bubble_rating_raw = review.find("span", {"class": "ui_bubble_rating"})
        try:
            curr_review['bubble_rating'] = int(str(bubble_rating_raw)[37:39])/10
        except ValueError:
            curr_review['bubble_rating'] = np.nan
        try:
            curr_review['review_contents'] = review.find("p", {"class": "partial_entry"}).getText().replace('...More', '')
        except AttributeError:
            curr_review['review_contents'] = np.nan
        review_data.append(curr_review)

    return [restaurant_name, description, url, top_details, about, price, diets, 
            meals, cuisines, features,
            overall_rating, food_rating, service_rating, value_rating, atmosphere_rating, 
            num_reviews, ranking, address, location, image_url, review_data]

In [5]:
get_curr_page_info(soup = BeautifulSoup(pages.find_one()['html'], features="html.parser"))

['Hanks Cafe Honolulu',
 'Hanks Cafe Honolulu, Honolulu: See 5 unbiased reviews of Hanks Cafe Honolulu, rated 5 of 5 on Tripadvisor and ranked #1,269 of 2,220 restaurants in Honolulu.',
 'https://www.tripadvisor.com/Restaurant_Review-g60982-d4685440-Reviews-Hanks_Cafe_Honolulu-Honolulu_Oahu_Hawaii.html',
 '',
 nan,
 nan,
 nan,
 'Breakfast, Lunch, Dinner',
 nan,
 nan,
 5.0,
 nan,
 nan,
 nan,
 nan,
 5,
 '#943 of 1,581 Restaurants in Honolulu',
 '1038 Nuuanu Ave, Honolulu, Oahu, HI 96817-5117',
 '0.1 miles from Downtown Honolulu',
 '',
 [{'user_name': 'waynen95',
   'bubble_rating': 4.0,
   'review_contents': 'a small pub in Chinatown with great service and great live music upstairs. Service was friendly and efficient'},
  {'user_name': 'Maria H',
   'bubble_rating': 5.0,
   'review_contents': 'Friendly and a good selection of beers and. Liquors. Nice crowd.crowdSay sports bar.Say mahalo to Dave at the bar.'},
  {'user_name': 'RowerChicago',
   'bubble_rating': 5.0,
   'review_contents': 

In [6]:
info = []
for page in pages.find({}).limit(100):
    soup = BeautifulSoup(page['html'], features="html.parser")
    curr_page_info = get_curr_page_info(soup)
    info.append(curr_page_info)
info_df = pd.DataFrame(np.array(info),
                   columns=['restaurant_name', 'description', 'url', 'top_details', 'about', 
                            'price', 'diets', 
                            'meals', 'cuisines', 'features', 'overall_rating', 'food_rating', 
                            'service_rating', 
                            'value_rating', 'atmosphere_rating', 'num_reviews', 'ranking', 
                            'address', 'location', 'image_url', 'review_data'])

In [7]:
info_df.sample(10)

Unnamed: 0,restaurant_name,description,url,top_details,about,price,diets,meals,cuisines,features,...,food_rating,service_rating,value_rating,atmosphere_rating,num_reviews,ranking,address,location,image_url,review_data
69,"Maui Brewing Company, Honolulu","Reserve a table at Maui Brewing Company, Honol...",https://www.tripadvisor.com/Restaurant_Review-...,$$ - $$$ | American | Bar | Pub,"Enjoy craft beers brewed locally on Maui, hand...",$10 - $35,"Vegetarian Friendly, Vegan Options, Gluten Fre...","Lunch, Dinner","American, Bar, Pub","Takeout, Television, Reservations, Outdoor Sea...",...,4.0,4.0,4.0,,1165,"#127 of 1,581 Restaurants in Honolulu","2300 Kalakaua Ave, Honolulu, Oahu, HI 96815-5049",0.5 miles from Waikiki Beach,https://media-cdn.tripadvisor.com/media/photo-...,"[{'user_name': 'DGW41-64-00-05', 'bubble_ratin..."
29,"The Cream Pot, Honolulu","The Cream Pot, Honolulu: See 468 unbiased revi...",https://www.tripadvisor.com/Restaurant_Review-...,$$ - $$$ | American | Cafe | Vegetarian Friendly,,$5 - $10,"Vegetarian Friendly, Vegan Options",,"American, Cafe",,...,4.0,3.5,3.0,4.0,468,"#188 of 1,581 Restaurants in Honolulu","444 Niu St, Honolulu, Oahu, HI 96815-1830",1.1 miles from Waikiki Beach,https://media-cdn.tripadvisor.com/media/photo-...,"[{'user_name': 'Cherybabe', 'bubble_rating': 4..."
72,"Maui Brewing Company, Honolulu","Reserve a table at Maui Brewing Company, Honol...",https://www.tripadvisor.com/Restaurant_Review-...,$$ - $$$ | American | Bar | Pub,"Enjoy craft beers brewed locally on Maui, hand...",$10 - $35,"Vegetarian Friendly, Vegan Options, Gluten Fre...","Lunch, Dinner","American, Bar, Pub","Takeout, Television, Reservations, Outdoor Sea...",...,4.0,4.0,4.0,,1165,"#127 of 1,581 Restaurants in Honolulu","2300 Kalakaua Ave, Honolulu, Oahu, HI 96815-5049",0.5 miles from Waikiki Beach,https://media-cdn.tripadvisor.com/media/photo-...,"[{'user_name': 'Betty H', 'bubble_rating': 5.0..."
95,"Maui Brewing Company, Honolulu","Reserve a table at Maui Brewing Company, Honol...",https://www.tripadvisor.com/Restaurant_Review-...,$$ - $$$ | American | Bar | Pub,"Enjoy craft beers brewed locally on Maui, hand...",$10 - $35,"Vegetarian Friendly, Vegan Options, Gluten Fre...","Lunch, Dinner","American, Bar, Pub","Takeout, Television, Reservations, Outdoor Sea...",...,4.0,4.0,4.0,,1165,"#127 of 1,581 Restaurants in Honolulu","2300 Kalakaua Ave, Honolulu, Oahu, HI 96815-5049",0.5 miles from Waikiki Beach,https://media-cdn.tripadvisor.com/media/photo-...,"[{'user_name': 'yancy_baan', 'bubble_rating': ..."
55,"Maui Brewing Company, Honolulu","Reserve a table at Maui Brewing Company, Honol...",https://www.tripadvisor.com/Restaurant_Review-...,$$ - $$$ | American | Bar | Pub,"Enjoy craft beers brewed locally on Maui, hand...",$10 - $35,"Vegetarian Friendly, Vegan Options, Gluten Fre...","Lunch, Dinner","American, Bar, Pub","Takeout, Television, Reservations, Outdoor Sea...",...,4.0,4.0,4.0,,1165,"#127 of 1,581 Restaurants in Honolulu","2300 Kalakaua Ave, Honolulu, Oahu, HI 96815-5049",0.5 miles from Waikiki Beach,https://media-cdn.tripadvisor.com/media/photo-...,"[{'user_name': 'ausanne', 'bubble_rating': 4.0..."
46,"Maui Brewing Company, Honolulu","Reserve a table at Maui Brewing Company, Honol...",https://www.tripadvisor.com/Restaurant_Review-...,$$ - $$$ | American | Bar | Pub,"Enjoy craft beers brewed locally on Maui, hand...",$10 - $35,"Vegetarian Friendly, Vegan Options, Gluten Fre...","Lunch, Dinner","American, Bar, Pub","Takeout, Television, Reservations, Outdoor Sea...",...,4.0,4.0,4.0,,1165,"#127 of 1,581 Restaurants in Honolulu","2300 Kalakaua Ave, Honolulu, Oahu, HI 96815-5049",0.5 miles from Waikiki Beach,https://media-cdn.tripadvisor.com/media/photo-...,"[{'user_name': 'Judith M', 'bubble_rating': 5...."
88,"Maui Brewing Company, Honolulu","Reserve a table at Maui Brewing Company, Honol...",https://www.tripadvisor.com/Restaurant_Review-...,$$ - $$$ | American | Bar | Pub,"Enjoy craft beers brewed locally on Maui, hand...",$10 - $35,"Vegetarian Friendly, Vegan Options, Gluten Fre...","Lunch, Dinner","American, Bar, Pub","Takeout, Television, Reservations, Outdoor Sea...",...,4.0,4.0,4.0,,1165,"#127 of 1,581 Restaurants in Honolulu","2300 Kalakaua Ave, Honolulu, Oahu, HI 96815-5049",0.5 miles from Waikiki Beach,https://media-cdn.tripadvisor.com/media/photo-...,"[{'user_name': 'Ausguy1605', 'bubble_rating': ..."
12,"The Cream Pot, Honolulu","The Cream Pot, Honolulu: See 468 unbiased revi...",https://www.tripadvisor.com/Restaurant_Review-...,$$ - $$$ | American | Cafe | Vegetarian Friendly,,$5 - $10,"Vegetarian Friendly, Vegan Options",,"American, Cafe",,...,4.0,3.5,3.0,4.0,468,"#188 of 1,581 Restaurants in Honolulu","444 Niu St, Honolulu, Oahu, HI 96815-1830",1.1 miles from Waikiki Beach,https://media-cdn.tripadvisor.com/media/photo-...,"[{'user_name': 'joluvscards', 'bubble_rating':..."
78,"Maui Brewing Company, Honolulu","Reserve a table at Maui Brewing Company, Honol...",https://www.tripadvisor.com/Restaurant_Review-...,$$ - $$$ | American | Bar | Pub,"Enjoy craft beers brewed locally on Maui, hand...",$10 - $35,"Vegetarian Friendly, Vegan Options, Gluten Fre...","Lunch, Dinner","American, Bar, Pub","Takeout, Television, Reservations, Outdoor Sea...",...,4.0,4.0,4.0,,1165,"#127 of 1,581 Restaurants in Honolulu","2300 Kalakaua Ave, Honolulu, Oahu, HI 96815-5049",0.5 miles from Waikiki Beach,https://media-cdn.tripadvisor.com/media/photo-...,"[{'user_name': 'M8362WDdavek', 'bubble_rating'..."
0,Hanks Cafe Honolulu,"Hanks Cafe Honolulu, Honolulu: See 5 unbiased ...",https://www.tripadvisor.com/Restaurant_Review-...,,,,,"Breakfast, Lunch, Dinner",,,...,,,,,5,"#943 of 1,581 Restaurants in Honolulu","1038 Nuuanu Ave, Honolulu, Oahu, HI 96817-5117",0.1 miles from Downtown Honolulu,,"[{'user_name': 'waynen95', 'bubble_rating': 4...."


In [8]:
info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
restaurant_name      100 non-null object
description          100 non-null object
url                  100 non-null object
top_details          100 non-null object
about                57 non-null object
price                81 non-null object
diets                89 non-null object
meals                72 non-null object
cuisines             93 non-null object
features             60 non-null object
overall_rating       98 non-null object
food_rating          91 non-null object
service_rating       91 non-null object
value_rating         91 non-null object
atmosphere_rating    32 non-null object
num_reviews          98 non-null object
ranking              98 non-null object
address              100 non-null object
location             100 non-null object
image_url            100 non-null object
review_data          100 non-null object
dtypes: object(21)
memory usage: 16.5+ KB


In [9]:
info_df.isnull().sum()

restaurant_name       0
description           0
url                   0
top_details           0
about                43
price                19
diets                11
meals                28
cuisines              7
features             40
overall_rating        2
food_rating           9
service_rating        9
value_rating          9
atmosphere_rating    68
num_reviews           2
ranking               2
address               0
location              0
image_url             0
review_data           0
dtype: int64