# Scraping Hotel Ratings on Booking # 

In this homework we will practice web scraping on the following [site](https://www.booking.com/searchresults.html?aid=304142&label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ&sid=28d97f630803f9d48b4a1f535cbdd33f&class_interval=1&dest_id=20061717&dest_type=city&group_adults=2&group_children=0&label_click=undef&no_rooms=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&src=index&src_elem=sb&ss=Boston&ssb=empty&ssne_untouched=Cancún&rows=15). Let's get some basic information for each hotel in Boston.
On each hotel page, scrape the following information: 
1. Hotel Name
2. Class of Rating (Wonderful/Excellent/Very Good/Good)
3. Rating Score
4. Number of Reviews


** Save the data in "traveler_ratings.csv" in the following format: hotel_name, class_of_rating, rating, num_reviews **

**(10 pts)**

You can see an overview of the information as displayed:





![Information to be scraped](booking_sample.png)

In [1]:
from bs4 import BeautifulSoup
import sys
import time
import os
import requests
import json
import pandas as pd
from urllib.request import urlopen as uReq
import re

In [106]:
'''
This code block is just experimenting with the scraped data.
'''
page_url = "https://www.booking.com/searchresults.html?aid=304142&label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ&sid=28d97f630803f9d48b4a1f535cbdd33f&class_interval=1&dest_id=20061717&dest_type=city&group_adults=2&group_children=0&label_click=undef&no_rooms=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&src=index&src_elem=sb&ss=Boston&ssb=empty&ssne_untouched=Canc%C3%BAn&rows=15"
response = requests.get(page_url)
html = response.text.encode('utf-8')
page_soup = BeautifulSoup(html, "lxml")

containers = page_soup.findAll('a', {'class':'hotel_name_link url'})
title = containers[0].findAll('span', {'class' : 'sr-hotel__name'})

paging = page_soup.find('a', {'class':re.compile('paging-next')}) #Find a tag which contains keyword paging-next

urls = page_soup.findAll('a', {'class':'hotel_name_link url'})  
urls[0]['href']

# containers = page_soup.findAll('div', {'data-hotelid': True })
# score = containers[0].findAll('span', {'class' : 'review-score-badge'})
# containers[0]
# hotel_name = containers[0].find('span', {'class':'sr-hotel__name'})
# rating = containers[0].findAll('span', {'class':'review-score-badge'})
# containers[0]


'\n/hotel/us/oakwood-boston.html\n#hotelTmpl'

In [111]:
'''
Given a webpage, extract hotel names and their corresponding urls; Recursively do 
this task on the next page if it exists. Otherwise exit.

'''
def get_hotel_name_and_url(source_page, hotel_names, hotel_urls):
    response = requests.get(source_page)
    html = response.text.encode('utf-8')
    page_soup = BeautifulSoup(html, "lxml") #raw html data
    
    urls = page_soup.findAll('a', {'class':'hotel_name_link url'})  
    names = page_soup.findAll('span', {'class' : 'sr-hotel__name'})
    
    for url in urls:
        hotel_urls.append('http://booking.com' + url['href'].replace("\n", "")) 
        
    for name in names:
        hotel_names.append(name.text.strip())
        
    #Deterimine if recurse
    paging = page_soup.find('a', {'class':re.compile('paging-next')})
    if paging is None: return  #This is the last page, so exit
    
    source_page = paging['href'] 
    time.sleep(0.5)
    get_hotel_name_and_url(source_page, hotel_names, hotel_urls)  #Recurse on the next page

#     print(paging['href'])
#     if paging is None: return 
#     source_page = 'http://booking.com' + paging['href']
#     get_hotel_name_and_url()

hotel_names = []
hotel_urls = []
source_page = "https://www.booking.com/searchresults.html?aid=304142&label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ&sid=28d97f630803f9d48b4a1f535cbdd33f&class_interval=1&dest_id=20061717&dest_type=city&group_adults=2&group_children=0&label_click=undef&no_rooms=1&raw_dest_type=city&room1=A%2CA&sb_price_type=total&src=index&src_elem=sb&ss=Boston&ssb=empty&ssne_untouched=Canc%C3%BAn&rows=15"

get_hotel_name_and_url(source_page, hotel_names, hotel_urls) #After this function call, two lists should be initialized


In [113]:
hotel_urls[0]

'http://booking.com/hotel/us/oakwood-boston.html#hotelTmpl'

In [118]:
rating_class = []
rating_score = []
review_number = []

'''
Retrieve info for individual hotel, using the hotel urls acquired previously
Cost 10min+
'''
def get_hotel_info(hotel_urls):
    
    for url in hotel_urls:
        response = requests.get(url)
        html = response.text.encode('utf-8')
        page_soup = BeautifulSoup(html, "lxml") 
        
        rclass = page_soup.find('span', {'class': 'review-score-widget__text'})
        rscore = page_soup.find('span', {'class': 'review-score-badge'})
        rnumber = page_soup.find('span', {'class': 'review-score-widget__subtext'})
        
        if(rclass is None): rating_class.append(None)
        else: rating_class.append(rclass.text.strip())
         
        if(rscore is None): rating_score.append(None)
        else: rating_score.append(rscore.text.strip())
        
        if(rnumber is None): review_number.append(None)
        else:review_number.append(rnumber.text.strip())
        
        time.sleep(0.02)
        
get_hotel_info(hotel_urls)

In [120]:
#Create a DataFrame using the four lists
df_data = pd.DataFrame(
    {'Name': hotel_names,
     'URL': hotel_urls,
     'Class of Rating': rating_class,
     'Rating Score': rating_score,
     'Review Number': review_number
    })

df_data

Unnamed: 0,Class of Rating,Name,Rating Score,Review Number,URL
0,Excellent,Oakwood Boston,8.7,85 reviews,http://booking.com/hotel/us/oakwood-boston.htm...
1,Very Good,14 Gloucester St #2B by Lyon Apartments,8.1,"2,805 reviews",http://booking.com/hotel/us/14-gloucester-st-2...
2,Very Good,14 Gloucester St #2A by Lyon Apartments,10,42 reviews,http://booking.com/hotel/us/14-gloucester-st-2...
3,Awesome,Seaport Boston Hotel,9.1,"1,921 reviews",http://booking.com/hotel/us/seaport.html#hotel...
4,Very Good,Longfellow Place Apartment by Stay Alfred,8.5,205 reviews,http://booking.com/hotel/us/longfellow-by-stay...
5,Good,Clearway Street by Boston Furnished Rooms,7.3,136 reviews,http://booking.com/hotel/us/clearway-street-by...
6,Very Good,The Copley House,8.1,"2,805 reviews",http://booking.com/hotel/us/the-c-house-boston...
7,Exceptional,Two Bedroom Boston Luxury Apartment,9.8,24 reviews,http://booking.com/hotel/us/two-bedroom-boston...
8,Very Good,Aloft Boston Seaport,8.5,765 reviews,http://booking.com/hotel/us/aloft-boston-seapo...
9,Very Good,112 Myrtle St #7 by Lyon Apartments,8.0,15 reviews,http://booking.com/hotel/us/112-myrtle-st-7.ht...


In [122]:
#Store hotel data in a csv file
df_data.to_csv('traveler_ratings.csv', encoding='utf-8')

Now let's scrape some reviews. For each review of each each hotel in Boston you are to scrape the following attributes: 
1. Reviewer name
2. Reviewer ethnicity
3. Number of reviews 
4. Number of helpful votes
5. Date
6. Rating
7. Negative Review
8. Positive Review

Note that you will also need the hotel's name!! Also, some reviews may not have all attributes. 

** Save the data in "review_ratings.csv" in the following format: hotel_name, reviewer_name, ethnicity, num_reviews, num_help_votes, date, rating, neg_review, pos_review **

**(25 pts)**

You can see an overview of the information as displayed:
![Information to be scraped](review_sample.png)

In [139]:
'''
Experiment on one page with reviews

'''

webUrl = "https://www.booking.com/hotel/us/loews-boston-hotel.html?aid=304142;label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ;sid=f7bbc22a8ecfb41b5a7bc29715b9f05e;dest_id=20061717;dest_type=city;dist=0;group_adults=2;hapos=1;hpos=1;room1=A%2CA;sb_price_type=total;srepoch=1509942585;srfid=4f87669b94b357cd9cfd533f9f2e96f49874620cX1;srpvid=80511f9cb82b0604;type=total;ucfs=1&#blockdisplay4"
response = requests.get(webUrl)
html = response.text.encode('utf-8')
page_soup = BeautifulSoup(html, "lxml")

country = page_soup.findAll('li', {'class':re.compile('review_item clearfix')})

name = country[1].find('h4').text.strip()
country_name = country[1].find('span', {'class' : 'reviewer_country'}).text.strip()
review_number = country[1].find('div', {'class' : 'review_item_user_review_count'})
helpful_review = country[1].find('div', {'class':'review_item_user_helpful_count'})
rating = country[1].find('span', {'class':'review-score-badge'})
date = country[1].find('p', {'class' : 'review_item_date'})
neg = country[1].find('p', {'class':'review_neg'})
positive = country[1].find('p', {'class':'review_pos'})

print(name)
if country_name is not None:print(country_name)
if review_number is not None:print(review_number)
if helpful_review is not None: print(helpful_review)
if rating is not None:print(rating)
if date is not None:print(date)
# if neg is not None:print(neg[0])
# if positive is not None:print(positive[0])

AsliDe
Turkey
<span class="review-score-badge">
9.2
</span>
<p class="review_item_date">
November 6, 2017
</p>


In [199]:
'''
Extract all reviews for one hotel

'''
def extract_hotel_info(hotel_name, hotel_url, info_list):
    response = requests.get(hotel_url)
    html = response.text.encode('utf-8')
    page_soup = BeautifulSoup(html, "lxml")
    
    if page_soup is None: return  #If cannot get webpage, skip

    review_page = page_soup.find('a', {'class':'show_all_reviews_btn'}) #Find review page's url
    if review_page is None: return  #If no reviews, skip
    
    base_page = "http://www.booking.com"
    review_page_url = base_page + review_page['href']
    
    
    #Get all reviews for this particula hotel if next page exists
    while(True):
        response = requests.get(review_page_url)
        html = response.text.encode('utf-8')
        page_soup = BeautifulSoup(html, 'lxml') #page_soup has html data for this page
        
        if page_soup is None: return  #If cannot get webpage, skip

        #Get all reviews on this page
        containers = page_soup.findAll('li', {'class':re.compile('review_item clearfix')})
        for item in containers:  #item has the data from one review
            tmp_list = []
            tmp_list.append(hotel_name)
            
            name = item.find('h4')
            if name is not None: tmp_list.append(name.text.strip())
            else: tmp_list.append(None)
                  
            country_name = item.find('span', {'class' : 'reviewer_country'})
            if country_name is not None: tmp_list.append(country_name.text.strip())
            else: tmp_list.append(None)
            
            review_number = item.find('div', {'class' : 'review_item_user_review_count'})
            if review_number is not None: tmp_list.append(review_number.text.strip())
            else: tmp_list.append(None)

            helpful_review = item.find('div', {'class':'review_item_user_helpful_count'})
            if helpful_review is not None: tmp_list.append(helpful_review.text.strip())
            else: tmp_list.append(None)
                
            rating = item.find('span', {'class':'review-score-badge'})
            if rating is not None: tmp_list.append(rating.text.strip())
            else: tmp_list.append(None)
                
            review_date = item.find('p', {'class' : 'review_item_date'})
            if review_date is not None: tmp_list.append(review_date.text.strip())
            else: tmp_list.append(None)
                
            neg = item.find('p', {'class':'review_neg'})
            if neg is not None:
                review_text = neg.find('span', {'itemprop': 'reviewBody'})
                if review_text is not None:
                    s = review_text.text.strip()
                    s = re.sub(r'[\n\r\t]*', '', s) #Get rit of special characters
                    tmp_list.append(s)  
                else: tmp_list.append(None)
            else: tmp_list.append(None)
            
            
            positive = item.find('p', {'class':'review_pos'})
            if positive is not None:
                review_text = positive.find('span', {'itemprop': 'reviewBody'})
                if review_text is not None:
                    s = review_text.text.strip()
                    s = re.sub(r'[\n\r\t]*', '', s) #Get rit of special characters
                    tmp_list.append(s)  
                else: tmp_list.append(None)
            else: tmp_list.append(None)
            
            info_list.append(tmp_list) #One entry added to the final result; For loop ends.
           
        
        
        #Try update url to the next page if exists
        next_url = page_soup.find('p', {'class':'page_link review_next_page'})
        if next_url is None: break  #Dealing with invalid page
        
        tag = next_url.find('a', {'href': True})
        if tag is None: break  #Dealing with invalid page

        else: 
            review_page_url = base_page + tag['href'].rstrip()
            time.sleep(0.02)
            
            

In [204]:
info_list = [] #Store all data here

#Given hotel names and their corresponding urls, extract all reviews for all hotel and store them in info_list

for i in range(len(hotel_urls)):
    extract_hotel_info(hotel_names[i], hotel_urls[i], info_list)
    

In [205]:
len(info_list)

56606

# excluded
url_1 = "https://www.booking.com/reviews/us/hotel/loews-boston-hotel.html?aid=304142;label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ;sid=f7bbc22a8ecfb41b5a7bc29715b9f05e"
url_2 = 'https://www.booking.com/reviews/us/hotel/oakwood-boston.html?aid=304142;label=gen173nr-1DCAEoggJCAlhYSDNiBW5vcmVmcgV1c19tYYgBAZgBMcIBA2FibsgBDdgBA-gBAfgBApICAXmoAgQ;sid=f7bbc22a8ecfb41b5a7bc29715b9f05e'
response_ = requests.get(url_1)
html_ = response_.text.encode('utf-8')
page_soup_ = BeautifulSoup(html_, "lxml")

next_url = page_soup_.find('p', {'class':'page_link review_next_page'}).find('a', {'href': True})['href'].rstrip()
next_url

In [206]:
df_reviews = pd.DataFrame(info_list)
df_reviews

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Oakwood Boston,Caroline,France,5 Reviews,,8.3,"November 3, 2017",Cleanliness in the common area is not very goo...,"Very nice staff, although not very present.The..."
1,Oakwood Boston,Anonymous,Australia,1 review,,9.6,"October 28, 2017",No concerns. The gym room was small but manage...,Convenient location. Helpful concierge after h...
2,Oakwood Boston,Dr,Qatar,4 Reviews,,10,"October 26, 2017",,Good locationFamily friendly2 bedrooms and 2 b...
3,Oakwood Boston,Anonymous,Switzerland,8 Reviews,,8.8,"October 17, 2017",,"FABULOUS location (historic sites, aquarium, Q..."
4,Oakwood Boston,Robin,United States of America,4 Reviews,,6.3,"September 15, 2017",The bed is miserably uncomfortable. The showe...,Property is in a great location. Central to ev...
5,Oakwood Boston,Overseeas,Saudi Arabia,2 Reviews,,7.5,"July 27, 2017",Carpet needs to be cleaner.,Two bedrooms and 2 bathrooms. Good space for s...
6,Oakwood Boston,Aj,Egypt,3 Reviews,,4.6,"July 2, 2017",The house keeping is sooo expensive and they d...,
7,Oakwood Boston,Susan,Australia,14 Reviews,,10,"May 28, 2017",There was no kettle and the fridge was really ...,Really good location in a beautiful building. ...
8,Oakwood Boston,Susan,United States of America,1 review,,9.6,"May 25, 2017",Booking this property is confusing. You recei...,The location is perfect. We traveled with our...
9,Oakwood Boston,Anonymous,Canada,1 review,,9.2,"May 24, 2017",It seems oakwood has some learning to do relat...,Excellent location and very suitable for our n...


In [207]:
df_reviews.to_csv('review_ratings.csv', encoding='utf-8')