In [86]:
# using selenium to scrape dynamic website
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# other libraries
import pandas as pd
import os
from bs4 import BeautifulSoup 
from requests_html import HTMLSession
from time import sleep
from pandas import DataFrame

In [87]:
def load_page(url:str, wait_time:int, batches:int) -> list:
    '''
    The browser sometimes return different htmls, fortunately there is a small variation in patterns.
    As a workaround, this function tries to reload the page every time a pattern different from the reference html is returned.
    
    Parameters:
    -----------------------
    url :: str
        the url of the parent page
    
    wait_time :: int
        The pause time before the next scroll down is executed, allows the browser to load twelve more hotel cards
        
    batches :: int
        The number of batches loaded (12 per batch), each scroll adds 1 batch of hotel cards
        
    Returns:
    -----------------------
    hotel_cardsBS :: list of BeautifulSoup Object
        list containing immediate parent of the hotel cards and its children
    
    '''
    # launch driver load get initial page html, and check if the same as reference html
    driver = webdriver.Chrome(chromedriver_path, options=opts) 
    driver.maximize_window()
    driver.get(url)
    page_init = driver.page_source    
    soup_init = BeautifulSoup(page_init, 'html.parser')
    cardsBS4_init = soup_init.select_one('body .resp-section .inner-section .resp-row .resp-col .main-inner section .h-listing .listings')   
    
    if cardsBS4_init is None:    
        # look at driver.navigate().refresh(), it might be better
        # cardsBS4_init is only none when the html loaded is different from the referrence html, reload.
        driver.quit()
        load_page(url)
    else:
        # hotel
        i = 1
        while i <= batches: 
            # this adds 12 more hotel cards per iteration, if i > 84, the script stops, that is we end up with 1008 (hopefully- net problems) ...
            # ... hotel cards
            script = 'window.scrollTo(0, document.body.scrollHeight,)'
            driver.execute_script(script)                                             # scroll down to the bottom
            sleep(wait_time)                                                          # gives time for the second scrolldown, adjust according to net speed.
            i+=1
            
        print('load_page done')
        page     = driver.page_source                                                 # the resulting html after scroll downs
        soup     = BeautifulSoup(page, 'html.parser')
        selector = ('body .resp-section .inner-section .resp-row .resp-col' + 
                    ' .main-inner section .h-listing .listings')
        hotel_cardsBS = soup.select_one(selector).find_all('li', recursive=False)
        
        return(hotel_cardsBS)

def request(url:str, init_query) -> BeautifulSoup:
    '''
    Follows the same logic as the selenium page loader, but for vanilla soup objects.
    reload page if the html of the page is different than that of the reference page.
    Keep reloading until html of query is equivalent to reference html. 
    
    Parameter:
    --------------------------------------------------------------------------------
    url :: str
        the url of the parent page
    
    init_query :: bs4Object -> bs4Object
        initial query with attribute names from reference html
        
    Return:
    ---------------------------------------------------------------------------------
    soup_init :: bs4Object
        the parent page html, this is similar to the reference html
        
    Note:
    ---------------------------------------------------------------------------------
        requests.get(url) randomly returns different html formats, fortunately the set of html formats returned is finite.
        reload the page until the reference format is returned.
    '''
    request   = HTMLSession()
    page      = requests.get(url)
    soup_init = BeautifulSoup(page.content, 'html.parser')
    
    # use this query uses attribute names similar to reference html, it this is None, the html loaded is different
    init_query = init_query                                          
    
    if init_query is None:
        requests(url,init_query)  # reload the html
    else:
        return soup_init          # this soup object is equivalent to reference html

def init_df() -> DataFrame:
    # features must be defined in the same order as the definitions in Hotel class
    features = [ 'Hotel Name', 'Price', 'Rating', 'Barangay/Complex', 'City',
                 'Province','Adjacent Landmarks', 'Terminals and Stations',
                 'Rooms', 'Amenities', 'Reviews']

    hotel_dataFrame = pd.DataFrame(features=columns)
    return hotel_dataFrame

def url() -> str:
    # concat url, create a function for this one
    url_q1 = 'https://ph.hotels.com/search.do?resolved-location=COUNTRY%3A10233139%3AUNKNOWN%3AUNKNOWN&destination-'
    url_q2 = 'id=10233139&q-destination=Philippines&q-check-in=2021-10-10&q-check-out=2021-10-11&q-rooms=1&q-room-0-adults=2&q-room-0-children=0'
    url = url_q1 + url_q2
    
    return url

In [None]:
class Hotel:
    def __init__(self, hotel_card:BeautifulSoup, dataFrame: DataFrame) -> Hotel:
        '''
        Initialize a hotel, where card is a bs4 Object, containing the hotel data
        
        note to self: classes can access global definitions, no need init class with requests object for indivPage
                      and reviewPage
        '''
         
        # helper variables
        self.card = hotel_card             
        self.address = self.get_address()
        
        individualPage = self.get_hotelPageSoup()
        reviewPage     = self.get_reviewPageSoup(individualPage)
    
        # the features
        self.hotel_name = self.get_name()
        self.price      = self.get_price() 
        self.rating     = self.get_rating()
        self.loc        = self.address[0]
        self.city       = self.address[-4]
        self.province   = self.address[-2]
        self.landmarks  = self.get_landmarks()
        self.terminals  = self.get_terminals()
        self.rooms      = self.get_rooms()
        self.ameneties  = self.get_amenities()
        self.reviews    = self.get_reviews()
        
        # add to DataFrame
        self.add_hotel(dataFrame)
        
    def get_name(self) -> str :
        '''
        Returns:
            hotel_name :: str
        '''
        hotel_name = self.card['data-title']
        return hotel_name
    
    def get_price(self) -> int:
        '''
        Returns
            price :: int
            
        pricing(Php) for 2 adults, one night sta y.
        '''
        price = self.card.find('a', class_='price-link').get_text()             # returns the likes of 'P6,221' 
        price_int = int(''.join(i for i in filter(None,price) if i.isdigit()))  # filter returns list of all chars in price ['P',',','6','7','8'], loop over each element and 
                                                                                # create a new list satisfying only i.isdigit, join this into a string convert string into int
        return price_int
    
    def get_rating(self) -> float:
        '''
        Returns
            rating :: float
        '''
        rating = self.card.find('strong',class_='guest-reviews-badge').get_text()
        rating_float = float(''.join(i for i in filter(None,rating) if (i.isdigit()|(i=='.'))))    # float(the digit chars and '.' in the list of chars of rating)
                                                                                                   # filter() :: str -> [char]
        return rating_float
    
    def get_address(self) -> list:
        '''
        Returns
            address :: str
        
        full address of the hotel, via list with format [loc1, loc2, city, zipCode, province, country]
        ''' 
        address = self.card.find('span', class_='address').get_text()
        address  = address.split(', ')                     # to get the list
        
        address_cp = address[-4:]                          # city has index -4 when from right to left, this is a list of locs from city -> country
        address_sp = self.join_specificAdds(address[:-4])  # concatenate the specific addresses
        
        address = address_cp.insert(index=0, address_sp)   # address now has elements: [address_sp, city, zipCode, province, country]
        return address
    
    def get_landmarks(self, individualPage: BeautifulSoup) -> list:
        '''
        Returns the list of all landmarks 
        with format:
            landmark - distance
        '''
        
        landmarks_q1 = 'body div.resp-section main.inner-section div#property-details '                   # cut the full query, into two parts ...
        landmarks_q2 = 'div#flexible-container-bottom div.whats-around-content-landmarks-transport'       # ... to long for one line
        landmarks    = individualPage.select_one(landmarks_q1 + landmarks_q2)                        
        
        landmarks = landmarks.find(class_='whats-around-content landmarks')
        
        landmarks_col = [landmark.get_text() for landmark in 
                         landmarks.select('div.landmarks-expandable-wrapper ul.landmark-list li')]
        
        return landmarks_col
    
    def get_terminals(self, individualPage: BeautifulSoup) -> dict:
        '''
        get dictionary of terminals where
        dict.key = transportation type
        dict.value = [Name of Terminal - distance from hotel]
        '''
        
        transpo_q1    = 'body div.resp-section main.inner-section div#property-details '                   # cut the full query, into two parts ...
        transpo_q2    = 'div#flexible-container-bottom div.whats-around-content-landmarks-transport'       # ... to long for one line
        transpo_soup  = individualPage.select_one(transpo_q1 + transpo_q2)                        

        transports    = transpo_soup.find(class_='whats-around-content transport')
        transport_cat = [category['class'][0] for category in transports.select('ul')]   # categegory['class'] returns a list, regardless the first entry is the actual category of transportation
                                                                                         # ... airport, train-station, etc
            
        terminal_query = lambda cat: ([station.get_text() for station in                 # helper function gets the list of all ...
                                       transports.select('ul.{0} li'.format(cat))])      # ... transports under category cat, cat :: str
        
        transports_col = {category:terminal_query(cat) for category in transport_cat}
        
    def get_rooms(self, individualPage: BeautifulSoup) -> int:
        '''
        Returns the number of rooms in the hotel
        '''
        
        hSize_q1   = '#at-a-glance div.cont-wrap '                              # query was two long for one line, divided in to two parts
        hSize_q2   = 'div.fact-sheet-columns div.col-8-24 div.info-box ul li'   # there are two instances of ul li tags in html, select_one always select first instance which is the one we need
        hotel_size = individualPage.select_one(hSize_q1 + hSize_q2).get_text()  # this is a bs4 object containing which has a child containing num of rooms
                                                                                # concatenated strings for shorter query in line

        room_count    = [int(word) for word in hotel_size.split() if word.isdigit()][0]
        
        # the extraction method follows from 
        # hotel_size has initial value 'This hotel has N number of rooms'
        # .split() splits according to the presence of each word
        # the list comprehension checks all strings with digits, in our case the room count, this is the same for all cases
        # need to add exceptions, to avoid errors
        
        return room_count
    
    def get_amenities(self) ->  list:
        '''
        list of all amenities available to hotel
        '''
        
        # updated from self.card.select('.hmvt8258-amenities') -> [amenities: BeautifulSoup, ... children of amenities]
        # the old query then for amenities_col was amenities_col = [amenity.get_text() for amenity in amenities[0].select('li')]
        
        amenities     = self.card.select_one('.hmvt8258-amenities') # this returns BeautifulSoup Object instead of list
        amenities_col = [amenity.get_text() for amenity in amenities.select('li')]
        
        return amenities_col
    
    def get_reviews(self, review_page: BeautifulSoup) -> list:
        '''
        Returns a list of the reviews for the hotel
        Each review is a dictionary and has the following items:
        
         'reviewer': name of reviewer
         'rating': rating given be reviewer
         'trip type': type of the trip; how long did the reviewer stay
         'comment': what the reviewer had to say in particular
        '''
        
        # helper variables and functions
        class_names = ['reviewer', 'rating-score', 'trip-type-nights', 'expandable-content description']  # Names of the classes that point to an ...
                                                                                                          # ... attribute of a review card
        get_content = lambda card, class_name: card.find(class_= class_name).get_text()                    
        to_float    = lambda content: float(''.join(i for i in filter(None,rating) if (i.isdigit()|(i=='.'))))  # convert string to float 
        
        review      = lambda card: {'reviewer': get_content(card, class_names[0]), 
                                    'rating': to_float(get_content(card, class_names[1])),                       # remove to_float if not working
                                    'trip type': get_content(card, class_names[2]),
                                    'comment': get_content(card, class_names[3])
                                    }
        
        review_cards = reviews_page.select('div.review-card')
        reviews = [review(card) for card in review_cards[0:15]]   # get formatted card reviews for 15 reviews
        
        
    def add_hotel(self, DataFrame):
        '''
        Adds the hotel to a dataFrame
        
        Parameters
        -----------------------------
        dataFrame :: Pandas.DataFrame
        '''
        hotel = self.__dict__                       # all the attributes in hotel in dictionary format ; feature_name : feature : value
        hotel.pop('card'); hotel.pop('address')     # remove unnecessary features, from the dictionary
            
        dataFrame.append(hotel, ignore_indes=True)
        
    # helper functions
    def get_hotelPageSoup(self) -> BeautifulSoup:
        '''
        Gets the individual hotel card
        '''
        
        href      = self.card.find('a', class_='property-name-link')['href']       # get the text in the attribute href
        href_url  = 'https://ph.hotels.com'+ href                                  # this is the link to the individual hotel information

        page_href = requests.get(href_url)                                         # requests also returns different htmls sometime, do the same recursion used in selenium loading 
        soup_href = BeautifulSoup(page_href.content, 'html.parser')
        
        return soup_href
    
    def get_reviewPageSoup(self, idPage:BeautifulSoup) -> BeautifulSoup:
        '''
        Gets the review page from the indivdual card
        '''
        
        revPage_linkQuery = soup_href.select_one('div#property-reviews div.see-all-reviews a')
        reviewPage_link   = 'https://ph.hotels.com' + see_allRevs['href']       # to review
        
        review_page     = requests.get(reviewPage_link)
        review_pageSoup = BeautifulSoup(review_page.content, 'html.parser')
        
        return review_pageSoup

    def join_specificAdds(self, specific_address:str) -> str:
        '''
        there are instances where there are 3 separate specific address before city hence the need
        specific_address is a list of the addresses before the city
        '''
        specific_add = ''
        index = 1
        for i in specific_address:
            if index != len(b):
                specific_add += (i + ', ')  # add the first address + ', ' to separate
                index += 1
            else:
                specific_add += i           # add the last specific address
                
        return specific_add                 # specific address is a string
    


In [None]:
def main():
    
    chromedriver_path = os.getcwd() + '\\chromedriver\\chromedriver.exe'
    opts = Options(); opts.add_argument(' â€” headless')     
    requests = HTMLSession()             
   
    
    hotels = init_df()   # initialize dataframe
    url  = url()
    
    hotel_cards = load_page(url, wait_time=10, batches=1)
    
    # create a function for this one
    for hotel_card in hotel_cards:
        Hotel(hotel_card, hotel_data)

<h3> <b> Test case for querying on single hotel card </b> </h3>
<i> Used as basis for the code above </i>

In [89]:
hotel_cards = hotel_cardsBS.find_all('li', recursive=False) # creates one BS obj per card from hotel card BeautifulSoup
print('we have {0} distinct hotels'.format(len(hotel_cards)))

we have 26 distinct hotels


<h3> <b> Inspecting one of the hotel cards </b> </h3>
Upon inspection the ff were idd.
<li> an a tag with further data given by the attribute href
<li> address: in _2oHhXM contains address with format: specific, town/city, zip, province, Philippines 
<li> amenities: in ?
<li> price: in ? #rate is one day for 2 persons
<li> h ref has it all, must go to href for number of rooms
<blockquote>
    <li> number of rooms in:
    <li> customer reviews in:
    <li> add landmarks, and getting around: in 
</blockquote>

In [90]:
# try getting information from 1 hotel card
# proof of concept
hotel_sample = hotel_cards[1]

In [91]:
hotel_name = hotel_sample['data-title']
hotel_name

'Conrad Manila'

In [9]:
rating = hotel_sample.find('strong',class_='guest-reviews-badge').get_text()
rating_decimal = float(''.join(i for i in filter(None,rating) if (i.isdigit()|(i=='.'))))

In [10]:
amenities     = hotel_sample.select('.hmvt8258-amenities')
amenities_col = [amenity.get_text() for amenity in amenities[0].select('li')]
amenities_col

['Pool', 'Free parking', 'Airport transfer', 'Spa', 'Gym', 'Restaurant']

In [11]:
address = hotel_sample.find('span', class_='address').get_text()
address       # can be split; address.split(', ') returns a list of loc, loc, mmo .... regardless start from last entry
              # ie get the index of Philippines and go back this gives us | address.split(', ')[-1], pa front pero pa back

In [12]:
price = hotel_sample.find('a', class_='price-link').get_text()          # returns the likes of 'P6,221' 
price_int = int(''.join(i for i in filter(None,price) if i.isdigit()))  # filter returns list of all chars in price ['P',',','6','7','8'], loop over each element and 
                                                                        # create a new list satisfying only i.isdigit, join this into a string convert string into int
price_int

6561

In [34]:
href     = hotel_sample.find('a', class_='property-name-link')['href']
href_url = 'https://ph.hotels.com'+ href                                             # this is the link to the individual hotel information

# open the page and get the ff
# number of roms
# customer review
# landmarks

requests = HTMLSession()

page_href = requests.get(href_url) # requests also returns different htmls sometime, do the same recursion used in selenium loading 
soup_href = BeautifulSoup(page_href.content, 'html.parser')

landMarks_q1 = 'body div.resp-section main.inner-section div#property-details '
landMarks_q2 = 'div#flexible-container-bottom div.whats-around-content-landmarks-transport' 
landmarks_transport = soup_href.select_one(landMarks_q1 + landMarks_q2)                        # gets to long if both queries are in one line

landmarks = landmarks_transport.find(class_='whats-around-content landmarks')
landmarks_col = [landmark.get_text() for landmark in landmarks.select('div.landmarks-expandable-wrapper ul.landmark-list li')]

transports = landmarks_transport.find(class_='whats-around-content transport')
transport_categories = [category['class'][0] for category in transports.select('ul')]                 # categegory['class'] returns a list, iregardless the first entry is the actual category

transpo = lambda cat: [station.get_text() for station in transports.select('ul.{0} li'.format(cat))]  # helper function gets the list of all transports under category cat, cat :: str
transports_col = {cat:transpo(cat) for cat in transport_categories}

In [35]:
landmarks_col

['In Barangay 76',
 'SM Mall of Asia - 8 min walk',
 'SMX Convention Center - 3 min walk',
 'Mall of Asia Arena - 6 min walk',
 'World Trade Center Manila - 33 min walk',
 'Cultural Center of the Philippines - 44 min walk',
 "Children's Museum (Museo Pambata) - 4.2 mi / 6.7 km",
 'Rizal Park - 4.4 mi / 7.1 km']

In [36]:
transports_col

{'airport': ['Ninoy Aquino Intl. Airport (MNL) - 20 min drive '],
 'train-station': ['Manila Buenidia Station - 6 min drive ',
  'Manila Vito Cruz Station - 7 min drive ',
  'Manila Paco Station - 7 min drive '],
 'shuttle': ['Airport shuttle (surcharge)']}

In [38]:
hSize_q1 = '#at-a-glance div.cont-wrap '          
hSize_q2 = 'div.fact-sheet-columns div.col-8-24 div.info-box ul li'   # there are two instances of ul li tags in html, select_one always select first instance which is the one we need in our case
hotel_size = soup_href.select_one(hSize_q1 + hSize_q2).get_text()     # this is a bs4 object containing which has a child containing num of rooms
                                                                      # concatenated strings for shorter query in line
    

room_count    = [int(roomCount) for roomCount in hotel_size.split() if roomCount.isdigit()][0]
room_count

In [43]:
# open another page for the reviews
# starting from the individual hotel card
see_allRevs = soup_href.select_one('div#property-reviews div.see-all-reviews a')
allRevs_url = 'https://ph.hotels.com' + see_allRevs['href']       # to review


reviews_page = requests.get(allRevs_url)
reviews_soup = BeautifulSoup(reviews_page.content, 'html.parser')

review_cards = reviews_soup.select('div.review-card')

# needs another recursion function see first implementation; addresses problem when browser loads different html, keep reloading
# until the right html -- review_cards is not NoneType is loaded.

review_cards
# get only 15

# sample query for one card
# get the name, rating, trip type and the review , delete this later
name = review_cards[0].find(class_='reviewer').get_text()
rating = review_cards[0].find(class_='rating-score').get_text()
trip_type = review_cards[0].find(class_='trip-type-nights').get_text()
comment = review_cards[0].find(class_='expandable-content description').get_text()



# implementation
# the logic of the ff follows from the logic of the ff query :: name = review_cards[0].find(class_='reviewer').get_text()
class_names = ['reviewer', 'rating-score', 'trip-type-nights', 'expandable-content description'] # this are the names of the classes
get_content = lambda card, class_name: card.find(class_= class_name).get_text()
review      = lambda card: {'reviewer': get_content(card, class_names[0]), 
                            'rating': get_content(card, class_names[1]),
                            'trip type': get_content(card, class_names[2]),
                            'comment': get_content(card, class_names[3])
                            }

reviews = [review(card) for card in review_cards[0:15]]   # get formatted card reviews