In [1]:
import pandas as pd
import os

from bs4 import BeautifulSoup
from pandas import DataFrame
from requests_html import HTMLSession
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep

def load_page(url: str, wait_time: int, batches: int, chromedriver_path: str,
              opts: Options) -> BeautifulSoup:
    '''
    The browser sometimes return different htmls, fortunately there is a
    small variation in patterns. As a workaround, this function tries to reload
    the page every time a pattern different from the reference
    html is returned.

    Parameters:
    -----------------------
    url :: str
        the url of the parent page

    wait_time :: int
        The pause time before the next scroll down is executed,
        allows the browser to load twelve more hotel cards

    batches :: int
        The number of batches loaded (12 per batch), each scroll
        adds 1 batch of hotel cards

    Returns:
    -----------------------
    hotel_cardsBS :: list of BeautifulSoup Object
        list containing immediate parent of the hotel cards and its children

    '''
    # launch driver load get initial page html, ...
    # ... and check if the same as reference html

    driver = webdriver.Chrome(chromedriver_path, options=opts)
    driver.maximize_window()
    driver.get(url)
    selector = 'body .resp-section .inner-section .resp-row .resp-col .main-inner section .h-listing .listings'

    page_init = driver.page_source
    soup_init = BeautifulSoup(page_init, 'html.parser')
    cardsBS4_init = soup_init.select_one(selector)

    if cardsBS4_init is None:
        # look at driver.navigate().refresh(), it might be better
        # cardsBS4_init is only none when the html loaded is different from ...
        # ... the referrence html, reload.
        driver.quit()
        load_page(url, wait_time, batches, chromedriver_path, opts)
    else:
        i = 1
        while i < batches:
            # this adds 12 more hotel cards per iteration, if i > 84, ...
            # the script stops, that is we end up with 1008 (hopefully- net ...
            # problems) hotel cards

            script = 'window.scrollTo(0, document.body.scrollHeight,)'
            # scroll down to the bottom
            driver.execute_script(script)

            # gives time for the second scrolldown, ...
            # ... adjust according to net speed.
            sleep(wait_time)
            i += 1

        print('load_page done')
        # the resulting html after scroll downs
        page = driver.page_source
        soup = BeautifulSoup(page, 'html.parser')
        hotel_cards_container = soup.select_one(selector)
        
        hotel_cards = hotel_cards_container.find_all('li', recursive=False)

        return hotel_cards # return list


def request(url: str, init_query: str, requests: HTMLSession) -> BeautifulSoup:
    '''
    Follows the same logic as the selenium page loader, but for vanilla soup
    objects. Reload page if the html of the page is different than that of the
    reference page. Keep reloading until html of query is equivalent to
    reference html.

    Parameter:
    ----------------------------------------------------------------------------
    url :: str
        the url of the parent page

    init_query :: str , this is the initial css selector to be used for
                  checking
        initial query with attribute names from reference html

    Return:
    ----------------------------------------------------------------------------
    soup_init :: bs4Object
        the parent page html, this is similar to the reference html

    Note:
    ----------------------------------------------------------------------------
        requests.get(url) randomly returns different html formats, fortunately
        the set of html formats returned is finite. Reload the page until the
        reference format is returned.
    '''
    page = requests.get(url)
    
    soup_init = BeautifulSoup(page.content, 'html.parser')

    # use this query uses attribute names similar to reference html, ...
    # ... it this is None, the html loaded is different
    check_query = soup_init.select_one(init_query)

    if check_query is None:
        # reload the html
        request(url, init_query, requests)
    else:
        # once init_query loads the expected attributes, return the soup
        return soup_init
   

def init_df() -> DataFrame:
    # features must be defined in the same order as the definitions ...
    # ... in Hotel class
    features = ['hotel_name', 'price', 'rating', 'loc', 'city',
                'province', 'landmarks', 'terminals',
                'rooms', 'ameneties', 'reviews']

    hotel_dataFrame = pd.DataFrame(columns=features)
    return hotel_dataFrame


def url_req() -> str:
    url = ('https://ph.hotels.com/search.do?resolved-location=COUNTRY%3A10'
           + '233139%3AUNKNOWN%3AUNKNOWN&destination-id=10233139&q-destina'
           + 'tion=Philippines&q-check-in=2021-10-10&q-check-out=2021-10-1'
           + '1&q-rooms=1&q-room-0-adults=2&q-room-0-children=0')

    return url

def add_hotels(cards: list, df: DataFrame):
    '''
    Add the attributes of one hotel to the hotel dataFrame
    '''
    for card in cards:
        Hotel(card, df)


def save_hotels(df: DataFrame):
    file = 'data/Hotel.csv'
    df.to_csv(file)


def extract_data() -> list:
    chromedriver_path = os.getcwd() + '\\chromedriver\\chromedriver.exe'
    opts = Options()
    opts.add_argument(' — headless')

    # initialize dataframe
    hotels = init_df()

    # returns of BeautifulSoup hotel cards
    hotel_cards = load_page(url=url_req(), wait_time=10,
                            batches=1,
                            chromedriver_path=chromedriver_path,
                            opts=opts
                            )
    add_hotels(hotel_cards, hotels)      # add hotels
    save_hotels(hotels)                  # save hotels dataFrame to csv


In [None]:
class Hotel:
    def __init__(self, hotel_card: BeautifulSoup, dataFrame: DataFrame):
        '''
        Initialize a hotel, where card is a bs4 Object,
        containing the hotel data

        note to self: classes can access global definitions, no need init class
        with requests object for indivPage
        and reviewPage
        '''

        # helper variables
        self.card = hotel_card
        self.address = self.get_address()

        individualPage = self.get_hotelPageSoup()
        reviewPage = self.get_reviewPageSoup(individualPage)

        # the features
        self.hotel_name = self.get_name()
        self.price = self.get_price()
        self.rating = self.get_rating()
        self.loc = self.address[0]
        self.city = self.address[-4]
        self.province = self.address[-2]
        self.landmarks = self.get_landmarks(individualPage)
        self.terminals = self.get_terminals(individualPage)
        self.rooms = self.get_rooms(individualPage)
        self.ameneties = self.get_amenities()
        self.reviews = self.get_reviews(reviewPage)

        # add to DataFrame
        self.add_hotel(dataFrame)

    def get_name(self) -> str:
        '''
        Returns:
            hotel_name :: str
        '''
        hotel_name = self.card['data-title']
        return hotel_name

    def get_price(self) -> int:
        '''
        Returns
            price :: int

        pricing(Php) for 2 adults, one night sta y.
        '''
        # returns the likes of 'P6,221'
        price = self.card.find('a', class_='price-link').get_text()
        # filter returns list of all chars in price ['P',',','6','7','8'], ...
        # ... loop over each element and

        # create a new list satisfying only i.isdigit, join this into a ...
        # ... string convert string into int

        price_int = int(''.join(i for i in filter(None, price) if i.isdigit()))

        return price_int

    def get_rating(self) -> float:
        '''
        Returns
            rating :: float
        '''
        rating = (self.card.find('strong',
                                 class_='guest-reviews-badge').get_text())

        # float(the digit chars and '.' in the list of chars of rating)
        rating_float = float(''.join(i for i in filter(None, rating) if
                                     (i.isdigit() | (i == '.'))))

        # filter() :: str -> [char]
        return rating_float

    def get_address(self) -> list:
        '''
        Returns
            address :: str

        full address of the hotel, via list with format
        [loc1, loc2, city, zipCode, province, country]
        '''

        address_ = self.card.find('span', class_='address').get_text()
        address = address_.split(', ')                     # to get the list

        # city has index -4 when from right to left, this is a list of locs ...
        # ... from city -> country
        address_cp = address[-4:]

        # concatenate the specific addresses
        address_sp = self.join_specificAdds(address[:-4])
        # address now has elements: [address_sp, city, zipCode, province, ...
        # ... country]
        address_cp.insert(0, address_sp)
        return address_cp

    def get_landmarks(self, individualPage: BeautifulSoup) -> list:
        '''
        Returns the list of all landmarks
        with format:
            landmark - distance
        '''
        selector = ('body div.resp-section main.inner-section '
                    + 'div#property-details '
                    + 'div#flexible-container-bottom '
                    + 'div.whats-around-content-landmarks-transport')

        landmarks = individualPage.select_one(selector)

        landmarks = landmarks.find(class_='whats-around-content landmarks')

        landmarks_col = [landmark.get_text() for landmark in
                         landmarks.select('div.landmarks-expandable-wrapper '
                                          + 'ul.landmark-list li')]

        return landmarks_col

    def get_terminals(self, individualPage: BeautifulSoup) -> dict:
        '''
        get dictionary of terminals where
        dict.key = transportation type
        dict.value = [Name of Terminal - distance from hotel]
        '''

        # cut the full query, into two parts ...
        selector = ('body div.resp-section main.inner-section '
                    + 'div#property-details '
                    + 'div#flexible-container-bottom '
                    + 'div.whats-around-content-landmarks-transport')

        transpo_soup = individualPage.select_one(selector)
        transports = transpo_soup.find(class_='whats-around-content transport')

        # categegory['class'] returns a list, regardless the first entry ...
        # ... is the actual category of transportation
        transport_cat = [category['class'][0] for category in
                         transports.select('ul')]

        # ... airport, train-station, etc

        # helper function,  returns the list of all terminals with category cat
        def terminal_query(cat): return ([station.get_text() for station in
                                          transports.select('ul.{0} li'
                                                            .format(cat))])

        transports_col = {category: terminal_query(category) for category in
                          transport_cat}
        return transports_col

    def get_rooms(self, individualPage: BeautifulSoup) -> int:
        '''
        Returns the number of rooms in the hotel
        '''

        # query was two long for one line, divided in to two parts
        hotelSize_query = ('#at-a-glance div.cont-wrap '
                           + 'div.fact-sheet-columns div.col-8-24 '
                           + 'div.info-box ul li')

        # there are two instances of ul li tags in html, select_one ...
        # always select first instance which is the one we need
        hotel_size = individualPage.select_one(hotelSize_query).get_text()
        # concatenated strings for shorter query in line

        room_count = [int(word) for word in hotel_size.split()
                      if word.isdigit()][0]

        # the extraction method follows from
        # hotel_size has initial value 'This hotel has N number of rooms'
        # .split() splits according to the presence of each word
        # the list comprehension checks all strings with digits, ...
        # ... in our case the room count, this is the same for all cases
        # need to add exceptions, to avoid errors

        return room_count

    def get_amenities(self) -> list:
        '''
        list of all amenities available to hotel
        '''
        amenities = self.card.select_one('.hmvt8258-amenities')
        amenities_col = [amenity.get_text() for amenity in
                         amenities.select('li')]

        return amenities_col

    def get_reviews(self, review_page: BeautifulSoup) -> list:
        '''
        Returns a list of the reviews for the hotel
        Each review is a dictionary and has the following items:

         'reviewer': name of reviewer
         'rating': rating given be reviewer
         'trip type': type of the trip; how long did the reviewer stay
         'comment': what the reviewer had to say in particular
        '''

        # helper variables and functions

        # helper functions
        def get_content(card, class_name):
            return card.find(class_=class_name).get_text()

        def to_float(content):  # converts strings to float
            return float(''.join(i for i in
                                 filter(None, content)
                                 if (i.isdigit() | (i == '.'))))

        # get the attribute as per class name assigned
        def review(card): return {'reviewer': get_content(card,
                                                          class_names[0]),
                                  'rating': to_float(get_content(card,
                                                                 class_names[1])
                                                     ),
                                  'trip type': get_content(card,
                                                           class_names[2]),
                                  'comment': get_content(card,
                                                         class_names[3])
                                  }

        # Names of the classes that point to an ...
        # ... attribute of a review card
        class_names = ['reviewer', 'rating-score', 'trip-type-nights',
                       'expandable-content description']

        review_cards = review_page.select('div.review-card')

        # get formatted card reviews for 15 reviews
        reviews = [review(card) for card in review_cards[0:15]]
        return reviews

    def add_hotel(self, dataFrame):
        '''
        Adds the hotel to a dataFrame

        Parameters
        -----------------------------
        dataFrame :: Pandas.DataFrame
        '''
        hotel = self.__dict__

        # remove unnecessary features
        hotel.pop('card')
        hotel.pop('address')

        dataFrame.append(hotel, ignore_index=True)

    # helper functions
    def get_hotelPageSoup(self) -> BeautifulSoup:
        '''
        Gets the individual hotel card
        '''

        # get the text in the attribute href
        href = self.card.find('a', class_='property-name-link')['href']
        # this is the link to the individual hotel information
        href_url = 'https://ph.hotels.com' + href
        # problem with 'link' + 'link' + 'link' notation, changed back to current format
        init_query = 'body div.resp-section main.inner-section div#property-details div#flexible-container-bottom div.whats-around-content-landmarks-transport'
        requests = HTMLSession()
        
        soup_href = request(href_url, init_query, requests)
        return soup_href

    def get_reviewPageSoup(self, idPage: BeautifulSoup) -> BeautifulSoup:
        '''
        Gets the review page from the indivdual card
        '''

        revPage_linkQuery = idPage.select_one('div#property-reviews div.see-all-reviews a')
        
        reviewPage_link = 'https://ph.hotels.com' + revPage_linkQuery['href']
        init_query = 'div.review-card'
        requests = HTMLSession()
        review_page = request(reviewPage_link, init_query, requests)

        return review_page

    def join_specificAdds(self, specific_address: str) -> str:
        '''
        there are instances where there are 3 separate specific address before
        city hence the need specific_address is a list of the addresses before
        the city
        '''
        specific_add = ''
        index = 1
        for i in specific_address:
            if index != len(specific_address):
                # add the first address and  ', ' to separate from next address
                specific_add += (i + ', ')
                index += 1
            else:
                # add the last specific address
                specific_add += i
                return specific_add

In [None]:
extract_data()

## <b> Test for individual data

In [3]:
p = extract_data()

load_page done


In [4]:
p

[<li class="hotel sponsored" data-accommodation-type="_ACC_TYPE_NAME@Hotel" data-hotel-id="113079" data-image-id="7151684" data-image-tracking="DataScience|$#|herobandit.v20201116-contextual-heroaesthetics.v20200130-hsc.v20190223|$#|20210409" data-info="1|113079|EC||LOCAL|4250" data-os-property-id="113079" data-os-property-image-id="7151684" data-os-property-rank="0" data-pimms="DoubleStamps|D13|TESCO" data-provider-type="LOCAL" data-title="The Heritage Hotel Manila"><article><section class="hotel-wrap"><div class="description resp-module"><h3 class="p-name"><span class="sponsored-badging">Ad</span><a class="property-name-link" href="/travelads/trackredirect.html?trackingUrl=H4sIAAAAAAAAACVUya6rSgz8m7O65NANBPpK0RNTIBOBhClsUEMDgTCEmeTrb855XliqcpUte-H7MDz7v9_f8zyvkgyTKemGvM_rbBU31feAn_33BL7l8r8h6Qc36fq8qc8fUZeTZMOiNVjRfxDNfjLP_QIeAm4F_ggQCB8k8ACs4B8BQWHFfIT0L8mw_K_t1ycgDv74EIK_yo_9U-Mh-nRBPFj_IGH9o0SI_akBGnA_I5AgsP9D4UN_dbh-bMBXjGuSEzwkeqWNOdlESUwnDI0ozMSIYuOUp3BKEMUQBAGNWT4h9NfQ4fjxWVrB

In [5]:
df = init_df()

In [6]:
p[0]['data-title']

'The Heritage Hotel Manila'

In [None]:
hotel = Hotel(p[0],df)    # sa internet ata
hotel.__dict__

In [1]:
df

NameError: name 'df' is not defined