In [2]:
from lxml import etree
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from time import sleep
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import numpy as np
import requests


options = Options()
options.add_argument("--headless")
#options.add_argument("--start-maximized")
#options.add_argument("--disable-notifications")
options.add_argument("--incognito")


# Functions

In [18]:
def page_source_from_selenium(url):
    driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(url)


    total_height = int(driver.execute_script("return document.body.scrollHeight"))

    for i in range(1, total_height, 5):
        driver.execute_script("window.scrollTo(0, {});".format(i))

    page_source=driver.page_source


    print("Done Exporting Data")
    driver.close()
    driver.quit()
    print("Closed Driver")
    return page_source

def scrape_features_from_page(url):
    page_source=page_source_from_selenium(url)
    soup=bs4.BeautifulSoup(page_source,'lxml')
    dom = etree.HTML(str(soup))
    
    
    property_type=[i.text for i in dom.xpath("//span[@class='e2f34d59b1']") ][0]
    if property_type!="Hotel":
        print ("Non-Hotel- Skipping Link")
        return pd.DataFrame()
    
    else:
        hotel_name=[i.text for i in dom.xpath("//h2[@class='d2fee87262 pp-header__title']") ][0]
        _stars=[i for i in dom.xpath("//span[@data-testid='rating-circles']") ]
        stars=len(_stars[0].findall("span[@aria-hidden]"))
        location=[i.text.replace("\n","") for i in dom.xpath("//span[contains(@class, 'address')]")][0]
        review_rating=[i.text.replace("\n","") for i in dom.xpath("//div[contains(@aria-label, 'Scored')]")][0]
        location_score=[i.text.replace("\n","") for i in dom.xpath("//span[@class='review-score-badge']")][0]
        descriptions=[i.findall('p') for i in dom.xpath("//div[@id='property_description_content']")]
        description=" ".join([i.text for i in descriptions[0]])

        main_facilities=[]
        facilities_=[i for i in dom.xpath("//div[@class='hp_desc_important_facilities clearfix hp_desc_important_facilities--bui']")]

        for i in facilities_:
            #print(i.findall("div"))
            for ii in i.findall("div"):
                main_facilities.append(ii.get("data-name-en"))

        total_reviews=[i.text for i in dom.xpath("//div[@class='d8eab2cf7f c90c0a70d3 db63693c62']")][0].split(" ")[0]
        sub_ratings=[i.text for i in dom.xpath("//div[@class='ee746850b6 b8eef6afe1']")][:7]
        sub_ratings_categories=[i.text for i in dom.xpath("//span[@class='d6d4671780']")][:7]
        print(len(sub_ratings),len(sub_ratings_categories))

        sub_ratings_dict={}
        for i in range(len(sub_ratings)):
            #print(sub_ratings[i])
            #print(sub_ratings_categories[i])
            sub_ratings_dict[sub_ratings_categories[i]] = sub_ratings[i]

        #-----------------------------------------------    
        hotel_surroundings=[i.text for i in dom.xpath("//div[@class='b1e6dd8416 aacd9d0b0a']|//span[@class='b6f930dcc9']") if i.text is not None]
        hotel_surroundings_distance=[i.text for i in dom.xpath("//div[@class='db29ecfbe2 c90c0a70d3']") if i.text is not None]

        print("surroundings qc")
        print(len(hotel_surroundings), len(hotel_surroundings_distance))
        surroundings_dict = {hotel_surroundings[i]: hotel_surroundings_distance[i] for i in range(len(hotel_surroundings))}


        price_list=[float(i.text.replace("\n","").strip().replace(u'₱\xa0', '').replace(",",""))
         for i in dom.xpath("//span[@class='prco-valign-middle-helper']")]

        cheapest_price=min(price_list)

        facilities_groups=[i.replace("\n","") for i in dom.xpath("//div[@class='bui-title__text hotel-facilities-group__title-text']//text()") if (i!='\n') & (i.replace("\n","")!="Internet")]
        hfg=dom.xpath("//div[@class='bui-spacer--large']")
        all_facilities=[]
        for i in hfg:
            data=i.findall("div//div//div")
            facilities_per_group=[i.text.replace("\n","") for i in data if (i.text!="\n") & (i.text!=None)]
            if facilities_per_group !=[]:
                all_facilities.append(facilities_per_group)
            #print("")

        print("facilities qc")
        print(len(facilities_groups), len(all_facilities))
        facilities_dict={facilities_groups[i]: all_facilities[i] for i in range(len(facilities_groups))}
        internet_desc=[i.text.replace("\n","") for i in dom.xpath("//div[@class='bui-spacer--medium hotel-facilities-group__policy']") if ("wifi" in (i.text.lower()))]
        facilities_dict["Internet"]=internet_desc



        #-----------------------------------------------   
        all_features={"hotel_name_":hotel_name,"stars":stars,
                  "location":location , "location_score":location_score, 
                  "review_rating":review_rating, "description":description,
                  "main_facilities":main_facilities, "total_reviews":total_reviews,
                  "sub_ratings":sub_ratings, "sub_ratings_categories":sub_ratings_categories,"sub_ratings_dict":sub_ratings_dict,
                  "hotel_surroundings":hotel_surroundings, "hotel_surroundings_distance":hotel_surroundings_distance,"surroundings_dict":surroundings_dict,
                  "price_list":price_list,"cheapeast_price":cheapest_price,
                  "facilities_groups":facilities_groups,"all_facilities":all_facilities,"facilities_dict":facilities_dict         
    }


        df_all_features=pd.DataFrame({k: pd.Series([v]) for k,v in all_features.items()})
        return df_all_features


# Webscrape per page

In [30]:
from tqdm import tqdm
df=pd.read_csv("data/all_manila_hotels.csv")
links=df.links

df_hotels_consolidated=pd.DataFrame()

start_index=0
end_index=10

for count,i in tqdm(enumerate(links[start_index:end_index])):
    print("Opening link- "+ str(count) + " " + str(i))
    print(df.hotel_name[count+start_index])
    try:
        df_to_append=scrape_features_from_page(i)
        df_to_append["link"]=links[count+start_index]
        df_to_append["hotel_name_from_all_urls"]=df.hotel_name[count+start_index]
        df_to_append["location_from_all_urls"]=df.location[count+start_index]
        df_to_append["distance_from_centre"]=df.distance[count+start_index]
        df_hotels_consolidated=pd.concat([df_hotels_consolidated,df_to_append])
    except Exception as e: 
        print("")
        print("Error:")
        print(str(e))
        print("Skipping due to error")
        continue
        
    print(" ")

0it [00:00, ?it/s]

Opening link- 0 https://www.booking.com/hotel/ph/diamond-philippines.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaLQBiAEBmAExuAEXyAEM2AEB6AEB-AECiAIBqAIDuALP2NGbBsACAdICJGRiYTg5ODViLTY0OWYtNDJmMC05MmQ2LTQzYTNhOTM0OGMzNtgCBeACAQ&sid=62995f74fed87079ecbbad21978f8cb9&aid=304142&ucfs=1&arphpl=1&checkin=2022-11-25&checkout=2022-11-26&dest_id=-2437894&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=1&hapos=1&sr_order=popularity&srpvid=e9da55d35eaa04a2&srepoch=1668773545&all_sr_blocks=29318601_0_2_1_0&highlighted_blocks=29318601_0_2_1_0&matching_block_id=29318601_0_2_1_0&sr_pri_blocks=29318601_0_2_1_0__15305&tpi_r=1&from=searchresults#hotelTmpl
Done Exporting Data
Closed Driver


1it [00:38, 38.78s/it]

7 7
surroundings qc
30 30
facilities qc
19 19
 
Opening link- 1 https://www.booking.com/hotel/ph/peaceful-and-beachy-staycation-azure.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaLQBiAEBmAExuAEXyAEM2AEB6AEB-AECiAIBqAIDuALP2NGbBsACAdICJGRiYTg5ODViLTY0OWYtNDJmMC05MmQ2LTQzYTNhOTM0OGMzNtgCBeACAQ&sid=62995f74fed87079ecbbad21978f8cb9&aid=304142&ucfs=1&arphpl=1&checkin=2022-11-25&checkout=2022-11-26&dest_id=-2437894&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=2&hapos=2&sr_order=popularity&srpvid=e9da55d35eaa04a2&srepoch=1668773545&all_sr_blocks=399263001_202857528_4_0_0&highlighted_blocks=399263001_202857528_4_0_0&matching_block_id=399263001_202857528_4_0_0&sr_pri_blocks=399263001_202857528_4_0_0__185980&tpi_r=2&from=searchresults#hotelTmpl
Done Exporting Data
Closed Driver


2it [01:15, 37.47s/it]

Non-Hotel- Skipping Link
 
Opening link- 2 https://www.booking.com/hotel/ph/airobedz-makati-makati1.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaLQBiAEBmAExuAEXyAEM2AEB6AEB-AECiAIBqAIDuALP2NGbBsACAdICJGRiYTg5ODViLTY0OWYtNDJmMC05MmQ2LTQzYTNhOTM0OGMzNtgCBeACAQ&sid=62995f74fed87079ecbbad21978f8cb9&aid=304142&ucfs=1&arphpl=1&checkin=2022-11-25&checkout=2022-11-26&dest_id=-2437894&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=3&hapos=3&sr_order=popularity&srpvid=e9da55d35eaa04a2&srepoch=1668773545&all_sr_blocks=817229002_0_2_0_0&highlighted_blocks=817229002_0_2_0_0&matching_block_id=817229002_0_2_0_0&sr_pri_blocks=817229002_0_2_0_0__1575&tpi_r=1&from=searchresults#hotelTmpl
Done Exporting Data
Closed Driver


3it [01:51, 36.98s/it]

7 7
surroundings qc
34 34
facilities qc
12 12
 
Opening link- 3 https://www.booking.com/hotel/ph/oyo-842-city-smiles-apartelle.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaLQBiAEBmAExuAEXyAEM2AEB6AEB-AECiAIBqAIDuALP2NGbBsACAdICJGRiYTg5ODViLTY0OWYtNDJmMC05MmQ2LTQzYTNhOTM0OGMzNtgCBeACAQ&sid=62995f74fed87079ecbbad21978f8cb9&aid=304142&ucfs=1&arphpl=1&checkin=2022-11-25&checkout=2022-11-26&dest_id=-2437894&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=4&hapos=4&sr_order=popularity&srpvid=e9da55d35eaa04a2&srepoch=1668773545&all_sr_blocks=804067804_357105001_2_0_0&highlighted_blocks=804067804_357105001_2_0_0&matching_block_id=804067804_357105001_2_0_0&sr_pri_blocks=804067804_357105001_2_0_0__84424&tpi_r=2&from=searchresults#hotelTmpl
Done Exporting Data
Closed Driver


4it [02:24, 35.46s/it]

7 7
surroundings qc
35 35
facilities qc
5 5
 
Opening link- 4 https://www.booking.com/hotel/ph/alicia-apartment.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaLQBiAEBmAExuAEXyAEM2AEB6AEB-AECiAIBqAIDuALP2NGbBsACAdICJGRiYTg5ODViLTY0OWYtNDJmMC05MmQ2LTQzYTNhOTM0OGMzNtgCBeACAQ&sid=62995f74fed87079ecbbad21978f8cb9&aid=304142&ucfs=1&arphpl=1&checkin=2022-11-25&checkout=2022-11-26&dest_id=-2437894&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=5&hapos=5&sr_order=popularity&srpvid=e9da55d35eaa04a2&srepoch=1668773545&all_sr_blocks=258460202_346394998_2_41_0&highlighted_blocks=258460202_346394998_2_41_0&matching_block_id=258460202_346394998_2_41_0&sr_pri_blocks=258460202_346394998_2_41_0__180000&tpi_r=2&from=searchresults#hotelTmpl
Done Exporting Data
Closed Driver


5it [03:02, 36.13s/it]

7 7
surroundings qc
35 35
facilities qc
12 11
list index out of range
Skipping due to error
Opening link- 5 https://www.booking.com/hotel/ph/anex-manila.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaLQBiAEBmAExuAEXyAEM2AEB6AEB-AECiAIBqAIDuALP2NGbBsACAdICJGRiYTg5ODViLTY0OWYtNDJmMC05MmQ2LTQzYTNhOTM0OGMzNtgCBeACAQ&sid=62995f74fed87079ecbbad21978f8cb9&aid=304142&ucfs=1&arphpl=1&checkin=2022-11-25&checkout=2022-11-26&dest_id=-2437894&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=6&hapos=6&sr_order=popularity&srpvid=e9da55d35eaa04a2&srepoch=1668773545&all_sr_blocks=691307804_0_2_0_0&highlighted_blocks=691307804_0_2_0_0&matching_block_id=691307804_0_2_0_0&sr_pri_blocks=691307804_0_2_0_0__3399&tpi_r=1&from=searchresults#hotelTmpl
Done Exporting Data
Closed Driver


6it [03:39, 36.39s/it]

7 7
surroundings qc
33 33
facilities qc
13 13
 
Opening link- 6 https://www.booking.com/hotel/ph/budget-room-for-you-kassel-residences.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaLQBiAEBmAExuAEXyAEM2AEB6AEB-AECiAIBqAIDuALP2NGbBsACAdICJGRiYTg5ODViLTY0OWYtNDJmMC05MmQ2LTQzYTNhOTM0OGMzNtgCBeACAQ&sid=62995f74fed87079ecbbad21978f8cb9&aid=304142&ucfs=1&arphpl=1&checkin=2022-11-25&checkout=2022-11-26&dest_id=-2437894&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=7&hapos=7&sr_order=popularity&srpvid=e9da55d35eaa04a2&srepoch=1668773545&all_sr_blocks=878677803_360657628_2_0_0&highlighted_blocks=878677803_360657628_2_0_0&matching_block_id=878677803_360657628_2_0_0&sr_pri_blocks=878677803_360657628_2_0_0__108000&tpi_r=2&from=searchresults#hotelTmpl
Done Exporting Data
Closed Driver


7it [04:14, 35.91s/it]

Non-Hotel- Skipping Link
 
Opening link- 7 https://www.booking.com/hotel/ph/airobedz-mckinley.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaLQBiAEBmAExuAEXyAEM2AEB6AEB-AECiAIBqAIDuALP2NGbBsACAdICJGRiYTg5ODViLTY0OWYtNDJmMC05MmQ2LTQzYTNhOTM0OGMzNtgCBeACAQ&sid=62995f74fed87079ecbbad21978f8cb9&aid=304142&ucfs=1&arphpl=1&checkin=2022-11-25&checkout=2022-11-26&dest_id=-2437894&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=8&hapos=8&sr_order=popularity&srpvid=e9da55d35eaa04a2&srepoch=1668773545&all_sr_blocks=818861401_0_2_0_0&highlighted_blocks=818861401_0_2_0_0&matching_block_id=818861401_0_2_0_0&sr_pri_blocks=818861401_0_2_0_0__1912&tpi_r=1&from=searchresults#hotelTmpl
Done Exporting Data
Closed Driver


8it [04:49, 35.75s/it]

7 7
surroundings qc
32 32
facilities qc
9 9
 
Opening link- 8 https://www.booking.com/hotel/ph/rooms-r-us-evangelista.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaLQBiAEBmAExuAEXyAEM2AEB6AEB-AECiAIBqAIDuALP2NGbBsACAdICJGRiYTg5ODViLTY0OWYtNDJmMC05MmQ2LTQzYTNhOTM0OGMzNtgCBeACAQ&sid=62995f74fed87079ecbbad21978f8cb9&aid=304142&ucfs=1&arphpl=1&checkin=2022-11-25&checkout=2022-11-26&dest_id=-2437894&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=9&hapos=9&sr_order=popularity&srpvid=e9da55d35eaa04a2&srepoch=1668773545&all_sr_blocks=899259602_360632428_2_0_0&highlighted_blocks=899259602_360632428_2_0_0&matching_block_id=899259602_360632428_2_0_0&sr_pri_blocks=899259602_360632428_2_0_0__95000&tpi_r=2&from=searchresults#hotelTmpl
Done Exporting Data
Closed Driver


9it [05:27, 36.43s/it]

Non-Hotel- Skipping Link
 
Opening link- 9 https://www.booking.com/hotel/ph/house-of-b-amp-y-across-naia-t3.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaLQBiAEBmAExuAEXyAEM2AEB6AEB-AECiAIBqAIDuALP2NGbBsACAdICJGRiYTg5ODViLTY0OWYtNDJmMC05MmQ2LTQzYTNhOTM0OGMzNtgCBeACAQ&sid=62995f74fed87079ecbbad21978f8cb9&aid=304142&ucfs=1&arphpl=1&checkin=2022-11-25&checkout=2022-11-26&dest_id=-2437894&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=10&hapos=10&sr_order=popularity&srpvid=e9da55d35eaa04a2&srepoch=1668773545&all_sr_blocks=925992301_364395165_2_0_0&highlighted_blocks=925992301_364395165_2_0_0&matching_block_id=925992301_364395165_2_0_0&sr_pri_blocks=925992301_364395165_2_0_0__134640&tpi_r=2&from=searchresults#hotelTmpl
Done Exporting Data
Closed Driver


10it [06:01, 36.12s/it]

Non-Hotel- Skipping Link
 





In [31]:
df_hotels_consolidated

Unnamed: 0,hotel_name,stars,location,location_score,review_rating,description,main_facilities,total_reviews,sub_ratings,sub_ratings_categories,...,surroundings_dict,price_list,cheapeast_price,facilities_groups,all_facilities,facilities_dict,link,hotel_name_from_all_urls,location_from_all_urls,distance_from_centre
0,Diamond Hotel Philippines - Multiple Use Hotel,5.0,"Dr. J. Quintos cor. Roxas Blvd Malate, 1000 Ma...",8.5,7.9,"Located along Manila Bay, a 30-minute drive fr...","[Swimming pool, Non Smoking Rooms, Fitness Roo...",209,"[8.3, 8.0, 8.1, 8.3, 7.4, 8.5, 7.3]","[Staff, Facilities, Cleanliness, Comfort, Valu...",...,"{'Pedro Gil Park': '150 m', 'Manila Baywalk': ...","[6289.0, 12512.0, 11960.0, 12512.0, 11960.0]",6289.0,"[Bathroom, Bedroom, View, Outdoors, Kitchen, R...","[[Toilet paper, Towels, Bidet, Additional toil...","{'Bathroom': ['Toilet paper', 'Towels', 'Bidet...",https://www.booking.com/hotel/ph/diamond-phili...,Diamond Hotel Philippines - Multiple Use Hotel,"Manila Bay, Manila",2.7 km from centre
0,airobedz MAKATI - near Cash & Carry Mall,3.0,"5487 Boyle Street, Palanan, Makati, 1235 Manil...",9.0,8.5,You're eligible for a Genius discount at airob...,"[Non Smoking Rooms, Free WiFi Internet Access ...",690,"[9.0, 8.6, 9.0, 8.9, 8.9, 8.2, 8.0]","[Staff, Facilities, Cleanliness, Comfort, Valu...",...,"{'Marian Quadrangle': '1,000 m', 'Amphitheater...","[1485.0, 1800.0, 2115.0, 1188.0]",1188.0,"[Bathroom, Bedroom, Kitchen, Room Amenities, L...","[[Toilet paper, Towels, Bidet, Bath or shower,...","{'Bathroom': ['Toilet paper', 'Towels', 'Bidet...",https://www.booking.com/hotel/ph/airobedz-maka...,airobedz MAKATI - near Cash & Carry Mall,"Makati, Manila",4.1 km from centre
0,OYO 842 City Smiles Apartelle,3.0,"85, 85 10th Avenue, Cubao, Quezon City, Quezon...",8.4,7.2,You're eligible for a Genius discount at OYO 8...,"[Non Smoking Rooms, Free WiFi Internet Access ...",69,"[8.3, 7.4, 7.6, 7.5, 7.9, 8.3, 8.4]","[Staff, Facilities, Cleanliness, Comfort, Valu...",...,{'San Martin de Porres Barangay Covered Court'...,"[676.0, 878.0]",676.0,"[Parking, Services, Safety & security, General...","[[Accessible parking], [Daily housekeeping], [...","{'Parking': ['Accessible parking'], 'Services'...",https://www.booking.com/hotel/ph/oyo-842-city-...,OYO 842 City Smiles Apartelle,"Quezon City, Manila",8.2 km from centre
0,Anex Hotel near US Embassy,3.0,"1125 M. H. Del Pilar Street, 1000 Manila, Phil...",8.8,7.4,You're eligible for a Genius discount at Anex ...,"[Airport Shuttle, Non Smoking Rooms, Room-serv...",278,"[8.8, 7.3, 7.6, 7.6, 7.4, 8.6, 7.6]","[Staff, Facilities, Cleanliness, Comfort, Valu...",...,"{'Plaza Nuestra Señora de Guia': '50 m', 'Muse...","[1620.0, 2160.0, 1890.0, 2430.0, 2250.0, 2790....",1350.0,"[Bathroom, Bedroom, Kitchen, Living Area, Medi...","[[Toilet paper, Towels, Bidet, Private bathroo...","{'Bathroom': ['Toilet paper', 'Towels', 'Bidet...",https://www.booking.com/hotel/ph/anex-manila.e...,Anex Hotel near US Embassy,"Manila Bay, Manila",2 km from centre
0,airobedz MCKINLEY - near SM Aura,2.0,"212 Sampaguita Street, Pembo (near SM Aura), M...",9.0,8.5,You're eligible for a Genius discount at airob...,"[Non Smoking Rooms, Free WiFi Internet Access ...",370,"[9.0, 8.4, 8.9, 8.8, 8.9, 8.3, 7.4]","[Staff, Facilities, Cleanliness, Comfort, Valu...",...,"{'Sky Park': '600 m', 'Pateros Park - Plaza de...","[1350.0, 1575.0, 1170.0]",1170.0,"[Bathroom, Bedroom, Kitchen, Media & Technolog...","[[Toilet paper, Towels, Bidet, Bath or shower,...","{'Bathroom': ['Toilet paper', 'Towels', 'Bidet...",https://www.booking.com/hotel/ph/airobedz-mcki...,airobedz MCKINLEY - near SM Aura,"Makati, Manila",10 km from centre
