In [1]:
import random
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import logging 
from datetime import datetime
import asyncio
from os import listdir
import aiofiles
import nest_asyncio
nest_asyncio.apply()

In [2]:
logger = logging.getLogger()
dstamp = datetime.now().strftime('%m_%d')
fhandler = logging.FileHandler(filename= dstamp + 'scrapelocal.log', mode='w')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

#  Constants

# Synchronous code

In [3]:
def getAllReviews(bsHotel):
    ltord = [a.get_text() for a in bsHotel.find_all(class_ = 'location-review-review-list-parts-ReviewRatingFilter__row_num--3cSP7')]
    if len(ltord) == 0:
        ltord = ['','','','','']
    return tuple(ltord)

In [4]:
def getHotelclass(bsHotel):
    if bsHotel.find(class_ = '_2MgVjxWG'):
        hClass = 2
    elif bsHotel.find(class_ = '_3RprXHxE'):
        hClass = 3
    elif bsHotel.find(class_ = '_30WZSV_9'):
        hClass = 4
    elif bsHotel.find(class_ = '_2LYcDtDf'):
        hClass = 4.5
    elif bsHotel.find(class_ = 'f33bWmtw'):
        hClass = 5 
    else:
        hClass = 0
    return hClass

In [5]:
def getNearby(bsHotel):
    logging.info('getNearby')
    if bsHotel.find(class_ = 'hotels-hotel-review-location-layout-Highlight__orange--1N-BP'):
        nearbyResto = bsHotel.find(
        class_ = 'hotels-hotel-review-location-layout-Highlight__orange--1N-BP').get_text()
    else:
        nearbyResto  = ''
    if bsHotel.find(class_ = 'hotels-hotel-review-location-layout-Highlight__blue--2qc3K'):
        nearbyAttractions = bsHotel.find(
        class_ = 'hotels-hotel-review-location-layout-Highlight__blue--2qc3K').get_text()
    else:
        nearbyAttractions = ''
    return  nearbyResto, nearbyAttractions

In [6]:
def getRating(bsHotel):
    if bsHotel.find(class_ = 'hotels-hotel-review-about-with-photos-Reviews__overallRating--vElGA'):
        rating = bsHotel.find(
        class_ = 'hotels-hotel-review-about-with-photos-Reviews__overallRating--vElGA').get_text()
    else:
        rating  = ''
    return rating

In [7]:
def getTotalRev(bsHotel):
    if bsHotel.find(class_ = 'hotels-community-content-common-TabAboveHeader__tabCount--26Tct'):
        totalRev = bsHotel.find(
        class_ = 'hotels-community-content-common-TabAboveHeader__tabCount--26Tct').get_text()
    else:
        totalRev  = ''
    return totalRev

In [8]:
def getprice(bsHotel):
    logging.info('getprice')
    if bsHotel.find(class_ = 'hotels-hotel-offers-DetailChevronOffer__price--py2LH'):
        price = bsHotel.find(
            class_ = 'hotels-hotel-offers-DetailChevronOffer__price--py2LH').get_text()
    elif bsHotel.find(class_ = 'hotels-hotel-offers-DominantOffer__price--D-ycN'):
        price = bsHotel.find(
            class_ = 'hotels-hotel-offers-DominantOffer__price--D-ycN').get_text()
    else:
        price = ''
    logging.info(price)
    return price 

In [9]:
def getHotelDetails(bsHotel):
    title =  bsHotel.find(id='HEADING').get_text()
    price =  getprice(bsHotel)
    hotelClass =  getHotelclass(bsHotel)
    rating =   getRating(bsHotel)
    totalRev = getTotalRev(bsHotel)
    excRev, vgRev, aveRev, poorRev, terRev =   getAllReviews(bsHotel)
    nearbyResto, nearbyAttractions =  getNearby(bsHotel)
        
    return {'title': title
            , 'price': price
            , 'hotelClass': hotelClass
            , 'rating' : rating
            , 'totalRev': totalRev
            , 'excellentRev':excRev
            , 'verygoodRev':vgRev
            , 'averageRev':aveRev
            , 'poorRev':poorRev
            , 'terribleRev':terRev
            , 'nearbyRestaurant':nearbyResto
            , 'nearbyAttractions':nearbyAttractions
           } 

### Async Codes 

In [10]:
async def crReadContents(fpath):
    async with aiofiles.open(fpath, mode='r') as f:
        logging.info(fpath)
        contents = await f.read()
        await f.close()
    return contents

In [11]:
async def crScrapeDetails(content,fpath):
    logging.info(f'Scrape on {fpath}')
    bs = BeautifulSoup(content, 'html.parser')
    logging.info(f'bs loaded on {fpath}')
    dctPages = getHotelDetails(bs)
    logging.info(dctPages)
    return dctPages
    

In [12]:
# tasks order
async def crtaskReadScrape(fpath):
    content = await crReadContents(fpath)
    dctPages = await  crScrapeDetails(content, fpath)
    logging.info(dctPages)
    return dctPages

In [13]:
#generation of task list, ie number of iter of the called taskTasks orders calls await functions 
async def crMain(fpathlt):
    tasklist = [crtaskReadScrape(fpath) for fpath in fpathlt]
    return await asyncio.wait(tasklist)

In [14]:
pageslt = listdir('dl')
fpathlt = ['dl/' + page for page in pageslt ]
fpathlt
loop = asyncio.get_event_loop()
finished, unfinished = loop.run_until_complete(crMain(fpathlt))

In [15]:
resListDict =  [res.result() for res in finished ]
dfRes = pd.DataFrame(resListDict)

In [16]:
dfRes

Unnamed: 0,title,price,hotelClass,rating,totalRev,excellentRev,verygoodRev,averageRev,poorRev,terribleRev,nearbyRestaurant,nearbyAttractions
0,The Tides Hotel Boracay,"₱4,180",4.0,3.5,292,73,121,59,26,13,111,5
1,Gran Prix Boracay Hideaway,,0.0,3.5,32,6,11,11,2,2,,
2,Gracia's Inn,,0.0,,,,,,,,33,3
3,John's Apartelle,,2.0,,,,,,,,101,5
4,Misueno Boracay,,3.0,4.5,52,40,7,3,0,2,,
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Tonglen Eco Resort,,2.0,4.0,160,65,63,26,4,2,44,4
610,MGB Cottages,,2.0,4.5,3,2,1,0,0,0,98,4
611,Casa Camilla Boracay Apartments,,0.0,3.5,29,8,10,3,5,3,23,3
612,Guesthouse in Boracay,,4.0,4.0,1,0,1,0,0,0,278,43
