In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

### Extract every hotel url from an initial search result,for example : 台東

In [2]:
def hotels(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text,"lxml")
    hotel_list=soup.find_all('div','ui_column is-8 main_col allowEllipsis')
    return hotel_list

### When you got those links, you also need a soup for extracting data

In [3]:
def create_soup(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text,"lxml")
    return soup

###  Using the following function can help to get hotel name, score, address, price, etc.

In [4]:
def grades(score):
    grade=int(score.find('span').get('class')[1].split('_')[1])/10
    return grade

In [5]:
def hotel_name(soup):
    Name=soup.find('h1','_1mTlpMC3').text
    return Name

In [6]:
def get_Category(soup):
    try:
        Category=soup.find('div','_1vpp5J_x').text[-2:]
    except:
        Category='No data'
    return Category

In [7]:
def get_Rank(soup):
    try:
        match = re.findall('[0-9]+',soup.find('div','_1vpp5J_x').text)
        Rank=str(match[0])+'/'+str(match[1])
    except:
        Rank='No data'
    return Rank
    
    

In [8]:
def get_address(soup):
    add=soup.find('div','_1sPw_t0w _3sCS_WGO').text
    return add

In [9]:
def get_score(soup):
    #total score     
    try:
        score=soup.find('span',"_3cjYfwwQ").text
    except:
        score="No data"
    return score

In [10]:
def get_comment(soup):
    try:
        comment=soup.find('span',"_33O9dg0j").text.split('則')[0]
    except:
        comment="No data"
    return comment

In [11]:
def get_price(soup):
    try:
        try:
            price=soup.find('div','_1mV0AtdP offer hVx3qimM _1p50Ckoi bookableOffer _2Ngw5d8g').get('data-pernight')
        except:
            price=soup.find('div',"premium_offers_area offers").find('a').get('data-pernight')
    except:
        price='No data'
    return price

In [12]:
def creat_subscore(soup):
    # List of indexes
    list_index = ["location", "cleanness" , "service" , "CP"]
    
    findgd=soup.find_all('div','_1krg1t5y')
    index_score=list(map(grades,findgd))
    if not index_score:
        subscore={"location":"No data", "cleanness":"No data" , "service":"No data" , "CP":"No data"}
        return subscore
    # Create a zip object from two lists
    else:
        subscore = dict(zip(list_index, index_score))
        return subscore

### Make urls 

In [13]:
websites=['https://www.tripadvisor.com.tw/Hotels-g304163-Taitung-Hotels.html']
for i in range(30,1530,30):
    websites.append('https://www.tripadvisor.com.tw/Hotels-g304163-oa'+str(i)+'-Taitung-Hotels.html')


### Get all Hotel's urls by  anonymous function

In [14]:
urls=[]
for web in websites:
    hotel_list=hotels(web)
    urls.extend(list(map(lambda x:'https://www.tripadvisor.com.tw/'+x.find('a','property_title prominent').get('href'),hotel_list)))

###  Start to scrape information. Here I only demo first 10 data

In [15]:
#Creat an empty dataframe with column names
df=pd.DataFrame(columns=["Name","Price","address","Rank","Category","Score","Comment","location", "cleanness" , "service" , "CP"])

for url in urls[:10]:
    #soup
    soup_temp = create_soup(url)
    #Hotel Name
    hotel_Name=hotel_name(soup_temp)
    #price
    hotel_price=get_price(soup_temp)
    #address
    hotel_address=get_address(soup_temp)
    #Category
    hotel_Category=get_Category(soup_temp)
    #Rank
    hotel_Rank=get_Rank(soup_temp)
    #score
    hotel_score=get_score(soup_temp)
    #comment
    hotel_comment=get_comment(soup_temp)
    #info dictionary
    info={"Name":hotel_Name,"Price":hotel_price,"address":hotel_address,"Rank":hotel_Rank,"Category":hotel_Category,"Score":hotel_score,"Comment":hotel_comment,"urls":url}
    #Combine two dictionaries
    subscore=creat_subscore(soup_temp)
    info.update(subscore)
    print("----------------------------------------------------------Done----------------------------------------------------------")
    # add hotel information
    df=df.append(info, ignore_index=True)
df

----------------------------------------------------------Done----------------------------------------------------------
----------------------------------------------------------Done----------------------------------------------------------
----------------------------------------------------------Done----------------------------------------------------------
----------------------------------------------------------Done----------------------------------------------------------
----------------------------------------------------------Done----------------------------------------------------------
----------------------------------------------------------Done----------------------------------------------------------
----------------------------------------------------------Done----------------------------------------------------------
----------------------------------------------------------Done----------------------------------------------------------
--------------------------------

Unnamed: 0,Name,Price,address,Rank,Category,Score,Comment,location,cleanness,service,CP,urls
0,康橋大飯店 (臺東市) Kindness Hotel Taitung,2359,台灣 臺東市 台東市中興路209巷16號,1/59,飯店,5.0,736,4.0,4.5,5.0,4.5,https://www.tripadvisor.com.tw//Hotel_Review-g...
1,台東桂田喜來登酒店 (臺東市) Sheraton Taitung Hotel,5346,造訪飯店網站,4/59,飯店,4.5,668,4.5,4.5,4.5,4.0,https://www.tripadvisor.com.tw//Hotel_Review-g...
2,日暉國際渡假村 (池上) Papago International Resort,7842,95844 台灣 池上鄉新興村新興107號,1/4,飯店,4.0,229,4.0,4.0,4.0,3.5,https://www.tripadvisor.com.tw//Hotel_Review-g...
3,知本金聯世紀酒店 (太麻里) Chihpen Century Hotel,3628,954 台灣 卑南鄉龍泉路30號,1/14,飯店,4.0,470,4.0,4.5,4.5,4.0,https://www.tripadvisor.com.tw//Hotel_Review-g...
4,知本老爺酒店 (太麻里) Hotel Royal Chihpen,No data,954 台灣 卑南鄉溫泉村龍泉路113巷23號,2/14,飯店,4.0,458,4.0,4.5,4.0,3.5,https://www.tripadvisor.com.tw//Hotel_Review-g...
5,台東南豐鐵花棧 (臺東市) Inn by the Village,1950,95002 台灣 臺東市 台東市中華路一段585號,3/59,飯店,4.5,144,5.0,4.5,4.5,4.5,https://www.tripadvisor.com.tw//Hotel_Review-g...
6,娜路彎大酒店 (臺東市) Formosan Naruwan Hotel & Resort T...,7223,950 台灣 臺東市 台東市連航路66號,10/59,飯店,4.0,156,4.0,4.0,4.0,3.5,https://www.tripadvisor.com.tw//Hotel_Review-g...
7,幸福旅行舍 (卑南) Xin Fu Traveler Inn,No data,954 台灣 太平路630巷45弄12號,1/57,民宿,5.0,373,4.0,5.0,5.0,4.5,https://www.tripadvisor.com.tw//Hotel_Review-g...
8,地景泽行馆 (臺東市) The Suites Taitung,2206,95043 台灣 臺東市 台东市博爱路362巷18号,7/59,飯店,4.0,70,4.5,4.5,4.5,4.0,https://www.tripadvisor.com.tw//Hotel_Review-g...
9,V-Hotel 假期商旅 (臺東市) V-Hotel,No data,95049 台灣 臺東市 台東市四維路三段88號,19/59,飯店,4.0,19,4.5,4.5,3.5,2.5,https://www.tripadvisor.com.tw//Hotel_Review-g...


###  Save as EXCEL

In [16]:
#df.to_excel('Taitung.xlsx')

####  After coverting 500 addresses to XYcoordinate by TGOS, I can creat an informative heatmap for displaying hotel location on the map

In [17]:
import folium
from folium.plugins import HeatMap
hotel_XY=pd.read_excel(r"C:\Users\YiJhe\FB Project\Taitung_XY.xlsx")
XY=[]
for i in range(len(hotel_XY)):
    XY.append([hotel_XY.Response_Y[i],hotel_XY.Response_X[i]])


### You can try to scroll up and down

In [22]:
fmap = folium.Map(location=[22.755563, 121.115492], zoom_start=10)
fmap.add_child(HeatMap(data=XY))
fmap