# **Scraping Hotel Rooms**

### Initialisation

In [1]:
import pandas as pd
import numpy as np
import requests, json, math, datetime
from tqdm import tqdm
from datetime import datetime as dt
from datetime import date
from urllib.parse import quote

# cities and the corresponding geocodes
city_geocodes = {
    'Badung': 102758,
    'Denpasar': 102747,
    'Gianyar': 102770
}


# api url
api_url = 'https://www.traveloka.com/api/v2/hotel/searchList'

# all the collected data will be stored in this list
_data_ = []

###Functions

In [2]:
# padding the number with '0' into two-digit format 
def padNumber(num):
  return str(num).rjust(2,'0')


# collect data based on the selected reservation at each city in `cities`
def collectData(response):
  global columns
  hotels = []
  
  for _, entry in enumerate(tqdm(response['data']['entries'])):
    try:
      sample = entry['data']['inventoryList'][0]
    except:      
      sample = entry['data']
    
    record = {'timestamp': dt.now(), 'city': city,'checkin': startdate,'checkout': enddate,'num_staying_nights':checkin_numnights}

    # try:
    #   print(sample['displayName'])
    # except:
    #   print(sample)
    #   break
    

    for c in columns:
      try:
        if c == 'hotelFeatures':
          record.update({c:', '.join([c_['text'] for c_ in sample[c]])})
          continue
        if c == 'hotelInventorySummary':
          record.update({
              'cheapestRate_allNights_baseFare': sample[c]['cheapestRateDisplay']['baseFare']['amount'],
              'cheapestRate_allNights_fees': sample[c]['cheapestRateDisplay']['fees']['amount'],
              'cheapestRate_allNights_taxes': sample[c]['cheapestRateDisplay']['taxes']['amount'],
              'cheapestRate_allNights_totalFare': sample[c]['cheapestRateDisplay']['totalFare']['amount'],
              'cheapestRate_perNight_baseFare': int(float(sample[c]['cheapestRateDisplay']['baseFare']['amount']) / checkin_numnights),
              'cheapestRate_perNight_fees': round(float(sample[c]['cheapestRateDisplay']['fees']['amount']) / checkin_numnights,2),
              'cheapestRate_perNight_taxes': round(float(sample[c]['cheapestRateDisplay']['taxes']['amount']) / checkin_numnights,2),
              'cheapestRate_perNight_totalFare': round(float(sample[c]['cheapestRateDisplay']['totalFare']['amount']) / checkin_numnights),
              'originalRate_allNights_baseFare': sample[c]['originalRateDisplay']['baseFare']['amount'],
              'originalRate_allNights_fees': sample[c]['originalRateDisplay']['fees']['amount'],
              'originalRate_allNights_taxes': sample[c]['originalRateDisplay']['taxes']['amount'],
              'originalRate_allNights_totalFare': sample[c]['originalRateDisplay']['totalFare']['amount'],
              'originalRate_perNight_baseFare': int(float(sample[c]['originalRateDisplay']['baseFare']['amount']) / checkin_numnights),
              'originalRate_perNight_fees': round(float(sample[c]['originalRateDisplay']['fees']['amount']) / checkin_numnights,2),
              'originalRate_perNight_taxes': round(float(sample[c]['originalRateDisplay']['taxes']['amount']) / checkin_numnights,2),
              'originalRate_perNight_totalFare': round(float(sample[c]['originalRateDisplay']['totalFare']['amount']) / checkin_numnights)
          })    
          continue
        record[c] = sample[c]
      except:
        record[c] = '-'
    hotels.append(record)
  return hotels

###Scraping

In [3]:
# reservation
curyear = date.today().year # current year
startdd, startmm = 27, 3 # check in date and month
enddd, endmm = 28, 3 # check out date and month

# staying range
startdate = padNumber(startdd) + '-' + padNumber(startmm) + '-' + str(curyear)
enddate = padNumber(enddd) + '-' + padNumber(endmm) + '-' + str(curyear)

# number of staying night(s)
checkin_numnights = (dt.strptime(enddate, '%d-%m-%Y')-dt.strptime(startdate, '%d-%m-%Y')).days


# data columns to be extracted
columns = ['id','name', 'displayName', 'accomPropertyType', 'region', 'starRating', 'userRating','showedFacilityTypes','numReviews','userRatingInfo',
           'latitude','longitude','lowRate','highRate',
           'hotelFeatures','hotelSeoUrl','hotelInventorySummary']

print(f'Scraping All Available Hotel Rooms, Checkin {startdate}, Checkout:{enddate}, Staying nights: {checkin_numnights}:')
print()

for city in city_geocodes.keys():
  print()
  geocode = city_geocodes[city]
  print(city.upper())

  # referer url
  referer_url = f'https://www.traveloka.com/id-id/hotel/search?spec={startdate}.{enddate}.1.1.HOTEL_GEO.{geocode}.{quote(city)}.2'

  # headers
  headers = {
      'content-type': 'application/json',
      'cookie': '_gcl_au=1.1.141782728.1670394296; _gac_UA-29776811-12=1.1670394296.Cj0KCQiA7bucBhCeARIsAIOwr-9NHNVdQqtQnRrovoGHLpACWxlx50k6KyMR6yMRUacNvmsKp_P3V7EaAsihEALw_wcB; tv-repeat-visit=true; _gid=GA1.2.137383459.1672284459; g_state={"i_l":1,"i_p":1672291669115}; _ga=GA1.1.1144801166.1670394296; amp_1a5adb=lmXek5GnRbmDVMBIye772l...1gldunjc7.1gldup1ge.c.0.c; tvl=qgdHX7GvehrD9XH5a3S4PdE8AYpuF3hYPaT5bxhY7ZYlTfL+WyvcrSI/VxnhD+GdIauD2fuQAp48xn5SOy61CcGKsORldom9dTN23+66MAYIHEn0dmPxsmhM3nEpAG8sgD1ega4KxIBCDYlQGDuVKUzPw3pNExw5Cd1OxjjrNg3vlyHfFnPptZUxAgMVwRNSCMYWUJplNNMY2P4/83O9X+8GNrPf8Ng75ZieUaJama8=; tvs=qgdHX7GvehrD9XH5a3S4PWL3Nd74xArIuT+JzcRMbKddQHovERAJ9HWRLrAaZ0jPhWj5HSxm0ZKiRbldET1ham2PeYg1sQr2h/wIBjIyPQ1JQfOnq9PrXiJXCb7pG+GuL55zGx9BHnW6AktSohrCEcVZJJEBlMy+/xGmAFjHYdanG44/La0X6wsaDJDc5dQI3jW7f6f85zK7XA1xLrLbn3wpMY91AYFzJ6h8za/vSrng40uUoDT+qJIv0oQGNB1A; _fbp=fb.1.1672284509886.851897136; cto_bundle=LLTD719vOXhjJTJCWjlPeGhpNUJ1VEVFdHpLdUVraHZLa1Fkem44UDZ4R3FlNnRIcGhuaG8zSWhlVFd4TyUyRm5JOVUlMkZTSGJVZ2UyaDVsVWVzWm9ETlBRSTRweTlGRDg2eE81WnVHNXhzJTJGNkZsaVRlTzRHUGVzQ2llQTFXbVN0a3B2OXBSdWpXYWR5cXU5SGpjZTQ5SFk5dzRqR2NwUSUzRCUzRA; _ga_RSRSMMBH0X=GS1.1.1672284458.2.1.1672284554.60.0.0',
      'origin': 'https://www.traveloka.com',
      'referer': referer_url,
      'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
      'x-domain': 'accomSearch',
      'x-route-prefix': 'id-id'    
  }

  # payload
  payload = '{"fields":[],"data":{"checkInDate":{"year":"'+startdate.split('-')[-1]+'","month":"'+str(startmm)+'","day":"'+str(startdd)+'"},"checkOutDate":{"year":"'+enddate.split('-')[-1]+'","month":"'+str(endmm)+'","day":"'+str(enddd)+'"},"numOfNights":'+str(checkin_numnights)+',"currency":"IDR","numAdults":1,"numChildren":0,"childAges":[],"numInfants":0,"numRooms":1,"ccGuaranteeOptions":{"ccInfoPreferences":["CC_TOKEN","CC_FULL_INFO"],"ccGuaranteeRequirementOptions":["CC_GUARANTEE"]},"rateTypes":["PAY_NOW","PAY_AT_PROPERTY"],"isJustLogin":false,"backdate":false,"geoId":"'+str(geocode)+'","monitoringSpec":{"lastKeyword":"'+city+'","referrer":"'+referer_url+'","searchId":null,"searchFunnelType":null,"isPriceFinderActive":null,"dateIndicator":null,"bannerMessage":"","displayPrice":null},"showHidden":false,"locationName":"'+city+'","sourceType":"HOTEL_GEO","isExtraBedIncluded":true,"isUseHotelSearchListAPI":true,"supportedDisplayTypes":["INVENTORY","INVENTORY_LIST","HEADER","INVENTORY_WITH_HEADER"],"userSearchPreferences":[],"uniqueSearchId":null,"basicFilterSortSpec":{"basicSortType":"POPULARITY","ascending":false,"criteriaFilterSortSpec":null,"starRatingFilter":[true,true,false,false,false],"facilityFilter":[],"showedFacilityTypes":[],"hasFreeCancellationRooms":false,"minPriceFilter":null,"maxPriceFilter":null,"quickFilterId":null,"skip":0,"top":100},"criteriaFilterSortSpec":null,"boundaries":null,"contexts":{"isFamilyCheckbox":false}},"clientInterface":"desktop"}'
  
  # try:
  print('preparing post ...')
  page = requests.post(api_url, headers=headers, json=json.loads(payload))
  print('preparing data collection ...')
  response = json.loads(page.text)
  _data_ += collectData(response)
  print()
  # except:
    # print(f'Error: Failed to Extract Data\n Error: {page}')
    # print('detail Error:')
    # print(page.text)
    # break

  
print()
print(f'Num of obtained records: {len(_data_)}')
print()


Scraping All Available Hotel Rooms, Checkin 27-03-2023, Checkout:28-03-2023, Staying nights: 1:


BADUNG
preparing post ...
preparing data collection ...


100%|██████████| 76/76 [00:00<00:00, 37484.37it/s]



DENPASAR
preparing post ...





preparing data collection ...


100%|██████████| 67/67 [00:00<00:00, 33582.50it/s]



GIANYAR
preparing post ...





preparing data collection ...


100%|██████████| 2/2 [00:00<?, ?it/s]



Num of obtained records: 145






###Scraping Result Checks

In [4]:
df = pd.DataFrame(_data_)
df.head()

Unnamed: 0,timestamp,city,checkin,checkout,num_staying_nights,id,name,displayName,accomPropertyType,region,...,cheapestRate_perNight_taxes,cheapestRate_perNight_totalFare,originalRate_allNights_baseFare,originalRate_allNights_fees,originalRate_allNights_taxes,originalRate_allNights_totalFare,originalRate_perNight_baseFare,originalRate_perNight_fees,originalRate_perNight_taxes,originalRate_perNight_totalFare
0,2023-03-27 11:15:55.436196,Badung,27-03-2023,28-03-2023,1,9000001049559,Taman Dharmawangsa Suites,Taman Dharmawangsa Suites,Vila,"Nusa Dua, Badung",...,0.0,1572873,2975207,263782,0,3238989,2975207,263782.0,0.0,3238989
1,2023-03-27 11:15:55.437227,Badung,27-03-2023,28-03-2023,1,338572,POP! Hotel Kuta Beach,POP! Hotel Kuta Beach,Hotel,"Kuta, Badung",...,0.0,207431,247934,33877,0,281811,247934,33877.0,0.0,281811
2,2023-03-27 11:15:55.437227,Badung,27-03-2023,28-03-2023,1,9000001071572,Horison Le Aman,Horison Le Aman,Hotel,"Jimbaran, Badung",...,46803.0,269680,284281,0,59699,343980,284281,0.0,59699.0,343980
3,2023-03-27 11:15:55.437227,Badung,27-03-2023,28-03-2023,1,461589,Amaris Hotel Pratama Nusa Dua,Amaris Hotel Pratama Nusa Dua,Hotel,"Nusa Dua, Badung",...,62479.0,360000,371901,0,78099,450000,371901,0.0,78099.0,450000
4,2023-03-27 11:15:55.437227,Badung,27-03-2023,28-03-2023,1,1000000514793,Hotel Amaris Kuta - Bali,Hotel Amaris Kuta - Bali,Hotel,"Legian, Badung",...,0.0,248621,270542,41715,0,312257,270542,41715.0,0.0,312257


In [5]:
df.groupby(['checkin','city'])['city'].agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count
checkin,city,Unnamed: 2_level_1
27-03-2023,Badung,76
27-03-2023,Denpasar,67
27-03-2023,Gianyar,2


In [6]:
len(df)

145

###Export Result

In [7]:
df.to_csv('traveloka_scrap_listing.csv', sep=',', index=False)