# Clean Hotel Listing Dataset

In [4]:
from selenium import webdriver      # conda install -c conda-forge selenium
from time import sleep
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys


import pandas as pd
import re
import os
import json

# Geospatial
import geopandas as gpd
import geojson
from shapely.geometry import Polygon, MultiPolygon, shape

# Utilities
from utility import retrieve_from_onemap, convert_geojson_to_geometry, export_df_to_shapefile, retrieve_onemap_population_data

# Load API Token
from dotenv import load_dotenv
load_dotenv()
onemap_token = os.getenv('onemap_token')


import warnings
warnings.filterwarnings("ignore")

## Hotel Listing

In [16]:
df_hotels = pd.read_csv("../data/hotel_listing.csv")

In [17]:
col = [
    "Accomodation Name",
    "Postal Code",
    "Address",
    "Status"
]

df_hotels = df_hotels[col]
df_hotels.rename(columns={
    "Accomodation Name": "hotel_names",
    "Postal Code": "postal_code",
    "Address": "address",
    "Status": "status"
}, inplace=True)

In [18]:
df_hotels = df_hotels.query("status == 'Active'")
df_hotels.reset_index(drop=True, inplace=True)

In [19]:
df_hotels

Unnamed: 0,hotel_names,postal_code,address,status
0,30 BENCOOLEN,189621,30 BENCOOLEN STREET,Active
1,7 Wonders Hostel,208930,257 Jalan Besar,Active
2,7 Wonders Hostel @ Boat Quay,58695,65 SOUTH BRIDGE ROAD,Active
3,7 Wonders Hostel @ Upper Dickson,207472,12A UPPER DICKSON ROAD,Active
4,ABC HOSTEL,199201,3 Jalan Kubor (1st Storey),Active
...,...,...,...,...
420,Yew Lian Hotel,389504,549A Geylang Road,Active
421,York Hotel,228516,21 Mount Elizabeth,Active
422,YOTEL Singapore Orchard Road,238904,366 ORCHARD ROAD,Active
423,YotelAir Singapore Changi Airport,819666,78 AIRPORT BOULEVARD,Active


In [20]:
df_hotels["postal_code"] = df_hotels["postal_code"].astype(str)
df_hotels["postal_code"] = df_hotels["postal_code"].apply(
    lambda x: f"{'0' * (6 - len(x))}{x}" if len(x) == 5 else x
)

In [21]:
df_hotels

Unnamed: 0,hotel_names,postal_code,address,status
0,30 BENCOOLEN,189621,30 BENCOOLEN STREET,Active
1,7 Wonders Hostel,208930,257 Jalan Besar,Active
2,7 Wonders Hostel @ Boat Quay,058695,65 SOUTH BRIDGE ROAD,Active
3,7 Wonders Hostel @ Upper Dickson,207472,12A UPPER DICKSON ROAD,Active
4,ABC HOSTEL,199201,3 Jalan Kubor (1st Storey),Active
...,...,...,...,...
420,Yew Lian Hotel,389504,549A Geylang Road,Active
421,York Hotel,228516,21 Mount Elizabeth,Active
422,YOTEL Singapore Orchard Road,238904,366 ORCHARD ROAD,Active
423,YotelAir Singapore Changi Airport,819666,78 AIRPORT BOULEVARD,Active


In [22]:
for idx, row in df_hotels.iterrows():
    query = retrieve_from_onemap(onemap_token, "commonapi/search", payload={"searchVal": f"Singapore {row['postal_code']}", "returnGeom": "Y", "getAddrDetails": "Y"})

    if len(query) > 0:
        results = query.loc[0, "results"]
        df_hotels.loc[idx, "search_result_building"] = results["BUILDING"]
        df_hotels.loc[idx, "search_result_postal_code"] = results["POSTAL"]
        df_hotels.loc[idx, "latitude"] = results["LATITUDE"]
        df_hotels.loc[idx, "longitude"] = results["LONGITUDE"]

In [24]:
df_hotels = df_hotels[["hotel_names", "postal_code", "address", "latitude", "longitude"]].copy()

In [None]:
df_hotels.to_csv("../data/hotel_listing.csv", index=False)

## Clean BCA Energy Performance Report

In [5]:
df_2019 = pd.read_csv("../data/listing-of-building-energy-performance-data-2019.csv")
df_2020 = pd.read_csv("../data/listing-of-building-energy-performance-data-2020.csv")

In [8]:
df_2019["postal_code"] = df_2019["buildingaddress"].apply(lambda x: x[-6:] if not pd.isna(x) else x)
df_2020["postal_code"] = df_2020["buildingaddress"].apply(lambda x: x[-6:] if not pd.isna(x) else x)

In [10]:
for idx, row in df_2019.iterrows():
    query = retrieve_from_onemap(onemap_token, "commonapi/search", payload={"searchVal": f"Singapore {row['postal_code']}", "returnGeom": "Y", "getAddrDetails": "Y"})

    if len(query) > 0:
        results = query.loc[0, "results"]
        df_2019.loc[idx, "search_result_building"] = results["BUILDING"]
        df_2019.loc[idx, "search_result_postal_code"] = results["POSTAL"]
        df_2019.loc[idx, "latitude"] = results["LATITUDE"]
        df_2019.loc[idx, "longitude"] = results["LONGITUDE"]

In [12]:
for idx, row in df_2020.iterrows():
    query = retrieve_from_onemap(onemap_token, "commonapi/search", payload={"searchVal": f"Singapore {row['postal_code']}", "returnGeom": "Y", "getAddrDetails": "Y"})

    if len(query) > 0:
        results = query.loc[0, "results"]
        df_2020.loc[idx, "search_result_building"] = results["BUILDING"]
        df_2020.loc[idx, "search_result_postal_code"] = results["POSTAL"]
        df_2020.loc[idx, "latitude"] = results["LATITUDE"]
        df_2020.loc[idx, "longitude"] = results["LONGITUDE"]

In [14]:
df_2019.drop(columns=["search_result_building", "search_result_postal_code"], inplace=True)
df_2020.drop(columns=["search_result_building", "search_result_postal_code"], inplace=True)

In [16]:
df_2019.dropna(subset=["buildingaddress"], inplace=True)
df_2020.dropna(subset=["buildingaddress"], inplace=True)

In [18]:
df_2019.to_csv("../data/listing-of-building-energy-performance-data-2019.csv", index=False)
df_2020.to_csv("../data/listing-of-building-energy-performance-data-2020.csv", index=False)