# Clean Shopping Mall Listing Dataset

In [1]:
from selenium import webdriver      # conda install -c conda-forge selenium
from time import sleep
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

import pandas as pd
import re
import os
import json

# Geospatial
import geopandas as gpd
import geojson
from shapely.geometry import Polygon, MultiPolygon, shape

# Utilities
from utility import retrieve_from_onemap, convert_geojson_to_geometry, export_df_to_shapefile, retrieve_onemap_population_data

# Load API Token
from dotenv import load_dotenv
load_dotenv()
onemap_token = os.getenv('onemap_token')


import warnings
warnings.filterwarnings("ignore")

In [None]:
# configuration tool that can be passed into webdriver.Chrome
options = Options()

# windows
# https://www.youtube.com/watch?v=Xjv1sY630Uc 
# PATH = "C:\Program Files\chromedriver.exe"
# driver = webdriver.Chrome(PATH)

# macOS
# https://www.edureka.co/community/52315/how-to-setup-chrome-driver-with-selenium-on-macos
# need to put chromedriver.exe into /usr/local/bin
# and type in terminal (after cd into /usr/local/bin)
#  xattr -d com.apple.quarantine chromedriver 
driver = webdriver.Chrome()

In [None]:
driver.get("https://www.google.com/maps")

## Scrape Google Review Data for Shopping Malls

In [None]:
df_shopping_malls = pd.read_csv("../data/shopping_mall_listing.csv")
df_shopping_malls["postal_code"] = df_shopping_malls["postal_code"].astype(str)
df_shopping_malls["postal_code"] = df_shopping_malls["postal_code"].apply(
    lambda x: f"{'0' * (6 - len(x))}{x}" if len(x) == 5 else x
)

In [None]:
# search for mall names + "mall Singapore"

for idx, row in df_shopping_malls.iterrows():
    searchbox = driver.find_element(By.ID, "searchboxinput")
    searchbox.clear()
    searchbox.send_keys(f'{row["mall_names"]} mall Singapore')
    print(row["mall_names"])
    searchbox.send_keys(Keys.ENTER)

    sleep(3)

    new_tags = driver.find_elements(By.CLASS_NAME, 'BHOKXe')

    for tag in new_tags:
        info = tag.get_attribute("aria-label")
        print(info)

        cleaned_info = re.sub("(?<=\d)\,(?=\d)|([a-z])|\s*", "", info)
        print(cleaned_info)
        print("-"*30)
        df_shopping_malls.loc[idx, f'{cleaned_info.split(",")[0]} stars'] = int(cleaned_info.split(",")[1])

In [None]:
# search for mall names only 

for idx, row in df_shopping_malls[df_shopping_malls["5 stars"].isna()].iterrows():
    searchbox = driver.find_element(By.ID, "searchboxinput")
    searchbox.clear()
    searchbox.send_keys(f'{row["mall_names"]}')
    print(row["mall_names"])
    searchbox.send_keys(Keys.ENTER)

    sleep(3)

    new_tags = driver.find_elements(By.CLASS_NAME, 'BHOKXe')

    if len(new_tags) == 0:
        print(row["mall_names"])

    for tag in new_tags:
        info = tag.get_attribute("aria-label")
        cleaned_info = re.sub("(?<=\d)\,(?=\d)|([a-z])|\s*", "", info)
        print(cleaned_info)
        df_shopping_malls.loc[idx, f'{cleaned_info.split(",")[0]} stars'] = int(cleaned_info.split(",")[1])

In [None]:
df_shopping_malls[df_shopping_malls["5 stars"].isna()]

In [None]:
# Shopping malls that can't be web-scraped through Google Image

# Bugis Junction
df_shopping_malls.loc[5, "5 stars"] = 6792
df_shopping_malls.loc[5, "4 stars"] = 4402
df_shopping_malls.loc[5, "3 stars"] = 1286
df_shopping_malls.loc[5, "2 stars"] = 121
df_shopping_malls.loc[5, "1 stars"] = 151

# Cathay Cineleisure Orchard
df_shopping_malls.loc[8, "5 stars"] = 481
df_shopping_malls.loc[8, "4 stars"] = 293
df_shopping_malls.loc[8, "3 stars"] = 127
df_shopping_malls.loc[8, "2 stars"] = 28
df_shopping_malls.loc[8, "1 stars"] = 46

# Holland V Shopping Mall
df_shopping_malls.loc[19, "5 stars"] = 216
df_shopping_malls.loc[19, "4 stars"] = 183
df_shopping_malls.loc[19, "3 stars"] = 113
df_shopping_malls.loc[19, "2 stars"] = 26
df_shopping_malls.loc[19, "1 stars"] = 20

# People's Park Complex
df_shopping_malls.loc[38, "5 stars"] = 842
df_shopping_malls.loc[38, "4 stars"] = 652
df_shopping_malls.loc[38, "3 stars"] = 436
df_shopping_malls.loc[38, "2 stars"] = 81
df_shopping_malls.loc[38, "1 stars"] = 64

# Downtown East
df_shopping_malls.loc[68, "5 stars"] = 4761
df_shopping_malls.loc[68, "4 stars"] = 2842
df_shopping_malls.loc[68, "3 stars"] = 1030
df_shopping_malls.loc[68, "2 stars"] = 135
df_shopping_malls.loc[68, "1 stars"] = 149

# Rivervale Mall
df_shopping_malls.loc[118, "5 stars"] = 1102
df_shopping_malls.loc[118, "4 stars"] = 833
df_shopping_malls.loc[118, "3 stars"] = 702
df_shopping_malls.loc[118, "2 stars"] = 137
df_shopping_malls.loc[118, "1 stars"] = 127

# Beauty World Plaza
df_shopping_malls.loc[124, "5 stars"] = 96
df_shopping_malls.loc[124, "4 stars"] = 72
df_shopping_malls.loc[124, "3 stars"] = 76
df_shopping_malls.loc[124, "2 stars"] = 28
df_shopping_malls.loc[124, "1 stars"] = 14

# Yew Tee Square
df_shopping_malls.loc[140, "5 stars"] = 23
df_shopping_malls.loc[140, "4 stars"] = 12
df_shopping_malls.loc[140, "3 stars"] = 7
df_shopping_malls.loc[140, "2 stars"] = 1
df_shopping_malls.loc[140, "1 stars"] = 2

# Boon Lay Shopping Centre
df_shopping_malls.loc[153, "5 stars"] = 386
df_shopping_malls.loc[153, "4 stars"] = 211
df_shopping_malls.loc[153, "3 stars"] = 96
df_shopping_malls.loc[153, "2 stars"] = 24
df_shopping_malls.loc[153, "1 stars"] = 19

In [None]:
# Drop other shopping malls that are duplicated or ceased operations
df_shopping_malls.dropna(subset=["5 stars"], inplace=True)

In [None]:
driver.close()

## Redoing Geocoding of Shopping Malls

In [None]:
for idx, row in df_shopping_malls.iterrows():
    query = retrieve_from_onemap(onemap_token, "commonapi/search", payload={"searchVal": f"{row['mall_names']} Singapore {row['postal_code']}", "returnGeom": "Y", "getAddrDetails": "Y"})

    if len(query) > 0:
        results = query.loc[0, "results"]
        df_shopping_malls.loc[idx, "search_result_building"] = results["BUILDING"]
        df_shopping_malls.loc[idx, "search_result_postal_code"] = results["POSTAL"]
        df_shopping_malls.loc[idx, "latitude"] = results["LATITUDE"]
        df_shopping_malls.loc[idx, "longitude"] = results["LONGITUDE"]
    else:
        print(row['mall_names'])

In [123]:
for idx, row in df_shopping_malls[df_shopping_malls["search_result_building"].isna()].iterrows():
    query = retrieve_from_onemap(onemap_token, "commonapi/search", payload={"searchVal": f"Singapore {row['postal_code']}", "returnGeom": "Y", "getAddrDetails": "Y"})

    if len(query) > 0:
        results = query.loc[0, "results"]
        df_shopping_malls.loc[idx, "search_result_building"] = results["BUILDING"]
        df_shopping_malls.loc[idx, "search_result_postal_code"] = results["POSTAL"]
        df_shopping_malls.loc[idx, "latitude"] = results["LATITUDE"]
        df_shopping_malls.loc[idx, "longitude"] = results["LONGITUDE"]
    else:
        print(row['mall_names'])

In [127]:
df_shopping_malls["1 stars"] = df_shopping_malls["1 stars"].astype(int)
df_shopping_malls["2 stars"] = df_shopping_malls["2 stars"].astype(int)
df_shopping_malls["3 stars"] = df_shopping_malls["3 stars"].astype(int)
df_shopping_malls["4 stars"] = df_shopping_malls["4 stars"].astype(int)
df_shopping_malls["5 stars"] = df_shopping_malls["5 stars"].astype(int)

In [130]:
df_shopping_malls["total_reviews"] = df_shopping_malls["1 stars"] + df_shopping_malls["2 stars"] + df_shopping_malls["3 stars"] + df_shopping_malls["4 stars"] + df_shopping_malls["5 stars"]
df_shopping_malls["rating"] = df_shopping_malls.apply(
    lambda x: (x["1 stars"] + (x["2 stars"] * 2) + (x["3 stars"] * 3) + (x["4 stars"] * 4) + (x["5 stars"] * 5)) / x["total_reviews"], axis = 1
)

In [132]:
df_shopping_malls.drop(columns=[
    "cleaned_mall_names",
    "search_result_building",
    "search_result_building",
    "search_result_postal_code"
], inplace=True)

In [135]:
df_shopping_malls.to_csv("../data/shopping_mall_listing.csv", index=False)