# Finding the Restaurants that are within Shopping Malls
Notebook used to extract Shopping Mall data, and check if Restaurants exist in Shopping Malls

In [2]:
# conda install -c anaconda beautifulsoup4
from bs4 import BeautifulSoup, SoupStrainer
import requests
import re

import pandas as pd
from fuzzywuzzy import fuzz
import numpy as np

from selenium import webdriver      # conda install -c conda-forge selenium
from time import sleep
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

import warnings
warnings.filterwarnings("ignore")

## 1. Web Scraping the list of Shopping Mall names from Wikipedia (using BeautifulSoup)

In [77]:
url = 'https://en.wikipedia.org/wiki/List_of_shopping_malls_in_Singapore'
resp = requests.get(url)
list_of_mall_names = SoupStrainer('div', class_='div-col')
soup = BeautifulSoup(resp.text, 'html.parser', parse_only=list_of_mall_names)

In [78]:
list_of_scraped_names = []

In [79]:
for li_tag in soup.find_all('li'):
    li_tag_text = li_tag.text
    extra_bracket_match = re.search('\[\d*\d]', li_tag_text)
    if extra_bracket_match:
        extra_bracket = extra_bracket_match.group()
        li_tag_text = li_tag_text.replace(extra_bracket, '').strip()
    list_of_scraped_names.append(li_tag_text)

In [80]:
df_malls = pd.DataFrame({'mall_names': list_of_scraped_names})
df_malls.drop_duplicates(inplace=True)

In [81]:
df_malls

Unnamed: 0,mall_names
0,100 AM
1,313@Somerset
2,Aperia
3,Balestier Hill Shopping Centre
4,Bugis Cube
...,...
157,Gek Poh Shopping Centre
158,Rochester Mall
159,Taman Jurong Shopping Centre
160,West Coast Plaza


## 2. Geocoding for Shopping Mall addresses

In [109]:
driver = webdriver.Chrome()
driver.get("https://www.google.com/maps")
dct_info = {}

for idx, row in df_malls.iterrows():
    mall = row['mall_names']
    input_element = driver.find_element_by_xpath('//*[@id="searchboxinput"]')
    input_element.clear()
    input_element.send_keys(f"{mall} location")
    input_element.send_keys(Keys.ENTER)
    sleep(5)
    try:
        first_reco = driver.find_element_by_class_name("a4gq8e-aVTXAb-haAclf-jRmmHf-hSRGPd")
        first_reco.click()
        sleep(5)
        print(mall)
    except:
        pass

    address_elements = driver.find_elements_by_class_name("QSFF4-text")
    if len(address_elements) > 0:
        address = address_elements[0].text
    else:
        address = ""

    url_current = driver.current_url
    match = re.search('@(-?\d+\.?\d*),(-?\d+\.?\d*)', url_current)
    lat = round(float(match.group(1).replace('@','').strip()), 6)
    lon = round(float(match.group(2)), 6)
    dct_info[mall] = [lat, lon, address]

To manually verify:

Bugis Junction
Cathay Cineleisure Orchard
Duo
Holland Village Shopping Mall
ION Orchard
Knightsbridge
Lucky Plaza
Marina Bay Sands
Millenia Walk
People's Park Centre
Shaw House and Centre
Suntec City
The Poiz
United Square
Zhongshan Mall
Our Tampines Hub
Downtown East
Paya Lebar Quarter (PLQ)
Roxy Square
Loyang Point
Admiralty Place
Djitsun Mall
Sun Plaza
Vista Point
Rivervale Mall
Upper Serangoon Shopping Centre
Beauty World Plaza
Yew Tee Square
Boon Lay Shopping Centre
Fairprice Hub


In [127]:
dct_info['Marina Bay Sands'] = [1.2840501, 103.858707, '10 Bayfront Ave, Singapore 018956']
dct_info['The South Beach'] = [1.2957395, 103.8544249, '38 Beach Rd, Singapore 189767']
dct_info['Northshore Plaza'] = [1.4168672, 103.8996155, '407 Northshore Dr, Singapore 820407']
dct_info['Bedok Point'] = [1.3249305, 103.9301924, 'SG, New Upper Changi Rd, 799, 467351']
dct_info['Jurong Point'] = [1.3397443, 103.704541, '1 Jurong West Central 2, Singapore 648886']
dct_info['Anchorpoint'] = [1.2887706, 103.8028875, '370 Alexandra Rd, Singapore 159953']
dct_info['Pioneer Mall'] = [1.3418699, 103.6951971, '638 Jurong West Street 61, Singapore 640638']
dct_info["People's Park Complex"] = [1.2841977, 103.8403601, '1 Park Rd, Singapore 059108']
dct_info['Orchard Plaza'] = [1.301077, 103.8387947, '150 Orchard Rd, Orchard Plaza, Singapore 238841']
dct_info['Tekka Centre'] = [1.306256, 103.848471, '665 Buffalo Road, Singapore 210665']

In [128]:
df_malls['address'] = df_malls['mall_names'].apply(lambda x: dct_info[x][0])
df_malls['latitude'] = df_malls['mall_names'].apply(lambda x: dct_info[x][1])
df_malls['longitude'] = df_malls['mall_names'].apply(lambda x: dct_info[x][2])

In [34]:
df_malls.to_csv('data/singapore_mall_locations.csv', encoding='utf-8-sig', index=False)

## 3. Use Regular Expressions to identify Restaurants in Malls
Based on address and postal code

In [3]:
df_malls = pd.read_csv('data/singapore_mall_locations.csv')

In [6]:
df_malls

Unnamed: 0,mall_names,latitude,longitude,address,postal_code,cleaned_mall_names
0,100 AM,1.275012,103.841382,"100 Tras St, Singapore 079027",079027,100 am
1,313@Somerset,1.300903,103.836178,"313 Orchard Rd, Singapore 238895",238895,313@somerset
2,Aperia,1.310463,103.861984,"12 Kallang Ave, Singapore 339511",339511,aperia
3,Balestier Hill Shopping Centre,1.325860,103.840678,"2 Balestier Rd, Singapore 320002",320002,balestier hill shopping centre
4,Bugis Cube,1.298197,103.853474,"470 North Bridge Rd, Singapore 188735",188735,bugis cube
...,...,...,...,...,...,...
156,Gek Poh Shopping Centre,1.348765,103.695288,"762 Jurong West Street 75, Singapore 640762",640762,gek poh shopping centre
157,Rochester Mall,1.305584,103.786017,"35 Rochester Dr, Singapore 138639",138639,rochester mall
158,Taman Jurong Shopping Centre,1.334648,103.718311,"399 Yung Sheng Rd, Singapore 610399",610399,taman jurong shopping centre
159,West Coast Plaza,1.303707,103.763748,"154 West Coast Rd, Singapore 127371",127371,west coast plaza


In [4]:
# extract postal code
df_malls['postal_code'] = df_malls['address'].apply(
    lambda x: re.search('\d{3}\s*\d{3}', x).group() if re.search('\d{3}\s*\d{3}', x) else np.nan
)

In [5]:
def split_sticked_postal_code(x):
    match = re.search('(\w*[\d{3}])(\d{3}\w*)', x)
    if match:
        first = match.groups()[0]
        second = match.groups()[1]
        new = first + ' ' + second
        x = x.replace(first+second, new)
        return x
    return x

In [6]:
df_restaurant = pd.read_csv("data/full_restaurant_rating_data.csv")

### Check if Mall Name Exist in Restaurant

In [7]:
df_restaurant['cleaned_address'] = df_restaurant['address'].str.lower()
df_restaurant['cleaned_address'] = df_restaurant['cleaned_address'].apply(split_sticked_postal_code)

df_malls['cleaned_mall_names'] = df_malls['mall_names'].str.lower()
df_malls['postal_code'] = df_malls['postal_code'].apply(split_sticked_postal_code)

In [8]:
def create_dict_from_df(df, col_key, col_value):
    df = df[[col_key, col_value]].drop_duplicates()
    return df.set_index([col_key])[col_value].to_dict()

In [9]:
dct_postal_code_mall = create_dict_from_df(df_malls, 'postal_code', 'mall_names')

In [10]:
def matches_shopping_mall(mall, restaurant):
    malls_to_fully_match = ['nex', 'imm', 'duo', 'jurong point', 'singpost centre']
    if mall in ['nex', 'imm', 'duo']:
        ### for these mall names, require them to be an individual word (not part of a bigger word like 'next' containing 'nex')
        tokens = restaurant.split(' ')
        return any([k == mall for k in tokens])
    elif mall in ['jurong point', 'singpost centre'] or re.search('\d+', mall):
        # mall names that is harder to be identified by FuzzyWuzzy
        return mall in restaurant
    else:
        # FuzzyWuzzy partio ratio returns String Matching scores (account for different variations of a mall name)
        return fuzz.partial_ratio(mall, restaurant) > 90

In [11]:
for idx, row in df_restaurant.iterrows():
    x = row['cleaned_address']
    name_match = [k for k in df_malls['cleaned_mall_names'] if matches_shopping_mall(k, x)]
    postal_code_match = [dct_postal_code_mall[k] for k in df_malls['postal_code'] if k in x]
    
    if any(name_match) or any(postal_code_match):
        df_restaurant.loc[idx, 'is_in_mall'] = 1
    else:
        df_restaurant.loc[idx, 'is_in_mall'] = 0

    if len(name_match) > 0:
        df_restaurant.loc[idx, 'mall'] = name_match[0].title()
        continue
    if len(postal_code_match) > 0:
        df_restaurant.loc[idx, 'mall'] = postal_code_match[0].title()

## 4. Inspect New Features Generated

In [18]:
df_subset = df_restaurant[['url', 'is_in_mall', 'mall']]

In [19]:
df_subset

Unnamed: 0,url,is_in_mall,mall
0,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,
1,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,
2,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,
3,https://www.tripadvisor.com.sg/Restaurant_Revi...,1.0,Marina Square
4,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,
...,...,...,...
11156,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,
11157,https://www.tripadvisor.com.sg/Restaurant_Revi...,1.0,Jcube
11158,https://www.tripadvisor.com.sg/Restaurant_Revi...,1.0,Yew Tee Point
11159,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,


In [None]:
df_subset.to_csv('data/restaurant_is_in_mall.csv', index=False, encoding='utf-8-sig')