# Finding the Restaurants that are within Shopping Malls

In [1]:
# conda install -c anaconda beautifulsoup4
from bs4 import BeautifulSoup, SoupStrainer
import requests
import re

import pandas as pd
from fuzzywuzzy import fuzz
import numpy as np

## 1. Web Scraping the list of Shopping Mall names from Wikipedia (using BeautifulSoup)

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_shopping_malls_in_Singapore'
resp = requests.get(url)
list_of_mall_names = SoupStrainer('div', class_='div-col')
soup = BeautifulSoup(resp.text, 'html.parser', parse_only=list_of_mall_names)

In [3]:
list_of_scraped_names = []

In [4]:
for li_tag in soup.find_all('li'):
    li_tag_text = li_tag.text
    extra_bracket_match = re.search('\[\d*\d]', li_tag_text)
    if extra_bracket_match:
        extra_bracket = extra_bracket_match.group()
        li_tag_text = li_tag_text.replace(extra_bracket, '').strip()
    list_of_scraped_names.append(li_tag_text)

In [5]:
df_malls = pd.DataFrame({'mall_names': list_of_scraped_names})
df_malls.drop_duplicates(inplace=True)

In [6]:
df_malls

Unnamed: 0,mall_names
0,100 AM
1,313@Somerset
2,Aperia
3,Balestier Hill Shopping Centre
4,Bugis Cube
...,...
157,Gek Poh Shopping Centre
158,Rochester Mall
159,Taman Jurong Shopping Centre
160,West Coast Plaza


## 2. Use Regular Expressions to identify Restaurants in Malls

In [7]:
# function keep here in case need (for now useless)
def split_sticked_words(x):
    match = re.search('(\w*[a-z])([A-Z]\w*)', x)
    if match:
        print(match)
        first = match.groups()[0]
        second = match.groups()[1]
        new = first + ' ' + second
        x = x.replace(first+second, new)
        return x
    return x

In [8]:
df_restaurant = pd.read_csv("data/full_restaurant_rating_data.csv")

### Check if Mall Name Exist in Restaurant

In [9]:
def matches_shopping_mall(mall, restaurant):
    malls_to_fully_match = ['nex', 'imm', 'duo', 'jurong point', 'singpost centre']
    if mall in ['nex', 'imm', 'duo']:
        ### for these mall names, require them to be an individual word (not part of a bigger word like 'next' containing 'nex')
        tokens = restaurant.split(' ')
        return any([k == mall for k in tokens])
    elif mall in ['jurong point', 'singpost centre'] or re.search('\d+', mall):
        # mall names that is harder to be identified by FuzzyWuzzy
        return mall in restaurant
    else:
        # FuzzyWuzzy partio ratio returns String Matching scores (account for different variations of a mall name)
        return fuzz.partial_ratio(mall, restaurant) > 90

In [10]:
df_restaurant['cleaned_address'] = df_restaurant['address'].str.lower()
df_malls['cleaned_mall_names'] = df_malls['mall_names'].str.lower()
df_restaurant['is_in_mall'] = df_restaurant['cleaned_address'].apply(lambda x: any([k in x for k in df_malls['cleaned_mall_names'] if matches_shopping_mall(k, x)]))
df_restaurant['potential_malls'] = df_restaurant['cleaned_address'].apply(lambda x: [k for k in df_malls['cleaned_mall_names'] if matches_shopping_mall(k, x)])

In [11]:
df_restaurant['mall'] = df_restaurant['potential_malls'].apply(
    lambda x: x[0].title() if len(x) > 0 else np.nan
)

In [12]:
df_restaurant[['address', 'url', 'is_in_mall', 'mall']]

Unnamed: 0,address,url,is_in_mall,mall
0,"60 Robertson Quay The Quayside 01-05, Singapor...",https://www.tripadvisor.com.sg/Restaurant_Revi...,False,
1,"27 Seah Street # 01-01, Singapore 188383 Singa...",https://www.tripadvisor.com.sg/Restaurant_Revi...,False,
2,1 Fullerton Square Fullerton Hotel The Fullert...,https://www.tripadvisor.com.sg/Restaurant_Revi...,False,
3,6 Raffles Boulevard Marina Square Level 4 At P...,https://www.tripadvisor.com.sg/Restaurant_Revi...,True,Marina Square
4,"390 Havelock Road King's Centre, Singapore 169...",https://www.tripadvisor.com.sg/Restaurant_Revi...,False,
...,...,...,...,...
11156,"529 Ang Mo Kio Ave10 #01-2359, Singapore 56052...",https://www.tripadvisor.com.sg/Restaurant_Revi...,False,
11157,"#02-65 Jcube, 2 Jurong East Central Singapore,...",https://www.tripadvisor.com.sg/Restaurant_Revi...,True,Jcube
11158,"#B1-41, 21 Choa Chu Kang North 6 (Yew Tee Poin...",https://www.tripadvisor.com.sg/Restaurant_Revi...,True,Yew Tee Point
11159,"Blk 29, 378 Alexandra Road, Singapore 159964 S...",https://www.tripadvisor.com.sg/Restaurant_Revi...,False,


In [31]:
df_restaurant[['address', 'url', 'is_in_mall', 'mall']].to_csv('data/restaurant_is_in_mall.csv', index=False, encoding='utf-8-sig')