# Capstone: SG Food Recommender
Author: Choo Wende

---

In [1]:
# pip install webdriver-manager

## Libraries Import

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import json
import requests
import re

from bs4 import BeautifulSoup
from datetime import datetime

import time
import re
# from collections import defaultdict
# import pickle
import random

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

## WebScrapping Yelp

### Restaurant Info

In [3]:
# Web page used is from yelp.com that is filtered by 99 neighbourhoods.
# Target web page:
url='https://www.yelp.com/search?find_desc=Restaurants&find_loc=Singapore&l=p%3ASG-SG%3ASingapore%3A%3A%5BAlexandra%2CAng_Mo_Kio%2CAnn_Siang_Hill%2CArab_Street%2CBayfront%2CBedok_North%2CBedok_Reservoir%2CBedok_South%2CBencoolen%2CBishan%2CBoat_Quay%2CBoon_Keng%2CBoon_Lay%2CBras_Brasah%2CBuangkok%2CBugis%2CBukit_Batok%2CBukit_Panjang%2CBukit_Timah%2CChangi%2CChinatown%2CChoa_Chu_Kang%2CCity_Hall%2CClarke_Quay%2CClementi%2CDempsey_Hill%2CDhoby_Ghaut%2CDover%2CDuxton_Hill%2CEunos%2CFarrer_Park%2CGeylang%2CGhim_Moh%2CHarbourfront%2CHolland_Hill%2CHolland_Village%2CHougang%2CJoo_Chiat%2CJurong%2CJurong_Island%2CKallang%2CKatong%2CKembangan%2CKent_Ridge%2CKeppel%2CLabrador_Park%2CLavender%2CLim_Chu_Kang%2CLittle_India%2CMacpherson%2CMandai%2CMarine_Parade%2CMount_Sophia%2CMountbatten%2CNewton%2CNovena%2COrchard%2COutram%2CPasir_Panjang%2CPasir_Ris%2CPaya_Lebar%2CPotong_Pasir%2CPulau_Ubin%2CPunggol%2CQueenstown%2CRaffles_Place%2CRedhill%2CRiver_Valley%2CRobertson_Quay%2CSeletar%2CSembawang%2CSengkang%2CSentosa%2CSerangoon%2CSerangoon_Gardens%2CSiglap%2CSimei%2CSixth_Avenue%2CSomerset%2CTampines%2CTanglin%2CTanglin_Halt%2CTanjong_Pagar%2CTanjong_Rhu%2CTelok_Blangah%2CTelok_Kurau%2CThomson%2CTiong_Bahru%2CToa_Payoh%2CTuas%2CUbi%2CUlu_Pandan%2CUpper_Bukit_Timah%2CWessex_Estate%2CWest_Coast%2CWoodlands%2CYio_Chu_Kang%2CYishun%2Cone-north%5D'

In [4]:
# Establishing the connection to the web page:
res = requests.get(url)
res.status_code

200

In [5]:
res.text[:500]

'<!DOCTYPE html><html lang="en-US" prefix="og: http://ogp.me/ns#" style="margin: 0;padding: 0; border: 0; font-size: 100%; font: inherit; vertical-align: baseline;"><head><script>document.documentElement.className=document.documentElement.className.replace(/\x08no-js\x08/,"js");</script><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><meta http-equiv="Content-Language" content="en-US" /><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link rel='

In [6]:
soup = BeautifulSoup(res.content, 'lxml')

**Extract Neighbourhood**

In [7]:
# Extract the list of Singapore neighbourhood in Yelp. 
neighbourhoods=[]

for row in soup.find('div', {'class':'transitionGroup__09f24__nJIf6'}):
    if row.text!=' • ':
        neighbourhoods.append(row.text)


# display list of neighbourhoods
neighbourhoods[-5:]

['Newton', 'Seletar', 'Paya Lebar', 'Macpherson', "Bird's-eye View"]

In [8]:
# Remove last element as it is invalid.
neighbourhoods.remove("Bird's-eye View")

In [9]:
# create list of neighbourhood url
neighbourhood_urls = {}

for hood in sorted(neighbourhoods):
    hoods=[]
    
    text=hood.replace(' ', '+')
    hoods.append(f'https://www.yelp.com/search?find_desc=Restaurants&find_loc={text}%2C+Singapore')
    
    for i in range(10, 240, 10):
        hoods.append(f'https://www.yelp.com/search?find_desc=Restaurants&find_loc={text}%2C+Singapore&start={i}')
        
    neighbourhood_urls[hood]=hoods

In [10]:
# Display first 5 urls for each neighbourhood
pd.DataFrame(neighbourhood_urls).head()

Unnamed: 0,Alexandra,Ang Mo Kio,Ann Siang Hill,Arab Street,Bayfront,Bedok North,Bedok Reservoir,Bedok South,Bencoolen,Bishan,...,Tuas,Ubi,Ulu Pandan,Upper Bukit Timah,Wessex Estate,West Coast,Woodlands,Yio Chu Kang,Yishun,one-north
0,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...
1,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...
2,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...
3,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...
4,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...,https://www.yelp.com/search?find_desc=Restaura...


**Extract Restaurant Info**

In [27]:
def get_soup(url):
    
    # Establishing the connection to the web page:
    res = requests.get(url, stream=True)

    if res.status_code==200:
        return BeautifulSoup(res.content, 'lxml')
        
    else:
        print(f'Res Status Code: {res.status_code}')
        pass


In [13]:
def get_data(hood, url):
    
    print(url)
    soup=get_soup(url)

    restaurants = []

    # restaurants details
    biz=soup.find_all('span', {'class':'css-1egxyvc'})

    # image
    img=soup.find_all('img', {'class':'css-xlzvdl'})
    
    # rating
    rating_class='attribute__09f24__hqUj7 display--inline-block__09f24__fEDiJ margin-r1__09f24__rN_ga border-color--default__09f24__NPAKY'
            
    # review count
    review_count=soup.find_all('span', {'class':'reviewCount__09f24__tnBk4 css-chan6m'})
    
    # location
    # location=soup.find_all('span', {'class':'css-chan6m'})[1:21:2]
    location=soup.find_all('p',{'class':'css-dzq7l1'})
    
    #category
    category=soup.find_all('span', {'class':'css-epvm6 display--inline__09f24__c6N_k border-color--default__09f24__NPAKY'})

    # PriceRange
    pricerange=soup.find_all('p', {'class':'css-dzq7l1'})

    for i in range(len(biz)):
        
        # biz info
        restaurant ={}
        restaurant['neighbourhood'] = hood
        restaurant['name']=biz[i].a['name']
        href=biz[i].a['href']
        restaurant['href']=f'www.yelp.com{href}'
        
        # image info
        restaurant['img']=img[i]['src']

        # rating info
        row=soup.find_all('div', {'class':rating_class})[i]
        restaurant['star_rating']=row.div['aria-label'].replace(' star rating','')

        # Review count info
        restaurant['review_count']=review_count[i].text

        # Location info
        restaurant['location']=location[i].find('span',{'class':'css-chan6m'}).text

        # category info
        try:
            cats_ls=[]
            cat=category[i].find_all('span', {'class':'css-11bijt4'})
            for j in range(len(cat)):
                cats_ls.append(cat[j].text)
            restaurant['category']=cats_ls
        except:
            restaurant['category']=None

        # Price range info
        price=pricerange[i].find('span', {'class':'priceRange__09f24__mmOuH'})
        try:
            restaurant['price_range']=len(price.text)
        except:
            restaurant['price_range']=price
        
        restaurants.append(restaurant)
        restaurant ={}
        
    return pd.DataFrame(restaurants)

```python
# The following code scrape through Yelp for restaurants info based on neighbourhood search
# A total of 99 neighbourhoods were available on yelp to search.
# The codes are commented out to prevent re-run of the code, which took some time to run. 

# Create restaurant dataframe. 
rest_data = pd.DataFrame()

for hood, urls in list(neighbourhood_urls.items()):
    print(f'\nScraping from {hood}:')
    for i in range(len(urls)):
        hood_data = get_data(hood, urls[i])
        rest_data=pd.concat([rest_data, hood_data]).reset_index(drop=True)
        
        if i%np.random.randint(8,11)==0:
            # generate a random sleep duration to manage number of requests per second
            sleep_duration = np.random.randint(5,12)
            print(f'Sleep: {sleep_duration}s')
            time.sleep(sleep_duration)
    text=hood.replace(' ', '_')
    rest_data.to_csv(f'../data/neighbourhoods/{text}.csv', index=False)
    rest_data = pd.DataFrame()
    
    # generate a random sleep duration to manage number of requests per second.
    # Sleep duration is set at such long interval 
    # because shorter intervals will face interruption by the server.
    sleep_duration = np.random.randint(60,80)
    print(f'Sleep: {sleep_duration}s')
    time.sleep(sleep_duration)
```

In [14]:
# Create dataframe of restaurants to merge data in various neighbourhood
rest_df=pd.DataFrame()

for hood in list(neighbourhood_urls.keys()):
    
    text=hood.replace(' ', '_')
    temp_df=pd.read_csv(f'../data/neighbourhoods/{text}.csv')
    rest_df=pd.concat([rest_df, temp_df]).reset_index(drop=True)
    
# Display first 5 and last 5 rows of restaurant dataframe.
rest_df

Unnamed: 0,neighbourhood,name,href,img,star_rating,review_count,location,category,price_range
0,Alexandra,The Naked Finn,www.yelp.com/biz/the-naked-finn-singapore?osq=...,https://s3-media0.fl.yelpcdn.com/bphoto/ScSHvC...,4.5,19,Alexandra,['Seafood'],3.0
1,Alexandra,Soi Thai Soi Nice,www.yelp.com/biz/soi-thai-soi-nice-singapore?o...,https://s3-media0.fl.yelpcdn.com/bphoto/qmndjm...,4.5,2,Alexandra,['Thai'],
2,Alexandra,The Gogi,www.yelp.com/biz/the-gogi-singapore?osq=Restau...,https://s3-media0.fl.yelpcdn.com/bphoto/srghHe...,5.0,1,Alexandra,"['Korean', 'Barbeque']",
3,Alexandra,Keng Eng Kee Seafood,www.yelp.com/biz/keng-eng-kee-seafood-singapor...,https://s3-media0.fl.yelpcdn.com/bphoto/1TziiZ...,4.0,22,Alexandra,['Seafood'],2.0
4,Alexandra,Burger Barn,www.yelp.com/biz/burger-barn-singapore?osq=Res...,https://s3-media0.fl.yelpcdn.com/bphoto/ABw7yU...,4.0,4,Alexandra,['Burgers'],1.0
...,...,...,...,...,...,...,...,...,...
14946,one-north,The World is Flat,www.yelp.com/biz/the-world-is-flat-singapore?o...,https://s3-media0.fl.yelpcdn.com/bphoto/JZgM6o...,3.0,1,one-north,"['Pizza', 'Asian Fusion', 'Sandwiches']",
14947,one-north,Food Canopy,www.yelp.com/biz/food-canopy-singapore?osq=Res...,https://s3-media0.fl.yelpcdn.com/bphoto/Z12m7m...,3.0,1,Bukit Timah,['Food Court'],
14948,one-north,Thaksin Beef Noodle,www.yelp.com/biz/thaksin-beef-noodle-singapore...,https://s3-media0.fl.yelpcdn.com/bphoto/wvQ5wj...,4.5,2,West Coast,['Chinese'],1.0
14949,one-north,The Bakery By Woodlands Sourdough,www.yelp.com/biz/the-bakery-by-woodlands-sourd...,https://s3-media0.fl.yelpcdn.com/bphoto/q0HOoI...,5.0,1,Bukit Timah,"['Bakeries', 'Pizza']",


In [15]:
# Display summary info.
rest_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14951 entries, 0 to 14950
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   neighbourhood  14951 non-null  object 
 1   name           14951 non-null  object 
 2   href           14951 non-null  object 
 3   img            14951 non-null  object 
 4   star_rating    14951 non-null  float64
 5   review_count   14951 non-null  int64  
 6   location       14523 non-null  object 
 7   category       14939 non-null  object 
 8   price_range    7959 non-null   float64
dtypes: float64(2), int64(1), object(6)
memory usage: 1.0+ MB


In [16]:
# Display numeric columns summary info
rest_df.describe()

Unnamed: 0,star_rating,review_count,price_range
count,14951.0,14951.0,7959.0
mean,4.021972,8.622768,1.932529
std,0.717602,21.443366,0.889946
min,1.0,1.0,1.0
25%,3.5,1.0,1.0
50%,4.0,3.0,2.0
75%,4.5,8.0,2.0
max,5.0,385.0,4.0


In [17]:
# Number of unique restaurants
len(rest_df['href'].unique())

4026

In [18]:
# show number of duplicates
rest_df.duplicated().value_counts()

False    14411
True       540
dtype: int64

### Restaurant Review

In [19]:
# Create dataframe of unique restaurants and url
rest_url=rest_df.drop(columns=['neighbourhood', 'location']).drop_duplicates(subset=['href']).reset_index(drop=True)

# Display first 5 rows of dataframe
rest_url.head()

Unnamed: 0,name,href,img,star_rating,review_count,category,price_range
0,The Naked Finn,www.yelp.com/biz/the-naked-finn-singapore?osq=...,https://s3-media0.fl.yelpcdn.com/bphoto/ScSHvC...,4.5,19,['Seafood'],3.0
1,Soi Thai Soi Nice,www.yelp.com/biz/soi-thai-soi-nice-singapore?o...,https://s3-media0.fl.yelpcdn.com/bphoto/qmndjm...,4.5,2,['Thai'],
2,The Gogi,www.yelp.com/biz/the-gogi-singapore?osq=Restau...,https://s3-media0.fl.yelpcdn.com/bphoto/srghHe...,5.0,1,"['Korean', 'Barbeque']",
3,Keng Eng Kee Seafood,www.yelp.com/biz/keng-eng-kee-seafood-singapor...,https://s3-media0.fl.yelpcdn.com/bphoto/1TziiZ...,4.0,22,['Seafood'],2.0
4,Burger Barn,www.yelp.com/biz/burger-barn-singapore?osq=Res...,https://s3-media0.fl.yelpcdn.com/bphoto/ABw7yU...,4.0,4,['Burgers'],1.0


In [20]:
# add 'http://' to 'href' column
rest_url['href'] = 'http://' + rest_url['href']

In [21]:
# Display restaurants with high review count
rest_url[rest_url['review_count']>100]

Unnamed: 0,name,href,img,star_rating,review_count,category,price_range
454,Tian Tian Hainanese Chicken Rice,http://www.yelp.com/biz/tian-tian-hainanese-ch...,https://s3-media0.fl.yelpcdn.com/bphoto/UJ5Kp3...,4.0,385,"['Hainan', 'Chicken Shop']",1.0
515,Jumbo Seafood,http://www.yelp.com/biz/jumbo-seafood-singapor...,https://s3-media0.fl.yelpcdn.com/bphoto/85hCRe...,4.0,188,['Seafood'],3.0
557,Song Fa Bak Kut Teh,http://www.yelp.com/biz/song-fa-bak-kut-teh-si...,https://s3-media0.fl.yelpcdn.com/bphoto/cgE6mZ...,4.0,136,"['Singaporean', 'Chinese']",1.0
667,Liao Fan Hawker Chan,http://www.yelp.com/biz/liao-fan-hawker-chan-s...,https://s3-media0.fl.yelpcdn.com/bphoto/LMpD02...,4.0,140,"['Noodles', 'Chicken Shop']",1.0
672,CÉ LA VI Restaurant,http://www.yelp.com/biz/c%C3%A9-la-vi-restaura...,https://s3-media0.fl.yelpcdn.com/bphoto/Y9T9Iu...,3.0,134,"['Bars', 'Asian Fusion']",3.0
750,Din Tai Fung,http://www.yelp.com/biz/din-tai-fung-singapore...,https://s3-media0.fl.yelpcdn.com/bphoto/fxsbP8...,4.5,132,"['Taiwanese', 'Dim Sum', 'Dumplings']",2.0


In [22]:
# Generate url to extract reviews.

# Create a column of urls to extract reviews. 
rest_url['review_url'] = np.nan
rest_url.head()

# Loop through every row to input urls
for i in range(len(rest_url['href'])):
    url_ls = rest_url['href'][i]
    
    if rest_url['review_count'][i]<11:
        pass
    else:
        # check number of url pages to scrap reviews.
        pages=int(rest_url['review_count'][i]/10) + (rest_url['review_count'][i]%10 > 0)
                
        for j in range(10, (pages*10), 10):
            url_ls = url_ls + ' ' + (rest_url['href'][i]+f'&start={j}')
    
    rest_url.loc[i, 'review_url']= url_ls

In [23]:
# Check dataframe and column created.
rest_url.head()

Unnamed: 0,name,href,img,star_rating,review_count,category,price_range,review_url
0,The Naked Finn,http://www.yelp.com/biz/the-naked-finn-singapo...,https://s3-media0.fl.yelpcdn.com/bphoto/ScSHvC...,4.5,19,['Seafood'],3.0,http://www.yelp.com/biz/the-naked-finn-singapo...
1,Soi Thai Soi Nice,http://www.yelp.com/biz/soi-thai-soi-nice-sing...,https://s3-media0.fl.yelpcdn.com/bphoto/qmndjm...,4.5,2,['Thai'],,http://www.yelp.com/biz/soi-thai-soi-nice-sing...
2,The Gogi,http://www.yelp.com/biz/the-gogi-singapore?osq...,https://s3-media0.fl.yelpcdn.com/bphoto/srghHe...,5.0,1,"['Korean', 'Barbeque']",,http://www.yelp.com/biz/the-gogi-singapore?osq...
3,Keng Eng Kee Seafood,http://www.yelp.com/biz/keng-eng-kee-seafood-s...,https://s3-media0.fl.yelpcdn.com/bphoto/1TziiZ...,4.0,22,['Seafood'],2.0,http://www.yelp.com/biz/keng-eng-kee-seafood-s...
4,Burger Barn,http://www.yelp.com/biz/burger-barn-singapore?...,https://s3-media0.fl.yelpcdn.com/bphoto/ABw7yU...,4.0,4,['Burgers'],1.0,http://www.yelp.com/biz/burger-barn-singapore?...


In [24]:
# Check column created for high review counts.
rest_url.loc[3]['review_url']

'http://www.yelp.com/biz/keng-eng-kee-seafood-singapore?osq=Restaurants http://www.yelp.com/biz/keng-eng-kee-seafood-singapore?osq=Restaurants&start=10 http://www.yelp.com/biz/keng-eng-kee-seafood-singapore?osq=Restaurants&start=20'

In [None]:
# Define function to scrape reviews. 
def get_reviews(rest_name, url):
    
    # Instantiate reviews list
    reviews =[]
    
    print(url)
    soup=get_soup(url)

    reviews = []

    # restaurants address info
    add = ''

    for row in soup.find('address').find_all('span', {'class':'raw__09f24__T4Ezm'}):
        add += (row.text + ' ')

    alt_add = soup.find('address').find_all('span', {'class':'raw__09f24__T4Ezm'})[0].text
    

    # Get postal code and latitude, longitude via onemap api
    try:
        postal_code = re.search('\d{6}', add).group()
        onemap = get_soup(f'https://developers.onemap.sg/commonapi/search?searchVal={postal_code}&returnGeom=Y&getAddrDetails=Y&pageNum=1')
        latitude = json.loads(onemap.p.text)['results'][0]['LATITUDE']
        longitude = json.loads(onemap.p.text)['results'][0]['LONGTITUDE']
    
    except:
        onemap = get_soup(f'https://developers.onemap.sg/commonapi/search?searchVal={alt_add}&returnGeom=Y&getAddrDetails=Y&pageNum=1')
        
        try: 
            postal_code = json.loads(onemap.p.text)['results'][0]['POSTAL']
            latitude = json.loads(onemap.p.text)['results'][0]['LATITUDE']
            longitude = json.loads(onemap.p.text)['results'][0]['LONGTITUDE']
        except:
            latitude = np.nan
            longitude = np.nan
            try:
                postal_code = postal_code
            except:
                postal_code = np.nan
    
    # opening hours info
    day=[]
    hrs=[]
    
    opening_table = soup.find('table', {'class':'hours-table__09f24__KR8wh table__09f24__J2OBP table--simple__09f24__vy16f'})
    
    for row in opening_table.find_all('th', {'class':"table-header-cell__09f24__y32Xb"}):
        day.append(row.text)

    for row in opening_table.find_all('ul', {'class':"undefined list__09f24__ynIEd"}):
        subset=[]
        for i in range(len(row.find_all('li'))):
            subset.append(row.find_all('li')[i].text)
        hrs.append(subset)
        
    opening = list(zip(day, hrs))

    # Review info
    review_div = soup.find('div', {'class':'css-79elbk border-color--default__09f24__NPAKY'})
    review_li = review_div.find_all('li', {'class':'margin-b5__09f24__pTvws border-color--default__09f24__NPAKY'})
    
    # scrape through every review. 
    for row in review_li:
        
        # instantiate review dictionary to store information. 
        review = {}
        
        # Store additional restaurants information.
        review['rest_name'] = rest_name
        review['address'] = add
        review['postal_code'] = postal_code
        review['latitude'] = latitude
        review['longitude'] = longitude
        review['opening'] = opening        
        
        # username
        try:
            user_info = row.find('div', {'class':'user-passport-info border-color--default__09f24__NPAKY'})
            review['username'] = user_info.a.text
        except:
            # user might be blocked.
            # Hence, their comments may not be credible
            break   
        
        # usesrid
        review['userid'] = user_info.a['href'].replace('/user_details?userid=','')
        
        # location
        review['user_location'] = user_info.find('span', {'class':'css-qgunke'}).text
        
        # rating
        rating_div = row.find('div', {'class':'margin-t1__09f24__w96jn margin-b1-5__09f24__NHcQi border-color--default__09f24__NPAKY'})
        review['user_rating'] = rating_div.span.div['aria-label'].replace(' star rating','')
        
        # date of post
        review['review_date'] = row.find('span', {'class':'css-chan6m'}).text
        
        # comments
        review['comment'] = row.find('span', {'class':'raw__09f24__T4Ezm'}).text
        
        reviews.append(review)
        review ={}
        
    return pd.DataFrame(reviews)

In [None]:
# ```python
# The following code scrape through Yelp for restaurants review
# The codes are commented out to prevent re-run of the code, which took some time to run. 

# Create review dataframe. 
reviews_data = pd.DataFrame()
count_url = 0
i = 940

# for i in range(400, len(rest_url)):
while i < len(rest_url):
    
    to_break = 0
    
    # Display info on scrapping status
    rest_name = rest_url.loc[i, 'name'] 
    print(f'\nScraping from {rest_name}:')
    print(i)
    
    urls = rest_url.loc[i, 'review_url'].split()
    
    for url in urls:
        
        try:
            review_data = get_reviews(rest_name, url)
        
        except:
            
            # sleep and try scrapping again.
            # generate a random sleep duration to manage number of requests per second         
            sleep_duration = np.random.randint(60,80)
            print(f'Sleep: {sleep_duration}s')
            time.sleep(sleep_duration)           
            
            # break from url loop and restart scrapping
            to_break = 1
            break

        # to add data to dataframe.
        reviews_data = pd.concat([reviews_data, review_data]).reset_index(drop=True)
        count_url +=1
        
        # generate a random sleep duration to manage number of requests per second
        # at random interval in every 20 restaurants.
        if count_url%np.random.randint(7,10)==0:
            sleep_duration = np.random.randint(5,8)
            print(f'Sleep: {sleep_duration}s')
            time.sleep(sleep_duration)
            
        # generate a random sleep duration to manage number of requests per second
        # sleep after scraping every 20 urls   
        if (count_url+1)%20 == 0:                 
            sleep_duration = np.random.randint(10,15)
            print(f'Sleep: {sleep_duration}s')
            time.sleep(sleep_duration)       

    # reset i and restart scrapping
    if to_break:
        rest_name = rest_url.loc[i, 'name']
        print(f'Restart scrapping from last url: i is {i}, {rest_name}')
    
    else:
        
        # increase i count by 1
        i += 1    

        # Save out data after every 20 restaurants
        if i%20 == 0:
            reviews_data.to_csv(f'../data/reviews_{i}.csv', index=False)
            print(f'Saved to reviews_{i}.csv')
            rest_data = pd.DataFrame()

        # Save last set of data   
        if i-1 == len(rest_url):
            reviews_data.to_csv(f'../data/reviews_{(i-1)%100 + 10}.csv', index=False)
            rest_data = pd.DataFrame()

# ```


Scraping from HG 106 Coffeeshop:
940
http://www.yelp.com/biz/hg-106-coffeeshop-singapore?osq=Restaurants

Scraping from Werner’s Oven:
941
http://www.yelp.com/biz/werners-oven-singapore-2?osq=Restaurants

Scraping from Tian Wai Tian Fishhead Steamboat:
942
http://www.yelp.com/biz/tian-wai-tian-fishhead-steamboat-singapore?osq=Restaurants

Scraping from Golden Jade Restaurant Culinary Group:
943
http://www.yelp.com/biz/golden-jade-restaurant-culinary-group-singapore?osq=Restaurants

Scraping from Wine Connection Bistro:
944
http://www.yelp.com/biz/wine-connection-bistro-singapore-3?osq=Restaurants

Scraping from Five Spice:
945
http://www.yelp.com/biz/five-spice-singapore?osq=Restaurants

Scraping from Saboten:
946
http://www.yelp.com/biz/saboten-singapore-2?osq=Restaurants
Sleep: 7s
http://www.yelp.com/biz/saboten-singapore-2?osq=Restaurants&start=10

Scraping from Bedok Corner Hokkien Prawn Mee:
947
http://www.yelp.com/biz/bedok-corner-hokkien-prawn-mee-singapore?osq=Restaurants

Scr

In [None]:
rest_url['href'][0]

In [65]:
soup=get_soup(rest_url['href'][0])

In [70]:
soup.find_all('meta')

[<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>,
 <meta content="en-US" http-equiv="Content-Language"/>,
 <meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>,
 <meta id="emotion-container"/>,
 <meta name="critical_css_middleware"/>,
 <meta content="19 reviews of The Naked Finn &quot;They serve the BEST cocktails you can ever find in Singapore!
 
 I love the fact that they live up to their name of being &quot;naked&quot;, for the fact that the ingredients used in the choice of seafood are simply salt, pepper and olive oil to bring out the freshness in their seafood!
 The cocktails, served at -12°C, taste FRESH!
 
 Made my first visit there from friend's recommendation and tried all of their 3 signature cocktails, Strawberry, Coconut and Kyoho grapes.
 The ingredients are freshly pacotized (according to their website), and it seriously brought me to heaven!
 Ordered a portion of baby squids for sharing (as it was rather late that 

In [None]:
# Address info
add = ''

for row in soup.find('address').find_all('span', {'class':'raw__09f24__T4Ezm'}):
    add += (row.text + ' ')
    
print(add)
postal_code = re.search('\d{6}', add).group()
print(postal_code)

In [None]:
onemap = get_soup(f'https://developers.onemap.sg/commonapi/search?searchVal={postal_code}&returnGeom=Y&getAddrDetails=Y&pageNum=1')
latitude = json.loads(onemap.p.text)['results'][0]['LATITUDE']
longitude = json.loads(onemap.p.text)['results'][0]['LONGTITUDE']

In [None]:
# Convert postal code to lat long
# https://developers.onemap.sg/commonapi/search?searchVal=revenue&returnGeom=Y&getAddrDetails=Y&pageNum=1

In [None]:
# opening hours info
day=[]
hrs=[]
for row in soup.find('table', {'class':'hours-table__09f24__KR8wh table__09f24__J2OBP table--simple__09f24__vy16f'}).find_all('th', {'class':"table-header-cell__09f24__y32Xb"}):
    day.append(row.text)
    
for row in soup.find('table', {'class':'hours-table__09f24__KR8wh table__09f24__J2OBP table--simple__09f24__vy16f'}).find_all('ul', {'class':"undefined list__09f24__ynIEd"}):
    subset=[]
    for i in range(len(row.find_all('li'))):
        subset.append(row.find_all('li')[i].text)
    hrs.append(subset)
    
list(zip(day, hrs))

In [None]:
review_li[1].find('div', {'class':'user-passport-info border-color--default__09f24__NPAKY'}).find('span', {'class':'css-qgunke'}).text

In [None]:
# Review info
review_li = soup.find('div', {'class':'css-79elbk border-color--default__09f24__NPAKY'}).find_all('li', {'class':'margin-b5__09f24__pTvws border-color--default__09f24__NPAKY'})
for row in review_li:
    # username
    print(row.find('div', {'class':'user-passport-info border-color--default__09f24__NPAKY'}).a.text)
    # usesrid
    print(row.find('div', {'class':'user-passport-info border-color--default__09f24__NPAKY'}).a['href'].replace('/user_details?userid=',''))
    # location
    print(row.find('div', {'class':'user-passport-info border-color--default__09f24__NPAKY'}).find('span', {'class':'css-qgunke'}).text)
    # rating
    print(row.find('div', {'class':'margin-t1__09f24__w96jn margin-b1-5__09f24__NHcQi border-color--default__09f24__NPAKY'}).span.div['aria-label'].replace(' star rating',''))
    # date of post
    print(row.find('span', {'class':'css-chan6m'}).text)
    # comments
    print(row.find('span', {'class':'raw__09f24__T4Ezm'}).text)

In [None]:
# find_list = soup.find('main', {"id": "main-content"}).find_all('a')

# restaurants = []

# for row in find_list:
#     try:
#         restaurant ={}
    
#         restaurant['name']=row['name']
#         href=row['href']
#         restaurant['href']=f'www.yelp.com{href}'
        
#         restaurants.append(restaurant)

#     except:
#         pass
        
# pd.DataFrame(restaurants)