In [None]:
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
import re
import warnings
warnings.filterwarnings('ignore', message='A NumPy version.*"')

base_url = 'https://www.yelp.com/biz/'
url = "https://www.yelp.com/biz/ryptic-room-escape-san-mateo-3"
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')

In [None]:
html_otc = requests.get('https://www.yelp.com/biz/off-the-couch-fremont')
soup_otc = BeautifulSoup(html_otc.text, 'html.parser')
print(soup_otc.prettify())

In [None]:
print(soup.prettify())

In [None]:
# Returns the upper bound for looping through the review pages.
# Example: If the review page consists of 20 pages, then the search query will
# append '?start=190' to the URL to get to the last review page. This function will
# return 191, the upper bound needed for a for-loop to go through all the review pages.
def get_reviews_loop_upper_bound(soup_obj):
    upper_bound = 0
    # 'css-chan6m' elements contain a "# of #", with the former being the current review page
    # and the latter being the last review page.
    chan6m_elems = soup_obj.find_all(class_='css-chan6m')
    for chan6m_elem in chan6m_elems:
        if 'of' in chan6m_elem.text:
            upper_bound = (int(chan6m_elem.text.split()[-1]) - 1) * 10 + 1 # get last review page
            return upper_bound
    return upper_bound

get_reviews_loop_upper_bound(soup)

In [None]:
# Takes in a soup object and retuns the escape room's name and the city the room is in
def get_room_info(soup_obj):
    room_title_info = soup_obj.title.text.split('-')
    
    # Title is the 0th element in the list with trailing white spaces
    room_name = room_title_info[0].strip().lower()

    # Address is the 2nd elemnt in the list, city is the second to last element in list with trailing white spaces
    for title_element in room_title_info:
        if 'California' in title_element:
            room_city = title_element.split(',')[-2].strip()

    # Get the overall rating of the escape room
    overall_rating_pattern = re.compile(r'(?:[1-4](?:\.5)?|5(?:\.0)?) star rating')
    # seems like 2 of the same gets extracted, pick the 0th element
    room_overall_element = soup_obj.find_all('div', {'aria-label': overall_rating_pattern, 'class': 'css-1v6kfrx'})[0]
    room_overall_rating = float(room_overall_element['aria-label'].split()[0])

    return room_name, room_city, room_overall_rating

room_name, room_city, room_overall_rating = get_room_info(soup)
room_name, room_city, room_overall_rating


## get all page reviews and ratings

Yelp is pretty clever. For each page_rating get, we want to only get the first 10, since yelp also offers 10 "people also viewed". BS4 will also capture those ratings and will be the latter 10 ratings. (number of ratings and reviews is always double)


In [None]:
base_url = 'https://www.yelp.com/biz/'
room_url = 'ryptic-room-escape-san-mateo-3'
upper_bound = 171

# Gets every review of an escape room and returns it in a list
def get_all_page_reviews(base_url, room_url, upper_bound):

    # Helper function that takes in a soup object and returns the reviews of
    # that "page" as a list of reviews
    def get_page_reviews(soup_obj):
        page_reviews = []
        # Class='raw__09f24__T4Ezm' and lang='en' tags specifies reviews
        review_elements = soup_obj.find_all(class_='raw__09f24__T4Ezm', lang='en')
        for review_element in review_elements:
            review = review_element.text
            page_reviews.append(review)
        return page_reviews
    
    # DEPRECATED #
    # Helper function that takes in a soup object and returns the rating of
    # that "page" as a list of ratings
    # def get_page_ratings(soup_obj):        
    #    # regex pattern to capture review ratings
    #    user_rating_pattern = re.compile(r'(?:[1-4](?:\.5)?|5(?:\.0)?) star rating')
    #    page_ratings = []
    #    rating_elements = soup_obj.find_all('div', {'aria-label': user_rating_pattern, 'class': 'css-14g69b3'})
    #    for rating_element in rating_elements:
    #        rating_value = rating_element['aria-label'].split()[0] # just want the number
    #        page_ratings.append(rating_value)

    #    # list comp to convert strings to ints (user ratings have no decimals)
    #    return [int(page_rating) for page_rating in page_ratings[:-10]]

    ### FUNCTION BEGIN ###
    all_reviews = []
    # all_ratings = []
    for i in range(0, upper_bound, 10): # loop through all review pages
        print(f'appending reviews for page {int(i/10) + 1}...')
        full_url = base_url + room_url
        if i != 0: # append appropriate search query for review page
            full_url += f'?start={i}' 
        
        # Make html request on full_url and create soup object
        html = requests.get(full_url)
        soup_obj = BeautifulSoup(html.text, 'html.parser')

        # .extend instead of .append because get_page_reviews returns a list
        all_reviews.extend(get_page_reviews(soup_obj))
        # print(f'length of reviews: {len(all_reviews)}')

        # # repeat for ratings
        # all_ratings.extend(get_page_ratings(soup_obj))
        # print(f'length of ratings: {len(all_ratings)}')
    print('all reviews appended')
    return all_reviews # , all_ratings

test_reviews = get_all_page_reviews(base_url, room_url, 21) 

In [None]:
# Takes in a list of reviews and returns a list of tuples with the 
# 0th element being the sentiment score and the 1st element being the
# corresponding review.
def compute_sentiments(reviews, decimals=4):
    sentiment_scores = []
    for review in reviews:
        review_blob = TextBlob(review)
        sentiment_score = review_blob.sentiment.polarity
        sentiment_scores.append((round(sentiment_score, decimals)))
    return sentiment_scores

test_sentiment_scores = compute_sentiments(test_reviews)

In [None]:
### TESTING MYSQL CONNECTION ###

import mysql.connector
import configparser

# Read configuration from config.ini
config = configparser.ConfigParser()
config.read('config.ini')

# Retrieve database settings from the configuration
host = config['mysql']['host']
user = config['mysql']['user']
password = config['mysql']['password']
database = config['mysql']['database']


# Establish a connection to the MySQL database
cnx = mysql.connector.connect(
    host=host,
    user=user,
    password=password,
    database=database
)

cursor = cnx.cursor()

sql_query = ("SELECT * FROM customers")

cursor.execute(sql_query)

for (id, first_name, last_name, email) in cursor:
  print(f'{id}, {first_name}, {last_name}, {email}')
cursor.close()
cnx.close()


Too lazy to copy-pasta

In [None]:
from scraper import EscapeRoom
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
import re
import warnings

ESCAPE_ROOM_URLS = [
    'trivium-games-emeryville',
    'omescape-san-jose-san-jose',
    'paniq-escape-room-san-jose-san-jose',
    'breakout-studios-san-jose',
    'the-escape-game-santa-clara',
    'edscapade-games-san-jose-2',
    'omescape-sunnyvale-sunnyvale-3',
    'ryptic-room-escape-mountain-view',
    'limitless-escape-games-pleasanton',
    'clockwise-escape-room-pleasanton-pleasanton',
    'heist-escape-room-fremont',
    'off-the-couch-fremont',
    'xcapade-immersive-escape-room-newark',
    'quantum-escapes-danville',
    'zscape-games-san-ramon',
    'red-door-escape-room-san-mateo',
    'ryptic-room-escape-b-street-san-mateo',
    'diablo-escapes-walnut-creek',
    'ryptic-room-escape-san-mateo-3',
    'palace-games-san-francisco',
    'escapology-sf-san-francisco-2',
    'escapesf-san-francisco',
    'the-escape-game-san-francisco-san-francisco-5',
    'reason-san-francisco',
    'pacifica-escape-zone-pacifica'
]

escape_rooms = []
for escape_room_url in ESCAPE_ROOM_URLS:
    print(f'scraping : {escape_room_url}')
    escape_room = EscapeRoom(escape_room_url, set_reviews_and_scores=True)
    escape_rooms.append(escape_room)
    print('---------------')

escape_rooms

In [1]:
from scraper import EscapeRoom
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
import re
import warnings

ESCAPE_ROOM_URLS = [
    'trivium-games-emeryville',
    'omescape-san-jose-san-jose',
    'paniq-escape-room-san-jose-san-jose',
    'breakout-studios-san-jose',
    'the-escape-game-santa-clara',
    'edscapade-games-san-jose-2',
    'omescape-sunnyvale-sunnyvale-3',
    'ryptic-room-escape-mountain-view',
    'limitless-escape-games-pleasanton',
    'clockwise-escape-room-pleasanton-pleasanton',
    'heist-escape-room-fremont',
    'off-the-couch-fremont',
    'xcapade-immersive-escape-room-newark',
    'quantum-escapes-danville',
    'zscape-games-san-ramon',
    'red-door-escape-room-san-mateo',
    'ryptic-room-escape-b-street-san-mateo',
    'diablo-escapes-walnut-creek',
    'ryptic-room-escape-san-mateo-3',
    'palace-games-san-francisco',
    'escapology-sf-san-francisco-2',
    'escapesf-san-francisco',
    'the-escape-game-san-francisco-san-francisco-5',
    'reason-san-francisco',
    'pacifica-escape-zone-pacifica'
]

escape_rooms_with_reviews = []
for escape_room_url in ESCAPE_ROOM_URLS[:11]:
    print(f'scraping : {escape_room_url}')
    escape_room_with_reviews = EscapeRoom(escape_room_url, set_reviews_and_scores=True)
    escape_rooms_with_reviews.append(escape_room_with_reviews)
    print('---------------')
escape_rooms_with_reviews



scraping : trivium-games-emeryville
Creating EscapeRoom trivium games in Emeryville with overall rating 5.0
total pages: 20
appending reviews for page 1...
appending reviews for page 2...
appending reviews for page 3...
appending reviews for page 4...
appending reviews for page 5...
appending reviews for page 6...
appending reviews for page 7...
appending reviews for page 8...
appending reviews for page 9...
appending reviews for page 10...
appending reviews for page 11...
appending reviews for page 12...
appending reviews for page 13...
appending reviews for page 14...
appending reviews for page 15...
appending reviews for page 16...
appending reviews for page 17...
appending reviews for page 18...
appending reviews for page 19...
appending reviews for page 20...
---------------
scraping : omescape-san-jose-san-jose
Creating EscapeRoom omescape in San Jose with overall rating 4.5
total pages: 37
appending reviews for page 1...
appending reviews for page 2...
appending reviews for page

IndexError: list index out of range

In [16]:
escape_rooms_with_reviews_2 = []
for escape_room_url in ESCAPE_ROOM_URLS[11:-6]:
    print(f'scraping : {escape_room_url}')
    escape_room_with_reviews = EscapeRoom(escape_room_url, set_reviews_and_scores=True)
    escape_rooms_with_reviews_2.append(escape_room_with_reviews)
    print('---------------')
escape_rooms_with_reviews_2

scraping : off-the-couch-fremont
Creating EscapeRoom off the couch in Fremont with overall rating 5.0
total pages: 18
appending reviews for page 1...
appending reviews for page 2...
appending reviews for page 3...
appending reviews for page 4...
appending reviews for page 5...
appending reviews for page 6...
appending reviews for page 7...
appending reviews for page 8...
appending reviews for page 9...
appending reviews for page 10...
appending reviews for page 11...
appending reviews for page 12...
appending reviews for page 13...
appending reviews for page 14...
appending reviews for page 15...
appending reviews for page 16...
appending reviews for page 17...
appending reviews for page 18...
---------------
scraping : xcapade-immersive-escape-room-newark
Creating EscapeRoom xcapade immersive escape room in Newark with overall rating 4.5
total pages: 1
appending reviews for page 1...
---------------
scraping : quantum-escapes-danville
Creating EscapeRoom quantum escapes in Danville wi

AttributeError: 'NoneType' object has no attribute 'text'

In [20]:
escape_rooms_with_reviews_3 = []
for escape_room_url in ESCAPE_ROOM_URLS[-6:-2]:
    print(f'scraping : {escape_room_url}')
    escape_room_with_reviews = EscapeRoom(escape_room_url, set_reviews_and_scores=True)
    escape_rooms_with_reviews_3.append(escape_room_with_reviews)
    print('---------------')
escape_rooms_with_reviews_3

scraping : palace-games-san-francisco
Creating EscapeRoom palace games in San Francisco with overall rating 5.0
total pages: 47
appending reviews for page 1...
appending reviews for page 2...
appending reviews for page 3...
appending reviews for page 4...
appending reviews for page 5...
appending reviews for page 6...
appending reviews for page 7...
appending reviews for page 8...
appending reviews for page 9...
appending reviews for page 10...
appending reviews for page 11...
appending reviews for page 12...
appending reviews for page 13...
appending reviews for page 14...
appending reviews for page 15...
appending reviews for page 16...
appending reviews for page 17...
appending reviews for page 18...
appending reviews for page 19...
appending reviews for page 20...
appending reviews for page 21...
appending reviews for page 22...
appending reviews for page 23...
appending reviews for page 24...
appending reviews for page 25...
appending reviews for page 26...
appending reviews for p

IndexError: list index out of range

In [25]:
escape_rooms_with_reviews_4 = []
for escape_room_url in ESCAPE_ROOM_URLS[-2:]:
    print(f'scraping : {escape_room_url}')
    escape_room_with_reviews = EscapeRoom(escape_room_url, set_reviews_and_scores=True)
    escape_rooms_with_reviews_4.append(escape_room_with_reviews)
    print('---------------')
escape_rooms_with_reviews_4

scraping : reason-san-francisco
Creating EscapeRoom reason in San Francisco with overall rating 5.0
total pages: 25
appending reviews for page 1...
appending reviews for page 2...
appending reviews for page 3...
appending reviews for page 4...
appending reviews for page 5...
appending reviews for page 6...
appending reviews for page 7...
appending reviews for page 8...
appending reviews for page 9...
appending reviews for page 10...
appending reviews for page 11...
appending reviews for page 12...
appending reviews for page 13...
appending reviews for page 14...
appending reviews for page 15...
appending reviews for page 16...
appending reviews for page 17...
appending reviews for page 18...
appending reviews for page 19...
appending reviews for page 20...
appending reviews for page 21...
appending reviews for page 22...
appending reviews for page 23...
appending reviews for page 24...
appending reviews for page 25...
---------------
scraping : pacifica-escape-zone-pacifica
Creating Es

[<scraper.EscapeRoom at 0x7fe8a3f35160>,
 <scraper.EscapeRoom at 0x7fe862f87af0>]

In [None]:
# Takes in a soup object and retuns the escape room's name and the city the room is in
def get_room_info(soup_obj):
    room_title_info = soup_obj.title.text.split('-')
    
    # Title is the 0th element in the list with trailing white spaces
    room_name = room_title_info[0].strip().lower()

    # Address is the 2nd elemnt in the list, city is the second to last element in list with trailing white spaces
    for title_element in room_title_info:
        if 'California' in title_element:
            room_city = title_element.split(',')[-2].strip()

    # Get the overall rating of the escape room
    overall_rating_pattern = re.compile(r'(?:[1-4](?:\.5)?|5(?:\.0)?) star rating')
    # seems like 2 of the same gets extracted, pick the 0th element
    room_overall_element = soup_obj.find_all('div', {'aria-label': overall_rating_pattern, 'class': 'css-1v6kfrx'})[0]
    room_overall_rating = float(room_overall_element['aria-label'].split()[0])

    return room_name, room_city, room_overall_rating

room_name, room_city, room_overall_rating = get_room_info(soup)
room_name, room_city, room_overall_rating



In [19]:
ESCAPE_ROOM_URLS[-6]

'palace-games-san-francisco'

In [14]:
for x in escape_rooms_with_reviews:
    print(x.name)

trivium games
omescape
paniq escape room san jose
breakout studios
the escape game san jose
edscapade games
omescape
ryptic room escape
limitless escape games
clockwise escape room pleasanton
heist escape room


In [31]:
ALL_ESCAPE_ROOMS = []

all_scraped_escape_roms = [
    escape_rooms_with_reviews,
    escape_rooms_with_reviews_2,
    escape_rooms_with_reviews_3,
    escape_rooms_with_reviews_4
]

for scraped_escape_rooms in all_scraped_escape_roms:
    ALL_ESCAPE_ROOMS.extend(scraped_escape_rooms)


In [36]:
for x in ALL_ESCAPE_ROOMS:
    print(f'{x.name} in {x.city} with overall rating {x.rating}')

trivium games in Emeryville with overall rating 5.0
omescape in San Jose with overall rating 4.5
paniq escape room san jose in San Jose with overall rating 4.4
breakout studios in San Jose with overall rating 5.0
the escape game san jose in Santa Clara with overall rating 4.0
edscapade games in San Jose with overall rating 5.0
omescape in Sunnyvale with overall rating 5.0
ryptic room escape in Mountain View with overall rating 4.4
limitless escape games in Pleasanton with overall rating 4.1
clockwise escape room pleasanton in Pleasanton with overall rating 5.0
heist escape room in Fremont with overall rating 5.0
off the couch in Fremont with overall rating 5.0
xcapade immersive escape room in Newark with overall rating 4.5
quantum escapes in Danville with overall rating 4.3
zscape games in San Ramon with overall rating 4.5
red door escape room in San Mateo with overall rating 4.2
ryptic room escape in San Mateo with overall rating 4.5
diablo escapes in Walnut Creek with overall rating 