## **01 Web Scrape Reviews**
This script takes a list of breweries and scrapes the corresponding Tripadvisor review data. The brewery list is sourced from Open Brewery DB.

### **Notebook Objectives**
1. Import a CSV into pandas and extract a list of breweries to scrape
2. For each brewery in the list, use the requests package to query DuckDuckGo and find the corresponding Tripadvisor page (if exists)
3. Scrape and save the html page using dill in case the scraped content is needed at a later date
4. Save the scraped review data into txt files for later processing in the next notebook

In [42]:
import os
from pathlib import Path
from dotenv import dotenv_values
import requests
from bs4 import BeautifulSoup
import time
import random
import dill
import pandas as pd
import re
from sqlalchemy import create_engine

In [58]:
config = dotenv_values(dotenv_path=Path('../.env'))

def get_soup(url):
    headers = {'User-Agent': config['USER_AGENT']}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup, response

def get_brew_id(id, url, review_type):
    # check tripadviser page type
    if review_type == 'Attraction':
        brew_id = re.sub('https://www.tripadvisor.+/Attraction_Review', '-Attraction', url)
    elif review_type == 'Restaurant':
        brew_id = re.sub('https://www.tripadvisor.+/Restaurant_Review', '-Restaurant', url)
    else:
        raise TypeError('Unsupported review type')
    brew_id = id + brew_id.strip('.html')
    return brew_id

def save_page(response, page, id):
    name = '../assets/html/' + id + page + '.dill'
    with open(name, 'wb') as f:
        dill.dump(response, f)
    return

def save_text(text, brew_id):
    name = '../assets/text/' + brew_id + '.txt'
    with open(name, 'a+') as f:
        print(f'Writing to {name}')
        f.write(text)
    return

def parse_soup(soup, brew_id, type='Attraction'):
    # HTML container depends on tripadviser classification
    if type == 'Attraction':
        reviews = soup.find_all('div', {'data-automation': 'reviewCard'})
        if len(reviews) == 0:
            print('No more reviews to parse.')
            return False
        # Iterate through the reviews and extract the date, rating, and text
        # TODO: store review data in a dictionary and json for easier parsing later
        for review in reviews:
            # check if tags present
            if review.find('div', class_='RpeCd') is not None:
                review_data = []
                review_data.append(review.find('div', class_='RpeCd').text)
                review_data.append(review.find('svg', class_='UctUV d H0').get('aria-label'))
                review_data.append(review.find('div', class_='biGQs _P fiohW qWPrE ncFvv fOtGX').text)
                # get review text and strip any newlines
                text_extract = review.find('div', class_='biGQs _P pZUbB KxBGd').text
                text_extract = text_extract.replace('\n', '')
                review_data.append(text_extract + '\n')
                text = '\n'.join(review_data)
                save_text(text, brew_id)
        return True

    elif type =='Restaurant':
        reviews = soup.find_all('div', class_='review-container')
        if len(reviews) == 0:
            print('No more reviews to parse.')
            return False
        # Iterate through the reviews and extract the date, rating, and text
        for review in reviews:
            # check if tags present
            if review.find('span', class_='ratingDate') is not None:
                review_data = []
                review_data.append(review.find('span', class_='ratingDate').get('title'))
                review_data.append(review.find('span', class_="ui_bubble_rating").get('class')[1])
                review_data.append(review.find('span', class_='noQuotes').text)
                # get review text and strip any newlines
                text_extract = review.find('p', class_='partial_entry').text
                text_extract = text_extract.replace('\n', '')
                review_data.append(text_extract + '\n')
                text = '\n'.join(review_data)
                save_text(text, brew_id)
            else:
                print('Review instance is empty')
                return False
        return True
    else:
        raise TypeError('Unsupported review type')

def increment_url(url, page):
    url = url.replace('-Reviews', '-Reviews' + page)
    return url

def get_review_type(url):
    if 'Attraction_Review' in url:
        # scrape up to 30 reviews
        pages = ['', '-or10', '-or20']
        return 'Attraction', pages
    elif 'Restaurant_Review' in url:
        # scrape up to 45 reviews since the lengths are truncated in Restaurant case
        pages = ['', '-or15', '-or30']
        return 'Restaurant', pages
    else:
        print('URL does not contain proper format')
        return None, None

def scrape(id, url_base):
    """
    Inputs: base url (str), brewery name (str)
    """
    # determine if tripadvisor url classifies it as attraction or restaurant
    review_type, pages = get_review_type(url_base)
    # skip if url does not follow format
    if review_type is not None:
        # get unique brewery identifier
        brew_id = get_brew_id(id, url_base, review_type)
        # loop through a couple pages of reviews
        for page in pages:
            url = increment_url(url_base, page)
            print(f'Scraping: {url}')
            soup, response = get_soup(url)
            save_page(response, page, brew_id)
            reviews = parse_soup(soup, brew_id, review_type)
            # conservative delay time between requests
            time.sleep(random.randint(4, 6))
            if not reviews:
                return
    return

def get_url_base(name, state, city, verbose=False):
    """
    Inputs: brewery name, state
    """
    base = 'https://duckduckgo.com/html/?q='
    name = name.replace(' ', '+')
    url = f'{base}+tripadvisor+{name}+{state}+{city}'
    if verbose:
        print(url)
    soup, response = get_soup(url)
    links = soup.find_all("a", class_="result__url", href=True)
    return links[0]['href']

In [59]:
# Get list of breweries
df = pd.read_csv('../assets/breweries.csv')
df.head()

Unnamed: 0,obdb_id,name,brewery_type,street,address_2,address_3,city,state,county_province,postal_code,website_url,phone,country,longitude,latitude,tags
0,10-56-brewing-company-knox,10-56 Brewing Company,micro,400 Brown Cir,,,Knox,Indiana,,46534,,6308165790,United States,-86.627954,41.289715,
1,10-barrel-brewing-co-bend-1,10 Barrel Brewing Co,large,62970 18th St,,,Bend,Oregon,,97701-9847,http://www.10barrel.com,5415851007,United States,-121.281706,44.086835,
2,10-barrel-brewing-co-bend-2,10 Barrel Brewing Co,large,1135 NW Galveston Ave Ste B,,,Bend,Oregon,,97703-2465,,5415851007,United States,-121.328802,44.057565,
3,10-barrel-brewing-co-bend-pub-bend,10 Barrel Brewing Co - Bend Pub,large,62950 NE 18th St,,,Bend,Oregon,,97701,,5415851007,United States,-121.280954,44.091211,
4,10-barrel-brewing-co-boise-boise,10 Barrel Brewing Co - Boise,large,826 W Bannock St,,,Boise,Idaho,,83702-5857,http://www.10barrel.com,2083445870,United States,-116.202929,43.618516,


In [60]:
# Get subset of breweries in MA
df_ma = df.loc[df['state'] == 'Massachusetts']
breweries_ma = df_ma[['obdb_id', 'name', 'state', 'city', 'street', 'longitude', 'latitude']]
# Explore size of subset
print(f'Brewery df shape: {breweries_ma.shape}')
# Save list of MA breweries
filepath = Path('../assets/ma_breweries.csv')
breweries_ma.to_csv(filepath, index=False)
breweries_ma.head(10)

Brewery df shape: (163, 7)


Unnamed: 0,obdb_id,name,state,city,street,longitude,latitude
13,10th-district-brewing-company-abington,10th District Brewing Company,Massachusetts,Abington,491 Washington St,-70.945941,42.105918
76,3-beards-beer-company-williamsburg,3 Beards Beer Company,Massachusetts,Williamsburg,4 Main St,-72.730506,42.392366
98,3cross-fermentation-cooperative-worcester,3cross Fermentation Cooperative,Massachusetts,Worcester,4 Knowlton Ave,-71.830576,42.243649
166,7th-wave-brewing-medfield,7th Wave Brewing,Massachusetts,Medfield,120 N Meadows Rd Ste 8,,
188,abandoned-building-brewery-easthampton,Abandoned Building Brewery,Massachusetts,Easthampton,142 Pleasant St Unit 103A,,
224,aeronaut-brewing-company-somerville,Aeronaut Brewing Company,Massachusetts,Somerville,14 Tyler St,-71.106268,42.381972
308,altruist-brewing-company-sturbridge,Altruist Brewing Company,Massachusetts,Sturbridge,559 Main St Unit 105,,
331,amherst-brewing-co-hangar-pub-and-grill-amherst,Amherst Brewing Co / Hangar Pub and Grill,Massachusetts,Amherst,10 University Dr,-72.531842,42.372864
333,amorys-tomb-brewing-co-maynard,Amorys Tomb Brewing Co,Massachusetts,Maynard,76 Main St,-71.453156,42.432038
339,anawan-brewing-company-dighton,Anawan Brewing Company,Massachusetts,Dighton,,,


In [61]:
# Inspect missing lat/lon data for future notebooks that will plot locations
print(f'Fraction of all breweries containing nulls:\n{df.isnull().mean()}')
print(f'Fraction of MA breweries containing nulls:\n{breweries_ma.isnull().mean()}')

Fraction of all breweries containing nulls:
obdb_id            0.000000
name               0.000000
brewery_type       0.000000
street             0.095921
address_2          0.989587
address_3          0.997060
city               0.000000
state              0.021561
county_province    0.978317
postal_code        0.000000
website_url        0.146637
phone              0.102291
country            0.000000
longitude          0.291682
latitude           0.291682
tags               1.000000
dtype: float64
Fraction of MA breweries containing nulls:
obdb_id      0.000000
name         0.000000
state        0.000000
city         0.000000
street       0.134969
longitude    0.386503
latitude     0.386503
dtype: float64


In [27]:
# Test query for brewery's Tripadvisor page
test_brewery = breweries_ma.iloc[14]
name = test_brewery['name']
state = test_brewery['state']
city = test_brewery['city']
link = get_url_base(name, state, city, verbose=True)
print(test_brewery['name'], link)

https://duckduckgo.com/html/?q=+tripadvisor+BareWolf+Brewing+Massachusetts+Amesbury
BareWolf Brewing https://www.tripadvisor.com/Attraction_Review-g29509-d14957575-Reviews-BareWolf_Brewing-Amesbury_Massachusetts.html


In [62]:
# Test iterating through brewery list
for index, row in breweries_ma[100:102].iterrows():
    print(row['obdb_id'], row['state'])

navigation-brewing-co-lowell Massachusetts
new-city-brewery-easthampton Massachusetts


In [57]:
# Scrape and save html and reviews for breweries in list
# Will take 30+ minutes for all breweries in MA
# TODO add logger rather than printing to screen
for index, brewery in breweries_ma.iterrows():
    id = brewery['obdb_id']
    name = brewery['name']
    city = brewery['city']
    name_token = re.findall(r'^[A-Za-z\d]+', name)[0]
    state = brewery['state']
    url_base = get_url_base(name, state, city)
    print(url_base)
    if 'tripadvisor.' in url_base and name_token in url_base:
        scrape(id, url_base)
    elif 'tripadvisor.' in url_base:
        name_token = name_token.strip('s')
        if 'tripadvisor.' in url_base and name_token in url_base:
            scrape(id, url_base)
    else:
        print(f'No trip advisor result for {name}')

https://www.tripadvisor.com/Attraction_Review-g29501-d12236051-Reviews-10th_District_Brewing_Company-Abington_Massachusetts.html
10th
Scraping: https://www.tripadvisor.com/Attraction_Review-g29501-d12236051-Reviews-10th_District_Brewing_Company-Abington_Massachusetts.html
Writing to ../assets/text/10th-district-brewing-company-abington-Attraction-g29501-d12236051-Reviews-10th_District_Brewing_Company-Abington_Massachusetts.txt
Writing to ../assets/text/10th-district-brewing-company-abington-Attraction-g29501-d12236051-Reviews-10th_District_Brewing_Company-Abington_Massachusetts.txt
Writing to ../assets/text/10th-district-brewing-company-abington-Attraction-g29501-d12236051-Reviews-10th_District_Brewing_Company-Abington_Massachusetts.txt
Writing to ../assets/text/10th-district-brewing-company-abington-Attraction-g29501-d12236051-Reviews-10th_District_Brewing_Company-Abington_Massachusetts.txt
Writing to ../assets/text/10th-district-brewing-company-abington-Attraction-g29501-d12236051-Re

In [None]:
# TODO
# If scraping approach needs to be updated, re-scrape from the saved html pages
# Extract review data from all txt files in directory
dir_path = '../assets/html/'
paths = Path(dir_path).glob('**/*.dill')
for path in paths:
    with open(path, 'rb') as f:
        page = dill.load(f)

In [39]:
# DEBUG
# Cell to debug unexpected behavior
# TODO running parse_soup() will append duplicate reviews to the txt files if they already exist

def debug_reviews(path, filename, url_base):
    with open(path + filename, 'rb') as f:
        print(f'Opening: {path + filename}')
        debug_page = dill.load(f)
    debug_soup = BeautifulSoup(debug_page.content, 'html.parser')
    review_type, pages = get_review_type(url_base)
    # extract brewery name
    brewery = re.findall(r'^[A-Za-z\d_]+', filename)[0]
    print(f'Brewery name: {brewery}')
    brew_id = get_brew_id(brewery, url_base, review_type)
    print(f'Review type: {review_type}')
    soup_output = debug_soup.find_all('div', class_='_c')
    print(f'Soup result: {soup_output}')
    # print(len(soup_output[0]))
    reviews = parse_soup(debug_soup, brew_id, review_type)
    # print(reviews)
    return

path = '../assets/html/'

## Case
# filename = 'Amorys_Tomb_Brewing_Co-g41669-d16656179-Reviews-Amory_s_Tomb_Brewing-Maynard_Massachusetts.dill'
# url_base = 'https://www.tripadvisor.com/Attraction_Review-g41669-d16656179-Reviews-Amory_s_Tomb_Brewing-Maynard_Massachusetts.html'
# debug_reviews(path, filename, url_base)

## Case
# filename = 'Amherst_Brewing_Co__Hangar_Pub_and_Grill-g29510-d1067968-Reviews-Hangar_Pub_Grill-Amherst_Hampshire_County_Massachusetts.dill'
# url_base = 'https://www.tripadvisor.com/Restaurant_Review-g29510-d1067968-Reviews-Hangar_Pub_Grill-Amherst_Hampshire_County_Massachusetts.html'
# debug_reviews(path, filename, url_base)

## Case
# filename = 'Aquatic_Brewing_LLC-g41565-d22754735-Reviews-Aquatic_Brewing-Falmouth_Cape_Cod_Massachusetts.dill'
# url_base = 'https://www.tripadvisor.com/Attraction_Review-g41565-d22754735-Reviews-Aquatic_Brewing-Falmouth_Cape_Cod_Massachusetts.html'
# debug_reviews(path, filename, url_base)

## CASE
# filename = 'Honest_Weight_Artisan_Beer-g41754-d10342647-Reviews-Honest_Weight_Artisan_Beer-Orange_Massachusetts-or10.dill'
# url_base = 'https://www.tripadvisor.com/Attraction_Review-g41754-d10342647-Reviews-Honest_Weight_Artisan_Beer-Orange_Massachusetts.html'
# debug_reviews(path, filename, url_base)

## CASE
filename = 'amorys-tomb-brewing-co-maynard-Attraction-g41669-d16656179-Reviews-Amory_s_Tomb_Brewing-Maynard_Massachusetts-or10.dill'
url_base = 'https://www.tripadvisor.com/Attraction_Review-g41669-d16656179-Reviews-or10-Amory_s_Tomb_Brewing-Maynard_Massachusetts.html'
debug_reviews(path, filename, url_base)

## CASE
filename = 'amorys-tomb-brewing-co-maynard-Attraction-g41669-d16656179-Reviews-Amory_s_Tomb_Brewing-Maynard_Massachusetts.dill'
url_base = 'https://www.tripadvisor.com/Attraction_Review-g41669-d16656179-Reviews-Amory_s_Tomb_Brewing-Maynard_Massachusetts.html'
debug_reviews(path, filename, url_base)

Opening: ../assets/html/amorys-tomb-brewing-co-maynard-Attraction-g41669-d16656179-Reviews-Amory_s_Tomb_Brewing-Maynard_Massachusetts-or10.dill
Brewery name: amorys
Review type: Attraction
Soup result: [<div class="_c">&lt; 1 hour</div>]
No more reviews to parse.
Opening: ../assets/html/amorys-tomb-brewing-co-maynard-Attraction-g41669-d16656179-Reviews-Amory_s_Tomb_Brewing-Maynard_Massachusetts.dill
Brewery name: amorys
Review type: Attraction
Soup result: [<div class="_c">&lt; 1 hour</div>, <div class="_c" data-automation="reviewCard"><div class="mwPje f M k"><div class="XExLl f u o"><div class="hzzSG"><div class="MLvbw f u"><div class="AYlYS" style="z-index:0"><div class=""><div class="tknvo ccudK Rb I o"><div class=""><a aria-hidden="true" class="BMQDV _F G- wSSLS SwZTJ" href="/Profile/Petitrobert54" tabindex="-1" target="_self"><div class="FGwzt PaRlG"><div aria-label="Petitrobert54" class="NhWcC _R" role="img" style="width:32px;height:32px"></div></div></a></div></div></div></div>

In [63]:
from sqlalchemy import Column, Date, Integer, String
from sqlalchemy.orm import declarative_base, sessionmaker

# Practice storing the scraped data in a SQL database
engine = create_engine("sqlite:///../assets/foo.db")
Base = declarative_base()

class City(Base):

    __tablename__ = "cities"

    id = Column(Integer, primary_key=True)
    name = Column(String)  

    # def __init__(self, name):
    #     self.name = name    

Base.metadata.create_all(engine)

In [64]:
# Set up the Session
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()

In [66]:
print(f'Count: {session.query(City).count()}')

names = ['A', 'B', 'C']
ids = ['1', '2', '3']
for name, id in zip(names, ids):
    if session.query(City).filter(City.id==id).first() is None:
        new_city = City(name=name, id=id)
        session.add(new_city)

# Write to DB
session.commit()

print(f'New Count: {session.query(City).count()}')

Count: 3
New Count: 3


In [67]:
# Inspect table
for city in session.query(City).limit(10):
    print(city.id, city.name)

1 A
2 B
3 C
