# Beer Advocate Scraper

## Purpose

This notebook will go through scraping the Beer Advocate site for breweries and reviews.

## Method

1. Scrape Place index for cities (86 cities)
    a. Store as CSV
2. Loop through the cities
    a. Store Breweries in DB (?? breweries)
    b. How to determine if already stored?
3. Loop through breweries
    a. Store Reviews in DB

## Schema

* Cities
    * Name
    * URL
    * BA City ID
* Breweries
    * Name
    * Address
    * City
    * State / Province
    * Postal Code
    * Rating
    * Total Ratings
    * URL
    * BA Brewery ID
* Reviews
    * BA Brewery ID
    * Text
    * Date


In [99]:
import re
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [100]:
from sqlalchemy import Column, Date, Integer, Float, String, UnicodeText, Table
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()

class City(Base):
    __tablename__ = 'cities'
    id = Column(Integer, primary_key=True)
    name = Column(String)
    url = Column(String)
    ba_city_id = Column(Integer)
    
class Brewery(Base):
    __tablename__ = 'brewery'
    id = Column(Integer, primary_key=True)
    name = Column(String)
    address = Column(String)
    city = Column(String)
    state_province = Column(String)
    postal_code = Column(String)
    url = Column(String)
    rating = Column(Float)
    total_ratings = Column(Integer)
    ba_brewery_id = Column(Integer)
    
class Review(Base):
    __tablename__ = 'reviews'
    id = Column(Integer, primary_key=True)
    rating = Column(Float)
    text = Column(UnicodeText)
    date = Column(Date)
    ba_brewery_id = Column(Integer)
    
engine = create_engine(f"sqlite:///brewery_reviews.db")
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()

Base.metadata.create_all(engine)

In [101]:
# Get the page and parse
ba_domain = "https://www.beeradvocate.com"
places_url = "https://www.beeradvocate.com/place/"
request_headers = {'User-agent': 'Springboard Capstone Project Research Tool by Chris Mears <chris.mears@gmail.com>'}

# TODO: Add User-Agent for ethical scraping
places_page = requests.get(places_url, headers=request_headers)
places_soup = BeautifulSoup(places_page.text)

In [102]:
# Select all of the brewery items
# <a href="/place/city/73/">Anchorage</a>
cities = places_soup.find_all(href=re.compile("\/place\/city\/"))
print(f"Number of Cities: {len(cities)}")

Number of Cities: 86


In [103]:
# Clean-up: Delete all rows in City
# session.query(City).delete()

In [104]:
# Add cities to DB
for city in cities:
    name = city.string
    url = f"{ba_domain}{city['href']}"
    ba_city_id = re.match(r"\/place\/city\/(\d+)\/", city['href']).group(1)
    
    # Skip if city already exists
    if session.query(City).filter(City.ba_city_id==ba_city_id).first() is None:
        new_city = City(name=name, url=url, ba_city_id=ba_city_id)
        session.add(new_city)

# Write to DB
session.commit()

In [105]:
# Inspect
for city in session.query(City).limit(10):
    print(city.id, city.name, city.url, city.ba_city_id)

1 Birmingham https://www.beeradvocate.com/place/city/83/ 83
2 Anchorage https://www.beeradvocate.com/place/city/73/ 73
3 Phoenix https://www.beeradvocate.com/place/city/29/ 29
4 Los Angeles https://www.beeradvocate.com/place/city/9/ 9
5 San Diego https://www.beeradvocate.com/place/city/28/ 28
6 San Francisco https://www.beeradvocate.com/place/city/17/ 17
7 Boulder https://www.beeradvocate.com/place/city/79/ 79
8 Denver https://www.beeradvocate.com/place/city/7/ 7
9 Fort Collins https://www.beeradvocate.com/place/city/53/ 53
10 Hartford https://www.beeradvocate.com/place/city/38/ 38


In [106]:
# Get first city
city = session.query(City).all()[0]

# Read city page
city_page = requests.get(city.url, headers=request_headers)
city_soup = BeautifulSoup(city_page.text)

In [107]:
breweries = soup.find_all(href=re.compile("\/beer\/profile\/"))
print(f"Number of Breweries: {len(breweries)}")

No. Items: 28


In [108]:
# Clean-up: Delete all rows in Brewery
# session.query(Brewery).delete()

In [109]:
# Add breweries to DB
for brewery in breweries:
    name = brewery.string
    url = f"{ba_domain}{brewery['href']}"
    ba_brewery_id = re.match(r"\/beer\/profile\/(\d+)\/", brewery['href']).group(1)
    
    # Skip if it already exists
    if session.query(Brewery).filter(Brewery.ba_brewery_id==ba_brewery_id).first() is None:
        new_brewery = Brewery(name=name, url=url, ba_brewery_id=ba_brewery_id)
        session.add(new_brewery)

# Write to DB
session.commit()

In [110]:
# Inspect
for brewery in session.query(Brewery).limit(10):
    print(brewery.id, brewery.name, brewery.url, brewery.ba_brewery_id)

1 Birmingham District Brewing https://www.beeradvocate.com/beer/profile/54932/ 54932
2 Trimtab Brewing Company https://www.beeradvocate.com/beer/profile/33996/ 33996
3 Good People Brewing Company https://www.beeradvocate.com/beer/profile/17282/ 17282
4 Cahaba Brewing Company https://www.beeradvocate.com/beer/profile/27947/ 27947
5 Avondale Brewing Co. https://www.beeradvocate.com/beer/profile/25916/ 25916
6 Ghost Train Brewing Company https://www.beeradvocate.com/beer/profile/43120/ 43120
7 Highland Package Store https://www.beeradvocate.com/beer/profile/23845/?view=ratings 23845
8 Slice https://www.beeradvocate.com/beer/profile/27347/?view=ratings 27347
9 Garage Cafe https://www.beeradvocate.com/beer/profile/14373/?view=ratings 14373
10 Good People Brewing Company - Birmingham Airport https://www.beeradvocate.com/beer/profile/31770/?view=ratings 31770


In [112]:
brewery = session.query(Brewery).limit(1)[0]
brewery_url = f"{ba_domain}/beer/profile/{brewery.ba_brewery_id}/?view=ratings"

# Read city page
brewery_page = requests.get(brewery_url, headers=request_headers)
brewery_soup = BeautifulSoup(brewery_page.text)

In [114]:
reviews = brewery_soup.find_all(id="rating_fullview_content_2")
print(f"Number of Reviews: {len(reviews)}")

Number of Reviews: 1


In [115]:
reviews[0].select('.muted')

[<span class="muted"><b><a class="username" href="/community/members/wickedbeer.1046282/">WickedBeer</a></b> from Alabama</span>,
 <span class="muted">vibe: 4.5 | quality: 4 | service: 4 | selection: 4.25</span>,
 <span class="muted"><a href="/beer/profile/54932/?view=ratings&amp;ba=WickedBeer#review">Jul 25, 2019</a></span>]