## **01 Web Scrape Reviews**
This script takes a list of breweries and scrapes the corresponding Tripadvisor review data. The brewery list is sourced from Open Brewery DB.

### **Notebook Objectives**
1. Import a CSV into pandas and extract a list of breweries to scrape
2. For each brewery in the list, use the requests package to query DuckDuckGo and find the corresponding Tripadvisor page (if exists)
3. Scrape and save the html page using dill in case the scraped content is needed at a later date
4. Save the scraped review data into txt files for later processing in the next notebook

In [224]:
import os
from pathlib import Path
from dotenv import dotenv_values
import requests
from bs4 import BeautifulSoup
import time
import random
import dill
import pandas as pd
import re

In [225]:
config = dotenv_values(dotenv_path=Path('../.env'))

def get_soup(url):
    headers = {'User-Agent': config['USER_AGENT']}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup, response

def get_brew_id(brewery, url, review_type):
    # clean brewery name
    brewery = brewery.replace('/', '')
    brewery = brewery.replace(' ', '_')
    # check tripadviser page type
    if review_type == 'Attraction':
        brew_id = url.replace('https://www.tripadvisor.com/Attraction_Review', '')
    elif review_type == 'Restaurant':
        brew_id = url.replace('https://www.tripadvisor.com/Restaurant_Review', '')
    else:
        raise TypeError('Unsupported review type')
    brew_id = brewery + brew_id.strip('.html')
    return brew_id

def save_page(url, response, page, id):
    name = '../assets/html/' + id + page + '.dill'
    with open(name, 'wb') as f:
        dill.dump(response, f)
    return

def save_text(text, id):
    name = '../assets/text/' + id + '.txt'
    with open(name, 'a+') as f:
        print(f'Writing to {name}')
        f.write(text)
    return

def parse_soup(soup, id, type='Attraction'):
    # HTML container depends on tripadviser classification
    if type == 'Attraction':
        reviews = soup.find_all('div', class_='_c')
        # reviews = soup.find_all('div', {'data-automation': 'reviewCard'})
        # Iterate through the reviews and extract the date, rating, and text
        for review in reviews:
            # check if tags present
            if review.find('div', class_='RpeCd') is not None:
                review_data = []
                review_data.append(review.find('div', class_='RpeCd').text)
                review_data.append(review.find('svg', class_='UctUV d H0').get('aria-label'))
                review_data.append(review.find('div', class_='biGQs _P fiohW qWPrE ncFvv fOtGX').text)
                # get review text and strip any newlines
                text_extract = review.find('div', class_='biGQs _P pZUbB KxBGd').text
                text_extract = text_extract.replace('\n', '')
                review_data.append(text_extract + '\n')
                if not review_data:
                    print('No more reviews available')
                    return False
                text = '\n'.join(review_data)
                save_text(text, id)
        return True

    elif type =='Restaurant':
        reviews = soup.find_all('div', class_='review-container')
        # Iterate through the reviews and extract the date, rating, and text
        for review in reviews:
            # check if tags present
            if review.find('span', class_='ratingDate') is not None:
                review_data = []
                review_data.append(review.find('span', class_='ratingDate').get('title'))
                review_data.append(review.find('span', class_="ui_bubble_rating").get('class')[1])
                review_data.append(review.find('span', class_='noQuotes').text)
                # get review text and strip any newlines
                text_extract = review.find('p', class_='partial_entry').text
                text_extract = text_extract.replace('\n', '')
                review_data.append(text_extract + '\n')
                if not review_data:
                    print('No more reviews available')
                    return False
                text = '\n'.join(review_data)
                save_text(text, id)
            else:
                print('Review instance is empty')
                return False
        return True
    else:
        raise TypeError('Unsupported review type')

def increment_url(url, page):
    url = url.replace('-Reviews', '-Reviews' + page)
    return url

def get_review_type(url):
    if 'Attraction_Review' in url:
        # scrape up to 30 reviews
        pages = ['', '-or10', '-or20']
        return 'Attraction', pages
    elif 'Restaurant_Review' in url:
        # scrape up to 45 reviews since the lengths are truncated in this case
        pages = ['', '-or15', '-or30']
        return 'Restaurant', pages
    else:
        print('URL does not contain proper format')
        return None, None

def scrape(url_base, brewery):
    """
    Inputs: base url (str), brewery name (str)
    """
    # determine if tripadvisor url classifies it as attraction or restaurant
    review_type, pages = get_review_type(url_base)
    # skip if url does not follow format
    if review_type is not None:
        # get unique brewery identifier
        brew_id = get_brew_id(brewery, url_base, review_type)
        # loop through a couple pages of reviews
        for page in pages:
            url = increment_url(url_base, page)
            print(f'Scraping: {url}')
            soup, response = get_soup(url)
            save_page(url, response, page, brew_id)
            reviews = parse_soup(soup, brew_id, review_type)
            # conservative delay time between requests
            time.sleep(random.randint(9, 12))
            if not reviews:
                return
    return

def get_url_base(keyword, state):
    """
    Inputs: brewery name, state
    """
    base = 'https://duckduckgo.com/html/?q='
    keyword = keyword.replace(' ', '+')
    url = base + 'trip+advisor+' + keyword + '+' + state
    soup, response = get_soup(url)
    links = soup.find_all("a", class_="result__url", href=True)
    return links[0]['href']

In [226]:
# Get list of breweries
df = pd.read_csv('../assets/breweries.csv')
# Get subset of breweries in MA
df_ma = df.loc[df['state'] == 'Massachusetts']
breweries_ma = df_ma[['name', 'state']]
# Explore size of subset
print(f'Brewery df shape: {breweries_ma.shape}')
breweries_ma.head(10)

Brewery df shape: (163, 2)


Unnamed: 0,name,state
13,10th District Brewing Company,Massachusetts
76,3 Beards Beer Company,Massachusetts
98,3cross Fermentation Cooperative,Massachusetts
166,7th Wave Brewing,Massachusetts
188,Abandoned Building Brewery,Massachusetts
224,Aeronaut Brewing Company,Massachusetts
308,Altruist Brewing Company,Massachusetts
331,Amherst Brewing Co / Hangar Pub and Grill,Massachusetts
333,Amorys Tomb Brewing Co,Massachusetts
339,Anawan Brewing Company,Massachusetts


In [222]:
# Test query for brewery's Tripadvisor page
test_brewery = breweries_ma.iloc[14]
name = test_brewery['name']
state = test_brewery['state']
link = get_url_base(name, state)
print(test_brewery['name'], link)

BareWolf Brewing https://www.tripadvisor.com/Attraction_Review-g29509-d14957575-Reviews-BareWolf_Brewing-Amesbury_Massachusetts.html


In [None]:
# Test scrape Tripadvisor page.
# Succesful output will be txt file containing review data
# url_base = get_url_base(name, state)
# scrape(url_base, name)

In [205]:
# Test iterating through brewery list
for index, row in breweries_ma[100:102].iterrows():
    print(row['name'], row['state'])

Navigation Brewing Co Massachusetts
New City Brewery Massachusetts


In [None]:
# Scrape and save html and reviews for breweries in list
# Will take ~1 hour for all breweries in MA
for index, brewery in breweries_ma[108:].iterrows():
    name = brewery['name']
    name_token = re.findall(r'^[A-Za-z\d]+', name)[0]
    state = brewery['state']
    url_base = get_url_base(name, state)
    print(url_base)
    if 'tripadvisor.com' in url_base and name_token in url_base:
        scrape(url_base, name)
    else:
        print(f'No trip advisor result for {name}')

In [208]:
# DEBUG
# Cell to debug unexpected behavior
# TODO running parse_soup() will append duplicate reviews to the txt files if they already exist

def debug_reviews(path, filename, url_base):
    with open(path + filename, 'rb') as f:
        print(f'Opening: {path + filename}')
        debug_page = dill.load(f)
    debug_soup = BeautifulSoup(debug_page.content, 'html.parser')
    review_type, pages = get_review_type(url_base)
    # extract brewery name
    brewery = re.findall(r'^[A-Za-z\d_]+', filename)[0]
    print(f'Brewery name: {brewery}')
    brew_id = get_brew_id(brewery, url_base, review_type)
    print(f'Review type: {review_type}')
    soup_output = debug_soup.find_all('div', class_='_c')
    print(f'Soup result: {soup_output}')
    print(len(soup_output[0]))
    reviews = parse_soup(debug_soup, brew_id, review_type)
    # print(reviews)
    return

path = '../assets/html/'

## Case
# filename = 'Amorys_Tomb_Brewing_Co-g41669-d16656179-Reviews-Amory_s_Tomb_Brewing-Maynard_Massachusetts.dill'
# url_base = 'https://www.tripadvisor.com/Attraction_Review-g41669-d16656179-Reviews-Amory_s_Tomb_Brewing-Maynard_Massachusetts.html'
# debug_reviews(path, filename, url_base)

## Case
# filename = 'Amherst_Brewing_Co__Hangar_Pub_and_Grill-g29510-d1067968-Reviews-Hangar_Pub_Grill-Amherst_Hampshire_County_Massachusetts.dill'
# url_base = 'https://www.tripadvisor.com/Restaurant_Review-g29510-d1067968-Reviews-Hangar_Pub_Grill-Amherst_Hampshire_County_Massachusetts.html'
# debug_reviews(path, filename, url_base)

## Case
# filename = 'Aquatic_Brewing_LLC-g41565-d22754735-Reviews-Aquatic_Brewing-Falmouth_Cape_Cod_Massachusetts.dill'
# url_base = 'https://www.tripadvisor.com/Attraction_Review-g41565-d22754735-Reviews-Aquatic_Brewing-Falmouth_Cape_Cod_Massachusetts.html'
# debug_reviews(path, filename, url_base)

## CASE
# filename = 'Honest_Weight_Artisan_Beer-g41754-d10342647-Reviews-Honest_Weight_Artisan_Beer-Orange_Massachusetts-or10.dill'
# url_base = 'https://www.tripadvisor.com/Attraction_Review-g41754-d10342647-Reviews-Honest_Weight_Artisan_Beer-Orange_Massachusetts.html'
# debug_reviews(path, filename, url_base)