In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import csv

## Step 1
Use list comprehension to produce a list of 100 URLs for most recent beers. There are 100 total pages of recent beers.

In [2]:
results_page_urls = [f'https://www.ratebeer.com/beer-ratings/0/{i}/' for i in range(1,101)]
results_page_urls[:5]

['https://www.ratebeer.com/beer-ratings/0/1/',
 'https://www.ratebeer.com/beer-ratings/0/2/',
 'https://www.ratebeer.com/beer-ratings/0/3/',
 'https://www.ratebeer.com/beer-ratings/0/4/',
 'https://www.ratebeer.com/beer-ratings/0/5/']

## Step 2
Iterate over all pages and get href for each beer detail page.

In [7]:
total_urls = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36'}

for url in results_page_urls:
    response = requests.get(url, headers=headers)
    status = response.status_code
    if status == 200:
        page = response.text
        soup = bs(page)
    else:
        print(f"Oops! Received status code {status} for page {url}!")
        continue

    a_tags = soup.find_all('a', attrs={'style':'font-size:20px; font-weight:bold;'})
    urls = [f"https://www.ratebeer.com{a_tag.get('href')}" for a_tag in a_tags]
    
    total_urls.extend(urls)
    time.sleep(1)

Oops! Received status code 500 for page https://www.ratebeer.com/beer-ratings/0/54/!


## Step 3
Use Selenium to visit each beer detail page to scrape field data.

In [8]:
ser = Service(r"C:\Program Files\chromedriver.exe")
driver = webdriver.Chrome(service=ser)

csv_file = open('beer_data.csv', 'w', newline='', encoding='utf8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['brewery', 'beer_name', 'overall_score', 'style_score', 'rating',
                    'num_ratings', 'num_reviews', #'city', 'state_region', 'country',
                    'location', 'abv_percent', 'beer_style', 'description', 'served_in_list',
                    'cal_per_355_ml', 'date_added', 'url'])

for url in total_urls:
    driver.get(url)

    beer_dict = {}

    # Waiting for site to load before clicking "Accept Cookies"
    time.sleep(1)

    # Click "Accept Cookies" if popup exists
    try:
        cookies_btn = driver.find_element('xpath', '//button[@id="onetrust-accept-btn-handler"]')
        cookies_btn.click()
    except:
        pass

    show_more = driver.find_element('xpath', '//button[@class="MuiButtonBase-root MuiButton-root MuiButton-text Button___StyledMaterialButton-FZwYh bGOCJz colorized__WrappedComponent-apsCh kAVjHC -ml-3 mt-3 MuiButton-textPrimary"]')
    show_more.click()

    # Scrape brewery name
    beer_dict['brewery'] = driver.find_element('xpath', '//a[@class="MuiTypography-root Text___StyledTypographyTypeless-bukSfn pzIrn colorized__WrappedComponent-hrwcZr liJcHu Anchor___StyledText-uWnSM eseQug MuiTypography-subtitle1"]').text

    # Scrape beer name
    beer_dict['beer_name'] = driver.find_element('xpath', '//div[@class="MuiTypography-root Text___StyledTypographyTypeless-bukSfn pzIrn text-500 colorized__WrappedComponent-hrwcZr hwjOn mt-3 MuiTypography-h4"]').text

    # Scrape overall and style scores
    try:
        overall_score = driver.find_element('xpath', '//div[@class="BeerRatingsWidget___StyledDiv3-eOHJZv bulLND fa-c"]/div[1]/div')
        overall_score = int(overall_score.text)
    except:
        overall_score = None
#         print(f"Oops! Overall score missing for page: {url}")

    beer_dict['overall_score'] = overall_score

    try:
        style_score = driver.find_element('xpath', '//div[@class="BeerRatingsWidget___StyledDiv3-eOHJZv bulLND fa-c"]/div[2]/div')
        style_score = int(style_score.text)
    except:
        style_score = None
#         print(f"Oops! Style score missing for page: {url}")

    beer_dict['style_score'] = style_score

    # Scrape rating
    beer_dict['rating'] = float(driver.find_element('xpath', '//div[@class="MuiTypography-root Text___StyledTypographyTypeless-bukSfn pzIrn text-500 colorized__WrappedComponent-hrwcZr hwjOn mr-2 MuiTypography-body2"]').text)

    # Scrape number of ratings and reviews
    num_ratings_reviews = driver.find_element('xpath', '//span[@class="MuiTypography-root Text___StyledTypographyTypeless-bukSfn kbrPIo colorized__WrappedComponent-hrwcZr gRvDpm mr-3 MuiTypography-caption"]')

    try:
        num_ratings = int(num_ratings_reviews.text.split()[0].replace(',',''))
    except:
        num_ratings = None
        print(f"Oops! Number of ratings missing for page: {url}")

    beer_dict['num_ratings'] = num_ratings

    try:
        num_reviews = int(num_ratings_reviews.text.split()[2].replace(',',''))
    except:
        num_reviews = None
        print(f"Oops! Number of reviews missing for page: {url}")

    beer_dict['num_reviews'] = num_reviews

    # Scrape loction of brewery (city, state/region, and country)
    #beer_dict['location'] = driver.find_element('xpath', '//div[@class="BeerCard___StyledDiv2-ieYeaq eRqQUm mb-3"]/div[@class="MuiTypography-root Text___StyledTypographyTypeless-bukSfn pzIrn colorized__WrappedComponent-hrwcZr hwjOn MuiTypography-body2"]').text#.split(',')
#     if len(location) == 3:
#         beer_dict['city'] = location[0].strip().split(' ')[1]
#         beer_dict['state_region'] = location[1].strip()
#         beer_dict['country'] = location[2].strip()
#     elif len(location) == 2:
#         beer_dict['city'] = None
#         beer_dict['state_region'] = location[0].strip()
#         beer_dict['country'] = location[1].strip()
#     elif len(location) == 1:
#         beer_dict['city'] = None
#         beer_dict['state_region'] = None
#         beer_dict['country'] = location[0].strip()
#     else:
#         beer_dict['city'] = None
#         beer_dict['state_region'] = None
#         beer_dict['country'] = None
#         print(f'Oops! Missing location for page: {url}')

    try:
        location = driver.find_element('xpath', '//div[@class="BeerCard___StyledDiv2-ieYeaq eRqQUm mb-3"]/div[@class="MuiTypography-root Text___StyledTypographyTypeless-bukSfn pzIrn colorized__WrappedComponent-hrwcZr hwjOn MuiTypography-body2"]').text#.split(',')
    except:
        location = None
        print(f"Oops! location missing for page: {url}")
        
    beer_dict['location'] = location
        
    # Scrape alcohol by volume percent
    try:
        abv_percent = float(driver.find_element('xpath', '//div[@class="MuiTypography-root Text___StyledTypographyTypeless-bukSfn kbrPIo colorized__WrappedComponent-hrwcZr bRPQdN MuiTypography-caption"]').text.replace('%',''))
    except:
        abv_percent = None
        print(f"Oops! ABV missing for page: {url}")

    beer_dict['abv_percent'] = abv_percent

    # Scrape beer style
    beer_dict['beer_style'] = driver.find_element('xpath', '//div[@class="fj-s"]/a[@class="MuiTypography-root Text___StyledTypographyTypeless-bukSfn kbrPIo colorized__WrappedComponent-hrwcZr liJcHu Anchor___StyledText-uWnSM eseQug MuiTypography-caption"]').text

    # Scrape beer description
    try:
        description = driver.find_element('xpath', '//div[@class="MuiTypography-root Text___StyledTypographyTypeless-bukSfn pzIrn colorized__WrappedComponent-hrwcZr hwjOn pre-wrap MuiTypography-body2"]').text
    except:
        description = None
        print(f"Oops! Description missing for page: {url}")

    beer_dict['description'] = description

    # Scrape type of serving glass
    served_in = driver.find_elements('xpath', '//span[@class="MuiTypography-root Text___StyledTypographyTypeless-bukSfn kbrPIo colorized__WrappedComponent-hrwcZr hwjOn Tag___StyledText-kDDuQR kZyKXJ cursor-pointer mr-3 my-2 MuiTypography-caption"]')
    beer_dict['served_in_list'] = [element.text for element in served_in]

    # Scrape calories per 355 ml
    try:
        cal_per_355_ml = int(driver.find_element('xpath', '//div[@class="MuiTypography-root Text___StyledTypographyTypeless-bukSfn pzIrn colorized__WrappedComponent-hrwcZr hwjOn LabelInfo___StyledText-eCbuRi hJPZRX fa-c MuiTypography-body2"]').text.split()[0])
    except:
        cal_per_355_ml = None
        print(f"Oops! Calories missing for page: {url}")
    
    beer_dict['cal_per_355_ml'] = cal_per_355_ml
        
    # Expanding "More statistics"
    more_stats = driver.find_element('xpath', '//div[@class="MuiTypography-root Text___StyledTypographyTypeless-bukSfn dLSnYi colorized__WrappedComponent-hrwcZr gRvDpm cursor-pointer MuiTypography-caption"]')
    more_stats.click()

    # Scrape date added to website from "More statistics"
    beer_dict['date_added'] = driver.find_element('xpath', '//div[@class="p-4"]/div[@class="fj-sb mb-2"]/div[2]').text

    beer_dict['url'] = url

    csv_writer.writerow(beer_dict.values())

    time.sleep(1)

csv_file.close()
driver.close()

Oops! ABV missing for page: https://www.ratebeer.com/beer/santa-clausthaler/18656/272845/
Oops! Calories missing for page: https://www.ratebeer.com/beer/santa-clausthaler/18656/272845/
Oops! ABV missing for page: https://www.ratebeer.com/beer/a-la-derive-boathouse-2/825125/229832/
Oops! Calories missing for page: https://www.ratebeer.com/beer/a-la-derive-boathouse-2/825125/229832/
Oops! ABV missing for page: https://www.ratebeer.com/beer/rrey-american-ipa/655051/128160/
Oops! Calories missing for page: https://www.ratebeer.com/beer/rrey-american-ipa/655051/128160/
Oops! ABV missing for page: https://www.ratebeer.com/beer/fayrouz-pineapple-egypt/990930/240051/
Oops! Calories missing for page: https://www.ratebeer.com/beer/fayrouz-pineapple-egypt/990930/240051/
Oops! ABV missing for page: https://www.ratebeer.com/beer/guinness-draught-00/869291/90466/
Oops! Calories missing for page: https://www.ratebeer.com/beer/guinness-draught-00/869291/90466/
Oops! ABV missing for page: https://www.r