In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common import NoSuchElementException, TimeoutException, ElementClickInterceptedException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.remote import webelement

from pathlib import Path
import csv, time
from typing import TextIO, List, Tuple, Optional
import logging


In [7]:
class BeeerAdvocateScraper:
    """Class that scrapes beer advocate reviews. """
    website: str = "https://www.beeradvocate.com/community/login/"
    username: str = "jojoshulktest@gmail.com"
    password: str = "Xontros.beeradvocate1"
    logger = logging.getLogger("BeeerAdvocateScraper")

    def __init__(self,
                 driver: WebDriver = None,
                 output_path: str = None,
                 wait: int = 5
                 ):
        self.driver = driver
        self.wait = wait
        self.logger = self._setup_logger()
        self.writer: Optional[csv.writer] = None
        self.fw: Optional[TextIO] = None
        # self.info_writer: Optional[csv.writer] = None
        # self.info_fw: Optional[TextIO] = None
        self.output_dir: Path = Path()


    def _setup_logger(self) -> logging.Logger:
        """Setup up logger"""

        # Create logger
        logger: logging.Logger = logging.getLogger(self.__class__.__name__)
        logger.setLevel(logging.INFO)

        if not logger.handlers:
            # Create console handler and set level to debug
            ch = logging.StreamHandler()
            ch.setLevel(logging.INFO)

            # Create formatter
            formatter = logging.Formatter('[%(asctime)s] %(levelname)s [%(name)s] - %(message)s')

            # Add formatter to ch
            ch.setFormatter(formatter)

            # Add ch to logger
            logger.addHandler(ch)

        return logger

    def _writer(self, brewery_id: str, beer_id: str) -> Tuple[csv.writer, TextIO]:
        """Initiates a csv.writer method and returns it"""

        # open a new csv writer
        output_path: Path = self.output_dir / "reviews" / f"{brewery_id}_{beer_id}_reviews.csv"
        fw: TextIO = output_path.open(mode="w",encoding="utf8")
        writer = csv.writer(fw,lineterminator="\n")
        writer.writerow(["brewery_id",
                        "beer_id",
                        "username",
                        "rating",
                        "text",
                        "brewery_name",
                        "Country",
                        "Style",
                        "Score"])
        self.logger.info(f"Opened file at {output_path}")
        return writer, fw

    def _setup_driver(self) -> WebDriver:
        if self.driver:
            self.logger.info(f"Driver already present.")
            pass
        else:
            self.logger.info(f"Installing Driver.")
            self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

        return self.driver

    def _load_main_page(self) -> WebDriver:
        # Set up the driver and navigate to the login page
        self.logger.info(f"Initialize website: {self.website}.")
        self.driver.maximize_window()
        self.driver.get(self.website)
        time.sleep(self.wait*3)

    def _accept_cookies(self) -> None:
        """Try to accept cookies"""
        _ = WebDriverWait(WebDriver, self.wait)
        try:
            accept_button = self.driver.find_element(By.CSS_SELECTOR, "button[class='fc-button fc-cta-consent fc-primary-button']")
            accept_button.click()
            self.logger.info("Cookies accepted")
        except NoSuchElementException:
            self.logger.warning("Cookies element not found.")
        time.sleep(self.wait)

    def _close_banner(self) -> None:
        """Sometimes an ad banner is placed on top of the loging button. Try to close it first"""
        try:
            banner = self.driver.find_element(By.CSS_SELECTOR, "#AdvallyAdhesion > div.__AdvallyClose > img")
            banner.click()
            self.logger.info("Ad banner Clicked.")
        except NoSuchElementException:
            self.logger.warning("Banner not found.")
        time.sleep(self.wait)

    def _login(self) -> None:
        """login to the site"""

        WebDriverWait(self.driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'form#pageLogin input#ctrl_pageLogin_login'))).send_keys(self.username)

        #for password
        self.driver.find_element(by=By.CSS_SELECTOR, value='form#pageLogin input#ctrl_pageLogin_password').send_keys(self.password)

        #for submit
        self.driver.find_element(by=By.CSS_SELECTOR, value='form#pageLogin input[type=submit]').click()

        self.logger.info("Log in successful.")
        time.sleep(self.wait)


    def _load_page(self, brewery_id, beer_id) -> None:
        """load a page based on the brewery_id and beer_id"""
        # Navigate to the beer page
        page: str = f"https://www.beeradvocate.com/beer/profile/{brewery_id}/{beer_id}/"
        self.logger.info(f"Loading Page {page}")
        self.driver.get(page)

    def _iterate_pages(self) -> None:
        """Iterates through all pages of reviews and writes them to the output file."""
        self.logger.info("Starting page iteration.")
        last = False
        count = 0

        info: dict = self._get_basic_info()
        while not last:

            # Write reviews for current page
            self._write_page_reviews(info=info)

            # Find and click the "next" button to navigate to the next page
            _next = False
            button = WebDriverWait(self.driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="ba-content"]//div//span//a')))
            for b in button:
                if b.text == 'next':
                    b.click()
                    _next = True
                    scraper.driver.execute_script('window,scrollTo(0,document.body.scrollHeight)')
                    break

            # If there is no "next" button, we have reached the last page
            if not _next:
                check_last = WebDriverWait(self.driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="ba-content"]//div//span//span')))
                for element in check_last:
                    if 'muted' in element.get_attribute("class"):
                        self.logger.info("Reached Last review Page.")
                        last = True
                        break

            if count % 20 == 0 and count > 0:
                self.logger.debug(f"{count} review pages loaded")
            count += 1


    def _write_review(self, review: dict) -> None:
        """Writes a review into the open file"""
        # write a new row
        self.writer.writerow([review["brewery_id"],
                              review["beer_id"],
                              review["username"],
                              review["rating"],
                              review["text"],
                              review["brewery_name"],
                              review["Country"],
                              review["Style"],
                              review["Score"]])

    def _get_basic_info(self) -> dict:
        """
        Gets the basic information about the beer from the "info_box" section of the webpage.

        Returns:
            A dictionary containing the basic information about the beer.
        """

        # Find all the "dd" elements under "div" elements that do not have "muted" in their class name
        info_elements = WebDriverWait(self.driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="info_box"]//div[not(contains(@class, "muted"))]//dl//dd')))

        # Find all the "dt" elements under "div" elements in the "info_box" section
        info_elements_text = WebDriverWait(self.driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="info_box"]//div//dl//dt')))

        _dict = {}

        # Iterate over each piece of information and add it to the dictionary
        for info in zip(info_elements, info_elements_text):
            # If there is text in the "dt" element, use it as the key in the dictionary
            if info[1].text != ' ':
                _dict[info[1].text.replace(":", "").strip()] = info[0].text.split("\n")[0].strip()
            # If there is no text in the "dt" element, assume it is the country and use "country" as the key
            else:
                _dict["Country"] = info[0].text

        self.logger.info(f"Beer Info retrieved.")
        # Return the dictionary of basic information
        return _dict

    def _write_page_reviews(self, info: dict) -> None:
        """get all reviews in the currenlty open page"""

        # Scrape the reviews
        review_divs = WebDriverWait(self.driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".user-comment")))

        for review_div in review_divs:
            elements = review_div.find_element(by=By.CSS_SELECTOR, value="[id='rating_fullview_content_2']")

            text_elements = elements.find_elements(By.TAG_NAME, "div")
            for text_element in text_elements:
                review = {
                    "username": elements.find_element(by=By.CSS_SELECTOR, value="[class='username']").text,
                    "rating": elements.find_element(by=By.CSS_SELECTOR, value="[class='BAscore_norm']").text,
                    "brewery_id": brewery_id,
                    "beer_id": beer_id,
                    "text": text_element.text.replace("\n", " "),
                    "brewery_name":  info.get("From"),
                    "Country":  info.get("Country"),
                    "Style":  info.get("Style"),
                    "Score":  info.get("Score")
                }
                self._write_review(review)

    def setup(self) -> None:
        """Main method that performs needed steps to get the reviews"""
        self._setup_driver()
        self._load_main_page()
        self._accept_cookies()
        self._close_banner()
        self._login()

    def get_reviews(self, brewery_id: int, beer_id: int) -> None:
        """Get all reviews for a given brewery_id and beer_id"""
        self.writer, self.fw = self._writer(brewery_id=str(brewery_id), beer_id=str(beer_id))
        self._load_page(brewery_id=brewery_id, beer_id=beer_id)
        try:
            self._iterate_pages()
        except KeyboardInterrupt:
            self.fw.close()
        self.logger.info(f"All reviews Loaded.")
        self.fw.close()


    def shutdown(self) -> None:
        """close driver and csv writer"""
        self.logger.info("Closing Driver.")
        self.driver.quit()
        self.fw.close()


In [3]:
import pandas as pd
path = r"C:\Users\jojoshulk\PycharmProjects\MscDataScience\Recommender_Systems\Project_1\beer_reviews.csv"
df = pd.read_csv(path)


In [9]:
scraper = BeeerAdvocateScraper(wait=2)
scraper.setup()

[2023-03-20 00:44:51,184] INFO [BeeerAdvocateScraper] - Installing Driver.
[2023-03-20 00:44:54,283] INFO [BeeerAdvocateScraper] - Initialize website: https://www.beeradvocate.com/community/login/.
[2023-03-20 00:45:03,693] INFO [BeeerAdvocateScraper] - Cookies accepted
[2023-03-20 00:45:05,763] INFO [BeeerAdvocateScraper] - Ad banner Clicked.
[2023-03-20 00:45:11,044] INFO [BeeerAdvocateScraper] - Log in successful.


In [12]:
count_df: pd.DataFrame = df.groupby(['brewery_id', 'beer_beerid'])['index'].agg('count').reset_index().sort_values(by="index", ascending=False)

for i in range(60,20000):
    temp = count_df.iloc[i]
    brewery_id = temp.brewery_id
    beer_id = temp.beer_beerid

    scraper.get_reviews(brewery_id=brewery_id, beer_id=beer_id)

[2023-03-20 18:45:18,748] INFO [BeeerAdvocateScraper] - Opened file at reviews\113_782_reviews.csv
[2023-03-20 18:45:18,748] INFO [BeeerAdvocateScraper] - Loading Page https://www.beeradvocate.com/beer/profile/113/782/
[2023-03-20 18:45:21,386] INFO [BeeerAdvocateScraper] - Starting page iteration.
[2023-03-20 18:45:23,214] INFO [BeeerAdvocateScraper] - Beer Info retrieved.
[2023-03-20 18:54:29,772] INFO [BeeerAdvocateScraper] - Reached Last review Page.
[2023-03-20 18:54:29,772] INFO [BeeerAdvocateScraper] - All reviews Loaded.
[2023-03-20 18:54:29,775] INFO [BeeerAdvocateScraper] - Opened file at reviews\147_38470_reviews.csv
[2023-03-20 18:54:29,776] INFO [BeeerAdvocateScraper] - Loading Page https://www.beeradvocate.com/beer/profile/147/38470/
[2023-03-20 18:54:31,253] INFO [BeeerAdvocateScraper] - Starting page iteration.
[2023-03-20 18:54:33,148] INFO [BeeerAdvocateScraper] - Beer Info retrieved.
[2023-03-20 19:06:49,449] INFO [BeeerAdvocateScraper] - Reached Last review Page.
[2

ElementClickInterceptedException: Message: element click intercepted: Element <a href="/beer/profile/30/22352/?view=beer&amp;show=recent&amp;start=2440#lists">...</a> is not clickable at point (1145, 8). Other element would receive the click: <div class="navTabs">...</div>
  (Session info: chrome=111.0.5563.65)
Stacktrace:
Backtrace:
	(No symbol) [0x0044DCE3]
	(No symbol) [0x003E39D1]
	(No symbol) [0x002F4DA8]
	(No symbol) [0x003252EA]
	(No symbol) [0x00323BBA]
	(No symbol) [0x00321EFB]
	(No symbol) [0x00321027]
	(No symbol) [0x00318D05]
	(No symbol) [0x0033AECC]
	(No symbol) [0x00318726]
	(No symbol) [0x0033B224]
	(No symbol) [0x0034D57C]
	(No symbol) [0x0033ACC6]
	(No symbol) [0x00316F68]
	(No symbol) [0x003180CD]
	GetHandleVerifier [0x006C3832+2506274]
	GetHandleVerifier [0x006F9794+2727300]
	GetHandleVerifier [0x006FE36C+2746716]
	GetHandleVerifier [0x004F6690+617600]
	(No symbol) [0x003EC712]
	(No symbol) [0x003F1FF8]
	(No symbol) [0x003F20DB]
	(No symbol) [0x003FC63B]
	BaseThreadInitThunk [0x758F00F9+25]
	RtlGetAppContainerNamedObjectPath [0x779A7BBE+286]
	RtlGetAppContainerNamedObjectPath [0x779A7B8E+238]


In [13]:
i

65

In [8]:
scraper.shutdown()

[2023-03-20 00:44:42,274] INFO [BeeerAdvocateScraper] - Closing Driver.
