In [None]:
import re
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time

In [None]:
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('--disable-infobars')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
options.add_argument('--remote-debugging-port=9222')

In [None]:
# Functions to get beer data from the soup

def get_beer(soup, n):
    try: 
        return soup.select('.row')[1].select('.table')[0].select('tr')[n].select('td')[0].text
    except:
        return 'NaN'

def get_beer_link(soup, n):
    try:
        return soup.select('.row')[1].select('.table')[0].select('tr')[n].select('td')[0].select('a')[0]['href']
    except:
        return 'NaN'

def get_user_rating(soup, n):
    try:
        return soup.select('.row')[1].select('.table')[0].select('tr')[n].select('td')[1].text
    except:
        return 'NaN'

def get_avg_rating(soup, n):
    try:
        return soup.select('.row')[1].select('.table')[0].select('tr')[n].select('td')[3].text
    except:
        return 'NaN'

In [None]:
# Scraping user ratings

# a number of user ids for more active users (i.e. those with more ratings and reviews) were scraped from ratebeer.com and saved as a .txt file
with open('./raw data/users.txt', 'r') as file:
    diff_users = file.read().replace('\n','')

ids = []
col = ['beer', 'link', 'user_rating', 'avg_rating', 'id']
d = {i: [] for i in col}

for i in tqdm(range(len(diff_users))):
    
    user_id = diff_users[i]
    # to avoid duplicates
    if user_id in ids:
        print(f'Already have data on {user_id}.') 
        pass
    
    else:
        print(f'Scraping beer ratings for {user_id}')
        driver = webdriver.Chrome(options=options)
        driver.implicitly_wait(2)  
        driver.get("https://www.ratebeer.com"+str(user_id)+"country/39/5/1/")
        time.sleep(5)
        soup = bs(driver.page_source, "html.parser")
        
        try:
            beers = soup.select('.row')[1].select('.table')[0].select('tr')
        except:
            continue

        # ignoring users with < 50 ratings; 101 is the limit per page so this scenario is a user with 1 page of beer ratings to scrape
        if len(beers) > 50 and len(beers) < 101:
            print(f'{user_id} has rated between 50 and 100 Canadian beers.')
            for j in tqdm(range(1, len(beers))):
                d['beer'].append(get_beer(soup, j))
                d['link'].append(get_beer_link(soup, j))
                d['user_rating'].append(get_user_rating(soup, j))
                d['avg_rating'].append(get_avg_rating(soup, j))
                d['id'].append(user_id)  
            ids.append(user_id)
            print(f'{user_id} done.')

        # in this scenario a user has more than 100 ratings (several pages) to scrape
        elif len(beers) >= 101:
            print(f'{user_id} has rated over 100 Canadian beers.')
            driver.quit()
            page = 1
            pages = True

            # start a new loop to scrape data from multiple pages
            while pages:
                driver = webdriver.Chrome(options=options)
                driver.implicitly_wait(2)  
                driver.get("https://www.ratebeer.com"+str(user_id)+"country/39/5/"+str(page)+"/")
                print(f'Scraping data from page {str(page)} for {user_id}')
                time.sleep(5)
                soup = bs(driver.page_source, "html.parser")
                if get_beer_link(soup, 1) in d['link']:
                    print(f'{user_id} done.')
                    ids.append(user_id)
                    break

                # 2000 beer ratings per user limit
                elif page >= 21:
                    print(f'{user_id} has rated a lot of Canadian beers; stopped at 2000.')
                    ids.append(user_id)
                    break

                # as long as there are 100 beer ratings on the page, scrape all and move on to the next page; 
                # if there are < 100 ratings, scrape all and exit the loop
                else:
                    try:
                        beers = soup.select('.row')[1].select('.table')[0].select('tr')
                        for k in tqdm(range(1, len(beers))):
                            d['beer'].append(get_beer(soup, k))
                            d['link'].append(get_beer_link(soup, k))
                            d['user_rating'].append(get_user_rating(soup, k))
                            d['avg_rating'].append(get_avg_rating(soup, k))
                            d['id'].append(user_id)
                        print(f'{len(beers)-1} beers on this page.')
                        driver.quit()
                        page += 1
                        if len(beers) != 101:
                            pages = False
                            ids.append(user_id)
                            print(f'{user_id} done.')
                            break
                    except:
                        break
            
        else:
            print(f'{user_id} has rated fewer than 50 Canadian beers so will be skipped.')
        driver.quit()
        
        df = pd.DataFrame(data=d, columns = col)
        df.to_csv('beer_ratings.csv', index=False)   
        print(f'Ratings of {user_id} saved.')