In [2]:
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time

In [4]:
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('--disable-infobars')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
options.add_argument('--remote-debugging-port=9222')

In [2]:
beers = pd.read_csv('beer_ratings.csv')
beers

In [7]:
# Function to get beer info from the soup
def get_beer_info(soup):
    info = []
    try:
        for i in soup.select('.MuiPaper-root')[5].select('.MuiTypography-root'):
            info.append(i.text)

    except:
        return 'NaN'
    
    return info

In [None]:
# Scraping info on beers most commonly rated by users
ids = []
col = ['beer', 'info']
d = {i: [] for i in col}

for link in tqdm(beers['link'].value_counts().index[:6433]): # data was scraped and processed in batches of 1000 or 2000, stopped at 6433
    beer = beers[beers["link"] == link]["beer"].iloc[0]
    d['beer'].append(beer)
    print(f'Scraping data for {beer}.')
    
    driver = webdriver.Chrome(options=options) 
    driver.get("https://www.ratebeer.com"+str(link))
    time.sleep(5)
    try:
        soup = bs(driver.page_source, "html.parser")
    except:
        print('Could not load the page.')
        pass
    
    try:
        d['info'].append(get_beer_info(soup))
    except:
        pass
        
    print(f'{beer} done!')
    driver.quit()
        
    df = pd.DataFrame(data=d, columns = col)
    df.to_csv('beer_info.csv', index=False)   
    print(f'Info on {beer} saved.')

In [None]:
# Functions to clean up beer info
def clean_up(x):
    """
    Cleans up beer info scraped from ratebeer.com
    """
    
    provinces = ['Quebec', 'Ontario', 'British Columbia', 'New Brunswick', 'Nova Scotia', 'Saskatchewan', 'Alberta', 'Newfoundland'
            'Manitoba', 'Prince Edward Island', 'Yukon', 'Nunavut', 'Northwest Territories']
    
    l1 = [element for element in x if 'VERIFIED' not in element and 'TOP 50' not in element and 'Reviews' not in element]
    l2 = [element for element in l1 if 'Ratings' not in element and "🇨🇦" not in element]
    l3 = [element for element in l2 if 'Canada' not in element or 'Bas-Canada' in element or "(Canada)" in element]
    l4 = [element for element in l3 if not element.strip() in provinces]
    l_final = [element for element in l4 if any(char.isalpha() for char in element)]
    
    return l_final[:7]

def beer_status(x):
    if 'seasonal' in x[2] or 'Production' in x[2]: # can be 'seasonal', 'out of production' or blank
        return x[2]
    else:
        return 'regular'

def beer_style(x):
    if 'seasonal' in x[2] or 'Production' in x[2]:
        if 'seasonal' in x[4] or 'Production' in x[4]:
            return x[6]
        else:
            return x[4]
    else:
        return x[2]
    
def string_cleanup(x):
    if x.strip()[0] == "'" and x.strip()[-1] == "'":
        return x.strip()[1:-1]
    else:
        return x.strip()

In [None]:
df['info'] = df['info'].apply(clean_up)
df['brewery'] = df["info"].apply(lambda x: x[0].replace('"','').strip()).apply(string_cleanup)
df['style'] = df['info'].apply(beer_style).apply(string_cleanup)
df['status'] = df['info'].apply(beer_status).apply(string_cleanup)

df = df[df['status'] !='Out of Production'] # excluding beers out of production
drinks_to_exclude = ['Cider', 'Mead', 'Saké'] # some ciders, meads and sakes were also scraped and need to be excluded
for drink in drinks_to_exclude:
    df = df[~df['style'].str.contains(drink)]

df = df.drop(columns = ['info'])
df.to_csv('beer_info.csv', index=False)

# if done in batches, after cleanup df from each batch can be concatenated with the main df
# main = pd.read_csv('')
# result = pd.concat([main, df], ignore_index=True)