In [None]:
# scraping
import requests
from lxml import html
import ast

# functions


def get_html(link):
    """Load HTML page and parse it."""
    # load html page and parse it
    page = requests.get(link)
    tree = html.fromstring(page.content)
    return tree


def parse_beer_info(link):
    """Parse information of beer."""
    # constants to select range in table
    min_tr = 4
    max_tr = 54
    page = requests.get(link)
    tree = html.fromstring(page.content)

    brew_name, ids, beer_name = [], [], []

    for i in range(min_tr, max_tr):
        p = '//*[@id="ba-content"]/table/tr[' + str(i) + ']'

        brew_name.append(tree.xpath(p + '//text()')[1])
        beer_name.append(tree.xpath(p + '//text()')[0])
        ids.append(tree.xpath(p + '//a/@href')[0])

    brewery_id = [i.split('/')[3] for i in ids]
    beer_id = [i.split('/')[4] for i in ids]

    return(beer_id, beer_name, brewery_id, brew_name)

def beer_link(page_id, brew_id, beer_id):
    """Create link to beer page including pagination and complete path."""
    start_page = (page_id * 25) - 25

    link = 'https://www.beeradvocate.com/beer/profile/'
    link = link + '%(brewery)s/%(beer)s/?view=beer&sort=&start=%(page)d' % {
        'brewery': brew_id, "beer": beer_id, 'page': start_page}

    return link


def beer_slink(brew_id, beer_id):
    """Create shortened link to beer page."""
    link = '/beer/profile/'
    link = link + '%(brewery)s/%(beer)s/?view=beer&sort=&start=' % {
        'brewery': brew_id, "beer": beer_id}

    return link


def get_max_page(max_page, tree, brew_id, beer_id):
    """Scrape maximum review page of beer."""
    link_s = beer_slink(brew_id, beer_id)
    links_a = tree.xpath('//a/@href')
    links_a = [s.replace(link_s, '') for s in links_a if link_s in s]

    try:
        maxmax = round(max([int(i) for i in links_a if i != '0#XenForo']) / 25)
    except:
        maxmax = 1

    if maxmax < max_page:
        return maxmax
    else:
        return max_page


def extr_reviews(tree):
    """Extract beer reviews (text) from HTML tree."""
    res = tree.xpath('//div[@id="rating_fullview_content_2"][1]/text()')

    # identify individual reviews
    res = ' '.join(res)
    res = res.split('\xa0\xa0rDev')

    # clean
    res = [r for r in res if r]
    res = [r.replace('\xa0', '') for r in res]
    res = [r.replace('\n', '') for r in res]

    return res


def extr_ratings(tree):
    """Extract beer ratings from HTML tree."""
    ratings = tree.xpath('//*[@id="rating_fullview_content_2"]/span[1]/text()')
    ratings = [float(r) for r in ratings]

    return ratings


def load_reviews(max_page, brew_id, beer_id):
    """Load reviews for beer."""
    reviews, ratings = [], []

    for i in range(1, max_page + 1):
        link = beer_link(i, brew_id, beer_id)
        tree = get_html(link)

        reviews.extend(extr_reviews(tree))
        ratings.extend(extr_ratings(tree))
    return(reviews, ratings)

In [None]:
bstyles = list(zip([155, 159, 73, 175, 116, 150, 101, 158],
               [#'American Pale Lager', 'American Porter',
                #'American Brown Ale',
                #'American Black Ale', 'American IPA',
                'English India Pale Ale (IPA)',
                'English Porter', 'American Stout']))

print('Extract all beers for styles')

link = 'https://www.beeradvocate.com/beer/style/'

beer_result = {}

# loop over beer styles and get 50 beers with highest amount of reviews
for style in bstyles:
    print(style)
    link = 'https://www.beeradvocate.com/beer/style/%(style)d/?sort=revsD' % {
           'style': style[0]}

    beer_ids, beer_names, brew_ids, brew_names = [], [], [], []
    try:
        beer_ids, beer_names, brew_ids, brew_names = parse_beer_info(link)

        # get beer data in mongodb format
        beers = [{'beer_id': int(beer_id),
                  'brew_id': int(brew_id),
                  'style_id': style[0],
                  'beer_name': beer_name,
                  'brew_name': brew_name,
                  'style_name': style[1]}
                 for beer_id, brew_id, beer_name, brew_name
                 in zip(beer_ids, brew_ids, beer_names, brew_names)]

        # add beers to mongodb
        beer_result[style[1]] = beers
    except:
        print('Some Problem here.')  # probably less than 50 beers in style

In [None]:
max_page = 30
beer_reviews = {key: [] for key in beer_result.keys()}
style_name = 'American Pale Lager'

for style_name in beer_reviews:
    print(style_name)
    style = beer_result[style_name]
    for i in range(0, len(style)):
        beer_id = style[i]['beer_id']
        brew_id = style[i]['brew_id']

        link = beer_link(1, brew_id, beer_id)
        tree = get_html(link)

        # check maxpage
        current_max_page = get_max_page(max_page, tree, brew_id, beer_id)

        # get reviews
        reviews, ratings = load_reviews(current_max_page, brew_id, beer_id)

        # reviews in mongodb format
        ratings = [{'text': review,
                    'rating': rating}
                   for review, rating in zip(reviews, ratings)]

        ratings = [rating for rating in ratings if rating['text'] != '  ']
        ratings = [rating for rating in ratings if len(rating['text']) > 100]
        ratings = [ast.literal_eval(el1) for el1 in set([str(el2) for el2 in ratings])]

        beer_reviews[style_name].extend(ratings)

In [None]:
import json
with open('/Users/eb/Google Drive/Research/text_words_meaning/twm-slides/notebooks/data/data.json', 'w') as outfile:
    json.dump(beer_reviews, outfile)