# Parser for the BeerAdvocate websites

In this notebook, we are parsing the HTML pages that are crawled with the other notebook.

In [1]:
import os
os.chdir('..')

In [2]:
from classes.helpers import *
from classes.parser import *
import pandas as pd
import numpy as np
import datetime
import time
import gzip

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [3]:
parser = Parser()

# Parse the breweries from the places

In [None]:
parser.parse_breweries_from_places()

# Add missing breweries

In [None]:
parser.parse_missing_breweries()

# Parse the breweries page to get the number of beers

In [None]:
parser.parse_breweries_files_for_number()

# Parse the breweries page to get the beers

In [None]:
parser.parse_breweries_files_for_beers()

# Parse the beer files for information

In [None]:
parser.parse_beer_files_for_information()

# Parse the beer files for the reviews

In [None]:
parser.parse_beer_files_for_reviews()

# Get the users from the ratings

In [None]:
parser.get_users_from_ratings()

# Parse the user files for information

In [None]:
parser.parse_all_users()

In [30]:
df = pd.read_csv('../data/parsed/beers.csv')

In [31]:
df

Unnamed: 0,beer_id,beer_name,brewery_id,brewery_name,style,nbr_ratings,nbr_reviews,avg,ba_score,bros_score,abv
0,166064,Nashe Moskovskoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager,0.0,0.0,,,,4.70
1,166065,Nashe Pivovskoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager,0.0,0.0,,,,3.80
2,166066,Nashe Shakhterskoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager,0.0,0.0,,,,4.80
3,166067,Nashe Zhigulevskoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager,0.0,0.0,,,,4.00
4,166063,Zhivoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager,0.0,0.0,,,,4.50
5,166068,Arpa,39913,Arpa (АРПА),Euro Pale Lager,0.0,0.0,,,,4.00
6,166071,Eles,39914,Bear Beer,Euro Pale Lager,0.0,0.0,,,,4.00
7,166072,Eles Light,39914,Bear Beer,Euro Pale Lager,0.0,0.0,,,,3.20
8,166074,Toroz Svetloye,39914,Bear Beer,American Pale Lager,0.0,0.0,,,,4.50
9,166076,Toroz Temnoye,39914,Bear Beer,Euro Dark Lager,0.0,0.0,,,,4.10


In [27]:
df['nbr_ratings'].astype(int)

0          0
1          0
2          0
3          0
4          0
5          0
6          0
7          0
8          0
9          0
10         0
11         0
12         0
13         0
14         0
15         0
16         0
17         0
18         0
19         0
20         0
21         0
22         0
23         1
24         4
25        75
26         8
27        64
28         1
29         1
          ..
352816    18
352817     1
352818     0
352819    55
352820     1
352821     1
352822    40
352823     3
352824    51
352825     0
352826     1
352827     1
352828     1
352829     1
352830    18
352831    50
352832     1
352833     0
352834     0
352835     0
352836     2
352837     3
352838     0
352839     2
352840     3
352841     3
352842     2
352843     0
352844     0
352845     0
Name: nbr_ratings, dtype: int64

In [25]:
df[df['nbr_ratings']>0]

Unnamed: 0,beer_id,beer_name,brewery_id,brewery_name,style,nbr_ratings,nbr_reviews,avg,ba_score,bros_score,abv
23,142544,Régab,37262,Societe des Brasseries du Gabon (SOBRAGA),Euro Pale Lager,1.0,1.0,2.88,,,4.50
24,19590,Barelegs Brew,10093,Strangford Lough Brewing Company Ltd,English Pale Ale,4.0,4.0,3.85,,,4.50
25,19827,Legbiter,10093,Strangford Lough Brewing Company Ltd,English Pale Ale,75.0,59.0,3.45,80.0,80.0,4.80
26,20841,St. Patrick's Ale,10093,Strangford Lough Brewing Company Ltd,English Pale Ale,8.0,6.0,3.86,,,6.00
27,20842,St. Patrick's Best,10093,Strangford Lough Brewing Company Ltd,English Bitter,64.0,48.0,3.56,82.0,90.0,4.20
28,22659,St. Patrick's Gold,10093,Strangford Lough Brewing Company Ltd,American Pale Wheat Ale,1.0,1.0,3.96,,,4.80
29,153718,Sheelin Blonde Ale,32848,The Sheelin Brewery,American Blonde Ale,1.0,0.0,4.53,,,
33,825,Caffrey's Irish Ale,297,Thomas Caffrey Brewing Co.,Irish Red Ale,131.0,82.0,3.23,77.0,85.0,3.80
34,7331,Stout,297,Thomas Caffrey Brewing Co.,American Stout,6.0,4.0,3.46,,,4.70
35,178689,Boom,40360,Walled City Brewing Company,American Pale Ale (APA),1.0,0.0,3.95,,,4.20


In [19]:
df = pd.read_csv('../tmp/beers_old.csv')

In [20]:
df

Unnamed: 0,beer_id,beer_name,brewery_id,brewery_name,style
0,166064,Nashe Moskovskoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager
1,166065,Nashe Pivovskoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager
2,166066,Nashe Shakhterskoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager
3,166067,Nashe Zhigulevskoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager
4,166063,Zhivoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager
5,166068,Arpa,39913,Arpa (АРПА),Euro Pale Lager
6,166071,Eles,39914,Bear Beer,Euro Pale Lager
7,166072,Eles Light,39914,Bear Beer,Euro Pale Lager
8,166074,Toroz Svetloye,39914,Bear Beer,American Pale Lager
9,166076,Toroz Temnoye,39914,Bear Beer,Euro Dark Lager
