# Parser for RateBeer

In this notebook, we are parsing the HTML pages from the RateBeer website.

In [2]:
import os
os.chdir('..')

In [3]:
from classes.parser import *

import pandas as pd
import numpy as np
import datetime
import time
import html
import gzip
import re
import os

data_folder = '../data/'

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [4]:
parser = Parser(data_folder)

# Parse the breweries from the places

In [4]:
parser.parse_breweries_from_places()

# Parse the beers from the breweries

In [18]:
parser.parse_brewery_files()

# Parse the beer files for information

In [172]:
parser.parse_beer_files_for_information()

# Parse the beer files for the reviews

In [9]:
parser.parse_beer_files_for_reviews()

In [12]:
df

Unnamed: 0,beer_id,beer_name,brewery_id,brewery_name,style,nbr_ratings,overall_score,style_score,avg,abv
0,339174,Barbican Apple Malt Beverage,8795,Aujan Industries Co.,Fruit Beer,0,,,,
1,307184,Barbican Lemon Malt Beverage,8795,Aujan Industries Co.,Fruit Beer,0,,,,
2,399610,Barbican Mango & Passion Fruit Malt Beverage,8795,Aujan Industries Co.,Low Alcohol,0,,,,
3,247462,Barbican Peach Malt Beverage,8795,Aujan Industries Co.,Low Alcohol,0,,,,
4,226285,Barbican Pineapple Malt Beverage,8795,Aujan Industries Co.,Low Alcohol,0,,,,
5,226287,Barbican Pomegranate Malt Beverage,8795,Aujan Industries Co.,Low Alcohol,0,,,,
6,76211,Barbican Premium Malt,8795,Aujan Industries Co.,Low Alcohol,177,0.0,5.0,1.50,
7,226286,Barbican Raspberry Malt Beverage,8795,Aujan Industries Co.,Low Alcohol,0,,,,
8,462079,Barbican Special Edition,8795,Aujan Industries Co.,Low Alcohol,7,,,1.86,
9,176277,Barbican Strawberry Malt Beverage,8795,Aujan Industries Co.,Low Alcohol,0,,,,


In [21]:
# Load the DF
df = pd.read_csv(parser.data_folder + 'parsed/beers2.csv')

# Open the GZIP file
f = gzip.open(parser.data_folder + 'parsed/ratings.txt.gz', 'wb')
# Go through all beers
for i in df.index:
    row = df.ix[i]
    
    nbr_rat = row['nbr_ratings']
    count = 0

    # Check that this beer has at least 1 rating
    if row['nbr_ratings'] > 0:

        folder = parser.data_folder + 'beers/{}/{}/'.format(row['brewery_id'], row['beer_id'])

        list_ = os.listdir(folder)
        list_.sort()

        for file in list_:
            
            if folder == folder_to_watch:
                print()
                print(file)
            
            # Open the file
            html_txt = open(folder + file, 'rb').read().decode('ISO-8859-1')

            # Unescape the HTML characters
            html_txt = html.unescape(html_txt)

            # Remove the \n, \r and \t characters
            html_txt = html_txt.replace('\r', '').replace('\n', '').replace('\t', '')

            # Search for all the elements
            str_ = '<div style="display:inline; padding: 0px 0px; font-size: 24px; font-weight: bold; color: ' \
                   '#036;" title="(.+?) out of 5.0<br /><small>Aroma (\d+)/10<br />Appearance (\d+)/5<br />' \
                   'Taste (\d+)/10<br />Palate (\d+)/5<br />Overall (\d+)/20<br /></small>">(.+?)</div></div>' \
                   '<small style="color: #666666; font-size: 12px; font-weight: bold;"><A HREF="/user/(\d+)/">' \
                   '(.+?)\xa0\((\d+)\)</A></I> -(.+?)- (.+?)</small><BR><div style="padding: 20px 10px 20px ' \
                   '0px; border-bottom: 1px solid #e0e0e0; line-height: 1.5;">(.+?)</div><br>'

            grp = re.finditer(str_, html_txt)
            
            for g in grp:
                count += 1
                rating = float(g.group(1))

                appearance = int(g.group(3))
                aroma = int(g.group(2))
                palate = int(g.group(5))
                taste = int(g.group(4))
                overall = int(g.group(6))

                user_name = g.group(9)
                user_id = int(g.group(8))
                
                if folder == folder_to_watch:
                    print(user_name)

                text = g.group(13)

                if 'UPDATED' in text:
                    # Update the date
                    str_ = '<small style="color: #666666">UPDATED: (.+?)</i></small> (.+)'
                    grp_txt = re.search(str_, text)

                    str_date = grp_txt.group(1)
                    text = grp_txt.group(2)

                else:
                    str_date = g.group(12)

                    # Sometimes, the user will add a second position (or a job, not sure)
                    # Therefore, we simply split the str_date
                    str_date = str_date.split(' - ')[-1]

                try:
                    year = int(str_date.split(",")[1])
                except ValueError:
                    # It's possible that there's an error due to the addition of the explanation
                    # why this rating doesn't count
                    year = int(str_date.split(",")[1].split('<')[0])

                month = time.strptime(str_date[0:3], '%b').tm_mon
                day = int(str_date.split(",")[0][4:])

                date = int(datetime.datetime(year, month, day, 12, 0).timestamp())

                # Write to file
                f.write('beer_name: {}\n'.format(row['beer_name']).encode('utf-8'))
                f.write('beer_id: {:d}\n'.format(row['beer_id']).encode('utf-8'))
                f.write('brewery_name: {}\n'.format(row['brewery_name']).encode('utf-8'))
                f.write('brewery_id: {:d}\n'.format(row['brewery_id']).encode('utf-8'))
                f.write('style: {}\n'.format(row['style']).encode('utf-8'))
                f.write('abv: {}\n'.format(row['abv']).encode('utf-8'))
                f.write('date: {:d}\n'.format(date).encode('utf-8'))
                f.write('user_name: {}\n'.format(user_name).encode('utf-8'))
                f.write('user_id: {:d}\n'.format(user_id).encode('utf-8'))
                f.write('appearance: {:d}\n'.format(appearance).encode('utf-8'))
                f.write('aroma: {:d}\n'.format(aroma).encode('utf-8'))
                f.write('palate: {:d}\n'.format(palate).encode('utf-8'))
                f.write('taste: {:d}\n'.format(taste).encode('utf-8'))
                f.write('overall: {:d}\n'.format(overall).encode('utf-8'))
                f.write('rating: {:.2f}\n'.format(rating).encode('utf-8'))
                f.write('text: {}\n'.format(text).encode('utf-8'))
                f.write('\n'.encode('utf-8'))
                
        if count < nbr_rat:
            # If there's a problem in the HTML file, we replace the count of ratings with the number we have now.
            df = df.set_value(i, 'nbr_ratings', count)  

f.close()


1.html
terefere
FunkyF

2.html
JanLaursen
joergen
Theis
Pinball
yespr
madsberg
Ungstrup
saxo
Papsoe


NameError: name 'asd' is not defined

In [28]:
df.ix[i]

beer_id                                                    158706
beer_name                                   Snow Ice Pure Beer 9°
brewery_id                                                   5227
brewery_name     China Resources Snow Breweries (CRB) (SABMiller)
style                                                  Pale Lager
nbr_ratings                                                    11
overall_score                                                   1
style_score                                                    20
avg                                                           1.8
abv                                                           3.3
Name: 314, dtype: object

In [15]:
print(count, rat)

11 19


In [16]:
folder_to_watch = folder

In [14]:
folder + file

'../data/beers/5227/158706/2.html'

In [7]:
df['nbr_ratings'].sum()

5797

In [8]:
def parse(filename):
    """
    Parse a txt.gz file and return a generator for it

    :param filename: name of the file
    :return: Generator to go through the file
    """
    file = gzip.open(filename, 'rb')
    entry = {}
    # Go through all the lines
    for line in file:
        # Transform the string-bytes into a string
        line = line.decode("utf-8").strip()
        
        # We check for a colon in each line
        colon_pos = line.find(":")
        if colon_pos == -1:
            # if no, we yield the entry
            yield entry
            entry = {}
            continue
        # otherwise, we add the key-value pair to the entry
        key = line[:colon_pos]
        value = line[colon_pos + 2:]
        entry[key] = value
    

In [9]:
iter_ = parse('../data/parsed/ratings.txt.gz')
i=0
for item in iter_:
    i+=1
print(i)

5789
