# Parser for the BeerAdvocate websites

In this notebook, we are parsing the HTML pages that are crawled with the other notebook.

In [1]:
import os
os.chdir('..')

In [2]:
from classes.helpers import *
from classes.parser import *
import pandas as pd
import numpy as np
import datetime
import time
import gzip

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [3]:
parser = Parser()

# Parse the breweries from the places

In [5]:
parser.parse_breweries_from_places()

# Add missing breweries

In [4]:
parser.parse_missing_breweries()

# Parse the breweries page to get the number of beers

In [5]:
parser.parse_breweries_files_for_number()

# Parse the breweries page to get the beers

In [4]:
parser.parse_breweries_files_for_beers()

# Parse the beer files for information

In [43]:
parser.parse_beer_files_for_information()

# Parse the beer files for the reviews

In [4]:
parser.parse_beer_files_for_reviews()

In [7]:
df = pd.read_csv(parser.data_folder + '/parsed/beers2.csv')

In [73]:
# Open the GZIP file
f_ratings = gzip.open(parser.data_folder + 'parsed/ratings.txt.gz', 'wb')
f_reviews = gzip.open(parser.data_folder + 'parsed/reviews.txt.gz', 'wb')

for i in df.index:
    row = df.ix[i]
    rats = row['nbr_ratings']
    
    # Check that this beer has at least 1 rating
    if row['nbr_ratings'] > 0:    
        
        folder = parser.data_folder + 'beers/{}/{}/'.format(row['brewery_id'], row['beer_id'])
        
        list_ = os.listdir(folder)
        list_.sort()

        for file in list_:
            
            # Open the file
            html_txt = open(folder + file, 'rb').read().decode('utf-8')
            
            # Remove the \n, \r and \t characters
            html_txt = html_txt.replace('\r', '').replace('\n', '').replace('\t', '')
            
            # Find the ratings without the aspects
            str_ = 'alt="Photo of ([^<]*)"></a></div></div><div id="rating_fullview_content_2">'\
            '<span class="BAscore_norm">([^<]*)</span><span class="rAvg_norm">/5</span>&nbsp;&nbsp;(.+?)'\
            '<br><br>(.+?)<span class="muted"><a href="/community/members/(.+?)/" class="username">([^<]*)</a>, '\
            '<a href="/beer/profile/(\d+)/(\d+)/\?ba=([^#]*)\#review">(.+?)</a></span>'

            grp = re.finditer(str_, html_txt)
            
            for g in grp:
                count += 1
                # Get username and userid
                user_name = g.group(6)
                user_id = g.group(5)
                
                # Some user have been deleted and leave a weird trace
                if user_name != '':
                    
                    # Get the "final" rating
                    rating = float(g.group(2))
                    
                    # Check for the ratings of the aspects
                    if 'overall' in g.group(3):
                        str_2 = '<span class="muted">look: (.+?) \| smell: (.+?) \| taste: (.+?) \| feel: (.+?) \|  overall: (.+?)</span>'
                        grp2 = re.search(str_2, g.group(3))
                        
                        # Get the ratings for the different aspects
                        appearance = float(grp2.group(1))
                        aroma = float(grp2.group(2))
                        taste = float(grp2.group(3))
                        palate = float(grp2.group(4))
                        overall = float(grp2.group(5))
                    else:
                        # Otherwise, they're all nan
                        appearance = np.nan
                        aroma = np.nan
                        taste = np.nan
                        palate = np.nan
                        overall = np.nan

                    # Get the date
                    str_date = g.group(10)
                    try:
                        year = int(str_date.split(",")[1])
                        month = time.strptime(str_date[0:3], '%b').tm_mon
                        day = int(str_date.split(",")[0][4:])

                    except IndexError:
                        # Date written in a different way (ex: Tuesday at XX pm)

                        # Get the day of the week
                        weekday = str_date.split(' at ')[0]

                        if weekday == 'Yesterday':
                            delta = 1
                        else:
                            # Transform it to number
                            day_nbr = parser.day_to_nbr[weekday]

                            # Get last time when the file was modified
                            last_modified = os.path.getmtime(folder+file) 

                            # Get the day of the week when the file was last modified
                            dt = datetime.datetime.fromtimestamp(last_modified)
                            this_day_nbr = dt.weekday()

                            # Compute difference (modulo 7 days)
                            if day_nbr > this_day_nbr:
                                delta = this_day_nbr + 7 - day_nbr
                            else:
                                delta = this_day_nbr - day_nbr

                        # Get the day when it was posted
                        day_posted = dt - datetime.timedelta(days=delta)
                        year = day_posted.year
                        month = day_posted.month
                        day = day_posted.day

                    date = int(datetime.datetime(year, month, day, 12, 0).timestamp())

                    # Check if there's some text
                    if 'characters' in g.group(4):                  
                        str_2 = '(.+?)<br>(.+?)<span class="muted">(.+?) characters</span><br><br><div>'
                        grp2 = re.search(str_2, g.group(4))

                        # Get the text
                        text = grp2.group(1).replace('<br />', '')
                        nbr_char = int(grp2.group(3).replace(',', ''))
                    else:
                        nbr_char = np.nan
                        text = np.nan

                    # Write in the file ratings.txt.gz
                    f_ratings.write('beer_name: {}\n'.format(row['beer_name']).encode('utf-8'))
                    f_ratings.write('beer_id: {:d}\n'.format(row['beer_id']).encode('utf-8'))
                    f_ratings.write('brewery_name: {}\n'.format(row['brewery_name']).encode('utf-8'))
                    f_ratings.write('brewery_id: {:d}\n'.format(row['brewery_id']).encode('utf-8'))
                    f_ratings.write('style: {}\n'.format(row['style']).encode('utf-8'))
                    f_ratings.write('abv: {}\n'.format(row['abv']).encode('utf-8'))
                    f_ratings.write('date: {:d}\n'.format(date).encode('utf-8'))
                    f_ratings.write('user_name: {}\n'.format(user_name).encode('utf-8'))
                    f_ratings.write('user_id: {}\n'.format(user_id).encode('utf-8'))
                    f_ratings.write('appearance: {}\n'.format(appearance).encode('utf-8'))
                    f_ratings.write('aroma: {}\n'.format(aroma).encode('utf-8'))
                    f_ratings.write('palate: {}\n'.format(palate).encode('utf-8'))
                    f_ratings.write('taste: {}\n'.format(taste).encode('utf-8'))
                    f_ratings.write('overall: {}\n'.format(overall).encode('utf-8'))
                    f_ratings.write('rating: {:.2f}\n'.format(rating).encode('utf-8'))
                    f_ratings.write('text: {}\n'.format(text).encode('utf-8'))
                    f_ratings.write('\n'.encode('utf-8'))

                    if nbr_char >= 150:
                        # Write in the file reviews.txt.gz
                        f_reviews.write('beer_name: {}\n'.format(row['beer_name']).encode('utf-8'))
                        f_reviews.write('beer_id: {:d}\n'.format(row['beer_id']).encode('utf-8'))
                        f_reviews.write('brewery_name: {}\n'.format(row['brewery_name']).encode('utf-8'))
                        f_reviews.write('brewery_id: {:d}\n'.format(row['brewery_id']).encode('utf-8'))
                        f_reviews.write('style: {}\n'.format(row['style']).encode('utf-8'))
                        f_reviews.write('abv: {}\n'.format(row['abv']).encode('utf-8'))
                        f_reviews.write('date: {:d}\n'.format(date).encode('utf-8'))
                        f_reviews.write('user_name: {}\n'.format(user_name).encode('utf-8'))
                        f_reviews.write('user_id: {}\n'.format(user_id).encode('utf-8'))
                        f_reviews.write('appearance: {}\n'.format(appearance).encode('utf-8'))
                        f_reviews.write('aroma: {}\n'.format(aroma).encode('utf-8'))
                        f_reviews.write('palate: {}\n'.format(palate).encode('utf-8'))
                        f_reviews.write('taste: {}\n'.format(taste).encode('utf-8'))
                        f_reviews.write('overall: {}\n'.format(overall).encode('utf-8'))
                        f_reviews.write('rating: {:.2f}\n'.format(rating).encode('utf-8'))
                        f_reviews.write('text: {}\n'.format(text).encode('utf-8'))
                        f_reviews.write('\n'.encode('utf-8'))

f_ratings.close()
f_reviews.close()

In [78]:
str_date

'Jul 12, 2015'

In [8]:
df['nbr_reviews'].sum()

7341

In [9]:
df['nbr_ratings'].sum()

16941

In [10]:
def parse(filename):
    """
    Parse a txt.gz file and return a generator for it

    Copyright © 2017 Gael Lederrey <gael.lederrey@epfl.ch>

    :param filename: name of the file
    :return: Generator to go through the file
    """
    file = gzip.open(filename, 'rb')
    entry = {}
    # Go through all the lines
    for line in file:
        # Transform the string-bytes into a string
        line = line.decode("utf-8").strip()

        # We check for a colon in each line
        colon_pos = line.find(":")
        if colon_pos == -1:
            # if no, we yield the entry
            yield entry
            entry = {}
            continue
        # otherwise, we add the key-value pair to the entry
        key = line[:colon_pos]
        value = line[colon_pos + 2:]
        entry[key] = value

In [12]:
grp = parse('../data/parsed/ratings.txt.gz')
i = 0
for g in grp:
    i+=1
print(i)

16932
