In [1]:
import pandas as pd
import numpy as np

In [2]:
# load the reviews
reviews_df = pd.read_csv('data/raw/reviews.csv')
reviews_df.head()

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score
0,271781,bluejacket74,2017-03-17,"750 ml bottle, 2016 vintage, bottle #304 of...",4.0,4.0,4.0,4.25,4.0,4.03
1,125646,_dirty_,2017-12-21,,4.5,4.5,4.5,4.5,4.5,4.5
2,125646,CJDUBYA,2017-12-21,,4.75,4.75,4.75,4.75,4.75,4.75
3,125646,GratefulBeerGuy,2017-12-20,0% 16 oz can. Funny story: As I finally wal...,4.75,4.75,4.5,4.5,4.5,4.58
4,125646,LukeGude,2017-12-20,Classic TH NEIPA. Overflowing head and bouq...,4.25,4.5,4.25,4.25,4.25,4.31


In [3]:
beers_df = pd.read_csv('data/raw/beers.csv')
beers_df.head()

Unnamed: 0,id,name,brewery_id,state,country,style,availability,abv,notes,retired
0,202522,Olde Cogitator,2199,CA,US,English Oatmeal Stout,Rotating,7.3,No notes at this time.,f
1,82352,Konrads Stout Russian Imperial Stout,18604,,NO,Russian Imperial Stout,Rotating,10.4,No notes at this time.,f
2,214879,Scottish Right,44306,IN,US,Scottish Ale,Year-round,4.0,No notes at this time.,t
3,320009,MegaMeow Imperial Stout,4378,WA,US,American Imperial Stout,Winter,8.7,Every time this year,f
4,246438,Peaches-N-Cream,44617,PA,US,American Cream Ale,Rotating,5.1,No notes at this time.,f


In [4]:
# merge the reviews and beers dataframes and drop every column not related to 
reviews_beers = pd.merge(reviews_df, beers_df, left_on='beer_id', how='inner', right_on='id')[['date', 'score', 'style', 'beer_id']]
reviews_beers.head()

Unnamed: 0,date,score,style,beer_id
0,2017-03-17,4.03,American Imperial Stout,271781
1,2018-08-04,4.0,American Imperial Stout,271781
2,2018-04-14,2.9,American Imperial Stout,271781
3,2017-08-16,2.82,American Imperial Stout,271781
4,2017-07-12,4.0,American Imperial Stout,271781


In [5]:
reviews_beers.columns

Index(['date', 'score', 'style', 'beer_id'], dtype='object')

In [6]:
# merge similar beer styles together in a meta_style column

metastyle_beer_dict = dict([
    ('IPA', 
    ['IPA', 'DDHIPA', 'NEIPA']),

    ('Ale', 
    ['Ale']),

    ('Sour', 
    ['Lambic', 'Sour', 'Kvass', 'Gueuze', 'Flanders', 'Gose', 'Sahti', 'Brett', 'Saison', 'Fruit']),

    ('Lager', 
    ['Lager', 'Pilsner', 'Bock', 'Chile', 'Happoshu', 'Pilsener', 'Helles', 'Oktoberfest', 'kölsch']),

    ('Stout', 
    ['Stout', 'Porter']),

    ('Smoked Beer', 
    ['Smoked','Smoke', 'Rauchbier']),

    ('Winter Beer', 
    ['Winter', 'Christmas', 'Pumpkin', 'spice']),

    ('Alcohol-free', 
    ['Low Alcohol Beer']),

    ('Belgian Blonde', 
    ['Dubbel', 'Quadrupel', 'Tripel', 'Belgian']),

    ('Wheat Beer', 
    ['Wheat', 'Weissbier', 'Witbier', 'Hefeweizen', 'Berliner']),

    ('Ambree', 
    ['Altbier', 'Rye']),

    ('Boozy', 
    ['Barleywine', 'Scotch', 'Scottish', 'Champagne', 'Braggot', 'Liquor'])

])

# Create a function to find the metastyle

# make metastyle_beer_dict_lower as a copy of metastyle_beer_dict with lower case values
metastyle_beer_dict_lower = {}
for key, value in metastyle_beer_dict.items():
    metastyle_beer_dict_lower[key] = [x.lower() for x in value]

def find_metastyle(specific_style):
    for metastyle, beer_substyles in metastyle_beer_dict_lower.items():
        for beer_substyle_keyword in beer_substyles:
            if beer_substyle_keyword in specific_style:
                return metastyle
    
    return 'Other'

In [7]:
# Iterate through the styles to find the metastyle
# make reviews_beers['style'] as string and lowercased
reviews_beers['style'] = reviews_beers['style'].astype(str).str.lower()

reviews_beers['meta_style'] = reviews_beers['style'].apply(find_metastyle)
reviews_beers.sample(5)

Unnamed: 0,date,score,style,beer_id,meta_style
8933531,2016-02-01,4.54,american imperial stout,102122,Stout
5147645,2014-12-15,4.5,english old ale,144695,Ale
957593,2017-06-11,3.91,german dunkelweizen,281133,Other
1581851,2014-05-28,3.75,american pale ale (apa),92885,Ale
8998180,2014-06-21,4.0,american black ale,103528,Ale


In [8]:
# compute average for each beer style and declare variable
avg_beer_style = reviews_beers.groupby('meta_style')['score'].mean()
avg_beer_style.head()

meta_style
Alcohol-free      2.508786
Ale               3.844813
Ambree            3.808414
Belgian Blonde    3.904772
Boozy             3.940979
Name: score, dtype: float64

In [9]:
# change the date to week of the year using isocalendar
reviews_beers['date'] = pd.to_datetime(reviews_beers['date'])
reviews_beers['week'] = reviews_beers['date'].apply(lambda x: x.isocalendar()[1])
reviews_beers.sample(5)

Unnamed: 0,date,score,style,beer_id,meta_style,week
994775,2011-12-31,4.0,american strong ale,92,Ale,52
5506771,2014-07-27,4.0,french bière de garde,18975,Other,30
2422924,2018-09-06,4.49,american imperial stout,358705,Stout,36
1014935,2013-06-11,4.5,german bock,101,Lager,24
8210675,2015-06-28,4.0,belgian saison,50570,Sour,26


In [10]:
# compute the average for each style and week
style_week_grade_df = reviews_beers.groupby(['meta_style', 'week'])['score'].mean()
style_week_grade_df.sample(10)

meta_style   week
Lager        48      3.426223
Stout        9       4.072437
IPA          29      4.006535
Other        21      3.717636
IPA          22      4.023764
Other        27      3.715373
Boozy        19      3.939178
Smoked Beer  33      3.701244
Winter Beer  17      3.687874
Wheat Beer   46      3.930825
Name: score, dtype: float64

In [11]:
# substract the average for each style and week by the yearly average for each style

style_week_grade_df = style_week_grade_df - style_week_grade_df.groupby('meta_style').transform('mean')
#normalize the result
style_week_grade_df = style_week_grade_df / style_week_grade_df.groupby('meta_style').transform('std')
style_week_grade_df.describe()

count    6.890000e+02
mean    -3.874985e-15
std      9.912407e-01
min     -2.285559e+00
25%     -6.741709e-01
50%     -9.414919e-02
75%      5.298633e-01
max      5.480337e+00
Name: score, dtype: float64

In [12]:
style_week_grade_df = style_week_grade_df.reset_index()
style_week_grade_df.to_csv('data/processed/style_week_grade.csv', index=False)

In [16]:
style_week_grade_df.sample(10)

Unnamed: 0,meta_style,week,score
318,Lager,1,-1.557061
266,IPA,2,0.178274
8,Alcohol-free,9,-0.234769
12,Alcohol-free,13,1.782355
467,Smoked Beer,44,0.227821
628,Wheat Beer,46,2.647914
639,Winter Beer,4,-0.0984
570,Stout,41,-1.174646
328,Lager,11,0.122505
481,Sour,5,0.503588


In [17]:
style_week_grade_df['meta_style'].unique()

array(['Alcohol-free', 'Ale', 'Ambree', 'Belgian Blonde', 'Boozy', 'IPA',
       'Lager', 'Other', 'Smoked Beer', 'Sour', 'Stout', 'Wheat Beer',
       'Winter Beer'], dtype=object)

array(['Alcohol-free', 'Ale', 'Ambree', 'Belgian Blonde', 'Boozy', 'IPA',
       'Lager', 'Other', 'Smoked Beer', 'Sour', 'Stout', 'Wheat Beer',
       'Winter Beer'], dtype=object)

In [23]:
style_keys = style_week_grade_df['meta_style'].unique()
for style in style_keys:
    # output the data to a csv file
    style_week_grade_df[style_week_grade_df['meta_style'] == style][['week', 'score']].to_csv('data/website_preparation/bubble_chart/style_week_grade_' + style + '.csv', index=False)