# Preprocessing + NN Playing for Beer Reviews

In [119]:
pip install liac-arff

Note: you may need to restart the kernel to use updated packages.


In [120]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

import arff

## Load Dataset and create Dataframe

In [121]:
data = arff.load(open('../data/beer_reviews.arff', 'r'))
attr = np.array(data['attributes'])
numericals = [i[0] for i in attr if i[1] == 'INTEGER' or i[1] == 'REAL']
df = pd.DataFrame(data['data'], columns=attr[:, 0])
df.columns

Index(['brewery_id', 'brewery_name', 'review_time', 'review_overall',
       'review_aroma', 'review_appearance', 'review_profilename', 'beer_style',
       'review_palate', 'review_taste', 'beer_name', 'beer_abv',
       'beer_beerid'],
      dtype='object')

In [122]:
df.columns

df['review_time'] = df['review_time'].apply(lambda sec: pd.Timestamp(sec, unit='s'))
display(df.head())

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,2009-02-16 20:57:03,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,2009-03-01 13:44:57,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,2009-03-01 14:10:04,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,2009-02-15 19:12:25,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,2010-12-30 18:53:26,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


## Fix Missing Values

The rows with missing brewery name for id 1193 are found through a quick google search and added. For the ones with brewery id 27 where the beers already exist with the correct brewery, so I add it based on the dataset. For the others I google with the provided data.

The missing review profilenames are set to anonynoums, but otherwise kept, because the review is still done correctly.

In [123]:
display(df[df.isna().any(axis=1)])
display(df[df['brewery_name'].isna()])

display(df[df['brewery_id'] == 1193])
df.loc[df['brewery_id'] == 1193, 'brewery_name'] = 'Crailsheimer Engel-Bräu'
df.loc[df['brewery_id'] == 1193, 'beer_name'] = df.loc[df['brewery_id'] == 1193, 'beer_name'].apply(lambda name: name.split(' WRONG')[0])
display(df[df['brewery_id'] == 1193])

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
273,1075,Caldera Brewing Company,2004-12-21 22:29:55,3.0,3.0,3.0,RedDiamond,American Stout,4.0,3.0,Cauldron Espresso Stout,,21241
430,850,Moon River Brewing Company,2005-03-13 17:48:30,3.5,4.0,4.5,cMonkey,Scotch Ale / Wee Heavy,3.5,3.5,The Highland Stagger,,20689
603,850,Moon River Brewing Company,2004-11-09 22:20:19,4.0,3.5,4.0,aracauna,Scotch Ale / Wee Heavy,3.5,3.5,The Highland Stagger,,20689
733,1075,Caldera Brewing Company,2009-12-13 03:12:01,4.0,4.0,4.0,plaid75,American IPA,4.0,4.0,Alpha Beta,,54723
798,1075,Caldera Brewing Company,2008-05-31 02:34:28,4.5,4.5,4.0,grumpy,American Double / Imperial Stout,4.0,4.5,Imperial Stout,,42964
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586568,14359,The Defiant Brewing Company,2007-08-14 00:49:27,4.0,3.5,4.0,maddogruss,Bock,4.0,4.0,Bock,,36424
1586587,14359,The Defiant Brewing Company,2007-04-29 10:22:48,3.5,4.5,4.0,BBM,Maibock / Helles Bock,4.5,4.0,Maibock,,36555
1586596,14359,The Defiant Brewing Company,2010-10-24 20:11:07,4.0,3.0,5.0,hoppymcgee,Belgian Strong Pale Ale,4.0,3.5,Resolution #2,,48360
1586597,14359,The Defiant Brewing Company,2009-05-09 21:57:03,4.5,4.5,4.0,WesWes,Belgian Strong Pale Ale,4.0,4.0,Resolution #2,,48360


Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
651565,1193,,2011-03-25 03:01:06,2.0,2.5,2.5,Knapp85,Vienna Lager,1.5,1.5,Engel Tyrolian Bräu WRONG BREWERY SEE SCHWABIS...,5.0,67503
659293,1193,,2010-11-18 19:14:58,4.0,4.5,3.5,dqrull,Bock,4.0,3.5,Engel Bock Dunkel WRONG BREWERY SEE CRAILSHEIMER,7.2,63658
659299,1193,,2010-11-06 20:56:41,3.5,3.0,3.0,dqrull,Dortmunder / Export Lager,4.0,4.0,Engel Gold WRONG BREWERY SEE CRAILSHEIMER,5.4,63215
659300,1193,,2010-11-15 19:57:13,3.5,4.0,3.5,dqrull,Munich Helles Lager,3.5,3.0,Engel Landbier WRONG BREWERY SEE CRAILSHEIMER,4.8,63557
659301,1193,,2010-11-07 19:37:12,3.5,4.0,4.0,dqrull,Keller Bier / Zwickel Bier,4.0,3.5,Engel Keller Hell WRONG BREWERY SEE CRAILSHEIMER,5.4,63256
659302,1193,,2010-11-12 19:27:45,3.5,4.0,3.0,dqrull,Vienna Lager,3.0,3.5,Engel Aloisius - WRONG BREWERY SEE CRAILSHEIMER,5.9,63459
659303,1193,,2011-02-19 01:28:46,3.0,3.0,3.0,Ochsenblut,Keller Bier / Zwickel Bier,2.0,3.0,Engel Keller Dunkel WRONG BREWERY SEE CRAILSH...,5.3,63324
659304,1193,,2010-12-15 14:27:51,4.5,4.0,4.0,Dentist666,Keller Bier / Zwickel Bier,4.0,4.5,Engel Keller Dunkel WRONG BREWERY SEE CRAILSH...,5.3,63324
659305,1193,,2010-11-09 19:12:42,3.5,3.5,4.0,dqrull,Keller Bier / Zwickel Bier,4.0,4.0,Engel Keller Dunkel WRONG BREWERY SEE CRAILSH...,5.3,63324
1391043,27,,2002-05-01 09:20:56,3.5,3.5,3.5,Jason,American Adjunct Lager,4.0,3.5,Hard Hat American Beer,3.8,60


Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
651565,1193,,2011-03-25 03:01:06,2.0,2.5,2.5,Knapp85,Vienna Lager,1.5,1.5,Engel Tyrolian Bräu WRONG BREWERY SEE SCHWABIS...,5.0,67503
659293,1193,,2010-11-18 19:14:58,4.0,4.5,3.5,dqrull,Bock,4.0,3.5,Engel Bock Dunkel WRONG BREWERY SEE CRAILSHEIMER,7.2,63658
659299,1193,,2010-11-06 20:56:41,3.5,3.0,3.0,dqrull,Dortmunder / Export Lager,4.0,4.0,Engel Gold WRONG BREWERY SEE CRAILSHEIMER,5.4,63215
659300,1193,,2010-11-15 19:57:13,3.5,4.0,3.5,dqrull,Munich Helles Lager,3.5,3.0,Engel Landbier WRONG BREWERY SEE CRAILSHEIMER,4.8,63557
659301,1193,,2010-11-07 19:37:12,3.5,4.0,4.0,dqrull,Keller Bier / Zwickel Bier,4.0,3.5,Engel Keller Hell WRONG BREWERY SEE CRAILSHEIMER,5.4,63256
659302,1193,,2010-11-12 19:27:45,3.5,4.0,3.0,dqrull,Vienna Lager,3.0,3.5,Engel Aloisius - WRONG BREWERY SEE CRAILSHEIMER,5.9,63459
659303,1193,,2011-02-19 01:28:46,3.0,3.0,3.0,Ochsenblut,Keller Bier / Zwickel Bier,2.0,3.0,Engel Keller Dunkel WRONG BREWERY SEE CRAILSH...,5.3,63324
659304,1193,,2010-12-15 14:27:51,4.5,4.0,4.0,Dentist666,Keller Bier / Zwickel Bier,4.0,4.5,Engel Keller Dunkel WRONG BREWERY SEE CRAILSH...,5.3,63324
659305,1193,,2010-11-09 19:12:42,3.5,3.5,4.0,dqrull,Keller Bier / Zwickel Bier,4.0,4.0,Engel Keller Dunkel WRONG BREWERY SEE CRAILSH...,5.3,63324


Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
651565,1193,Crailsheimer Engel-Bräu,2011-03-25 03:01:06,2.0,2.5,2.5,Knapp85,Vienna Lager,1.5,1.5,Engel Tyrolian Bräu,5.0,67503
659293,1193,Crailsheimer Engel-Bräu,2010-11-18 19:14:58,4.0,4.5,3.5,dqrull,Bock,4.0,3.5,Engel Bock Dunkel,7.2,63658
659299,1193,Crailsheimer Engel-Bräu,2010-11-06 20:56:41,3.5,3.0,3.0,dqrull,Dortmunder / Export Lager,4.0,4.0,Engel Gold,5.4,63215
659300,1193,Crailsheimer Engel-Bräu,2010-11-15 19:57:13,3.5,4.0,3.5,dqrull,Munich Helles Lager,3.5,3.0,Engel Landbier,4.8,63557
659301,1193,Crailsheimer Engel-Bräu,2010-11-07 19:37:12,3.5,4.0,4.0,dqrull,Keller Bier / Zwickel Bier,4.0,3.5,Engel Keller Hell,5.4,63256
659302,1193,Crailsheimer Engel-Bräu,2010-11-12 19:27:45,3.5,4.0,3.0,dqrull,Vienna Lager,3.0,3.5,Engel Aloisius -,5.9,63459
659303,1193,Crailsheimer Engel-Bräu,2011-02-19 01:28:46,3.0,3.0,3.0,Ochsenblut,Keller Bier / Zwickel Bier,2.0,3.0,Engel Keller Dunkel,5.3,63324
659304,1193,Crailsheimer Engel-Bräu,2010-12-15 14:27:51,4.5,4.0,4.0,Dentist666,Keller Bier / Zwickel Bier,4.0,4.5,Engel Keller Dunkel,5.3,63324
659305,1193,Crailsheimer Engel-Bräu,2010-11-09 19:12:42,3.5,3.5,4.0,dqrull,Keller Bier / Zwickel Bier,4.0,4.0,Engel Keller Dunkel,5.3,63324


In [124]:
display(df[df['brewery_id'] == 27])

df.loc[1391053, 'brewery_id'] = 24831
df.loc[1391053, 'brewery_name'] = 'American Brewing Company'

df.loc[1391051, 'brewery_id'] = 24831
df.loc[1391051, 'brewery_name'] = 'American Brewing Company'
df.loc[1391052, 'brewery_id'] = 24831
df.loc[1391052, 'brewery_name'] = 'American Brewing Company'

df.loc[1391049, 'brewery_id'] = 782
df.loc[1391049, 'brewery_name'] = 'City Brewing Company, LLC'
df.loc[1391049, 'beer_name'] = 'Side Pocket High Gravity Ale'
df.loc[1391050, 'brewery_id'] = 782
df.loc[1391050, 'brewery_name'] = 'City Brewing Company, LLC'
df.loc[1391050, 'beer_name'] = 'Side Pocket High Gravity Ale'

df.drop(1391043, inplace=True)

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
1391043,27,,2002-05-01 09:20:56,3.5,3.5,3.5,Jason,American Adjunct Lager,4.0,3.5,Hard Hat American Beer,3.8,60
1391049,27,,2006-03-24 06:49:16,2.5,2.0,2.0,BeerBob,American Malt Liquor,2.0,2.0,Side Pocket,8.5,3394
1391050,27,,2005-06-05 09:20:32,1.5,1.5,2.0,feloniousmonk,American Malt Liquor,1.5,1.5,Side Pocket,8.5,3394
1391051,27,,2011-12-08 03:24:34,4.5,4.5,3.5,ccrida,American IPA,4.5,4.5,Breakaway IPA,7.2,75135
1391052,27,,2011-11-11 05:36:14,3.5,4.0,3.5,Docer,American IPA,4.0,4.0,Breakaway IPA,7.2,75135
1391053,27,,2011-11-11 07:10:08,5.0,4.5,4.0,Docer,American Stout,4.5,4.5,Caboose Oatmeal Stout,7.0,75137


In [125]:
df.loc[df['review_profilename'].isna(), 'review_profilename'] = 'Anonymous'

In [126]:
len(df['beer_style'].unique())

104

In [127]:
display(len(df.loc[df['beer_abv'].isna(), 'beer_name'].unique()))

def create_mean(df):
    means = {}
    for style in df['beer_style'].unique():
        mean_abv = df.loc[df['beer_style'] == style, 'beer_abv'].mean()
        means[style] = round(mean_abv, 1)

    return means

def fill_mean(means, row):
    return 

means = create_mean(df)
print(means)
df.loc[df['beer_abv'].isna(), 'beer_abv'] = df.loc[df['beer_abv'].isna()].apply(lambda row: means[row['beer_style']], axis=1)

14110

{'Hefeweizen': 5.3, 'English Strong Ale': 7.3, 'Foreign / Export Stout': 7.7, 'German Pilsener': 5.0, 'American Double / Imperial IPA': 9.4, 'Herbed / Spiced Beer': 6.6, 'Light Lager': 4.0, 'Oatmeal Stout': 5.8, 'American Pale Lager': 5.0, 'Rauchbier': 5.7, 'American Pale Ale (APA)': 5.5, 'American Porter': 6.2, 'Belgian Strong Dark Ale': 9.4, 'American IPA': 6.6, 'American Stout': 6.3, 'Russian Imperial Stout': 10.0, 'American Amber / Red Ale': 6.0, 'American Strong Ale': 9.8, 'Märzen / Oktoberfest': 5.9, 'American Adjunct Lager': 4.9, 'American Blonde Ale': 5.0, 'Euro Pale Lager': 5.1, 'English Brown Ale': 5.2, 'Scotch Ale / Wee Heavy': 8.2, 'Fruit / Vegetable Beer': 5.8, 'American Double / Imperial Stout': 10.6, 'Belgian Pale Ale': 6.1, 'English Bitter': 4.3, 'English Porter': 5.7, 'Irish Dry Stout': 4.9, 'American Barleywine': 10.7, 'Belgian Strong Pale Ale': 8.7, 'Doppelbock': 8.3, 'Maibock / Helles Bock': 6.8, 'Pumpkin Ale': 6.6, 'Dortmunder / Export Lager': 5.5, 'Euro Strong Lag

In [128]:
print(df[df.isna().any(axis=1)].count())

brewery_id            0
brewery_name          0
review_time           0
review_overall        0
review_aroma          0
review_appearance     0
review_profilename    0
beer_style            0
review_palate         0
review_taste          0
beer_name             0
beer_abv              0
beer_beerid           0
dtype: int64


## Encode Labels and Split

In [130]:
from sklearn.preprocessing import LabelEncoder

In [131]:
le = LabelEncoder()
le.fit(df['beer_style'])
df['class'] = le.transform(df['beer_style'])

In [135]:
le = LabelEncoder()
le.fit(df['review_profilename'])
df['review_profilecode'] = le.transform(df['review_profilename'])

In [136]:
X = df.copy()
X.drop('brewery_name', axis=1, inplace=True)
X.drop('beer_name', axis=1, inplace=True)
X.drop('beer_style', axis=1, inplace=True)
X.drop('review_profilename', axis=1, inplace=True)

y = df['class']
X.drop('class', axis=1, inplace=True)
X

Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_beerid,review_profilecode
0,10325,2009-02-16 20:57:03,1.5,2.0,2.5,1.5,1.5,5.0,47986,30567
1,10325,2009-03-01 13:44:57,3.0,2.5,3.0,3.0,3.0,6.2,48213,30567
2,10325,2009-03-01 14:10:04,3.0,2.5,3.0,3.0,3.0,6.5,48215,30567
3,10325,2009-02-15 19:12:25,3.0,3.0,3.5,2.5,3.0,5.0,47969,30567
4,1075,2010-12-30 18:53:26,4.0,4.5,4.0,4.0,4.5,7.7,64883,23009
...,...,...,...,...,...,...,...,...,...,...
1586609,14359,2006-11-05 00:01:32,5.0,4.0,3.5,4.0,4.0,5.2,33061,24776
1586610,14359,2006-10-17 01:29:26,4.0,5.0,2.5,2.0,4.0,5.2,33061,33152
1586611,14359,2006-10-13 01:21:53,4.5,3.5,3.0,3.5,4.0,5.2,33061,12504
1586612,14359,2006-10-05 04:37:24,4.0,4.5,4.5,4.5,4.5,5.2,33061,18487


In [None]:
# Find Solution of NN + THEORIE!