In [1]:
from setup import dask_init, dask_shutdown
import dask.distributed as dd
import dask.dataframe as ddf

import pandas as pd
import numpy as np

import ingestion as ing
import processing as prc

import datetime as dt

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2

# setup

In [2]:
# PANDAS SETUP
pd.options.mode.use_inf_as_na = True

# DASK SETUP
client, cluster = dask_init()
cluster

Tab(children=(HTML(value='<div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-outpu…

# [I] Loading the Data

In [3]:
users_ddf = prc.users_pipeline(persist=False)
users_ddf.head(5)

Unnamed: 0,n_ratings,n_reviews,uid,username,joined,location,country
0,7820,465,nmann08.184925,nmann08,2008-01-07 11:00:00,"United States, Washington",United States
1,2521,2504,stjamesgate.163714,StJamesGate,2007-10-08 10:00:00,"United States, New York",United States
2,1797,1143,mdagnew.19527,mdagnew,2005-05-18 10:00:00,Northern Ireland,Northern Ireland
3,31,31,helloloser12345.10867,helloloser12345,2004-11-25 11:00:00,Northern Ireland,Northern Ireland
4,604,604,cypressbob.3708,cypressbob,2003-11-20 11:00:00,Northern Ireland,Northern Ireland


In [10]:
ratings_ddf = prc.ratings_pipeline(persist=False, users_persisted=True)
ratings_ddf['bid'] = ratings_ddf['bid'].astype('float64')
ratings_ddf.head(5)

Unnamed: 0,date,bid,uid,appearance,aroma,palate,taste,overall,rating,has_review,review
0,2015-08-20 10:00:00,142544.0,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,True,"From a bottle, pours a piss yellow color with ..."
1,2015-11-14 11:00:00,120824.0,nmann08.184925,4.0,3.5,3.75,3.5,3.5,3.56,False,
2,2011-11-18 11:00:00,47581.0,nmann08.184925,,,,,,4.5,False,
3,2013-05-03 10:00:00,93113.0,nmann08.184925,,,,,,3.75,False,
4,2015-01-24 11:00:00,875.0,nmann08.184925,4.0,3.5,3.75,4.0,3.75,3.81,False,


In [11]:
# load beer.csv using ingestion module
path = "Data/BeerAdvocate/beers.csv"
beer_df = ing.read_csv(path)
beer_df.head(5)

Unnamed: 0,beer_id,beer_name,brewery_id,brewery_name,style,nbr_ratings,nbr_reviews,avg,ba_score,bros_score,abv,avg_computed,zscore,nbr_matched_valid_ratings,avg_matched_valid_ratings
0,166064,Nashe Moskovskoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager,0,0,,,,4.7,,,0,
1,166065,Nashe Pivovskoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager,0,0,,,,3.8,,,0,
2,166066,Nashe Shakhterskoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager,0,0,,,,4.8,,,0,
3,166067,Nashe Zhigulevskoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager,0,0,,,,4.0,,,0,
4,166063,Zhivoe,39912,Abdysh-Ata (Абдыш Ата),Euro Pale Lager,0,0,,,,4.5,,,0,


In [12]:
ratings_users_df = ratings_ddf.merge(users_ddf, on="uid")
ratings_users_df.head(5)

Unnamed: 0,date,bid,uid,appearance,aroma,palate,taste,overall,rating,has_review,review,n_ratings,n_reviews,username,joined,location,country
0,2015-08-20 10:00:00,142544.0,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,True,"From a bottle, pours a piss yellow color with ...",7820,465,nmann08,2008-01-07 11:00:00,"United States, Washington",United States
1,2015-11-14 11:00:00,120824.0,nmann08.184925,4.0,3.5,3.75,3.5,3.5,3.56,False,,7820,465,nmann08,2008-01-07 11:00:00,"United States, Washington",United States
2,2011-11-18 11:00:00,47581.0,nmann08.184925,,,,,,4.5,False,,7820,465,nmann08,2008-01-07 11:00:00,"United States, Washington",United States
3,2013-05-03 10:00:00,93113.0,nmann08.184925,,,,,,3.75,False,,7820,465,nmann08,2008-01-07 11:00:00,"United States, Washington",United States
4,2015-01-24 11:00:00,875.0,nmann08.184925,4.0,3.5,3.75,4.0,3.75,3.81,False,,7820,465,nmann08,2008-01-07 11:00:00,"United States, Washington",United States


In [13]:
rating_user_beer_df = ratings_users_df.merge(beer_df, left_on='bid', right_on='beer_id')
rating_user_beer_df.head(5)

Unnamed: 0,date,bid,uid,appearance,aroma,palate,taste,overall,rating,has_review,...,nbr_ratings,nbr_reviews,avg,ba_score,bros_score,abv,avg_computed,zscore,nbr_matched_valid_ratings,avg_matched_valid_ratings
0,2015-08-20 10:00:00,142544.0,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,True,...,1,1,2.88,,,4.5,2.88,,0,
1,2015-11-14 11:00:00,120824.0,nmann08.184925,4.0,3.5,3.75,3.5,3.5,3.56,False,...,4,1,3.84,,,5.0,3.8375,-0.163207,2,4.145
2,2014-05-11 10:00:00,120824.0,wl0307.16869,,,,,,3.5,False,...,4,1,3.84,,,5.0,3.8375,-0.163207,2,4.145
3,2011-11-18 11:00:00,47581.0,nmann08.184925,,,,,,4.5,False,...,457,165,4.2,93.0,,8.0,4.184639,,0,
4,2013-11-27 11:00:00,47581.0,jaydoc.265507,,,,,,4.25,False,...,457,165,4.2,93.0,,8.0,4.184639,,0,


# [III] Descriptive Statistics & Distributions

In [14]:
# drop useless columns and duplicates on bid
beer_ratings = rating_user_beer_df[['bid', 'ba_score', 'bros_score']].drop_duplicates(subset=['bid'])
beer_ratings.head(5)

Unnamed: 0,bid,ba_score,bros_score
0,142544.0,,
1,120824.0,,
3,47581.0,93.0,
387,93113.0,88.0,
511,875.0,91.0,100.0


In [15]:
print("Number of beers: ", beer_ratings.shape[0].compute())

Number of beers:  230141


In [17]:
beers_ba = beer_ratings[beer_ratings.ba_score.notnull()]
n_ba = beers_ba.shape[0].compute()
print("Number of beers with a BA score: {}" .format(n_ba))

Number of beers with a BA score: 61121


In [18]:
beers_bro = beer_ratings[beer_ratings.bros_score.notnull()]
n_bro = beers_bro.shape[0].compute()
print("Number of beers with a Bros score: {}" .format(n_bro))

Number of beers with a Bros score: 5702


In [19]:
beers_ba_bro = beer_ratings[(beer_ratings.ba_score.notnull()) & (beer_ratings.bros_score.notnull())]
n_ba_bro = beers_ba_bro.shape[0].compute()
print("Number of beers with a BA and Bros score: {}" .format(n_ba_bro))

Number of beers with a BA and Bros score: 5047
