In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [38]:
########## INITIALIZATION ##########

### Imports
from scripts.plot_code import *
from scripts.beerrecommender_helper import *
from IPython.display import display, clear_output, HTML, Markdown, Latex
from ipywidgets import Layout, interact, interact_manual
import ipywidgets as widgets
import pandas as pd
import numpy as np
import warnings
from tabulate import tabulate
warnings.filterwarnings('ignore')


### Load Data
beers = pd.read_csv('data/beer_info.csv').drop('Unnamed: 0',axis=1)
reviews = pd.read_csv('data/beer_ratings.csv').drop('Unnamed: 0',axis=1)
breweries = beers.sort_values('brewery').brewery.unique()
family_lookup = pd.read_csv('data/beer_family_lookup.csv').drop('idx',axis=1)
family_lookup.rename(columns={'style':'beer_style'},inplace=True)

beers = pd.merge(beers,family_lookup,on='beer_style')


########## PLOT GENERATION ##########

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.plotly
import plotly.io as pio
init_notebook_mode(connected=True)

### Show timeseries of supersuser
def plot_super_user():
    def find_user_aggregate(u):
        df = reviews[reviews.username == u].posted.value_counts()
        df = df.reset_index().rename(columns = {'index':'date'})
        df = (
                df.
                groupby(by=[pd.to_datetime(df.date.rename(columns={'date':'year'})).dt.year,
                            pd.to_datetime(df.date.rename(columns={'date':'month'})).dt.month])
                .agg('count')
            )
        df = df.reset_index().rename(columns = {'level_0':'year','level_1':'month'})[['year','month','posted']]
        df['dates'] = [dt.datetime.strptime('-'.join([str(df['month'][i]),str(df['year'][i])]),'%m-%Y') for i in range(df.shape[0])]
        return df[['dates','posted']]

    top10_users = list(reviews.groupby('username').agg('count').sort_values('posted',ascending=False).index[:10])
    
    data = [find_user_aggregate(user) for user in top10_users]
    traces=[]
    for i in range(len(top10_users[:])):
        df = data[i]
        xU = df.dates
        yU = df.posted.values

        trace = dict(
            x=xU,
            y=yU,
            hoverinfo='x+y',
            mode='lines',
            line=dict(width=0.5),
            stackgroup='one',
            name = top10_users[i]
        )
        traces.append(trace)
    review_counts_by_time = traces


    layout = dict(title = 'Montly Aggregate of Super-User Activity',
                  yaxis = dict(
                      title = 'Per Month Count'),
                  xaxis = dict(
                      title = 'Date'
                  )
                 )
    fig = dict(data = review_counts_by_time, layout = layout)
    #pio.write_image(fig, 'img/super_user_lifetime.png', width=600, height= 600,scale = 1)
    return plotly.offline.iplot(fig)


### Show boxplots of the percentage of beers rated in each family by the top 10 reviewers
### Show boxplots of the percentage of beers rated in each family by the top 10 reviewers
def boxplot_family():
    top10 = reviews.groupby('username').agg('count').sort_values('posted',ascending=False).index[:10].values
    df = reviews[reviews.username.isin(top10)].join(beers[['family','BAscore']],on='beer_id',how='left')
    df = df.groupby(by=['family','username']).agg({'posted':'count','score':'mean'})
    families = df.reset_index().family.unique()    
    data = []
    i=0
    for family in families[:10]:
        num_beers = len(beers.reset_index()[beers.family == family].index.values)
        trace1 = go.Box(
            y=df.loc[family]['posted'] / num_beers,
            name=family,
            xaxis='x1',
            yaxis='y1'
            
        )
        trace2 = go.Box(
            y=df.loc[family]['score'],
            name=family,
            xaxis='x1',
            yaxis='y2'
        )
        data.append(trace1)
        data.append(trace2)
        i=i+1
            
    layout = go.Layout(
        xaxis = dict(            
            anchor='x1',
            title='Beer Family',
            linewidth = 2,
            mirror = True
        ),
        yaxis1 = dict(
            domain=[0, 0.5],
            anchor='y1',
            title = 'Number of Reviews'
        ),
        
        yaxis2 = dict(
            domain=[0.5, 1],
            anchor='y2',
            title = 'Ratings'
        ),
        showlegend = False
        
    )
    fig = go.Figure(data=data,layout=layout)
    
    return plotly.offline.iplot(fig)

In [42]:
########## PRESENTATION TEXT ##########

overview_text = """ 
# Beer Advisor Overview

Hello, and welcome to Beer Advisor.

This project had three steps:
1. This project aimed to scrape as much data as possible from [BeerAdvocate.com](https://www.beeradvocate.com)
2. Peform Numerical Analysis on user reviews.
3. Develop a recommender system to suggest beers to new users based on the data scraped from previous reviews.
4. Analyze textual data from reviews to advisor brewers on new products.')) """

webscraping_text = """
### 1. Webscraping

__Beer Advocate__ is online community where users can rate and review all beers, craft to mainstream. This website was scraped using a _scrapy spider_. Information on the general product page and each individual review was pulled by the spider.

    Beer Advocate boasts a database of nearly 300,000 beers (probably more at this point).

    Only looked at beers with more than 100 user ratings. (conveniently listed on the beer list page)

    3 hours later and we have nearly 10,000 individual beers and 1.7 million individual reviews!!

Based on the origin of the data, the information was piped into one of two csv files and generated tables of information similar to this:

#### Beers DataFrame

|   beer_id | beer_name                        | brewery                            | beer_style                 |   abv |   num_reviews |   ranking |
|----------:|:---------------------------------|:-----------------------------------|:---------------------------|------:|--------------:|----------:|
|      9128 | Motor City Brewing Ghettoblaster | Motor City Brewing Works           | English Dark Mild Ale      |   4.2 |            64 |     44196 |
|       205 | Spellbound IPA                   | Spellbound Brewing                 | American IPA               |   6.5 |            35 |     13651 |
|      9358 | Red Nose Winter Ale              | Natty Greene's Pub & Brewing Co.   | Winter Warmer              |   6.8 |            51 |     38773 |
|      6646 | Hunter Vanilla                   | 18th Street Brewery - Gary Taproom | English Sweet / Milk Stout |   8.5 |            61 |      1391 |
|      3753 | Bière De Miel Biologique         | Brasserie Dupont sprl              | Belgian Saison             |   8   |           184 |     16355 |

#### Reviews DataFrame
  
|   review_id |   beer_id | posted              | ratings                     |   score | username        |
|------------:|----------:|:--------------------|:----------------------------|--------:|:----------------|
|      675575 |      4399 | 2012-04-26 00:00:00 | [4.5, 4.5, 3.5, 4.0, 4.0]   |    3.95 | Rutager         |
|     1270933 |      7509 | 2016-04-19 00:00:00 | [4.0, 4.25, 4.0, 4.25, 4.0] |    4.09 | stortore        |
|      686394 |      4481 | 2009-10-21 00:00:00 | [4.0, 4.0, 4.5, 4.5, 4.5]   |    4.35 | Josievan        |
|     1347362 |      7959 | 2009-11-13 00:00:00 | [4.0, 3.0, 3.0, 2.5, 2.0]   |    2.81 | civilizedpsycho |
|     1119392 |      6675 | 2010-02-17 00:00:00 | [4.0, 4.5, 4.0, 4.0, 4.0]   |    4.12 | drizzam         |


Due to the enormity of reviews scraped from this website, text content was omitted from this notebook to save on data limits. Some _post-processed text_ will be discussed later.

Lets look a little more closely at the data...

"""

EDA_text = """
### 2. Numerical Analysis

In our dataset we have reviews from 57,023 individual users.

|   EDA  |   count |    mean |     std |   min |   25% |   50% |   75% |   max |
|-------:|--------:|:-------:|:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
| summary|   57023 | 30.8128 | 133.207 |     1 |     1 |     2 |     9 |  4175 |


We can see that the vast majority of reviews are supplied by less than 25% of the population ==> The dataset is _heavily skewed_ to a set of __super users__

Lets try to find some of our __super users__. Here are the Top 10


|  user  |   StonedTrippin |   metter98 |   superspak |   brentk56 |   BEERchitect |   UCLABrewN84 |   zeff80 |   woodychandler |   jlindros |   NeroFiddled |
|:-------|----------------:|-----------:|------------:|-----------:|--------------:|--------------:|---------:|----------------:|-----------:|--------------:|
| posted |            4175 |       4056 |        3855 |       3753 |          3682 |          3581 |     3015 |            2959 |       2957 |          2834 |

"""



show_boxplot_intro_text = """




What's the impact of all these reviews? Do more reviews for a given product impact its favorability?




#########Rating V User Reviews#########




1. User Reviews Graphs
  1. User Reviews over time
  2. Rating v User Reviews
  3. Ratings v Styles of Beer
    1. Variance (Box-Plot), 

2. 
"""

beerrecommender_text = """
### Beer Advisor Recommender
Using a collaborative recommender system, Beer Advisor can suggest a number of beers you may be interested in based on your preferences.

       --- The more ratings you have the more accurate Beer Advisor is. ---
"""

########## PRESENTATION UI AND FUNCTIONS ##########

show_overview_btn = widgets.Button(
    description='',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Show Overview Block',
    icon=''
)
show_webscraping_btn = widgets.Button(
    description='',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Show Webscraping Block',
    icon=''
)
show_EDA_btn = widgets.Button(
    description='',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Show EDA Block',
    icon=''
)
show_superuserplot_btn = widgets.Button(
    description='',
    disabled=False,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Show superuser plot ',
    icon=''
)
close_superuserplot_btn = widgets.Button(
    description='',
    disabled=False,
    button_style='warning', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Close superuser plot ',
    icon=''
)
show_boxplot_btn = widgets.Button(
    description='',
    disabled=False,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Show boxplot plot ',
    icon=''
)
close_boxplot_btn = widgets.Button(
    description='',
    disabled=False,
    button_style='warning', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Close boxplot plot ',
    icon=''
)



def show_overview_text(b):
    show_overview_btn.close()
    display(Markdown(overview_text))
    display(show_webscraping_btn)
    
def show_webscraping_text(b):
    show_webscraping_btn.close()
    display(Markdown(webscraping_text))
    display(show_EDA_btn)

def show_EDA_text(b):
    show_EDA_btn.close()
    display(Markdown(EDA_text))
    display(show_superuserplot_btn)
    
def goplot_super_user(b):
    plot_super_user()
    
    return display(Markdown(show_boxplot_intro_text))

def goplot_boxplot(b):
    return boxplot_family()

    
    
display(show_overview_btn)
show_overview_btn.on_click(show_overview_text)
show_webscraping_btn.on_click(show_webscraping_text)
show_EDA_btn.on_click(show_EDA_text)
show_superuserplot_btn.on_click(goplot_super_user)





Button(style=ButtonStyle(), tooltip='Show Overview Block')

 
# Beer Advisor Overview

Hello, and welcome to Beer Advisor.

This project had three steps:
1. This project aimed to scrape as much data as possible from [BeerAdvocate.com](https://www.beeradvocate.com)
2. Peform Numerical Analysis on user reviews.
3. Develop a recommender system to suggest beers to new users based on the data scraped from previous reviews.
4. Analyze textual data from reviews to advisor brewers on new products.')) 

Button(style=ButtonStyle(), tooltip='Show Webscraping Block')


### 1. Webscraping

__Beer Advocate__ is online community where users can rate and review all beers, craft to mainstream. This website was scraped using a _scrapy spider_. Information on the general product page and each individual review was pulled by the spider.

    Beer Advocate boasts a database of nearly 300,000 beers (probably more at this point).

    Only looked at beers with more than 100 user ratings. (conveniently listed on the beer list page)

    3 hours later and we have nearly 10,000 individual beers and 1.7 million individual reviews!!

Based on the origin of the data, the information was piped into one of two csv files and generated tables of information similar to this:

#### Beers DataFrame

|   beer_id | beer_name                        | brewery                            | beer_style                 |   abv |   num_reviews |   ranking |
|----------:|:---------------------------------|:-----------------------------------|:---------------------------|------:|--------------:|----------:|
|      9128 | Motor City Brewing Ghettoblaster | Motor City Brewing Works           | English Dark Mild Ale      |   4.2 |            64 |     44196 |
|       205 | Spellbound IPA                   | Spellbound Brewing                 | American IPA               |   6.5 |            35 |     13651 |
|      9358 | Red Nose Winter Ale              | Natty Greene's Pub & Brewing Co.   | Winter Warmer              |   6.8 |            51 |     38773 |
|      6646 | Hunter Vanilla                   | 18th Street Brewery - Gary Taproom | English Sweet / Milk Stout |   8.5 |            61 |      1391 |
|      3753 | Bière De Miel Biologique         | Brasserie Dupont sprl              | Belgian Saison             |   8   |           184 |     16355 |

#### Reviews DataFrame
  
|   review_id |   beer_id | posted              | ratings                     |   score | username        |
|------------:|----------:|:--------------------|:----------------------------|--------:|:----------------|
|      675575 |      4399 | 2012-04-26 00:00:00 | [4.5, 4.5, 3.5, 4.0, 4.0]   |    3.95 | Rutager         |
|     1270933 |      7509 | 2016-04-19 00:00:00 | [4.0, 4.25, 4.0, 4.25, 4.0] |    4.09 | stortore        |
|      686394 |      4481 | 2009-10-21 00:00:00 | [4.0, 4.0, 4.5, 4.5, 4.5]   |    4.35 | Josievan        |
|     1347362 |      7959 | 2009-11-13 00:00:00 | [4.0, 3.0, 3.0, 2.5, 2.0]   |    2.81 | civilizedpsycho |
|     1119392 |      6675 | 2010-02-17 00:00:00 | [4.0, 4.5, 4.0, 4.0, 4.0]   |    4.12 | drizzam         |


Due to the enormity of reviews scraped from this website, text content was omitted from this notebook to save on data limits. Some _post-processed text_ will be discussed later.

Lets look a little more closely at the data...



Button(style=ButtonStyle(), tooltip='Show EDA Block')


### 2. Numerical Analysis

In our dataset we have reviews from 57,023 individual users.

|   EDA  |   count |    mean |     std |   min |   25% |   50% |   75% |   max |
|-------:|--------:|:-------:|:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
| summary|   57023 | 30.8128 | 133.207 |     1 |     1 |     2 |     9 |  4175 |


We can see that the vast majority of reviews are supplied by less than 25% of the population ==> The dataset is _heavily skewed_ to a set of __super users__

Lets try to find some of our __super users__. Here are the Top 10


|  user  |   StonedTrippin |   metter98 |   superspak |   brentk56 |   BEERchitect |   UCLABrewN84 |   zeff80 |   woodychandler |   jlindros |   NeroFiddled |
|:-------|----------------:|-----------:|------------:|-----------:|--------------:|--------------:|---------:|----------------:|-----------:|--------------:|
| posted |            4175 |       4056 |        3855 |       3753 |          3682 |          3581 |     3015 |            2959 |       2957 |          2834 |



Button(button_style='info', style=ButtonStyle(), tooltip='Show superuser plot ')






What's the impact of all these reviews? Do more reviews for a given product impact its favorability?




#########Rating V User Reviews#########




1. User Reviews Graphs
  1. User Reviews over time
  2. Rating v User Reviews
  3. Ratings v Styles of Beer
    1. Variance (Box-Plot), 

2. 


Here We have a cumulate plot marking the montly aggregate of the top ten reviewers

Some intesting features we can infer:
1. 2010 and 2011 were big years for Beer Advocate. They should the start of 