# ABV study
### Preliminary analysis

In [2]:
# Import all libraries
import pandas as pd
import os
import plotly.express as px
import numpy as np

# Set the data folder
DATA_FOLDER = os.path.join(os.getcwd(), '../data/processed')

# Set some visualization settings
px.defaults.width = 800
px.defaults.height = 600
px.defaults.template = "plotly_white"

# Load all the data
df_beers = pd.read_parquet(os.path.join(DATA_FOLDER, 'beers.pq'))
df_breweries = pd.read_parquet(os.path.join(DATA_FOLDER, 'breweries.pq'))
df_users = pd.read_parquet(os.path.join(DATA_FOLDER, 'users.pq'))
df_ratings_no_text = pd.read_parquet(os.path.join(DATA_FOLDER, 'ratings_no_text.pq'))

# Remove the ratings where a country has overall less than 100 ratings
nbr_ratings_brewery_location = df_ratings_no_text.groupby('location_brewery').count()['beer_id'].rename('nbr_ratings').reset_index()
df_ratings_no_text = df_ratings_no_text[df_ratings_no_text['location_brewery'].isin(nbr_ratings_brewery_location[nbr_ratings_brewery_location['nbr_ratings'] > 100]['location_brewery'])]

# Remove the ratings where users of a specific country has done less than 100 ratings
nbr_ratings_user_location = df_ratings_no_text.groupby('location_user').count()['beer_id'].rename('nbr_ratings').reset_index()
df_ratings_no_text = df_ratings_no_text[df_ratings_no_text['location_user'].isin(nbr_ratings_user_location[nbr_ratings_user_location['nbr_ratings'] > 100]['location_user'])]

# Add a year column to the data
df_ratings_no_text['year'] = df_ratings_no_text['date'].dt.year

#### ABV Vs Popularity
<b>Contribution to the story</b>: This is the first main analysis of the story. In particular we want to understand is there has been any significant shift over time in the ABV of the beers, if a specific ABV range is rated better than others and how this value has evolved over time. <i>This try to see how ABV impact popularity of the beer</i>

In [3]:
# Process the data
beer_ratings = []
linspace = np.linspace(0, 20, 201)
for year in range(2002, 2018):
    df_year = df_ratings_no_text[df_ratings_no_text['year'] == year]
    
    for i in range(len(linspace) - 1):
        min_abv = round(linspace[i], 2)
        max_abv = round(linspace[i + 1], 2)
        abv = (min_abv + max_abv) / 2
        filtered = df_year[(df_year['abv'] >= min_abv) & (df_year['abv'] < max_abv)]

        ratings = filtered['rating'].mean()
        nbr_ratings = filtered['rating'].count()

        if nbr_ratings > 250:
            beer_ratings.append({'year': year, 'abv': abv, 'rating': ratings, 'nbr_ratings': nbr_ratings})

# Convert to DataFrame
beer_ratings = pd.DataFrame(beer_ratings)

# Do the plot
fig = px.scatter(beer_ratings, x='abv', y='rating', size='nbr_ratings', hover_name='abv',animation_frame='year', labels={'abv': 'ABV:', 'rating': 'Rating:', 'nbr_ratings': 'Number of ratings:'},range_x=[0, 20], range_y=[2.25, 4.75])
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(showlegend=False)
fig.update_xaxes(title_text='ABV')
fig.update_yaxes(title_text='Rating')
fig.show()

We see that people tends to prefer high ABV beers. We see that low ABV beers tend to score much lower than higher ABV ones. We see that over the years the trend is consistent. We also see that over the year more and more high ABV beers are reviewed which seems that the average ABV drunk globally has increased.

In [12]:
df_ratings_no_text['year'] = df_ratings_no_text['date'].dt.year
avg_abv = df_ratings_no_text.groupby('year').agg({'abv': 'mean'})
avg_abv = avg_abv[(avg_abv.index >= 2002) & (avg_abv.index < 2018)]
avg_abv['year'] = avg_abv.index
corr = avg_abv.corr(method='spearman').iloc[0,1]
fig = px.line(avg_abv)
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Average ABV',
    xaxis=dict(tickvals=np.arange(2002, 2018, 1)),
    yaxis=dict(range=[5, 7]),
    showlegend=False
)
fig.add_annotation(
    x=2010,
    y=7,
    text=f'<b>Spearman correlation: {corr:.2f}</b>',
    showarrow=False,
    font=dict(size=14)
)
fig.show()

We see that over time there has been a significant increase in the average ABV at global level. We see that this monotonic trend is confirmed by the spearman correlation which is pretty high in this case meaning that there is a significant monotonic trend. This prove our hypothesis that people tend to prefer high ABV beers and that this trend is increasing over time. <br>
Let's now see how this has changed over time in the different countries.

In [18]:
unique_states_breweries = df_ratings_no_text['location_brewery'].unique()
row = []
for year in range(2002, 2018):
    df_state = df_ratings_no_text[df_ratings_no_text['year'] == year].groupby('location_brewery').agg({'abv': 'mean'}).reset_index()
    row += [{'year': year, 'state': state, 'avg_abv': abv} for state, abv in zip(df_state['location_brewery'], df_state['abv'])]
df_states = pd.DataFrame(row)

nbr_years_considered = df_states.groupby('state').agg({'year': 'count'}).reset_index()
nbr_years_considered = nbr_years_considered[nbr_years_considered['year'] == nbr_years_considered['year'].max()]
df_states = df_states[df_states['state'].isin(nbr_years_considered['state'])]

# Create frames for the animation
frames = [
    {
        "data": [
            px.choropleth(
                df_states[df_states['year'] == year],
                locations='state',
                locationmode='country names',
                color='avg_abv',
                color_continuous_scale='Viridis',
                range_color=[4, 8]
            ).data[0]
        ],
        "name": str(year),
    }
    for year in df_states['year'].unique()
]

# Define the figure with animation
fig_brewery = px.choropleth(
    df_states[df_states['year'] == df_states['year'].min()],
    locations='state',
    locationmode='country names',
    color='avg_abv',
    color_continuous_scale='Viridis',
    range_color=[4, 8]
)

fig_brewery.frames = frames

# Add slider to the layout
fig_brewery.update_layout(
    sliders=[{
        "steps": [
            {"args": [[str(year)], {"frame": {"duration": 300, "redraw": True}, "mode": "immediate"}],
             "label": str(year), "method": "animate"}
            for year in df_states['year'].unique()
        ],
        "transition": {"duration": 300},
        "x": 0.1,
        "xanchor": "left",
        "y": 0,
        "yanchor": "top"
    }],
    updatemenus=[{
        "buttons": [
            {"args": [None, {"frame": {"duration": 300, "redraw": True}, "fromcurrent": True}],
             "label": "Play", "method": "animate"},
            {"args": [[None], {"frame": {"duration": 0, "redraw": True}, "mode": "immediate"}],
             "label": "Pause", "method": "animate"}
        ],
        "direction": "left",
        "pad": {"r": 10, "t": 87},
        "showactive": False,
        "type": "buttons",
        "x": 0.1,
        "xanchor": "right",
        "y": 0,
        "yanchor": "top"
    }]
)

fig_brewery.show()


Could make sense to add a spearman correlation map plot?