In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns 

# Loading the data

In [None]:
beers = pd.read_csv("data/processed/beers.csv", index_col="id")
beers.head()

# Looking at the data

In [None]:
ax = beers.abv.hist()
_ = ax.set_title("alcohol volume percentage")

In [None]:
ax = beers.boxplot(column=['alcohol', 'astringency', 'bitter', 'body', 'fruits', 'hoppy', 'malty', 'salty', 'sour', 'spices', 'sweet'], figsize=(10,6))
_ = ax.set_title("aroma perceptions")

In [None]:
beers[['brewery_type_bar', 'brewery_type_beer_to_go',
       'brewery_type_eatery', 'brewery_type_homebrew', 'brewery_type_brewery',
       'brewery_type_store']].sum()

In [None]:
ax = beers["country"].value_counts().plot(kind='bar', figsize=(10,5))
_ = ax.set_title("Country of origin")
plt.yscale('log')

In [None]:
ax = beers.hist(column=['ave_rating', 'rating_reviews_score'])

In [None]:
ax = beers.hist(column=['rating_reviews_look', 'rating_reviews_smell', 'rating_reviews_taste',
       'rating_reviews_feel', 'rating_reviews_overall'], figsize=(6,4))
plt.tight_layout()
plt.show()

In [None]:
ax = beers["meta_style"].value_counts().plot(kind='bar', figsize=(10,5))
_ = ax.set_title("meta_styles")
plt.yscale('log')

# Preliminary analysis

### Is there a flavour difference between the best and worse rated beers for each metastyle ?

In [None]:
nb_beers = 20

# get the best rated nb_beers beers in each meta_style
best_beers_per_style = beers.sort_values('ave_rating', ascending=False).groupby('meta_style').head(nb_beers)
worse_beers_per_style = beers.sort_values('ave_rating', ascending=True).groupby('meta_style').head(nb_beers)

aroma_columns =  ['astringency', 'body',
       'alcohol', 'bitter', 'sweet', 'sour', 'salty', 'fruits', 'hoppy',
       'spices', 'malty']

# get the average aromatic profile of each meta_style
best_beers_per_style_mean = best_beers_per_style.groupby('meta_style').mean()[aroma_columns]
worse_beers_per_style_mean = worse_beers_per_style.groupby('meta_style').mean()[aroma_columns]

In [None]:
# plot on spider chart per style with the mean best and worse beer aromatic profile
nb_plot_columns = 4
nb_plot_rows = len(best_beers_per_style_mean.index) // nb_plot_columns + 1

fig, ax = plt.subplots(nb_plot_rows, nb_plot_columns, figsize=(20, 20))


for i, meta_style in enumerate(best_beers_per_style_mean.index):

    # ------- PART 1: Create background
    col = i % nb_plot_columns
    row = i // nb_plot_columns
    
    # number of variable
    categories=list(best_beers_per_style_mean)
    N = len(aroma_columns)
    
    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]
    
    # Initialise the spider plot
    ax[row, col] = plt.subplot(nb_plot_rows, nb_plot_columns, i+1, polar=True)

    # If you want the first axis to be on top:
    ax[row, col].set_theta_offset(np.pi / 2)
    ax[row, col].set_theta_direction(-1)
    
    # Draw one axe per variable + add labels
    plt.xticks(angles[:-1], categories, color='grey', size=8)

    
    # Draw ylabels
    ax[row, col].set_rlabel_position(0)
    plt.yticks([30,60,90,120], ["30","60","90","120"], color="grey", size=7)
    plt.ylim(0,150)


    # ------- PART 2: Add plots
    
    # Plot each individual = each line of the data
    # I don't make a loop, because plotting more than 3 groups makes the chart unreadable
    
    # Ind1
    values=best_beers_per_style_mean.loc[meta_style].values.flatten().tolist()
    values += values[:1]
    ax[row, col].plot(angles, values, linewidth=1, linestyle='solid', label="Best rated beers")
    ax[row, col].fill(angles, values, 'b', alpha=0.1)

    # Ind2
    values=worse_beers_per_style_mean.loc[meta_style].values.flatten().tolist()
    values += values[:1]
    ax[row, col].plot(angles, values, linewidth=1, linestyle='solid', label="Worse rated beers")
    ax[row, col].fill(angles, values, 'r', alpha=0.1)

    # Add title and make background white
    ax[row, col].set_title(meta_style, size=11, y=1.1)
    ax[row, col].set_facecolor('white')

# Add legend
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

# Show the graph
plt.tight_layout()

# Make background color white
fig.patch.set_facecolor('white')

# remove empty plots
for i in range(len(best_beers_per_style_mean.index), nb_plot_columns * nb_plot_rows):
    col = i % nb_plot_columns
    row = i // nb_plot_columns
    fig.delaxes(ax[row, col])

plt.show()

### Which country makes what type of beer best ?

In [None]:
# display in a bar chart the average rating of each meta_style per country, only display countries with more than 100 beers

# get the average rating of each meta_style per country
meta_style_per_country = beers.groupby(['country', 'meta_style'])['ave_rating'].mean().reset_index()

# get the number of beers per country
nb_beers_per_country = beers.groupby('country')["name"].count().reset_index()

# merge the two dataframes
meta_style_per_country = meta_style_per_country.merge(nb_beers_per_country, on='country')

# keep only countries with more than 100 beers

meta_style_per_country = meta_style_per_country[meta_style_per_country['name'] > 100]
# drop germany
meta_style_per_country = meta_style_per_country[meta_style_per_country['country'] != 'DE']

# remove fruit and pils
meta_style_per_country = meta_style_per_country[meta_style_per_country['meta_style'] != 'Fruit']
meta_style_per_country = meta_style_per_country[meta_style_per_country['meta_style'] != 'Pilsner']

# plot the average rating of each meta_style per country
sns.catplot(x="meta_style", y="ave_rating", hue="country", data=meta_style_per_country, kind="bar", height=10, aspect=2, palette="muted")