In [None]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load all the data
reviews = pd.read_csv('data/raw/reviews.csv')
breweries = pd.read_csv('data/raw/breweries.csv')
beers = pd.read_csv('data/raw/beers.csv')
aromas = pd.read_csv('data/raw/beer_data_set.csv')

In [None]:
# Merge reviews and beers
reviews_beers = pd.merge(reviews, beers, left_on='beer_id', how='inner', right_on='id')

In [None]:
reviews_beers.head()

In [None]:
# merge similar beer styles together in a meta_style column

metastyle_beer_dict = dict([
    ('IPA', 
    ['IPA', 'DDHIPA', 'NEIPA']),

    ('Ale', 
    ['Ale']),

    ('Sour', 
    ['Lambic', 'Sour', 'Kvass', 'Gueuze', 'Flanders', 'Gose', 'Sahti', 'Brett', 'Saison', 'Fruit']),

    ('Lager', 
    ['Lager', 'Pilsner', 'Bock', 'Chile', 'Happoshu', 'Pilsener', 'Helles', 'Oktoberfest', 'kölsch']),

    ('Stout', 
    ['Stout', 'Porter']),

    ('Smoked Beer', 
    ['Smoked','Smoke', 'Rauchbier']),

    ('Winter Beer', 
    ['Winter', 'Christmas', 'Pumpkin', 'spice']),

    ('Alcohol-free', 
    ['Low Alcohol Beer']),

    ('Belgian Blonde', 
    ['Dubbel', 'Quadrupel', 'Tripel', 'Belgian']),

    ('Wheat Beer', 
    ['Wheat', 'Weissbier', 'Witbier', 'Hefeweizen', 'Berliner']),

    ('Ambree', 
    ['Altbier', 'Rye']),

    ('Boozy', 
    ['Barleywine', 'Scotch', 'Scottish', 'Champagne', 'Braggot', 'Liquor'])

])

# Create a function to find the metastyle

# make metastyle_beer_dict_lower as a copy of metastyle_beer_dict with lower case values
metastyle_beer_dict_lower = {}
for key, value in metastyle_beer_dict.items():
    metastyle_beer_dict_lower[key] = [x.lower() for x in value]

def find_metastyle(specific_style):
    for metastyle, beer_substyles in metastyle_beer_dict_lower.items():
        for beer_substyle_keyword in beer_substyles:
            if beer_substyle_keyword in specific_style:
                return metastyle
    
    return 'Other'

In [None]:
# Iterate through the styles to find the metastyle
# make reviews_beers['style'] as string and lowercased
reviews_beers['style'] = reviews_beers['style'].astype(str).str.lower()

reviews_beers['meta_style'] = reviews_beers['style'].apply(find_metastyle)
reviews_beers.sample(5)

In [None]:
# Print the number of beers in each metastyle
print(reviews_beers['meta_style'].value_counts())
# Print the styles in other metastyle and sort
print(reviews_beers[reviews_beers['meta_style'] == 'Other']['style'].value_counts().sort_values(ascending=False))
reviews_beers.head()

In [None]:
# group by beer id and keep the metastyle
from scipy.stats import mode

def get_majority(series):
    try:
        return mode(series)[0][0]
    except:
        return 'N/A'  # Default value when mode is not available
    
reviews_beers['review_scores'] = reviews_beers['score']

reviews_beers_grouped = reviews_beers.groupby('beer_id').agg({'review_scores': 'mean', 'meta_style': 'first', 'country': get_majority, 'state': get_majority})
reviews_beers_grouped.head(5)

In [None]:
reviews_beers_grouped.head(20)

In [None]:
reviews_beers_grouped_copy = reviews_beers_grouped.copy().reset_index()
reviews_beers_grouped_copy.drop(['beer_id'], axis=1, inplace=True)
reviews_beers_grouped_copy

In [None]:
reviews_beers_grouped_copy['mean_rating'] = reviews_beers_grouped_copy['review_scores']
reviews_beers_grouped_copy['count_rating'] = reviews_beers_grouped_copy['review_scores']
# Extract us beers
us_beers = reviews_beers_grouped_copy[reviews_beers_grouped_copy['country'] == 'US']
# Groupby state and metastyle
us_beers_grouped = us_beers.groupby(['state', 'meta_style']).agg({'mean_rating': 'mean', 'count_rating': 'count'})
us_beers_grouped.reset_index(inplace=True)
# Create a "relative count" column where the count is divided by the sum of the counts for that state
us_beers_grouped['relative_count'] = us_beers_grouped.groupby('state')['count_rating'].apply(lambda x: x / x.sum())
us_beers_grouped.head(5)

In [None]:
# print the range of relative_count
print(us_beers_grouped['relative_count'].min())
print(us_beers_grouped['relative_count'].max())

In [None]:
# Remove countries that have less than 10 beers, only keep european continent countries
# Add Andorra Isle of Man, Albania, Macedonia, Moldova, San Marino, Faroe Islands, Bosnia and Herzegovina, Montenegro, Belarus, Greece, Monaco, Ukraine, United Kingdom, Serbia

european_country_codes = ['BE', 'BG', 'CZ', 'DK', 'DE', 'EE', 'IE', 'EL', 'ES', 'FR', 'HR', 'IT', 'CY', 'LV', 'LT', 'LU', 'HU', 'MT', 'NL',
                           'AT', 'PL', 'PT', 'RO', 'SI', 'SK', 'FI', 'SE', 'UK', 'IS', 'LI', 'NO', 'CH', 'ME', 'RS', 'MK', 'AL', 'BA', 'MD',
                            'UA', 'BY', 'AD', 'IM', 'SM', 'FO', 'GR', 'MC', 'GB']


reviews_beers_grouped_reduced = reviews_beers_grouped_copy.groupby('country').filter(lambda x: len(x) > 10)
reviews_beers_grouped_reduced = reviews_beers_grouped_reduced[reviews_beers_grouped_reduced['country'].isin(european_country_codes)]

country_beers = reviews_beers_grouped_reduced.groupby(['country', 'meta_style']).agg({'mean_rating': 'mean', 'count_rating': 'count'})
country_beers.reset_index(inplace=True)

country_beers['relative_count'] = country_beers.groupby('country')['count_rating'].apply(lambda x: x / x.sum())
country_beers.head(5)

In [None]:
eu_dict_code_to_name = {
    'BE': 'Belgium',
    'BG': 'Bulgaria',
    'CZ': 'Czech Republic',
    'DK': 'Denmark',
    'DE': 'Germany',
    'EE': 'Estonia',
    'IE': 'Ireland',
    'EL': 'Greece',
    'ES': 'Spain',
    'FR': 'France',
    'HR': 'Croatia',
    'IT': 'Italy',
    'CY': 'Cyprus',
    'LV': 'Latvia',
    'LT': 'Lithuania',
    'LU': 'Luxembourg',
    'HU': 'Hungary',
    'MT': 'Malta',
    'NL': 'Netherlands',
    'AT': 'Austria',
    'PL': 'Poland',
    'PT': 'Portugal',
    'RO': 'Romania',
    'SI': 'Slovenia',
    'SK': 'Slovakia',
    'FI': 'Finland',
    'SE': 'Sweden',
    'UK': 'United Kingdom',
    'IS': 'Iceland',
    'LI': 'Liechtenstein',
    'NO': 'Norway',
    'CH': 'Switzerland',
    'ME': 'Montenegro',
    'RS': 'Serbia',
    'MK': 'Macedonia',
    'AL': 'Albania',
    'BA': 'Bosnia and Herzegovina',
    'MD': 'Moldova',
    'UA': 'Ukraine',
    'BY': 'Belarus',
    'AD': 'Andorra',
    'IM': 'Isle of Man',
    'SM': 'San Marino',
    'FO': 'Faroe Islands',
    'GR': 'Greece',
    'MC': 'Monaco',
    'GB': 'United Kingdom'
}

state_dict_code_to_name = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 
    'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia',
    'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa',
    'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
    'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 'MO': 'Missouri',
    'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey',
    'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio',
    'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont',
    'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming',
    'DC': 'District of Columbia'
}
print("check if same length eu: ", len(eu_dict_code_to_name) == len(european_country_codes))
print("check if same length us: ", len(state_dict_code_to_name) == len(us_beers_grouped['state'].unique()))

In [None]:
# rename country and state to country_code and state_code
country_beers.rename(columns={'country': 'country_code'}, inplace=True)
us_beers_grouped.rename(columns={'state': 'state_code'}, inplace=True)

In [None]:
us_beers_grouped.head(5)

In [None]:
# plot mean_rating distribution for both datasets
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
sns.distplot(country_beers['mean_rating'], ax=ax[0])
sns.distplot(us_beers_grouped['mean_rating'], ax=ax[1])
ax[0].set_title('European Beers')
ax[1].set_title('US Beers')
plt.show()


In [None]:
# plot relative_count distribution for both datasets
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
sns.distplot(country_beers['relative_count'], ax=ax[0])
sns.distplot(us_beers_grouped['relative_count'], ax=ax[1])
ax[0].set_title('European Beers')
ax[1].set_title('US Beers')

In [None]:
# save the data in a csv file, us_beers_grouped and country_beers
country_beers.to_csv('country_beers.csv', index=False)
us_beers_grouped.to_csv('us_beers.csv', index=False)

In [None]:
print(country_beers['relative_count'].min())
print(country_beers['relative_count'].max())
# print the max row
country_beers[country_beers['relative_count'] == country_beers['relative_count'].max()]

In [None]:
# create a new column with the average of the review_overall
reviews_beers['review_scores'] = reviews_beers['score']

grouped_reviews = reviews_beers.groupby(['country', 'meta_style']).agg({'review_scores': ['mean', 'count']})
grouped_reviews

In [None]:
reviews_beers.columns

In [None]:
# Drop unnecessary columns
reviews_beers = reviews_beers.drop(['username', 'date', 'text', 'look', 'smell', 'taste', 'feel', 'score', 'id', 'name', 'brewery_id', 'availability', 'notes', 'retired'], axis=1)
# group by beer_id and get the mean of the review_overall, reset index
reviews_beers = reviews_beers.groupby('beer_id').mean().reset_index()
reviews_beers.head()

In [None]:
# Groupby beers
reviews_beers.columns

In [None]:
# merge reviews_beers and breweries
reviews_beers_breweries = pd.merge(reviews_beers, breweries, left_on='beer_id', how='inner', right_on='id')

In [None]:
merged_beers = pd.merge(beers, breweries, how='inner', left_on='brewery_id', right_on='id', suffixes=('_beer', '_brewery'))

merged_aromas = pd.merge(merged_beers, aromas, how='inner' ,left_on='name_beer', right_on='Name')

# Only keep rows where merged_aromas is equal to Brewery
merged_aromas = merged_aromas[merged_aromas['Brewery'] == merged_aromas['name_brewery']]
merged_aromas = merged_aromas.drop(['Brewery', 'Name', 'id_brewery', 'state_beer', 'country_beer'], axis=1)\
    .rename(columns={'state_brewery': 'state', 'country_brewery': 'country', 'id_beer': 'beer_id'})

In [None]:
# merge similar beer styles together in a meta_style column

from utils.beer_metastyles import metastyle_beer_dict

# Create a new column for meta_style
merged_aromas['meta_style'] = merged_aromas['style']

# Create a function to find the metastyle

def find_metastyle(specific_style):
    for metastyle, beer_substyles in metastyle_beer_dict.items():
        for beer_substyle_keyword in beer_substyles:
            if beer_substyle_keyword in specific_style:
                return metastyle
    
    return 'Other'

# Iterate through the styles to find the metastyle

merged_aromas['meta_style'] = merged_aromas['style'].apply(find_metastyle)

In [None]:
# Normalize each column
merged_aromas_norm = merged_aromas.copy()
merged_aromas_norm['Astringency'] = (merged_aromas_norm['Astringency'] - merged_aromas_norm['Astringency'].mean()) / merged_aromas_norm['Astringency'].std()
merged_aromas_norm['Body'] = (merged_aromas_norm['Body'] - merged_aromas_norm['Body'].mean()) / merged_aromas_norm['Body'].std()
merged_aromas_norm['Alcohol'] = (merged_aromas_norm['Alcohol'] - merged_aromas_norm['Alcohol'].mean()) / merged_aromas_norm['Alcohol'].std()
merged_aromas_norm['Bitter'] = (merged_aromas_norm['Bitter'] - merged_aromas_norm['Bitter'].mean()) / merged_aromas_norm['Bitter'].std()
merged_aromas_norm['Sweet'] = (merged_aromas_norm['Sweet'] - merged_aromas_norm['Sweet'].mean()) / merged_aromas_norm['Sweet'].std()
merged_aromas_norm['Sour'] = (merged_aromas_norm['Sour'] - merged_aromas_norm['Sour'].mean()) / merged_aromas_norm['Sour'].std()
merged_aromas_norm['Salty'] = (merged_aromas_norm['Salty'] - merged_aromas_norm['Salty'].mean()) / merged_aromas_norm['Salty'].std()
merged_aromas_norm['Fruits'] = (merged_aromas_norm['Fruits'] - merged_aromas_norm['Fruits'].mean()) / merged_aromas_norm['Fruits'].std()
merged_aromas_norm['Hoppy'] = (merged_aromas_norm['Hoppy'] - merged_aromas_norm['Hoppy'].mean()) / merged_aromas_norm['Hoppy'].std()
merged_aromas_norm['Spices'] = (merged_aromas_norm['Spices'] - merged_aromas_norm['Spices'].mean()) / merged_aromas_norm['Spices'].std()
merged_aromas_norm['Malty'] = (merged_aromas_norm['Malty'] - merged_aromas_norm['Malty'].mean()) / merged_aromas_norm['Malty'].std()

In [None]:
# Make columns from 0 to 1 
merged_aromas_norm['Astringency'] = (merged_aromas_norm['Astringency'] - merged_aromas_norm['Astringency'].min()) / (merged_aromas_norm['Astringency'].max() - merged_aromas_norm['Astringency'].min())
merged_aromas_norm['Body'] = (merged_aromas_norm['Body'] - merged_aromas_norm['Body'].min()) / (merged_aromas_norm['Body'].max() - merged_aromas_norm['Body'].min())
merged_aromas_norm['Alcohol'] = (merged_aromas_norm['Alcohol'] - merged_aromas_norm['Alcohol'].min()) / (merged_aromas_norm['Alcohol'].max() - merged_aromas_norm['Alcohol'].min())
merged_aromas_norm['Bitter'] = (merged_aromas_norm['Bitter'] - merged_aromas_norm['Bitter'].min()) / (merged_aromas_norm['Bitter'].max() - merged_aromas_norm['Bitter'].min())
merged_aromas_norm['Sweet'] = (merged_aromas_norm['Sweet'] - merged_aromas_norm['Sweet'].min()) / (merged_aromas_norm['Sweet'].max() - merged_aromas_norm['Sweet'].min())
merged_aromas_norm['Sour'] = (merged_aromas_norm['Sour'] - merged_aromas_norm['Sour'].min()) / (merged_aromas_norm['Sour'].max() - merged_aromas_norm['Sour'].min())
merged_aromas_norm['Salty'] = (merged_aromas_norm['Salty'] - merged_aromas_norm['Salty'].min()) / (merged_aromas_norm['Salty'].max() - merged_aromas_norm['Salty'].min())
merged_aromas_norm['Fruits'] = (merged_aromas_norm['Fruits'] - merged_aromas_norm['Fruits'].min()) / (merged_aromas_norm['Fruits'].max() - merged_aromas_norm['Fruits'].min())
merged_aromas_norm['Hoppy'] = (merged_aromas_norm['Hoppy'] - merged_aromas_norm['Hoppy'].min()) / (merged_aromas_norm['Hoppy'].max() - merged_aromas_norm['Hoppy'].min())
merged_aromas_norm['Spices'] = (merged_aromas_norm['Spices'] - merged_aromas_norm['Spices'].min()) / (merged_aromas_norm['Spices'].max() - merged_aromas_norm['Spices'].min())
merged_aromas_norm['Malty'] = (merged_aromas_norm['Malty'] - merged_aromas_norm['Malty'].min()) / (merged_aromas_norm['Malty'].max() - merged_aromas_norm['Malty'].min())

In [None]:
merged_aromas_norm.describe()

In [None]:
merged_aromas_norm

In [None]:
merged_aromas.head()

In [None]:
merged_aromas.describe()

In [None]:
merged_aromas.columns

In [None]:
# print the number of unique beers in reviews
print('Number of unique beers in reviews: ', len(reviews['beer_id'].unique()))

In [None]:
merged_reviews = pd.merge(reviews, merged_aromas_norm, how='inner', on='beer_id')

In [None]:
merged_reviews.head()

In [None]:
merged_reviews.columns

In [None]:
# Group all reviews by country and meta_style
us_meta_style_aromas = merged_aromas[merged_aromas['country'] == 'US'].groupby(['state', 'meta_style']).mean()
country_meta_style_aromas = merged_aromas.groupby(['country', 'meta_style']).mean()

# Drop brewery_id, beer_id, key, Style Key, ABV and review_time157750708
us_meta_style_aromas = us_meta_style_aromas.drop(['brewery_id', 'beer_id', 'key', 'Style Key', 'abv'], axis=1)
country_meta_style_aromas = country_meta_style_aromas.drop(['brewery_id', 'beer_id', 'key', 'Style Key', 'abv'], axis=1)
country_meta_style_aromas

In [None]:
us_meta_style_aromas

In [None]:
merged_reviews['overall']

In [None]:
# find the closest 0.5 increment review score 
merged_reviews['overall_step'] = np.round(merged_reviews['overall'] * 2) / 2

In [None]:
merged_reviews.columns

In [None]:
#drop columns unrelated to the violin plot
styles_to_analyze = ['Lager', 'Stout', 'IPA']
# Astringency, Body, Alcohol, Bitter, Sweet, Sour, Salty; Fruits, Hoppy, Spices, Malty 
reduced_df = merged_reviews[merged_reviews['overall'] >= 3]
reduced_df = reduced_df.drop(['Alcohol','Body','Style','style','taste', 'feel', 'score','username', 'text', 'smell','date','look','brewery_id', 'beer_id', 'key', 'Style Key', 'abv', 'state', 'country', 'overall', 'retired', 'Description', 'notes_brewery', 'city', 'name_brewery', 'availability', 'Min IBU', 'Max IBU', 'name_beer', 'notes_beer', 'types'], axis=1)
reduced_df.columns

In [None]:
aromas_to_analyze = ['Astringency', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy', 'Spices', 'Malty']

df_s = []

for aroma in aromas_to_analyze:
    df = reduced_df.groupby(['meta_style', 'overall_step'])[aroma].apply(lambda x: np.histogram(x, bins=20)).reset_index()
    hist_values = df[aroma].apply(lambda x: x[0])
    hist_bins = df[aroma].apply(lambda x: x[1])
    hist_data = pd.concat([hist_values, hist_bins], axis=1)
    hist_data.columns = ['Histogram', 'Bins']
    hist_data = hist_data.reset_index(drop=True)
    hist_data['Aroma'] = aroma
    df_s.append(hist_data)

df_s[8]

In [None]:
df_s = []
keys = list(metastyle_beer_dict.keys())
keys.remove('Alcohol-free')
i = 0
for style in keys:
    for aroma in aromas_to_analyze:
        df = reduced_df[reduced_df['meta_style'] == style].groupby(['overall_step'])[aroma].apply(lambda x: np.histogram(x, bins=20)).reset_index()
        hist_values = df[aroma].apply(lambda x: x[0])
        hist_bins = df[aroma].apply(lambda x: x[1])
        hist_data = pd.concat([hist_values, hist_bins], axis=1)
        hist_data.columns = ['Histogram', 'Bins']
        hist_data = hist_data.reset_index(drop=True)
        df_s.append(hist_data)
        i = i + 1
        if(i == 82):
            print(aroma)
            print(style)

In [None]:
keys

In [None]:
i = 0
for style in keys:
    for aroma in aromas_to_analyze:
        df_s[i].to_csv('data/website_preparation/vio/hist_{}_{}.csv'.format(style, aroma), index=False)

        i = i + 1

In [None]:
for df in df_s:
    if(df.empty != False):
        print(df)
        aroma = df['Aroma'].iloc[0]
        meta_style = df['Meta Style'].iloc[0]
        df.drop(['Aroma', 'Meta Style'], axis=1).to_csv('data/website_preparation/vio/aroma_hist_{}_{}.csv'.format(meta_style, aroma), index=False)

In [None]:
reduced_df.to_csv('data/website_preparation/violin_df.csv', index=False)

In [None]:
hist_data.to_csv('data/website_preparation/violin_grouped_df.csv', index=False)

In [None]:
styles_to_analyze = ['Lager', 'Stout', 'IPA']
# Astringency, Body, Alcohol, Bitter, Sweet, Sour, Salty; Fruits, Hoppy, Spices, Malty 
aroma_to_analyze = 'Sour'

#######################
# Aroma vs. Overall Step, print only where reviews are >= 3
#reduced_df = merged_reviews[merged_reviews['overall'] >= 3]
#reduced_df = reduced_df[reduced_df['meta_style'].isin(styles_to_analyze)]

fig, axs = plt.subplots(figsize=(15, 10))
sns.violinplot(x='overall_step', y=aroma_to_analyze, hue= 'meta_style', data=reduced_df, ax = axs)
sns.pointplot(x='overall_step', y=aroma_to_analyze, hue= 'meta_style', data=reduced_df, dodge=True, join=True, palette='dark', markers='d', scale=1.5,  ax = axs)
plt.title('Aroma vs. Rating')
plt.xlabel('Rating')

plt.show()


In [None]:
from psmpy import PsmPy
from psmpy.functions import cohenD
from psmpy.plotting import *

In [None]:
merged_aromas.columns

In [None]:
avg_aromas_per_style = merged_aromas_norm.groupby('meta_style').mean()
avg_aromas_per_style = avg_aromas_per_style[['Astringency', 'Body', 'Alcohol', 'Bitter', 'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy', 'Spices', 'Malty']]
avg_aromas_per_style

In [None]:
threshold = 0.3
prevalent_aromas_per_style = avg_aromas_per_style.copy()
prevalent_aromas_per_style[prevalent_aromas_per_style < threshold] = 0
prevalent_aromas_per_style[prevalent_aromas_per_style >= threshold] = 1

names_that_appear = []
for style in avg_aromas_per_style.index:
    names_that_appear.append(prevalent_aromas_per_style.columns[prevalent_aromas_per_style.loc[style] == 1].tolist())
    print(style, prevalent_aromas_per_style.columns[prevalent_aromas_per_style.loc[style] == 1].tolist())

names_that_appear = [item for sublist in names_that_appear for item in sublist]
names_that_appear = list(set(names_that_appear))
names_that_appear

In [None]:
most_prevalent_aroma = []
for style in prevalent_aromas_per_style.index:
    most_prevalent_aroma.append(avg_aromas_per_style.columns[avg_aromas_per_style.loc[style] == avg_aromas_per_style.loc[style].max()].tolist())
    print(style, avg_aromas_per_style.columns[avg_aromas_per_style.loc[style] == avg_aromas_per_style.loc[style].max()].tolist())

In [None]:
dict_colour_aromas = {'Astringency': 'black', 'Body': 'blue', 'Alcohol': 'green', 'Bitter': 'orange', 'Sweet': 'purple', 'Sour': 'yellow', 'Salty': 'white', 'Fruits': 'red', 'Hoppy': 'olive', 'Spices': 'orange', 'Malty': 'pink'}
colour_per_beer_type = []

# add the most prevalent aroma colour
for style in prevalent_aromas_per_style.index:
    colour_per_beer_type.append(dict_colour_aromas[most_prevalent_aroma[avg_aromas_per_style.index.tolist().index(style)][0]])

In [None]:
# Display all the aroma names, with their colour as background
fig, axs = plt.subplots(3,4,figsize=(3, 3))
axs = axs.ravel()
i = 0
for aroma, colour in dict_colour_aromas.items():
    axs[i].set_facecolor(colour)
    axs[i].set_title(aroma)
    axs[i].set_xticks([])
    axs[i].set_yticks([])
    # remove grid lines
    axs[i].grid(False)
    
    i += 1
axs[i].set_facecolor('white')
axs[i].set_title(' ')
axs[i].set_xticks([])
axs[i].set_yticks([])
axs[i].grid(False)
plt.tight_layout()

In [None]:
style_similarity = pd.DataFrame(index=avg_aromas_per_style.index, columns=avg_aromas_per_style.index)

# import cohenD from psmpy.functions
from psmpy.functions import cohenD

def distance(val1, val2, metric = 'euclidean'):
    if metric == 'euclidean':
        return np.sqrt(np.sum(np.square(val1 - val2)))
    elif metric == 'manhattan':
        return np.sum(np.abs(val1 - val2))
    elif metric == 'cosine':
        return 1 - np.dot(val1, val2) / (np.sqrt(np.dot(val1, val1)) * np.sqrt(np.dot(val2, val2)))
    elif metric == 'jaccard':
        return 1 - np.sum(np.minimum(val1, val2)) / np.sum(np.maximum(val1, val2))
    

def compute_similarity(style1, style2):
    style1_values = avg_aromas_per_style.loc[style1]
    style2_values = avg_aromas_per_style.loc[style2]

    score_list = []
    
    for aroma in avg_aromas_per_style.columns:
        score_list.append(distance(style1_values[aroma], style2_values[aroma], metric='euclidean'))
    
    return score_list

for style1 in avg_aromas_per_style.index:
    for style2 in avg_aromas_per_style.index:
        style_similarity.loc[style1, style2] = compute_similarity(style1, style2)


In [None]:

style_similarity_mean_per_cell = style_similarity.copy()
style_similarity_mean_per_cell = style_similarity_mean_per_cell.applymap(np.mean)
# invert the values (the lower the value, the more similar the styles), avoid division by 0
style_similarity_mean_per_cell = style_similarity_mean_per_cell.applymap(lambda x: 1/x if x != 0 else None)

# Drop the Other column and Row
style_similarity_mean_per_cell = style_similarity_mean_per_cell.drop('Other', axis=0)
style_similarity_mean_per_cell = style_similarity_mean_per_cell.drop('Other', axis=1)

# normalize with mean and std style_similarity_mean_per_cell between 0 and 1
style_similarity_mean_per_cell = style_similarity_mean_per_cell.applymap(lambda x: (x - style_similarity_mean_per_cell.mean().mean()) / style_similarity_mean_per_cell.std().std())

# normalize style_similarity_mean_per_cell
style_similarity_mean_per_cell = style_similarity_mean_per_cell.applymap(lambda x: (x - style_similarity_mean_per_cell.min().min()) / (style_similarity_mean_per_cell.max().max() - style_similarity_mean_per_cell.min().min()))

# make style_similarity_mean_per_cell between -1 and 1
style_similarity_mean_per_cell = style_similarity_mean_per_cell.applymap(lambda x: (x)-style_similarity_mean_per_cell.mean().mean())

# plot value distribution
plt.hist(style_similarity_mean_per_cell.values.flatten(), bins=20)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(style_similarity_mean_per_cell, ax=ax, cmap='viridis', annot=True, fmt='.2f', linewidths=.5, cbar=False)
plt.show()

In [None]:
import networkx as nx

# create a network graph
G = nx.MultiGraph()

# add nodes, the size is the number of beers per style, the colour is the colour_per_beer_type
for style in avg_aromas_per_style.index:
    if style == 'Other':
        continue
    size = len(merged_aromas_norm[merged_aromas_norm['meta_style'] == style])
    colour = colour_per_beer_type[avg_aromas_per_style.index.tolist().index(style)]
    G.add_node(style, size=size, colour=colour)

# add edges, the width is the similarity between the styles, the colour is the aroma type

for style1 in style_similarity_mean_per_cell.index:
    for style2 in style_similarity_mean_per_cell.index:
        i = 0
        for aroma, col in dict_colour_aromas.items():
            width = style_similarity.loc[style1, style2][i]
            # normalise the width
            width = width / style_similarity_mean_per_cell.loc[style1, style2].max()

            if width != 0:
                G.add_edge(style1, style2, width=width, colour=col)

            i += 1
    
# draw the graph
plt.figure(figsize=(10,10))
pos = nx.spring_layout(G, k=0.5, iterations=50)
node_size = [G.nodes[node]['size'] for node in G]
node_colour = [G.nodes[node]['colour'] for node in G]

edge_width = [G.edges[edge]['width'] for edge in G.edges]
edge_colour = [G.edges[edge]['colour'] for edge in G.edges]

nx.draw(G, pos, node_size=node_size, node_color=node_colour, edge_color=edge_colour, width=edge_width, with_labels=True)
plt.show()

In [None]:
import networkx as nx

# create a network graph
G = nx.Graph()

# add nodes, the size is the number of beers per style, the colour is the colour_per_beer_type
for style in avg_aromas_per_style.index:
    if style == 'Other':
        continue
    size = len(merged_aromas_norm[merged_aromas_norm['meta_style'] == style])
    colour = colour_per_beer_type[avg_aromas_per_style.index.tolist().index(style)]
    G.add_node(style, size=size, colour=colour)

# add edges, the width is the similarity between the styles

for style1 in style_similarity_mean_per_cell.index:
    for style2 in style_similarity_mean_per_cell.index:
        width = style_similarity_mean_per_cell.loc[style1, style2]

        if width != 0:
            G.add_edge(style1, style2, width=width*10)

# draw the graph
plt.figure(figsize=(10,10))
pos = nx.spring_layout(G, k=0.5, iterations=50)
node_size = [G.nodes[node]['size'] for node in G]
node_colour = [G.nodes[node]['colour'] for node in G]

edge_width = [G.edges[edge]['width'] for edge in G.edges]

# draw in circular_layout
nx.draw_circular(G, node_size=node_size, node_color=node_colour, width=edge_width, with_labels=True)
plt.show()



In [None]:
bubble_data = pd.DataFrame(index=avg_aromas_per_style.index, columns=['size', 'colour'])
bubble_data['size'] = [len(merged_aromas_norm[merged_aromas_norm['meta_style'] == style]) for style in avg_aromas_per_style.index]
bubble_data['colour'] = [colour_per_beer_type[avg_aromas_per_style.index.tolist().index(style)] for style in avg_aromas_per_style.index]
bubble_data = bubble_data.drop('Other', axis=0)
bubble_data

In [None]:
style_similarity_mean_per_cell

In [None]:
import numpy as np
import matplotlib.pyplot as plt

id_to_style = {i: style for i, style in enumerate(style_similarity_mean_per_cell.index)}
initial_index = np.array([[None,None,3,None,8],
                 [None,None,4,10,2],
                 [7,9,6,1,5],
                 [None,None,None,0,None]])

class Bubble:
    def __init__(self, id, x, y, area, forces):
        
        self.id = id
        self.area = area
        self.force = forces
        self.x = x
        self.y = y
        self.radius = np.sqrt(area / np.pi)
    
    def force_to(self, other):
        # force between two bubbles
        return self.force[other.id]
    
    def distance_to(self, other):
        # Euclidean distance, take into consideration the radius of the bubbles
        return np.sqrt((self.x - other.x)**2 + (self.y - other.y)**2) - (self.radius + other.radius)
    
    def gravity_to(self, other):
        # gravity force
        #print("bubble {} to bubble {}, with force {}, distance {}, gravity {}".format(id_to_style[self.id], id_to_style[other.id], self.force_to(other), self.distance_to(other), self.force_to(other) / self.distance_to(other)**2))
        return self.force_to(other) / (3*self.distance_to(other)**3)

    def xy(self):
        return np.array([self.x, self.y])

class GravityBubbleChart:
    def __init__(self, area, bubble_spacing=0, force = None, seed = 0):
        """
        Setup for bubble collapse.

        Parameters
        ----------
        area : array-like
            Area of the bubbles.
        bubble_spacing : float, default: 0
            Minimal spacing between bubbles after collapsing.

        Notes
        -----
        If "area" is sorted, the results might look weird.
        """
        area = np.asarray(area)

        self.bubble_spacing = bubble_spacing

        self.nb_bubbles = len(area)

        self.bubbles = []
        # create an array of class Bubble
        for i in range(self.nb_bubbles):
            b = Bubble(i, 0, 0, area[i], force[i])
            self.bubbles.append(b)
        
        self.bubbles = np.array(self.bubbles)

        self.maxstep = 2 * np.array([b.radius for b in self.bubbles[:]]).max() + self.bubble_spacing
        self.step_dist = self.maxstep / 2

        # calculate a 4*5 grid of positions
        gx, gy = np.meshgrid(np.linspace(0, 5, 5), np.linspace(0, 4, 4))

        # randomize bubble positions, no repetitions
        #np.random.seed(seed)
        #random_id_list =  np.random.choice(self.nb_bubbles, self.nb_bubbles, replace=False)
        
        for b in self.bubbles[:]:
            x_index = np.where(initial_index == b.id)[0][0]
            y_index = np.where(initial_index == b.id)[1][0]
            b.x = gx[x_index, y_index]*200
            b.y = gy[x_index, y_index]*200

        self.com = self.center_of_mass()

    def center_of_mass(self):
        return np.average(
            np.array([b.xy() for b in self.bubbles[:]]), axis=0, weights=np.array([b.area for b in self.bubbles[:]])
        )

    def center_distance(self, bubble, bubbles):

        if type(bubble) == np.ndarray:
            if type(bubbles[0]) == np.ndarray:
                return np.hypot(bubble[0] - np.array([b[0] for b in bubbles[:]]),
                        bubble[1] - np.array([b[1] for b in bubbles[:]]))
            else :
                return np.hypot(bubble[0] - np.array([b.x for b in bubbles[:]]),
                        bubble[1] - np.array([b.y for b in bubbles[:]]))
        else:
            if type(bubbles[0]) == np.ndarray:
                return np.hypot(bubble.x - np.array([b[0] for b in bubbles[:]]),
                        bubble.y - np.array([b[1] for b in bubbles[:]]))
            return np.hypot(bubble.x - np.array([b.x for b in bubbles[:]]),
                        bubble.y - np.array([b.y for b in bubbles[:]]))

    def outline_distance(self, bubble, bubbles):
        center_distance = self.center_distance(bubble, bubbles)
        return center_distance - bubble.radius - \
            np.array([b.radius for b in bubbles[:]]) - self.bubble_spacing

    def check_collisions(self, bubble, bubbles):
        distance = self.outline_distance(bubble, bubbles)
        return len(distance[distance < 0])

    def collides_with(self, bubble, bubbles):
        distance = self.outline_distance(bubble, bubbles)
        idx_min = np.argmin(distance)
        return idx_min if type(idx_min) == np.ndarray else [idx_min]

    def calculate_distance_error(self):
        # For each bubble add error to close bubbles with negative force, and remove error to close bubbles with positive force
        error = 0
        for i in range(self.nb_bubbles):
            for j in range(self.nb_bubbles):
                if i != j:
                    error += self.bubbles[i].gravity_to(self.bubbles[j]) / self.outline_distance(self.bubbles[i], [self.bubbles[j]])
        return error

    def collapse(self, n_iterations=50):
        """
        Move bubbles to the center of mass.

        Parameters
        ----------
        n_iterations : int, default: 50
            Number of moves to perform.
        """

        for _i in range(n_iterations):

            moves = 0

            for i in range(self.nb_bubbles):

                # remove bubble from list of bubbles
                rest_bub = np.delete(self.bubbles, i, 0)

                # try to move directly towards the center of mass with force attraction to other bubbles
                # direction vector from bubble to the center of mass with force attraction to other bubbles

                dir_vec = - self.bubbles[i].xy()
                #dir_vec = self.com - self.bubbles[i].xy()
                
                for j in range(self.nb_bubbles):
                    if j != i:
                        attraction_force = self.bubbles[i].gravity_to(self.bubbles[j])
                        # print("bubble ", id_to_style[self.bubbles[i].id], " is attracted to ", id_to_style[self.bubbles[j].id], " : ", attraction_force > 0)
                        
                        dir_vec += np.array([attraction_force*(self.bubbles[j].x - self.bubbles[i].x), attraction_force*(self.bubbles[j].y - self.bubbles[i].y)])
                        
                # shorten direction vector to have length of 1
                dir_vec = dir_vec / np.sqrt(dir_vec.dot(dir_vec))

                # calculate new bubble position
                new_point = self.bubbles[i].xy() + dir_vec * self.step_dist
                
                new_bubble = Bubble(self.bubbles[i].id, new_point[0], new_point[1], self.bubbles[i].area, self.bubbles[i].force)

                # check whether new bubble collides with other bubbles
                # print type of rest_bub
                
                if not self.check_collisions(new_bubble, rest_bub):
                    self.bubbles[i] = new_bubble
                    self.com = self.center_of_mass()
                    moves += 1
                else:
                    # try to move around a bubble that you collide with
                    # find colliding bubble
                    for colliding in self.collides_with(new_bubble, rest_bub):
                        # calculate direction vector
                        dir_vec = rest_bub[colliding].xy() - self.bubbles[i].xy()
                        dir_vec = dir_vec / np.sqrt(dir_vec.dot(dir_vec))
                        # calculate orthogonal vector
                        orth = np.array([dir_vec[1], -dir_vec[0]])
                        # test which direction to go
                        new_point1 = (self.bubbles[i].xy() + orth *
                                      self.step_dist)
                        new_point2 = (self.bubbles[i].xy() - orth *
                                      self.step_dist)

                        dist1 = self.center_distance(
                            self.com, np.array([new_point1]))
                        dist2 = self.center_distance(
                            self.com, np.array([new_point2]))
                        new_point = new_point1 if dist1 < dist2 else new_point2

                        new_bubble = Bubble(self.bubbles[i].id, new_point[0], new_point[1], self.bubbles[i].area, self.bubbles[i].force)

                        if not self.check_collisions(new_bubble, rest_bub):
                            self.bubbles[i] = new_bubble
                            self.com = self.center_of_mass()

            if moves / self.nb_bubbles < 0.1:
                self.step_dist = self.step_dist / 2
        
    def plot(self, ax, labels, colors):
        """
        Draw the bubble plot.

        Parameters
        ----------
        ax : matplotlib.axes.Axes
        labels : list
            Labels of the bubbles.
        colors : list
            Colors of the bubbles.
        """
        for i in range(self.nb_bubbles):

            circ = plt.Circle(
                self.bubbles[i].xy(), self.bubbles[i].radius, color=colors[i])
            
            ax.add_patch(circ)
            ax.text(*self.bubbles[i].xy(), labels[i],
                    horizontalalignment='center', verticalalignment='center')

In [None]:
# make a Packed bubble chart, where the size of the bubble is the number of beers per style, and the colour is the colour_per_beer_type. The bubbles are packed together,bubbles with high scores are closer together, and bubbles with low scores are further apart.

bubble_chart = GravityBubbleChart(area=bubble_data['size'], bubble_spacing=0.1, force = style_similarity_mean_per_cell.to_numpy())

In [None]:
bubble_chart = GravityBubbleChart(area=bubble_data['size'], bubble_spacing=0.1, force = style_similarity_mean_per_cell.to_numpy(), seed = seed_list[error_list.index(min(error_list))] )
bubble_chart.collapse(n_iterations=100)

In [None]:
fig, ax = plt.subplots(subplot_kw=dict(aspect="equal"), figsize=(12, 12))
bubble_chart.plot(
    ax, style_similarity_mean_per_cell.index, bubble_data['colour'])
ax.axis("off")
ax.relim()
ax.autoscale_view()
ax.set_title('Browser market share')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(style_similarity_mean_per_cell, ax=ax, cmap='viridis', annot=True, fmt='.2f', linewidths=.5, cbar=False)
plt.show()

In [None]:
# group reviews by beer style and overall_step
grouped_reviews = merged_reviews.groupby(['meta_style', 'overall_step']).agg({'overall': 'count'}).reset_index()
grouped_reviews