# Project
## Load data and set up environment

In [1]:
# Import all the libraries
import pandas as pd
import plotly.express as px
from src.utils.plots import *

# Set some internal settings for plotly
px.defaults.width = 800
px.defaults.height = 600
px.defaults.template = 'plotly_white'

# Define the folder
FOLDER = 'data/processed/'
SAVING_FOLDER = 'docs/plots/'

# Load the data
df_beers = pd.read_parquet(FOLDER + 'beers.pq')
df_breweries = pd.read_parquet(FOLDER + 'breweries.pq')
df_users = pd.read_parquet(FOLDER + 'users.pq')
df_ratings_no_text = pd.read_parquet(FOLDER + 'ratings_no_text.pq')

## Data presentation
Let's take a look at the data we have in the dataset.

In [2]:
df_beers.head(5)

Unnamed: 0,beer_id,beer_name,brewery_id,brewery_name,style,abv
0,410549,33 Export (Gabon),3198,Sobraga,Pale Lager,5.0
1,105273,Castel Beer (Gabon),3198,Sobraga,Pale Lager,5.2
2,19445,Régab,3198,Sobraga,Pale Lager,4.5
3,155699,Ards Bally Black Stout,13538,Ards Brewing Co.,Stout,4.6
4,239097,Ards Belfast 366,13538,Ards Brewing Co.,Golden Ale/Blond Ale,4.2


In [3]:
print(f"In the platform there are {df_beers.shape[0]} different beers")

In the platform there are 399987 different beers


In [4]:
df_breweries.head(5)

Unnamed: 0,brewery_id,brewery_name,country_brewery,state_brewery
0,3198,Sobraga,Gabon,
1,13538,Ards Brewing Co.,United Kingdom,
2,22304,Barrahooley Craft Brewery,United Kingdom,
3,22818,Boundary,United Kingdom,
4,24297,Brewbot Belfast,United Kingdom,


In [5]:
print(f"In the platform there are {df_breweries.shape[0]} different breweries")

In the platform there are 24189 different breweries


In [6]:
df_users.head(5)

Unnamed: 0,user_id,user_name,joined,country_user,state_user
0,175852,Manslow,2012-05-20 10:00:00,Poland,
1,442761,MAGICuenca91,2017-01-10 11:00:00,Spain,
2,288889,Sibarh,2013-11-16 11:00:00,Poland,
3,250510,fombe89,2013-03-22 11:00:00,Spain,
4,122778,kevnic2008,2011-02-02 11:00:00,Germany,


In [7]:
print(f"In the platform there are {df_users.shape[0]} different users")

In the platform there are 50592 different users


In [8]:
df_ratings_no_text.head(5)

Unnamed: 0,date,beer_id,user_id,brewery_id,abv,style,rating,palate,taste,appearance,aroma,overall,year,brewery_name,country_brewery,state_brewery,country_user,state_user
0,2016-04-26 12:00:00,410549,175852,3198,5.0,Pale Lager,2.0,2.0,4.0,2.0,4.0,8.0,2016,Sobraga,Gabon,,Poland,
1,2017-02-17 12:00:00,105273,442761,3198,5.2,Pale Lager,1.9,2.0,4.0,2.0,3.0,8.0,2017,Sobraga,Gabon,,Spain,
2,2016-06-24 12:00:00,105273,288889,3198,5.2,Pale Lager,1.6,2.0,3.0,3.0,3.0,5.0,2016,Sobraga,Gabon,,Poland,
3,2016-01-01 12:00:00,105273,250510,3198,5.2,Pale Lager,1.5,1.0,2.0,4.0,3.0,5.0,2016,Sobraga,Gabon,,Spain,
4,2015-10-23 12:00:00,105273,122778,3198,5.2,Pale Lager,1.9,2.0,4.0,2.0,4.0,7.0,2015,Sobraga,Gabon,,Germany,


In [9]:
print(f"In the platform there are {df_ratings_no_text.shape[0]} different ratings")
print(f"The first rating was made on {df_ratings_no_text['date'].min()}")
print(f"The last rating was made on {df_ratings_no_text['date'].max()}")

In the platform there are 7123786 different ratings
The first rating was made on 2000-04-12 12:00:00
The last rating was made on 2017-07-31 12:00:00


In [10]:
df_beers = df_beers.merge(df_breweries[['brewery_id', 'country_brewery', 'state_brewery']], on='brewery_id', how='left').rename(columns={'country_brewery': 'country_beer', 'state_brewery': 'state_beer'})
df_beers.head(5)

Unnamed: 0,beer_id,beer_name,brewery_id,brewery_name,style,abv,country_beer,state_beer
0,410549,33 Export (Gabon),3198,Sobraga,Pale Lager,5.0,Gabon,
1,105273,Castel Beer (Gabon),3198,Sobraga,Pale Lager,5.2,Gabon,
2,19445,Régab,3198,Sobraga,Pale Lager,4.5,Gabon,
3,155699,Ards Bally Black Stout,13538,Ards Brewing Co.,Stout,4.6,United Kingdom,
4,239097,Ards Belfast 366,13538,Ards Brewing Co.,Golden Ale/Blond Ale,4.2,United Kingdom,


Now that we know what's in our data, let's see how our data is distributed over space.

In [11]:
# Compute the aggregated value for countries outside the US
beers_per_country_no_US = df_beers[df_beers['country_beer'] != 'United States'].groupby('country_beer').size().reset_index(name='count').rename(columns={'country_beer': 'location'})
users_per_country_no_US = df_users[df_users['country_user'] != 'United States'].groupby('country_user').size().reset_index(name='count').rename(columns={'country_user': 'location'})
breweries_per_country_no_US = df_breweries[df_breweries['country_brewery'] != 'United States'].groupby('country_brewery').size().reset_index(name='count').rename(columns={'country_brewery': 'location'})
ratings_in_country_no_US = df_ratings_no_text[df_ratings_no_text['country_brewery'] != 'United States'].groupby('country_brewery').size().reset_index(name='count').rename(columns={'country_brewery': 'location'})
ratings_users_country_no_US = df_ratings_no_text[df_ratings_no_text['country_user'] != 'United States'].groupby('country_user').size().reset_index(name='count').rename(columns={'country_user': 'location'})
df_no_US = [beers_per_country_no_US, users_per_country_no_US, breweries_per_country_no_US, ratings_in_country_no_US, ratings_users_country_no_US]

# Compute the aggregated value for the states of the US
beers_per_country_US = df_beers[df_beers['country_beer'] == 'United States'].groupby('state_beer').size().reset_index(name='count').rename(columns={'state_beer': 'location'})
users_per_country_US = df_users[df_users['country_user'] == 'United States'].groupby('state_user').size().reset_index(name='count').rename(columns={'state_user': 'location'})
breweries_per_country_US = df_breweries[df_breweries['country_brewery'] == 'United States'].groupby('state_brewery').size().reset_index(name='count').rename(columns={'state_brewery': 'location'})
ratings_in_country_US = df_ratings_no_text[df_ratings_no_text['country_brewery'] == 'United States'].groupby('state_brewery').size().reset_index(name='count').rename(columns={'state_brewery': 'location'})
ratings_users_country_US = df_ratings_no_text[df_ratings_no_text['country_user'] == 'United States'].groupby('state_user').size().reset_index(name='count').rename(columns={'state_user': 'location'})
df_US = [beers_per_country_US, users_per_country_US, breweries_per_country_US, ratings_in_country_US, ratings_users_country_US]

# Define some options for the plot
options = {
    "title": "Beer Statistics by Country and US State",
    "plots": [
        {
            'label': 'Beers per country',
            'location_label': 'location',
            'z_label': 'count',
            'colorscale': 'Blues'
        },{
            'label': 'Users per country',
            'location_label': 'location',
            'z_label': 'count',
            'colorscale': 'Blues'
        }, {
            'label': 'Breweries per country',
            'location_label': 'location',
            'z_label': 'count',
            'colorscale': 'Blues'
        }, {
            'label': 'Number of ratings based on the brewery country',
            'location_label': 'location',
            'z_label': 'count',
            'colorscale': 'Blues'
        }, {
            'label': 'Number of ratings based on the reviewer country',
            'location_label': 'location',
            'z_label': 'count',
            'colorscale': 'Blues'            
        }
    ]
}

# Plot the map
plot_map(df_no_US, df_US, options)

Finally just review the temporal evolution of our data.

In [12]:
year_grouping = df_ratings_no_text.groupby(df_ratings_no_text['date'].dt.year).size().reset_index(name='count').rename(columns={'date': 'Year', 'count': 'Number of ratings'})
px.bar(year_grouping, x='Year', y='Number of ratings', title=f'Number of ratings per year').show()

In [13]:
year_grouping = df_users.groupby(df_users['joined'].dt.year).size().reset_index(name='count').rename(columns={'joined': 'Year', 'count': 'Number of users'})
px.bar(year_grouping, x='Year', y='Number of users', title=f'Number of users that has joined each year').show()

## Preliminary preference analysis
In this section we are going to analyze the preferences of the users of the platform in a general way.
### Correlation between the ratings attributes
Here we are gonna analyze the correlation between the ratings attributes given by the users. 

In [14]:
corr = df_ratings_no_text[['palate','taste','appearance','aroma','overall','abv']].corr()
plot_correlation_matrix(df_ratings_no_text[['palate','taste','appearance','aroma','overall','abv']], filename=SAVING_FOLDER + 'correlation_matrix.html', title='Correlation matrix between the ratings provided by the users')

All factors influence the overall user experience, but taste (correlation: 0.86) and aroma (correlation: 0.77) stand out as the most significant. Palate (correlation: 0.66) and appearance (correlation: 0.50) show slightly lower correlations with overall experience, though their contribute remain important.

### Rating in the different countries

### Rating evolution over time in the different countries

## ABV and style analysis
In this section we'll continue by analyzing the popularity of ABV and style in the world and we'll also study how these have evolved over time.

## Brewery popularity analysis
In this section we are going to see whether user likes a lot beers from specific breweries from specific countries and in general we'll focus on understanding the impact of breweries on the popularity of beers.

## NLP analysis
In this final section we are going to see from the textual ratings if there are some specific words or emotions that are associated with high or low ratings.