In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

sns.set_theme()

In [None]:
def describe_dataframe(dataframe: pd.DataFrame, site: str, name: str):
    print(f'[{site}] Dataframe contains {dataframe.shape[0]} rows and {dataframe.shape[1]} columns')    
    dataframe.isna().mean(axis=0).sort_values().plot(
        figsize=(6, 3),
        kind='barh', 
        title=f'[{site}] Missing values in {name} dataset',
        xlabel='Proportion of missing values'
    )

# Data Exploration - Beer Advocate

In [None]:
df_ba_beers     = pd.read_csv('src/data/beer_advocate/beers.csv')
df_ba_breweries = pd.read_csv('src/data/beer_advocate/breweries.csv')
df_ba_users     = pd.read_csv('src/data/beer_advocate/users.csv')
df_ba_ratings   = pd.read_csv('src/data/beer_advocate/ratings.csv')

## Beers

In [None]:
describe_dataframe(df_ba_beers, 'BeerAdvocate', 'beers')
df_ba_beers.head(5)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation, columns with NaNs are not the ones used currently in our analysis, thus it is not a problem</span>

### Breweries

In [None]:
df_beers_by_brewery = df_ba_beers.groupby('brewery_id').size().reset_index().rename(columns={0: 'count'}).sort_values('count', ascending=True)
df_beers_by_brewery['CCDF'] = 1 - df_beers_by_brewery['count'].cumsum(0) / df_beers_by_brewery['count'].sum()
df_beers_by_brewery.plot(
    x='count', y='CCDF', 
    logx=True, logy=True, 
    title='[BeerAdvocate] CCDF of number of beers per brewery',
    xlabel='Number of beers', 
    ylabel='CCDF',
)
print(f'[BeerAdvocate] Quartiles of the number of beers per brewery : {df_beers_by_brewery['count'].quantile([0.25, 0.5, 0.75]).values}')

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Beer Style

In [None]:
print(f'[BeerAdvocate] Dataset contains data about {df_ba_beers['beer_style'].nunique()} styles of beer')

During pre-processing, we grouped several beer styles together into broader categories based on those proposed on the BeerAdvocate website, to create a taxonomy that was easier to analyze.

In [None]:
df_ba_beers.groupby('beer_global_style').size().sort_values(ascending=False).plot(
    kind='bar',
    title='[BeerAdvocate] Number of beers per global style',
    xlabel='Beer style',
    ylabel='Number of beers',
)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Number of ratings

In [None]:
bin_counts = pd.cut(df_ba_beers['ratings_count'], 100).value_counts().sort_index()
ratings_per_beer_CCDF = 1 - bin_counts.cumsum(0) / bin_counts.sum()

sns.lineplot(
    x=[interval.mid for interval in bin_counts.index],
    y=ratings_per_beer_CCDF
)
plt.title('[BeerAdvocate] CCDF of the number of ratings per beer (log-log with 100 bins)')
plt.xlabel('Number of ratings')
plt.ylabel('CCDF')
plt.xscale('log')
plt.yscale('log')

print(f'[BeerAdvocate] {(df_ba_beers['ratings_count'] == 0).mean() * 100:.2f}% ({(df_ba_beers['ratings_count'] == 0).sum()}) of beers have no ratings')
print(f'[BeerAdvocate] Quartiles of the number of ratings per beer : {df_ba_beers['ratings_count'].quantile([0.25, 0.5, 0.75]).values}')

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Average rating

In [None]:
sns.violinplot(df_ba_beers['ratings_average'], inner='quart')
plt.title('[BeerAdvocate] Distribution of average ratings per beer')
plt.ylabel('Average rating (/5)')

In [None]:
ratings_average_rolling = df_ba_beers[df_ba_beers['ratings_average'].notna()]['ratings_average'].rolling(10).mean()[9:]
sns.histplot(ratings_average_rolling, bins=100, kde=True, stat="density")


ratings_average_mean = ratings_average_rolling.mean()
ratings_average_std  = ratings_average_rolling.std()

x = np.linspace(
        ratings_average_mean - 4 * ratings_average_std,
        ratings_average_mean + 4 * ratings_average_std, 
        200
    )
y = stats.norm.pdf(x, ratings_average_mean, ratings_average_std)

sns.lineplot(x=x, y=y, color='red')

plt.title('[BeerAdvocate] Distribution of average ratings per beer (rolling mean with window=10)')
plt.legend(['Beer average rating distribution', 'Normal distribution'])
plt.xlabel('Average rating (/5)')

In [None]:
ALPHA = 0.01
normality_test_results = stats.normaltest(df_ba_beers['ratings_average'].dropna())
print(f'[BeerAdvocate] {normality_test_results}')
print(f'[BeerAdvocate] Normality test for average ratings per beer : reject null hypothesis = {normality_test_results.pvalue < ALPHA}')

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation (rolling mean because of the underlying distribution of rating with increment of 0.25) </span>

## Breweries

In [None]:
describe_dataframe(df_ba_breweries, 'BeerAdvocate', 'beers')
print(f'[BeerAdvocate] Dataset contains data about breweries in {df_ba_breweries["brewery_country"].nunique()} countries')
df_ba_breweries.head(5)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Country

In [None]:
print(f'[BeerAdvocate] Quartiles of the number of breweries per country : {df_ba_breweries["brewery_country"].value_counts().quantile([0.25, 0.5, 0.75]).values}')

df_ba_breweries['brewery_country'].value_counts().head(50).plot(
    kind='bar', 
    log=True,
    figsize=(10, 4),
    title='[BeerAdvocate] Number of breweries per country (50 largers producers)',
    xlabel='Country',
    ylabel='Number of breweries (log scale)',
)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

## Users

In [None]:
describe_dataframe(df_ba_users, 'BeerAdvocate', 'users')
df_ba_users.head(5)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Ratings

In [None]:
ratings_per_user_CCDF = 1 - df_ba_users['user_ratings_count'].value_counts().sort_index().cumsum() / df_ba_users.shape[0]

ratings_per_user_CCDF.plot(
    x=df_ba_users['user_ratings_count'].value_counts().sort_index(),
    y=ratings_per_user_CCDF,
    logx=True, logy=True, 
    title='[BeerAdvocate] CCDF of number of ratings per user (log-log)',
    xlabel='Number of ratings', 
    ylabel='CCDF',
)

print(f'[BeerAdvocate] Quartiles of the number of ratings per user : {df_ba_users['user_ratings_count'].quantile([0.25, 0.5, 0.75]).values}')

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Country

In [None]:
df_ba_users['user_country'].value_counts().head(10)

In [None]:
print(f'[BeerAdvocate] Users from United States represent {df_ba_users["user_country"].value_counts(normalize=True).loc['United States'] * 100:.2f}% of accounts')

In [None]:
df_ba_users['user_country'].value_counts().plot(
    kind='bar',
    figsize=(20, 4),
    logy=True,
    title='[BeerAdvocate] Number of users per country',
    xlabel='Country',
    ylabel='Number of users (log scale)',
)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Creation date

In [None]:
PRESENT_TIME = df_ba_users['user_created_date'].max()
user_years_since_creation = (PRESENT_TIME - df_ba_users['user_created_date']) / 3600 / 24 / 365.25

In [None]:
print(f'[BeerAdvocate] Mean of the number of years since user creation : {user_years_since_creation.mean():.3f}')
print(f'[BeerAdvocate] Quartiles of the number of years since user creation : {user_years_since_creation.quantile([0.25, 0.5, 0.75]).values}')

In [None]:
user_years_since_creation.plot(
    kind='hist',
    bins=100,
    title='[BeerAdvocate] Distribution of time elapsed since user account creation (years)',
    xlabel='Years since user creation',
    ylabel='Number of users',
)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation (talk about the fact that a lot of accounts could be "dead" accounts that are not active anymore)</span>

## Ratings

In [None]:
describe_dataframe(df_ba_ratings, 'BeerAdvocate', 'ratings')
df_ba_ratings.head(5)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation, for past_ratings_average it is normal because it is a shifted running average</span>

### Date

In [None]:
pd.to_datetime(df_ba_ratings['date'], unit='s').dt.to_period('Q').value_counts().sort_index().plot(
    kind='bar',
    figsize=(15, 4),
    title='[BeerAdvocate] Number of ratings per quarters',
    xlabel='Quarter',
    ylabel='Number of ratings'
)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Text

In [None]:
print(f'[BeerAdvocate] {(df_ba_ratings['text'].notna()).mean() * 100:.2f}% ({(df_ba_ratings['text'].notna()).sum()}) of the reviews contain a text')

print(f'[BeerAdvocate] Average length of text in reviews : {df_ba_ratings["text"].str.len().mean():.2f} characters')
print(f'[BeerAdvocate] Quartiles of the length of text in reviews : {df_ba_ratings["text"].str.len().quantile([0.25, 0.5, 0.75]).values} characters')

print(f'[BeerAdvocate] Average length of text in reviews : {(df_ba_ratings["text"].str.count(' ') + 1).mean():.2f} words')
print(f'[BeerAdvocate] Quartiles of the length of text in reviews : {(df_ba_ratings["text"].str.count(' ') + 1).quantile([0.25, 0.5, 0.75]).values} words')

In [None]:
ratings_text_lengths = df_ba_ratings['text'].str.len()

sns.histplot(ratings_text_lengths, bins=100, kde=True, stat="density")


ratings_text_length_mean = ratings_text_lengths.mean()
ratings_text_length_std  = ratings_text_lengths.std()

x = np.linspace(
        ratings_text_length_mean - 4 * ratings_text_length_std,
        ratings_text_length_mean + 4 * ratings_text_length_std, 
        200
    )
y = stats.norm.pdf(x, ratings_text_length_mean, ratings_text_length_std)

sns.lineplot(x=x, y=y, color='red')

plt.title('[BeerAdvocate] Distribution of length of text ratings')
plt.legend(['Review text length distribution', 'Normal distribution'])
plt.xlabel('Text length')

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Ratings (subcategories)

In [None]:
fig, axs = plt.subplots(2, 3, sharey=True, figsize=(12, 6))
axs = axs.ravel()

for i, c in enumerate(['overall', 'aroma', 'appearance', 'palate', 'taste']):
    df_ba_ratings[c].plot(
        kind='box',
        ax=axs[i],
        title=f'[BeerAdvocate] Distribution of {c} ratings',
        ylabel=f'{c} rating (/5)'
    )

fig.delaxes(axs[-1])
plt.tight_layout()

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Correlations

#### Ratings and subcategories

In [None]:
ratings_cross_corr_array = df_ba_ratings[["rating","overall","aroma","appearance","palate","taste"]].dropna().to_numpy()
corrcoef_ratings = np.corrcoef(ratings_cross_corr_array,rowvar=0)
sns.heatmap(corrcoef_ratings,annot=True,xticklabels=["rating","overall","aroma","appearance","palate","taste"],yticklabels=["rating","overall","aroma","appearance","palate","taste"])
plt.show()

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

In [None]:
del corrcoef_ratings

#### Ratings and past ratings

In [None]:
df_ba_ratings_full_corr_coef = df_ba_ratings.drop(["user_id","beer_id","brewery_id","date","review","text","overall","aroma","appearance","palate","taste","beer_global_style"],axis=1)
df_ba_ratings_full_corr_coef.dropna(inplace=True)
corr_columns = df_ba_ratings_full_corr_coef.columns

corrcoef_ratings = np.corrcoef(df_ba_ratings_full_corr_coef.to_numpy(),rowvar=0)

triup_array = np.triu(corrcoef_ratings)

sns.heatmap(triup_array,
            annot=True,
            xticklabels=corr_columns,
            yticklabels=corr_columns,
            fmt=".1f",
            mask=np.abs(triup_array) < 0.5)
plt.show()

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

In [None]:
del df_ba_ratings_full_corr_coef
del corrcoef_ratings
del triup_array

# Data Exploration - Rate Beer

In [None]:
df_rb_beers     = pd.read_csv('src/data/rate_beer/beers.csv')
df_rb_breweries = pd.read_csv('src/data/rate_beer/breweries.csv')
df_rb_users     = pd.read_csv('src/data/rate_beer/users.csv')
df_rb_ratings   = pd.read_csv('src/data/rate_beer/ratings.csv')

## Beer

In [None]:
describe_dataframe(df_rb_beers, 'RateBeer', 'beers')
df_rb_beers.head(5)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Breweries

In [None]:
df_beers_by_brewery = df_rb_beers.groupby('brewery_id').size()
df_beers_by_brewery.plot(
    kind='hist',
    log=True, 
    bins=100,
    title='[RateBeer] Number of beers per brewery',
    xlabel='Number of beers',
    ylabel='Number of breweries',
)
print(f'[RateBeer] Quartiles of the number of beers per brewery : {df_beers_by_brewery.quantile([0.25, 0.5, 0.75]).values}')

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Beer Style

In [None]:
print(f'[RateBeer] Dataset contains data about {df_rb_beers['beer_style'].nunique()} styles of beer')

In [None]:
df_rb_beers.groupby('beer_global_style').size().sort_values(ascending=False).plot(
    kind='bar',
    title='[RateBeer] Number of beers per global style',
    xlabel='Beer style',
    ylabel='Number of beers',
)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Ratings count

In [None]:
df_rb_beers['ratings_count'].plot(
    kind='hist',
    bins=100, 
    log=True,
    title='[RateBeer] Distribution of the number of ratings per beer',
    xlabel='Number of ratings',
    ylabel='Number of beers (log scale)',
)

print(f'[RateBeer] {(df_rb_beers['ratings_count'] == 0).mean() * 100:.2f}% ({(df_rb_beers['ratings_count'] == 0).sum()}) of beers have no ratings')
print(f'[RateBeer] Quartiles of the number of ratings per beer : {df_rb_beers['ratings_count'].quantile([0.25, 0.5, 0.75]).values}')

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Ratings average

In [None]:
sns.violinplot(df_rb_beers['ratings_average'], inner='quart')
plt.title('[RateBeer] Distribution of average ratings per beer')
plt.ylabel('Average rating (/5)')

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

In [None]:
ratings_average_rolling = df_rb_beers[df_rb_beers['ratings_average'].notna()]['ratings_average'].rolling(10).mean()[9:]
sns.histplot(ratings_average_rolling, bins=100, kde=True, stat="density")


ratings_average_mean = ratings_average_rolling.mean()
ratings_average_std  = ratings_average_rolling.std()

x = np.linspace(
        ratings_average_mean - 4 * ratings_average_std,
        ratings_average_mean + 4 * ratings_average_std, 
        200
    )
y = stats.norm.pdf(x, ratings_average_mean, ratings_average_std)

sns.lineplot(x=x, y=y, color='red')

plt.title('[RateBeer] Distribution of average ratings per beer (rolling mean with window=10)')
plt.legend(['Beer average rating distribution', 'Normal distribution'])
plt.xlabel('Average rating (/5)')

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

In [None]:
ALPHA = 0.01
normality_test_results = stats.normaltest(df_rb_beers['ratings_average'].dropna())
print(f'[RateBeer] {normality_test_results}')
print(f'[RateBeer] Normality test for average ratings per beer : reject null hypothesis = {normality_test_results.pvalue < ALPHA}')

## Breweries

In [None]:
describe_dataframe(df_rb_breweries, 'RateBeer', 'breweries')
print(f'[RateBeer] Dataset contains data about breweries in {df_ba_breweries["brewery_country"].nunique()} countries')
df_rb_breweries.head(5)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Country

In [None]:
print(f'[RateBeer] Quartiles of the number of breweries per country : {df_rb_breweries["brewery_country"].value_counts().quantile([0.25, 0.5, 0.75]).values}')

df_rb_breweries['brewery_country'].value_counts().head(50).plot(
    kind='bar', 
    log=True,
    figsize=(10, 4),
    title='[RateBeer] Number of breweries per country (50 largers producers)',
    xlabel='Country',
    ylabel='Number of breweries (log scale)',
)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

## Users

In [None]:
describe_dataframe(df_rb_users, 'RateBeer', 'users')
df_rb_users.head(5)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Ratings

In [None]:
ratings_per_user_CCDF = 1 - df_rb_users['user_ratings_count'].value_counts().sort_index().cumsum() / df_rb_users.shape[0]

ratings_per_user_CCDF.plot(
    x=df_rb_users['user_ratings_count'].value_counts().sort_index(),
    y=ratings_per_user_CCDF,
    logx=True, logy=True, 
    title='[BeerAdvocate] CCDF of number of ratings per user (log-log)',
    xlabel='Number of ratings', 
    ylabel='CCDF',
)

print(f'[RateBeer] Quartiles of the number of ratings per user : {df_rb_users['user_ratings_count'].quantile([0.25, 0.5, 0.75]).values}')

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Country

In [None]:
df_rb_users['user_country'].value_counts().head(10)

In [None]:
print(f'[RateBeer] Users from United States represent {df_rb_users["user_country"].value_counts(normalize=True).loc['United States'] * 100:.2f}% of accounts')

In [None]:
df_rb_users['user_country'].value_counts().plot(
    kind='bar',
    logy=True,
    figsize=(20, 4),
    title='[RateBeer] Number of users per country',
    xlabel='Country',
    ylabel='Number of users (log scale)',
)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Creation date

In [None]:
PRESENT_TIME = df_rb_users['user_created_date'].max()
user_years_since_creation = (PRESENT_TIME - df_rb_users['user_created_date']) / 3600 / 24 / 365.25

In [None]:
print(f'[RateBeer] Mean of the number of years since user creation : {user_years_since_creation.mean():.3f}')
print(f'[RateBeer] Quartiles of the number of years since user creation : {user_years_since_creation.quantile([0.25, 0.5, 0.75]).values}')

In [None]:
user_years_since_creation.plot(
    kind='hist',
    bins=100,
    title='[RateBeer] Distribution of time elapsed since user account creation (years)',
    xlabel='Years since user creation',
    ylabel='Number of users',
)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

## Ratings

In [None]:
describe_dataframe(df_rb_ratings, 'RateBeer', 'ratings')
df_rb_ratings.head(5)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Date

In [None]:
pd.to_datetime(df_rb_ratings['date'], unit='s').dt.to_period('Q').value_counts().sort_index().plot(
    kind='bar',
    figsize=(15, 4),
    title='[RateBeer] Number of ratings per quarters',
    xlabel='Quarter',
    ylabel='Number of ratings'
)

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Text

In [None]:
print(f'[RateBeer] {(df_ba_ratings['text'].notna()).mean() * 100:.2f}% ({(df_ba_ratings['text'].notna()).sum()}) of the reviews contain a text')

print(f'[RateBeer] Average length of text in reviews : {df_ba_ratings["text"].str.len().mean():.2f} characters')
print(f'[RateBeer] Quartiles of the length of text in reviews : {df_ba_ratings["text"].str.len().quantile([0.25, 0.5, 0.75]).values} characters')

print(f'[RateBeer] Average length of text in reviews : {(df_ba_ratings["text"].str.count(' ') + 1).mean():.2f} words')
print(f'[RateBeer] Quartiles of the length of text in reviews : {(df_ba_ratings["text"].str.count(' ') + 1).quantile([0.25, 0.5, 0.75]).values} words')

In [None]:
ratings_text_lengths = df_rb_ratings['text'].str.len()

sns.histplot(ratings_text_lengths, bins=100, kde=True, stat="density")


ratings_text_length_mean = ratings_text_lengths.mean()
ratings_text_length_std  = ratings_text_lengths.std()

x = np.linspace(
        ratings_text_length_mean - 4 * ratings_text_length_std,
        ratings_text_length_mean + 4 * ratings_text_length_std, 
        200
    )
y = stats.norm.pdf(x, ratings_text_length_mean, ratings_text_length_std)

sns.lineplot(x=x, y=y, color='red')

plt.title('[RateBeer] Distribution of length of text ratings')
plt.legend(['Review text length distribution', 'Normal distribution'])
plt.xlabel('Text length')

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Ratings (subcategories)

In [None]:
fig, axs = plt.subplots(2, 3, sharey=True, figsize=(12, 6))
axs = axs.ravel()

for i, c in enumerate(['overall', 'aroma', 'appearance', 'palate', 'taste']):
    df_rb_ratings[c].plot(
        kind='box',
        ax=axs[i],
        title=f'[RateBeer] Distribution of {c} ratings',
        ylabel=f'{c} rating (/5)'
    )

fig.delaxes(axs[-1])
plt.tight_layout()

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

### Correlations

#### Ratings and subcategories

In [None]:
ratings_cross_corr_array = df_rb_ratings[["rating","overall","aroma","appearance","palate","taste"]].dropna().to_numpy()
corrcoef_ratings = np.corrcoef(ratings_cross_corr_array,rowvar=0)
sns.heatmap(corrcoef_ratings,annot=True,xticklabels=["rating","overall","aroma","appearance","palate","taste"],yticklabels=["rating","overall","aroma","appearance","palate","taste"])
plt.show()

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

In [None]:
del corrcoef_ratings

#### Ratings and past ratings

In [None]:
df_rb_ratings_full_corr_coef = df_rb_ratings.drop(["user_id","beer_id","brewery_id","date","review","text","overall","aroma","appearance","palate","taste","beer_global_style"],axis=1)
df_rb_ratings_full_corr_coef.dropna(inplace=True)
corr_columns = df_rb_ratings_full_corr_coef.columns

corrcoef_ratings = np.corrcoef(df_rb_ratings_full_corr_coef.to_numpy(),rowvar=0)

triup_array = np.triu(corrcoef_ratings)

sns.heatmap(triup_array,
            annot=True,
            xticklabels=corr_columns,
            yticklabels=corr_columns,
            fmt=".1f",
            mask=np.abs(triup_array) < 0.5)
plt.show()

<span style="background-color: red; color: white; font-weight: bold; padding: 0.75rem; display: block;">TODO : Interpretation</span>

In [None]:
del df_rb_ratings_full_corr_coef
del corrcoef_ratings
del triup_array