In [6]:
import pandas as pd
import plotly.graph_objects as go


In [7]:
# Define the folder
FOLDER = '../data/processed/'

# Load the data
df_beers = pd.read_parquet(FOLDER + 'beers.pq')
df_breweries = pd.read_parquet(FOLDER + 'breweries.pq')
df_users = pd.read_parquet(FOLDER + 'users.pq')
df_ratings_no_text = pd.read_parquet(FOLDER + 'ratings_no_text.pq')



In [8]:
# Select only the countries where the number of ratings are above a certain threshold
MIN_RATINGS_THRESHOLD = 1000
number_of_ratings_per_country = df_ratings_no_text['location'].value_counts()
popular_countries = number_of_ratings_per_country[number_of_ratings_per_country > MIN_RATINGS_THRESHOLD].index

# Filter all the data
df_ratings_no_text = df_ratings_no_text[df_ratings_no_text['location'].isin(popular_countries)]
df_beers = df_beers[df_beers['beer_id'].isin(df_ratings_no_text['beer_id'].unique())]
df_breweries = df_breweries[df_breweries['id'].isin(df_ratings_no_text['brewery_id'].unique())]

df_ratings_no_text.head(5)

Unnamed: 0,user_id,rating,review,abv,brewery_name,beer_id,appearance,palate,aroma,overall,taste,style,beer_name,brewery_id,date,idx,location
1,stjamesgate.163714,3.67,True,4.5,Strangford Lough Brewing Company Ltd,19590,3.0,3.5,3.5,3.5,4.0,English Pale Ale,Barelegs Brew,10093,2009-02-20 12:00:00,1,United Kingdom
2,mdagnew.19527,3.73,True,4.5,Strangford Lough Brewing Company Ltd,19590,4.0,3.5,3.5,3.5,4.0,English Pale Ale,Barelegs Brew,10093,2006-03-13 12:00:00,2,United Kingdom
3,helloloser12345.10867,3.98,True,4.5,Strangford Lough Brewing Company Ltd,19590,4.0,4.0,3.5,4.5,4.0,English Pale Ale,Barelegs Brew,10093,2004-12-01 12:00:00,3,United Kingdom
4,cypressbob.3708,4.0,True,4.5,Strangford Lough Brewing Company Ltd,19590,4.0,4.0,4.0,4.0,4.0,English Pale Ale,Barelegs Brew,10093,2004-08-30 12:00:00,4,United Kingdom
5,hellpop65.48993,3.25,False,4.8,Strangford Lough Brewing Company Ltd,19827,,,,,,English Pale Ale,Legbiter,10093,2014-12-01 12:00:00,5,United Kingdom


In [9]:
print("Beer columns:")
print(df_beers.columns)
print("Breweries columns:")
print(df_breweries.columns)
print("Users columns:")
print(df_users.columns)
print("Ratings columns:")
print(df_ratings_no_text.columns)

Beer columns:
Index(['beer_id', 'beer_name', 'brewery_id', 'brewery_name', 'style', 'abv',
       'avg', 'std', 'median', 'appearance', 'aroma', 'palate', 'overall',
       'nbr_ratings', 'nbr_reviews', 'nbr_interactions'],
      dtype='object')
Breweries columns:
Index(['id', 'location', 'name', 'nbr_beers'], dtype='object')
Users columns:
Index(['user_id', 'user_name', 'location', 'joined', 'nbr_ratings',
       'nbr_reviews', 'nbr_interactions'],
      dtype='object')
Ratings columns:
Index(['user_id', 'rating', 'review', 'abv', 'brewery_name', 'beer_id',
       'appearance', 'palate', 'aroma', 'overall', 'taste', 'style',
       'beer_name', 'brewery_id', 'date', 'idx', 'location'],
      dtype='object')


In [13]:
# Example DataFrames (replace with actual DataFrames)
df_breweries = pd.read_parquet(FOLDER + "/breweries.pq")
df_ratings_no_text = pd.read_parquet(FOLDER + "/ratings_no_text.pq")

# Preprocess the data for global and US state-level analysis
breweries_per_country = df_breweries.groupby("location").size().reset_index(name="nbr_breweries")
ratings_per_country = df_ratings_no_text.groupby("location").size().reset_index(name="nbr_ratings")
unique_beers_per_country = (
    df_ratings_no_text.groupby("location")["beer_id"].nunique().reset_index(name="nbr_unique_beers")
)

# Filter US-specific data for state-level analysis
df_breweries_us = df_breweries[df_breweries["location"].str.contains(", USA", na=False)]
df_breweries_us["state"] = df_breweries_us["location"].str.split(", ").str[0]
breweries_per_state = df_breweries_us.groupby("state").size().reset_index(name="nbr_breweries")

df_ratings_us = df_ratings_no_text[df_ratings_no_text["location"].str.contains(", USA", na=False)]
df_ratings_us["state"] = df_ratings_us["location"].str.split(", ").str[0]
ratings_per_state = df_ratings_us.groupby("state").size().reset_index(name="nbr_ratings")

# Create base figure
fig = go.Figure()

# Add traces for global country-level stats
fig.add_trace(
    go.Choropleth(
        locations=breweries_per_country["location"],
        locationmode="country names",
        z=breweries_per_country["nbr_breweries"],
        colorscale="Blues",
        colorbar_title="Breweries",
        visible=True,  # Show this trace initially
        name="Breweries per Country",
    )
)

fig.add_trace(
    go.Choropleth(
        locations=ratings_per_country["location"],
        locationmode="country names",
        z=ratings_per_country["nbr_ratings"],
        colorscale="Greens",
        colorbar_title="Ratings",
        visible=False,
        name="Ratings per Country",
    )
)

fig.add_trace(
    go.Choropleth(
        locations=unique_beers_per_country["location"],
        locationmode="country names",
        z=unique_beers_per_country["nbr_unique_beers"],
        colorscale="Reds",
        colorbar_title="Unique Beers",
        visible=False,
        name="Unique Beers per Country",
    )
)

# Add traces for US state-level stats
fig.add_trace(
    go.Choropleth(
        locations=breweries_per_state["state"],
        locationmode="USA-states",
        z=breweries_per_state["nbr_breweries"],
        colorscale="Blues",
        colorbar_title="Breweries",
        visible=False,
        name="Breweries per US State",
    )
)

fig.add_trace(
    go.Choropleth(
        locations=ratings_per_state["state"],
        locationmode="USA-states",
        z=ratings_per_state["nbr_ratings"],
        colorscale="Greens",
        colorbar_title="Ratings",
        visible=False,
        name="Ratings per US State",
    )
)

# Add dropdown menu for selecting the plots
fig.update_layout(
    updatemenus=[
        dict(
            buttons=[
                dict(
                    args=[{"visible": [True, False, False, False, False]}],
                    label="Breweries per Country",
                    method="update",
                ),
                dict(
                    args=[{"visible": [False, True, False, False, False]}],
                    label="Ratings per Country",
                    method="update",
                ),
                dict(
                    args=[{"visible": [False, False, True, False, False]}],
                    label="Unique Beers per Country",
                    method="update",
                ),
                dict(
                    args=[{"visible": [False, False, False, True, False]}],
                    label="Breweries per US State",
                    method="update",
                ),
                dict(
                    args=[{"visible": [False, False, False, False, True]}],
                    label="Ratings per US State",
                    method="update",
                ),
            ],
            direction="down",
            showactive=True,
            x=0.1,
            y=1.15,
            xanchor="left",
            yanchor="top",
        )
    ]
)

# Layout settings
fig.update_layout(
    title=dict(
        text="Beer Stats by Country and US State",
        x=0.5,
        xanchor="center",
    ),
    geo=dict(
        showframe=False,
        showcoastlines=True,
        projection_type="equirectangular",
        scope="world",
    ),
    height=600,
    width=800,
)

# Show the plot
fig.show()
