# PreAnalysis for Users datasets

In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import os
import matplotlib.pyplot as plt

In [2]:
# Define folder paths and filenames
folders = ['BeerAdvocate', 'RateBeer']
filenames = ['beers.csv', 'breweries.csv', 'users.csv']

# Dictionary to store DataFrames
data = {}

# Loop through each folder and filename to load the data
for folder in folders:
    for filename in filenames:
        # Create a dynamic key for each DataFrame
        key = f"{folder.lower()}_{filename.split('.')[0]}"
        # Read the CSV and store it in the dictionary
        data[key] = pd.read_csv(os.path.join(folder, filename))

user_review_summary = pd.read_csv('user_reviews_summary1.csv')
# Now you can access each DataFrame by key, e.g., data['beeradvocate_beers']

FileNotFoundError: [Errno 2] No such file or directory: 'BeerAdvocate/beers.csv'

In [None]:
# Count duplicates for each dataset
duplicate_counts = {
    'ratebeer_users': data['ratebeer_users'].duplicated(subset='user_id').sum(),
    'beeradvocate_users': data['beeradvocate_users'].duplicated(subset='user_id').sum(),
    'ratebeer_beers': data['ratebeer_beers'].duplicated(subset='beer_id').sum(),
    'beeradvocate_beers': data['beeradvocate_beers'].duplicated(subset='beer_id').sum(),
    'ratebeer_breweries': data['ratebeer_breweries'].duplicated(subset='id').sum(),
    'beeradvocate_breweries': data['beeradvocate_breweries'].duplicated(subset='id').sum(),
}

# Print the results
for dataset, count in duplicate_counts.items():
    print(f"{dataset} has {count} duplicate(s).")


In [None]:
# Dropping Duplicates in ratebeer_users csv file
data['ratebeer_users'] = data['ratebeer_users'].drop_duplicates(subset = 'user_id')

In [None]:
# Check missing values for each dataset
missing_values = {
    'ratebeer_users': data['ratebeer_users'].isnull().sum(),
    'beeradvocate_users': data['beeradvocate_users'].isnull().sum(),
    'ratebeer_beers': data['ratebeer_beers'].isnull().sum(),
    'beeradvocate_beers': data['beeradvocate_beers'].isnull().sum(),
    'ratebeer_breweries': data['ratebeer_breweries'].isnull().sum(),
    'beeradvocate_breweries': data['beeradvocate_breweries'].isnull().sum(),
}
for dataset, missing in missing_values.items():
    print(f"\nMissing values in {dataset}:")
    print(missing)

In [None]:
data['ratebeer_users'] = data['ratebeer_users'].merge(
        user_review_summary[['user_id', 'nbr_reviews']], on='user_id', how='inner')

# Define the desired column order for data['ratebeer_users'] and data[beeradvocate_users]
new_order = ['user_id', 'user_name', 'nbr_ratings', 'nbr_reviews', 'joined', 'location']

# Reorder columns in data['ratebeer_users'] and data['beeradvocate_users']
data['ratebeer_users'] = data['ratebeer_users'][new_order]
data['beeradvocate_users'] = data['beeradvocate_users'][new_order]

In [None]:
# Calculate sums
beeradvocate_sum = data['beeradvocate_users']['nbr_reviews'].sum()
ratebeer_sum = data['ratebeer_users']['nbr_reviews'].sum()

# Print the results
print(f"Total number of reviews in beeradvocate users: {beeradvocate_sum}")
print(f"Total number of reviews in ratebeer users: {ratebeer_sum}")

In [None]:
import plotly.graph_objects as go

beeradvocate_user_reviews = data['beeradvocate_users'].groupby('user_id')['nbr_reviews'].sum()
ratebeer_user_reviews = data['ratebeer_users'].groupby('user_id')['nbr_reviews'].sum()

beeradvocate_user_reviews_df = beeradvocate_user_reviews.reset_index()
beeradvocate_user_reviews_df.columns = ['user_id', 'total_reviews']

ratebeer_user_reviews_df = ratebeer_user_reviews.reset_index()
ratebeer_user_reviews_df.columns = ['user_id', 'total_reviews']

# Histograms
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=beeradvocate_user_reviews_df['total_reviews'],
    nbinsx=20,  
    name='BeerAdvocate User Reviews',
    opacity=0.7
))

fig.add_trace(go.Histogram(
    x=ratebeer_user_reviews_df['total_reviews'],
    nbinsx=20, 
    name='Ratebeer User Reviews', 
    opacity=0.7
))


fig.update_layout(
    title=dict(
        text='Comparison Of Total Reviews Per User',
        x=0.5, 
        xanchor='center' 
    ),
    xaxis=dict(
        title='Sum Of Reviews Per User',
        range=[0, 25000],
    ),
    yaxis=dict(
        title='Frequency (Log Scale)',
        type='log',  
        range=[0, 5],  
        showgrid=True  
    ),
    barmode='overlay',  
    template='plotly_white'
)

fig.update_traces(opacity=0.5)

fig.show()

In [None]:
from plotly.subplots import make_subplots

def classify_location(location):
    if pd.isnull(location):
        return None 
    elif "United States" in location:
        return "United States"
    else:
        return "Other Countries"

data['beeradvocate_users']['location_category'] = data['beeradvocate_users']['location'].apply(classify_location)
data['ratebeer_users']['location_category'] = data['ratebeer_users']['location'].apply(classify_location)


beeradvocate_location_counts = data['beeradvocate_users']['location_category'].value_counts().reset_index()
beeradvocate_location_counts.columns = ['Location', 'Count']

# Filtering rows with NaN 
beeradvocate_location_counts = beeradvocate_location_counts[~beeradvocate_location_counts['Location'].isna()]


ratebeer_location_counts = data['ratebeer_users']['location_category'].value_counts().reset_index()
ratebeer_location_counts.columns = ['Location', 'Count']


ratebeer_location_counts = ratebeer_location_counts[~ratebeer_location_counts['Location'].isna()]


max_y = max(
    beeradvocate_location_counts['Count'].max(),
    ratebeer_location_counts['Count'].max()
)

# Define custom colors
color_map = {
    "United States": "#87CEFA",  
    "Other Countries": "pink"  
}


fig = make_subplots(
    rows=1, cols=2,  
    subplot_titles=("BeerAdvocate Users", "Ratebeer Users")
)


fig.add_trace(
    go.Bar(
        x=beeradvocate_location_counts['Location'],
        y=beeradvocate_location_counts['Count'],
        name="BeerAdvocate Users",
        marker_color=[color_map[loc] for loc in beeradvocate_location_counts['Location']]  
    ),
    row=1, col=1
)

fig.add_trace(
    go.Bar(
        x=ratebeer_location_counts['Location'],
        y=ratebeer_location_counts['Count'],
        name="Ratebeer Users",
        marker_color=[color_map[loc] for loc in ratebeer_location_counts['Location']] 
    ),
    row=1, col=2
)

fig.update_yaxes(range=[0, max_y], title_text="Number of Users", row=1, col=1)
fig.update_yaxes(range=[0, max_y], title_text="Number of Users", row=1, col=2)
fig.update_xaxes(title_text="Location Category", row=1, col=1)
fig.update_xaxes(title_text="Location Category", row=1, col=2)


fig.update_layout(
    title=dict(
        text="Comparison of User Locations In Different Sites",
        x=0.5, 
        xanchor="center"  
    ),
    showlegend = False,
    template="plotly_white",
    height=500,  
    width=1000  
)

fig.show()


In [None]:
# Function
def classify_location(location):
    if pd.isnull(location):
        return None 
    elif "United States" in location:
        return "United States"
    else:
        return "Other Countries"

# Apply function
data['beeradvocate_users']['location_category'] = data['beeradvocate_users']['location'].apply(classify_location)
data['ratebeer_users']['location_category'] = data['ratebeer_users']['location'].apply(classify_location)


beeradvocate_location_counts = data['beeradvocate_users']['location_category'].value_counts().reset_index()
beeradvocate_location_counts.columns = ['Location', 'Count']
beeradvocate_location_counts = beeradvocate_location_counts[~beeradvocate_location_counts['Location'].isna()]


ratebeer_location_counts = data['ratebeer_users']['location_category'].value_counts().reset_index()
ratebeer_location_counts.columns = ['Location', 'Count']
ratebeer_location_counts = ratebeer_location_counts[~ratebeer_location_counts['Location'].isna()]


max_y = max(
    beeradvocate_location_counts['Count'].max()+ 20000,
    ratebeer_location_counts['Count'].max()
)

# Define custom colors
color_map = {
    "United States": "#87CEFA",  
    "Other Countries": "pink"  
}


fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True)

axes[0].bar(
    beeradvocate_location_counts['Location'],
    beeradvocate_location_counts['Count'],
    color=[color_map[loc] for loc in beeradvocate_location_counts['Location']]
)
axes[0].set_title("BeerAdvocate Users")
axes[0].set_xlabel("Location Category")
axes[0].set_ylabel("Number of Users")
axes[0].set_ylim(0, max_y)


axes[1].bar(
    ratebeer_location_counts['Location'],
    ratebeer_location_counts['Count'],
    color=[color_map[loc] for loc in ratebeer_location_counts['Location']]
)
axes[1].set_title("Ratebeer Users")
axes[1].set_xlabel("Location Category")


fig.suptitle("Comparison of User Locations In Different Sites", fontsize = 16)
fig.tight_layout(rect = [0, 0, 1, 0.95])


plt.show()

In [None]:
data['beeradvocate_breweries'] = data['beeradvocate_breweries'].loc[data['beeradvocate_breweries']['nbr_beers'] != 0]
data['beeradvocate_breweries']

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

def classify_location(location):
    if pd.isnull(location):
        return None 
    elif "United States" in location:
        return "United States"
    else:
        return "Other Countries"

# Adding location category
data['beeradvocate_breweries']['location_category'] = data['beeradvocate_breweries']['location'].apply(classify_location)
data['ratebeer_breweries']['location_category'] = data['ratebeer_breweries']['location'].apply(classify_location)

# Calculating total number of beers per location
beeradvocate_beer_totals = data['beeradvocate_breweries'].groupby('location_category')['nbr_beers'].sum().reset_index()
beeradvocate_beer_totals.columns = ['Location', 'Total Beers']

# Filtering out NaN rows
beeradvocate_beer_totals = beeradvocate_beer_totals[~beeradvocate_beer_totals['Location'].isna()]

# Calculating total number of beers per location
ratebeer_beer_totals = data['ratebeer_breweries'].groupby('location_category')['nbr_beers'].sum().reset_index()
ratebeer_beer_totals.columns = ['Location', 'Total Beers']

# Filtering out NaN rows
ratebeer_beer_totals = ratebeer_beer_totals[~ratebeer_beer_totals['Location'].isna()]


max_y = max(
    beeradvocate_beer_totals['Total Beers'].max(),
    ratebeer_beer_totals['Total Beers'].max()
)

# Define custom colors
color_map = {
    "United States": "#87CEFA",  
    "Other Countries": "pink"  
}

fig = make_subplots(
    rows=1, cols=2,  
    subplot_titles=("BeerAdvocate Breweries (Total Beers)", "Ratebeer Breweries (Total Beers)")
)


fig.add_trace(
    go.Bar(
        x = beeradvocate_beer_totals['Location'],
        y = beeradvocate_beer_totals['Total Beers'],
        name = "BeerAdvocate Breweries",
        marker_color = [color_map[loc] for loc in beeradvocate_beer_totals['Location']]  
    ),
    row = 1, col = 1
)


fig.add_trace(
    go.Bar(
        x = ratebeer_beer_totals['Location'],
        y = ratebeer_beer_totals['Total Beers'],
        name = "Ratebeer Breweries",
        marker_color = [color_map[loc] for loc in ratebeer_beer_totals['Location']]  
    ),
    row = 1, col = 2
)

# axes
fig.update_yaxes(range = [0, max_y], title_text = "Total Number of Beers", row = 1, col = 1)
fig.update_yaxes(range = [0, max_y], title_text = "Total Number of Beers", row = 1, col = 2)
fig.update_xaxes(title_text = "Location Category", row = 1, col = 1)
fig.update_xaxes(title_text = "Location Category", row = 1, col = 2)

fig.update_layout(
    title = dict(
        text = "Comparison of Total Beers by Location in Different Sites",
        x = 0.5, 
        xanchor = "center"  
    ),
    showlegend = False,
    template = "plotly_white",
    height = 500,  
    width = 1000  
)

fig.show()

In [None]:
# Function
def classify_location(location):
    if pd.isnull(location):
        return None 
    elif "United States" in location:
        return "United States"
    else:
        return "Other Countries"

# Applying classify_location function
data['beeradvocate_breweries']['location_category'] = data['beeradvocate_breweries']['location'].apply(classify_location)
data['ratebeer_breweries']['location_category'] = data['ratebeer_breweries']['location'].apply(classify_location)


beeradvocate_beer_totals = data['beeradvocate_breweries'].groupby('location_category')['nbr_beers'].sum().reset_index()
beeradvocate_beer_totals.columns = ['Location', 'Total Beers']
beeradvocate_beer_totals = beeradvocate_beer_totals[~beeradvocate_beer_totals['Location'].isna()]


ratebeer_beer_totals = data['ratebeer_breweries'].groupby('location_category')['nbr_beers'].sum().reset_index()
ratebeer_beer_totals.columns = ['Location', 'Total Beers']
ratebeer_beer_totals = ratebeer_beer_totals[~ratebeer_beer_totals['Location'].isna()]


max_y = max(
    beeradvocate_beer_totals['Total Beers'].max()+50000,
    ratebeer_beer_totals['Total Beers'].max()
)

# Define custom colors
color_map = {
    "United States": "#2B65EC",  # Blue
    "Other Countries": "pink"  # pink
}


fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True)


axes[0].bar(
    beeradvocate_beer_totals['Location'],
    beeradvocate_beer_totals['Total Beers'],
    color = [color_map[loc] for loc in beeradvocate_beer_totals['Location']]
)
axes[0].set_title("BeerAdvocate Breweries (Total Beers)")
axes[0].set_xlabel("Location Category")
axes[0].set_ylabel("Total Number of Beers")
axes[0].set_ylim(0, max_y)


axes[1].bar(
    ratebeer_beer_totals['Location'],
    ratebeer_beer_totals['Total Beers'],
    color = [color_map[loc] for loc in ratebeer_beer_totals['Location']]
)
axes[1].set_title("Ratebeer Breweries (Total Beers)")
axes[1].set_xlabel("Location Category")


fig.suptitle("Comparison of Total Beers by Location in Different Sites", fontsize = 16)
fig.tight_layout(rect = [0, 0, 1, 0.95])

plt.show()

### Dataset Description

These two datasets contain user information from two beer rating and review platforms, RateBeer and BeerAdvocate. Below is an overview of the columns in these user data tables:

- **nbr_ratings**: The total number of ratings each user has given across various beers, reflecting their level of engagement with rating on the platform.

- **nbr_reviews**: The total number of detailed reviews a user has contributed, offering qualitative feedback on their beer experiences.

- **user_id**: A unique identifier for each user. In the BeerAdvocate dataset, `user_id` typically includes both the username and a numerical suffix to differentiate similar usernames. In the RateBeer dataset, `user_id` is usually represented as a standalone numerical code, unique to each user.

- **user_name**: The display name or alias chosen by the user, used for interaction within the platform.

- **joined**: The date and time when the user joined the platform, stored as a Unix timestamp in seconds, which provides a measure of how long the user has been active.

- **location**: The user’s reported location, which often includes the country and, in some cases, the state or province. This can help in analyzing geographic trends in user activity or preferences.

In [None]:
print(data['ratebeer_users'].dtypes)
print(data['beeradvocate_users'].dtypes)

In [None]:
# Convert to string type to string
data['ratebeer_users']['user_name'] = data['ratebeer_users']['user_name'].astype('string')
data['ratebeer_users']['location'] = data['ratebeer_users']['location'].astype('string')

data['beeradvocate_users']['user_id'] = data['beeradvocate_users']['user_id'].astype('string')
data['beeradvocate_users']['user_name'] = data['beeradvocate_users']['user_name'].astype('string')
data['beeradvocate_users']['location'] = data['beeradvocate_users']['location'].astype('string')

# Convert 'joined' column from Unix timestamp to datetime format
data['ratebeer_users']['joined'] = pd.to_datetime(data['ratebeer_users']['joined'], unit='s')
data['beeradvocate_users']['joined'] = pd.to_datetime(data['beeradvocate_users']['joined'], unit='s')

# Droping Nans in joined and location columns because we can't fill them!
data['ratebeer_users'] = data['ratebeer_users'].dropna(subset=['location', 'joined'])
data['beeradvocate_users'] = data['beeradvocate_users'].dropna(subset=['location', 'joined'])

# beeradvocate dataframe has one Nan in 'user_name' which we will fill it with its 'user_id'
data['beeradvocate_users']['user_name'] = data['beeradvocate_users']['user_name'].fillna(data['beeradvocate_users']['user_id'])

### Dataset Description: Breweries Data

These datasets contain information about breweries listed on two different beer rating platforms: BeerAdvocate and RateBeer.

- **id**: A unique identifier for each brewery. The ID is specific to each platform and used to distinguish individual breweries in the dataset.

- **location**: The geographic location of the brewery, typically specifying the country, and sometimes the region or state. This information provides insight into the distribution of breweries globally and may be used to explore regional brewing patterns or trends.

- **name**: The name of the brewery. This is the recognizable label associated with each brewery, often reflecting the brand or business name.

- **nbr_beers**: The number of unique beers produced by each brewery. This column offers insight into the scale or variety of production for each brewery, with higher values indicating a larger selection of beers.

In [None]:
beer_reviews_summary = pd.read_csv('beer_reviews_summary.csv')
data['ratebeer_beers'] = data['ratebeer_beers'].merge(beer_reviews_summary[['beer_id','nbr_reviews']], on = ['beer_id'], how = 'inner')

In [None]:
import plotly.express as px

# Group by beer_id and calculate the sum of number of reviews
beer_reviews_sum = data['beeradvocate_beers'].groupby('beer_id')['nbr_reviews'].sum()

beer_reviews_sum_df = beer_reviews_sum.reset_index()
beer_reviews_sum_df.columns = ['beer_id', 'total_reviews']

# Plotting
fig = px.histogram(
    beer_reviews_sum_df,
    x = 'total_reviews',
    nbins = 20, 
    title = 'Distribution of Total Reviews per Beer (Logarithmic Y-Axis)',
    labels = {'total_reviews': 'Sum of Reviews (nbr_reviews)', 'count': 'Frequency'},
    opacity = 0.7
)


fig.update_layout(
    title = dict(
        text = 'Distribution of Total Reviews per Beer',
        x = 0.5, 
        xanchor = 'center' 
    ),
    xaxis_title = 'Sum of Reviews',
    yaxis = dict(
        type = 'log',
        title = 'Frequency (Log Scale)'
    ),
    template = 'plotly_white',
    bargap = 0.1
)

fig.show()

In [None]:
# Group by beer_id and calculate the sum of number of reviews
beer_reviews_sum = data['beeradvocate_beers'].groupby('beer_id')['nbr_reviews'].sum()

beer_reviews_sum_df = beer_reviews_sum.reset_index()
beer_reviews_sum_df.columns = ['beer_id', 'total_reviews']

total_reviews = beer_reviews_sum_df['total_reviews']

bins = 20

# Plot
plt.figure(figsize=(10, 6))
counts, bin_edges, patches = plt.hist(
    total_reviews,
    bins = bins,
    alpha = 0.7,
    edgecolor = 'black',
    color = 'pink'
)


plt.yscale('log')


plt.title('Distribution of Total Reviews per Beer', fontsize = 14)
plt.xlabel('Sum of Reviews', fontsize = 12)
plt.ylabel('Frequency (Log Scale)', fontsize = 12)


plt.tight_layout()
plt.show()

In [None]:
# Group by beer_id and calculate the sum of number of reviews
beer_reviews_sum = data['ratebeer_beers'].groupby('beer_id')['nbr_reviews'].sum()

beer_reviews_sum_df = beer_reviews_sum.reset_index()
beer_reviews_sum_df.columns = ['beer_id', 'total_reviews']

# Ploting
fig = px.histogram(
    beer_reviews_sum_df,
    x = 'total_reviews',
    nbins = 20, 
    title = 'Distribution of Total Reviews per Beer',
    labels = {'total_reviews': 'Sum of Reviews per Beer', 'count': 'Frequency'},
    opacity = 0.7
)

fig.update_layout(
    title = dict(
        text = 'Distribution of Total Reviews per Beer',
        x = 0.5,
        xanchor = 'center'
    ),
    xaxis_title = 'Sum of Reviews',
    yaxis = dict(
        type = 'log', 
        title = 'Frequency (Log Scale)'
    ),
    template = 'plotly_white',
    bargap = 0.1
)

fig.show()

In [None]:
# Group by beer_id and calculate the sum of number of reviews
beer_reviews_sum = data['ratebeer_beers'].groupby('beer_id')['nbr_reviews'].sum()

beer_reviews_sum_df = beer_reviews_sum.reset_index()
beer_reviews_sum_df.columns = ['beer_id', 'total_reviews']

total_reviews = beer_reviews_sum_df['total_reviews']

bins = 20


plt.figure(figsize = (10, 4))
counts, bin_edges, patches = plt.hist(
    total_reviews,
    bins = bins,
    alpha = 0.7,
    edgecolor = 'black',
    color = '#FFC0CB'  
)


plt.yscale('log')

plt.title('Distribution of Total Reviews per Beer', fontsize = 14)
plt.xlabel('Sum of Reviews per Beer', fontsize = 12)
plt.ylabel('Frequency (Log Scale)', fontsize = 12)

plt.tight_layout()
plt.show()

### Dataset Description: Beers Data for BeerAdvocate and RateBeer

These datasets contain detailed information on individual beers from two platforms, BeerAdvocate and RateBeer.

#### Shared Columns
- **beer_id** and **beer_name**: Unique identifier and name of each beer.
- **brewery_id** and **brewery_name**: Identifier and name of the brewery that produces the beer.
- **style**: The type or category of the beer (e.g., IPA, Stout).
- **nbr_ratings** and **nbr_reviews**: Total user ratings and reviews the beer has received.
- **avg**: Average rating given by users.
- **abv**: Alcohol by volume percentage, representing the beer’s strength.
- **avg_computed** and **zscore**: Adjusted or standardized ratings, with `zscore` indicating how the rating compares to the dataset average.
- **nbr_matched_valid_ratings** and **avg_matched_valid_ratings**: Count and average of validated ratings.

#### Unique to BeerAdvocate
- **overall_score** and **style_score**: Scores reflecting the beer's overall popularity and alignment with its style.

#### Unique to RateBeer
- **ba_score** and **bros_score**: Additional rating scores, with `ba_score` representing the BeerAdvocate score and `bros_score` possibly reflecting an expert or curated rating.

In [None]:
# Group by beer_id and calculate the sum of number of reviews for each dataset
ratebeer_reviews_sum = data['ratebeer_beers'].groupby('beer_id')['nbr_reviews'].sum()
beeradvocate_reviews_sum = data['beeradvocate_beers'].groupby('beer_id')['nbr_reviews'].sum()

ratebeer_reviews_sum_df = ratebeer_reviews_sum.reset_index()
ratebeer_reviews_sum_df.columns = ['beer_id', 'total_reviews']

beeradvocate_reviews_sum_df = beeradvocate_reviews_sum.reset_index()
beeradvocate_reviews_sum_df.columns = ['beer_id', 'total_reviews']

fig = go.Figure()

fig.add_trace(go.Histogram(
    x = ratebeer_reviews_sum_df['total_reviews'],
    nbinsx = 20, 
    name = 'Ratebeer Reviews',  
    opacity = 0.7
))


fig.add_trace(go.Histogram(
    x = beeradvocate_reviews_sum_df['total_reviews'],
    nbinsx = 20, 
    name = 'Beeradvocate Reviews',
    opacity = 0.7
))

fig.update_layout(
    title = 'Comparison of Total Reviews per Beer',
    xaxis_title = 'Sum of Reviews',
    yaxis = dict(
        title = 'Frequency (Log Scale)',
        type = 'log' 
    ),
    barmode = 'overlay', 
    template = 'plotly_white'
)

fig.update_traces(opacity=0.5)

fig.show()

In [None]:
# Group by beer_id and calculate the sum of number of reviews for each dataset
ratebeer_reviews_sum = data['ratebeer_beers'].groupby('beer_id')['nbr_reviews'].sum()
beeradvocate_reviews_sum = data['beeradvocate_beers'].groupby('beer_id')['nbr_reviews'].sum()

ratebeer_reviews_sum_df = ratebeer_reviews_sum.reset_index()
ratebeer_reviews_sum_df.columns = ['beer_id', 'total_reviews']

beeradvocate_reviews_sum_df = beeradvocate_reviews_sum.reset_index()
beeradvocate_reviews_sum_df.columns = ['beer_id', 'total_reviews']


ratebeer_reviews = ratebeer_reviews_sum_df['total_reviews']
beeradvocate_reviews = beeradvocate_reviews_sum_df['total_reviews']


plt.figure(figsize = (10, 4))


plt.hist(ratebeer_reviews, bins = 20, alpha = 0.5, label = 'Ratebeer Reviews', color = 'blue', edgecolor = 'black')
plt.hist(beeradvocate_reviews, bins = 20, alpha = 0.5, label = 'Beeradvocate Reviews', color = 'pink', edgecolor = 'black')


plt.yscale('log')

plt.title('Comparison of Total Reviews per Beer', fontsize = 14)
plt.xlabel('Sum of Reviews', fontsize = 12)
plt.ylabel('Frequency (Log Scale)', fontsize = 12)
plt.legend(loc='upper right', fontsize = 10)


plt.tight_layout()
plt.show()

In [None]:
import dash
from dash import html, dash_table

sample_data = {
    "Beer Name": ["Jacobsen Original Dark Lager", "Left Hand Widdershins - Oak Aged", "Obolon Premium", "Fort Collins Major Tom­s Pomegranate Wheat", "Great Lakes Burning River Pale Ale"],
    "Word 1": ["Annoy", "Barleywine", "Bitternes", "Avoid", "artificial"],
    "Word 2": ["Boring", "Chemical", "Blandness", "Citric", "Awful"],
    "Word 3": ["Chalk", "Fake", "Carbon", "Drain", "Burning"],
    "Word 4": ["Corny", "Cleanser", "Cheap", "Horrible", "Citric"],
    "Word 5": ["Dirt", "Cork", "Crap", "Insane", "Flaw"]
    #"Word 6": ["failure", "liver", "disapates", "nasty", "garbage"],
    #"Word 7": ["lifeless", "nasty", "drain", "rotten", "moss"],
    #"Word 8": ["graininess", "obnoxious", "foul", "sludge", "swamp"],
    #"Word 9": ["metal", "repulsive", "skunky", "steam", "woodsy"],
    #"Word 10": ["spoil", "poor", "shitty", "unfair", "yellow"]
}

df = pd.DataFrame(sample_data)

app = dash.Dash(__name__)

colors = ["rgba(255, 128, 128, 1)", "rgba(255, 160, 160, 1)", "rgba(255, 192, 192, 1)", "rgba(255, 224, 224, 1)", "rgba(255, 255, 255, 1)"]

app.layout = html.Div([
    html.H2(" Beer Table", style={"text-align": "center"}),
    dash_table.DataTable(
        id='beer-table',
        columns=[{"name": col, "id": col} for col in df.columns],
        data=df.to_dict('records'),
        style_table={'margin': 'auto'},
        style_cell={'textAlign': 'center', 'padding': '10px'},
        style_header={'backgroundColor': '#D69A3B', 'color': 'black', 'fontWeight': 'bold'},
        style_data_conditional=[
            {
                "if": {"column_id": f"Word {i+1}"},
                "backgroundColor": colors[i]
            } for i in range(5)
        ],
        style_cell_conditional=[
            {"if": {"column_id": "Beer Name"}, "width": "150px", "textAlign": "center"} 
        ],
    ),
])

if __name__ == '__main__':
    app.run_server(debug=True)
