In [None]:
import pandas as pd
import utils
import importlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
importlib.reload(utils)

In [None]:

total_rb=pd.read_csv(../../data/Processed/RateBeer/ratings_processed.csv')

# Step 1: Count reviews per group and filter based on threshold
total_rb_counts = total_rb.groupby(['beer_id', 'year', 'location_user']).size().reset_index(name='review_count')

# Step 2: Merge back to the main DataFrame and filter for groups with at least 5 reviews
total_rb_with_counts = pd.merge(total_rb, rtotal_rb_counts, on=['beer_id', 'year', 'location_user'])
filtered_total_rb = total_rb_with_counts[total_rb_with_counts['review_count'] >= 5]

# Step 3: Calculate weighted average rating for each group
weighted_avg_ratings_rb = filtered_total_rb.groupby(['beer_id', 'beer_name', 'year', 'review_count','location_user']).apply(
    lambda x: (x['rating'] * x['review_count']).sum() / x['review_count'].sum()
).reset_index(name='weighted_avg_rating')

weighted_avg_ratings_rb['rank'] = (
    weighted_avg_ratings_rb.sort_values(['year', 'location_user', 'weighted_avg_rating', 'review_count'], 
                                        ascending=[True, True, False, False])
    .groupby(['year', 'location_user'])
    .cumcount() + 1
)

# Check results
weighted_avg_ratings_rb.head()


In [None]:
# Pivot data for heatmap
top_ranked_beers_rb = weighted_avg_ratings_rb[weighted_avg_ratings_rb['rank'] == 1]

pivot_data_rb =top_ranked_beers_rb.pivot_table(
    values='weighted_avg_rating', 
    index='year', 
    columns='location_user', 
    aggfunc='mean'
)


In [None]:

chunk_size = 14


num_chunks = len(pivot_data_rb.columns) // chunk_size + (1 if len(pivot_data_rb.columns) % chunk_size != 0 else 0)

for i in range(num_chunks):
    start = i * chunk_size
    end = start + chunk_size
    chunk = pivot_data_rb.iloc[:, start:end]  # Seleziona le colonne del blocco
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(chunk, annot=True, cmap='coolwarm', linewidths=0.5)
    plt.title(f'Average Beer Ratings by Region and Year, RateBeer (Bloc {i+1})')
    plt.xlabel('Region')
    plt.ylabel('Year')
    plt.show()


In [None]:
review_counts_us_rb = total_rb.groupby(['beer_id', 'year', 'location_region_user']).size().reset_index(name='review_count')


reviews_with_counts_us_rb = pd.merge(total_rb, review_counts_us_rb, on=['beer_id', 'year', 'location_region_user'])
filtered_reviews_us_rb = reviews_with_counts_us_rb[reviews_with_counts_us_rb['review_count'] >= 5]

weighted_avg_ratings_us_rb = filtered_reviews_us_rb.groupby(['beer_id', 'beer_name', 'year','review_count', 'location_region_user']).apply(
    lambda x: (x['rating'] * x['review_count']).sum() / x['review_count'].sum()
).reset_index(name='weighted_avg_rating')


weighted_avg_ratings_us_rb['rank'] = (
    weighted_avg_ratings_us_rb.sort_values(['year', 'location_region_user', 'weighted_avg_rating', 'review_count'], 
                                        ascending=[True, True, False, False])
    .groupby(['year', 'location_region_user'])
    .cumcount() + 1
)


In [None]:
# Pivot data for heatmap
top_ranked_beers_us_rb = weighted_avg_ratings_us_rb[weighted_avg_ratings_us_rb['rank'] == 1]
pivot_data_us_rb= top_ranked_beers_us_rb.pivot_table(
    values='weighted_avg_rating', 
    index='year', 
    columns='location_region_user', 
    aggfunc='mean'
)



In [None]:

chunk_size = 14
num_chunks = len(pivot_data_us_br.columns) // chunk_size + (1 if len(pivot_data_us_br.columns) % chunk_size != 0 else 0)

for i in range(num_chunks):
    start = i * chunk_size
    end = start + chunk_size
    chunk = pivot_data_us_br.iloc[:, start:end]  
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(chunk, annot=True, cmap='coolwarm', linewidths=0.5)
    plt.title(f'Average Beer Ratings by Region US and Year, RateBeer (Bloc {i+1})')
    plt.xlabel('Region')
    plt.ylabel('Year')
    plt.show()

In [None]:
top_beers_rb = weighted_avg_ratings_rb['beer_name'].value_counts().nlargest(10).index
time_series_data_rb= weighted_avg_ratings_rb[weighted_avg_ratings_rb['beer_name'].isin(top_beers)]

time_series_data_rb = time_series_data_rb.groupby(['year', 'beer_name']).agg({'weighted_avg_rating': 'mean'}).reset_index()

plt.figure(figsize=(14, 7))
sns.lineplot(data=time_series_data_rb, x='year', y='weighted_avg_rating', hue='beer_name', marker='o')
plt.title('Average Rating of Selected Beers Over Time, RateBeer')
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.legend(title='Beer Name')
plt.show()


In [None]:
top_beers_rb = weighted_avg_ratings_rb[weighted_avg_ratings_rb['location_user']=='United States']['beer_name'].value_counts().nlargest(10).index
time_series_data_rb = weighted_avg_ratings_rb[weighted_avg_ratings_rb['beer_name'].isin(top_beers)]

time_series_data_rb = time_series_data_rb.groupby(['year', 'beer_name']).agg({'weighted_avg_rating': 'mean'}).reset_index()

plt.figure(figsize=(14, 7))
sns.lineplot(data=time_series_data_rb, x='year', y='weighted_avg_rating', hue='beer_name', marker='o')
plt.title('Average Rating of Selected in US Beers Over Time, RateBeer')
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.legend(title='Beer Name')
plt.show()


In [None]:

top_beers_cali_rb = weighted_avg_ratings_us_rb[weighted_avg_ratings_us_rb['location_region_user']=='California']['beer_name'].value_counts().nlargest(10).index
time_series_data_cali_rb = weighted_avg_ratings_us_rb[weighted_avg_ratings_us_rb['beer_name'].isin(top_beers)]

time_series_data_cali_rb= time_series_data_cali_rb.groupby(['year', 'beer_name']).agg({'weighted_avg_rating': 'mean'}).reset_index()

plt.figure(figsize=(14, 7))
sns.lineplot(data=time_series_data_cali_rb, x='year', y='weighted_avg_rating', hue='beer_name', marker='o')
plt.title('Average Rating of Selected Beers in California Over Time, RateBeer')
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.legend(title='Beer Name')
plt.show()

In [None]:
pivot_data_name_rb = top_ranked_beers_rb.pivot_table(
    values='beer_name', 
    index='year', 
    columns='location_user', 
    aggfunc=lambda x: ', '.join(x.dropna())
)


chunk_size = 4  
num_chunks = (pivot_data_name_rb.shape[1] + chunk_size - 1) // chunk_size

for i in range(num_chunks):
    start = i * chunk_size
    end = start + chunk_size
    chunk = pivot_data_name_rb.iloc[:, start:end]  
    
   
    plt.figure(figsize=(10, 6))  
    sns.heatmap(
        chunk.isna().astype(int),  
        annot=chunk, fmt='', cmap='coolwarm', linewidths=0.5, cbar=False
    )
    plt.title(f'Beer Names by Region and Year , RateBeer(Bloc {i+1})')
    plt.xlabel('Region')
    plt.ylabel('Year')
    plt.xticks(rotation=45, ha='right', fontsize=8)
    plt.yticks(fontsize=8)
    plt.show()


In [None]:
review_counts_br_rb = total_rb.groupby(['brewery_id', 'year', 'location_user']).size().reset_index(name='review_count')


reviews_with_counts_br_rb = pd.merge(total_rb, review_counts_br_rb, on=['brewery_id', 'year', 'location_user'])
filtered_reviews_br_rb = reviews_with_counts_br_rb[reviews_with_counts_br_rb['review_count'] >= 5]

weighted_avg_ratings_br_rb = filtered_reviews_br_rb.groupby(['brewery_id','brewery_name', 'year','review_count', 'location_user']).apply(
    lambda x: (x['rating'] * x['review_count']).sum() / x['review_count'].sum()
).reset_index(name='weighted_avg_rating')


weighted_avg_ratings_br_rb['rank'] = (
    weighted_avg_ratings_br_rb.sort_values(['year', 'location_user', 'weighted_avg_rating', 'review_count'], 
                                        ascending=[True, True, False, False])
    .groupby(['year', 'location_user'])
    .cumcount() + 1
)


In [None]:
top_ranked_beers_br_rb = weighted_avg_ratings_br_rb[weighted_avg_ratings_br_rb['rank'] == 1]


pivot_data_br_rb = top_ranked_beers_br_rb.pivot_table(
    values='brewery_name', 
    index='year', 
    columns='location_user', 
    aggfunc=lambda x: ', '.join(x.dropna())
)


chunk_size = 4  
num_chunks = (pivot_data_name_rb.shape[1] + chunk_size - 1) // chunk_size

for i in range(num_chunks):
    start = i * chunk_size
    end = start + chunk_size
    chunk = pivot_data_name_rb.iloc[:, start:end] 
    
    
    plt.figure(figsize=(10, 6))  
    sns.heatmap(
        chunk.isna().astype(int), 
        annot=chunk, fmt='', cmap='coolwarm', linewidths=0.5, cbar=False
    )
    plt.title(f'best Breweries  by User Region and Year , RateBeer(Bloc {i+1})')
    plt.xlabel('Region')
    plt.ylabel('Year')
    plt.xticks(rotation=45, ha='right', fontsize=8)
    plt.yticks(fontsize=8)
    plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


# Calculate the total ratings and review count for each beer
beer_stats_rb = total_rb.groupby('beer_id').agg(
    total_ratings=('rating', 'sum'),       # Sum of regular ratings for each beer
    review_count=('rating', 'count')       # Count of the number of reviews for each beer
).reset_index()

# Calculate the average rating for each beer (regular, not weighted)
beer_stats_rb['average_rating'] = beer_stats_rb['total_ratings'] / beer_stats_rb['review_count']

# Calculate correlation between the number of reviews and the average rating
correlation = beer_stats_rb[['review_count', 'average_rating']].corr().iloc[0, 1]
print(f"Correlation between Number of reviews and average rating(RB): {correlation}")

# Visualize the correlation between review count and average rating
plt.figure(figsize=(10, 6))
sns.scatterplot(data=beer_stats_rb, x='review_count', y='average_rating', alpha=0.7)
plt.title('Correlation between Number of Reviews and Average Rating, RateBeer')
plt.xlabel('Number of Reviews')
plt.ylabel('Average Rating')
plt.grid(True)
plt.show()

In [None]:

review_counts_fl_rb = total_rb.groupby(['style', 'year', 'location_user']).size().reset_index(name='review_count')


reviews_with_counts_fl_rb = pd.merge(total_rb, review_counts_fl_rb, on=['style', 'year', 'location_user'])
filtered_reviews_fl_rb = reviews_with_counts_fl_rb[reviews_with_counts_fl_rb['review_count'] >= 5]

weighted_avg_ratings_fl_rb = filtered_reviews_fl_rb.groupby(['style', 'year','review_count', 'location_user']).apply(
    lambda x: (x['rating'] * x['review_count']).sum() / x['review_count'].sum()
).reset_index(name='weighted_avg_rating')


weighted_avg_ratings_fl_rb['rank'] = (
    weighted_avg_ratings_fl_rb.sort_values(['year', 'location_user', 'weighted_avg_rating', 'review_count'], 
                                        ascending=[True, True, False, False])
    .groupby(['year', 'location_user'])
    .cumcount() + 1
)


In [None]:
top_ranked_beers_fl_rb = weighted_avg_ratings_fl_rb[weighted_avg_ratings_fl[_rb'rank'] == 1]

pivot_data_fl_rb = top_ranked_beers_fl_rb.pivot_table(
    values='style', 
    index='year', 
    columns='location_user', 
    aggfunc=lambda x: ', '.join(x.dropna())
)


chunk_size = 4  
num_chunks = (pivot_data_fl.shape[1] + chunk_size - 1) // chunk_size

for i in range(num_chunks):
    start = i * chunk_size
    end = start + chunk_size
    chunk = pivot_data_fl_rb.iloc[:, start:end] 
   
    plt.figure(figsize=(10, 6)) 
    sns.heatmap(
        chunk.isna().astype(int), 
        annot=chunk, fmt='', cmap='coolwarm', linewidths=0.5, cbar=False
    )
    plt.title(f'Best Beer Style by Region and Year ,RateBeer(Bloc {i+1})')
    plt.xlabel('Region')
    plt.ylabel('Year')
    plt.xticks(rotation=45, ha='right', fontsize=8)
    plt.yticks(fontsize=8)
    plt.show()


In [None]:
review_counts_rb = total_rb.groupby(['style']).size().reset_index(name='review_count')

# Merge the reviews with their review counts based on style, year, and user location
reviews_with_counts_rb = pd.merge(total_rb, review_counts_rb, on=['style'])

# Filter the reviews to include only those with at least 5 reviews
filtered_reviews_rb = reviews_with_counts_rb[reviews_with_counts_rb['review_count'] >= 5]

# Calculate the weighted average rating for each beer style and user location
weighted_avg_styles_rb = filtered_reviews_rb.groupby(['style']).apply(
    lambda x: (x['rating'] * x['review_count']).sum() / x['review_count'].sum()
).reset_index(name='weighted_avg_rating')

# Sort by weighted average rating
weighted_avg_styles_rb= weighted_avg_styles_rb.sort_values('weighted_avg_rating', ascending=False)


weighted_avg_styles_rb['style_code'] = pd.factorize(weighted_avg_styles_rb['style'])[0]

correlation = weighted_avg_styles_rb[['style_code', 'weighted_avg_rating']].corr().iloc[0, 1]

print(f"Correlation between  style (coded) e Weighted average rating, RateBeer: {correlation}")

plt.figure(figsize=(10, 6))
sns.scatterplot(data=weighted_avg_styles_rb, x='style_code', y='weighted_avg_rating', alpha=0.7)
plt.title(f'Correlation between  style (coded) e Weighted average rating, RateBeer\nCorrelation: {correlation:.2f}')
plt.xlabel('Style ')
plt.ylabel('weighted average rating')
plt.xticks(weighted_avg_styles['style_code'], weighted_avg_styles['style'], rotation=90)
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

style_review_count_rb= total_rb.groupby(['year', 'style']).size().reset_index(name='review_count')

style_avg_ratings_rb = total_rb.groupby(['year', 'style'])['rating'].mean().reset_index(name='average_rating')

most_reviewed_style_idx_rb = style_review_count_rb.groupby('year')['review_count'].idxmax()
most_reviewed_style_rb = style_review_count_rb.loc[most_reviewed_style_idx_rb]

style_analysis_rb = pd.merge(most_reviewed_style_rb, style_avg_ratings_rb, on=['year', 'style'], how='left')

top_10_styles_rb = style_review_count_rb.groupby('style')['review_count'].sum().nlargest(10).index

# Filter the dataframe to include only the top 10 most popular styles
top_styles_data_rb = style_review_count_rb[style_review_count_rb['style'].isin(top_10_styles)]

# Plot data for each of the 10 most popular styles
for style in top_10_styles_rb:
    style_data_rb = top_styles_data_rb[top_styles_data_rb['style'] == style]
    plt.plot(style_data_rb['year'], style_data_rb['review_count'], label=style, marker='o')

# Add title, labels, and legend
plt.title('Trend of the Top 10 Most Popular Beer Styles Over Time, RateBeer', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Reviews', fontsize=14)
plt.legend(title="Beer Styles", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

# Add layout to prevent cutting off titles
plt.tight_layout()

# Display the plot
plt.show()


In [None]:
# Group reviews by style, year, and user location, then count the number of reviews for each combination
review_counts_fl_rb = reviews_df_rb.groupby(['style', 'year', 'location_user']).size().reset_index(name='review_count')

# Merge the reviews with their review counts based on style, year, and user location
reviews_with_counts_fl_rb = pd.merge(total_rb, review_counts_fl_rb, on=['style', 'year', 'location_user'])

# Filter the reviews to include only those with at least 5 reviews
filtered_reviews_fl_rb = reviews_with_counts_fl_rb[reviews_with_counts_fl_rb['review_count'] >= 5]

# Calculate the weighted average rating for each beer style and user location
weighted_avg_styles_location_rb = filtered_reviews_fl_rb.groupby(['style', 'location_user']).apply(
    lambda x: (x['rating'] * x['review_count']).sum() / x['review_count'].sum()
).reset_index(name='weighted_avg_rating')

# Sort by weighted average rating
weighted_avg_styles_location_rb = weighted_avg_styles_location_rb.sort_values('weighted_avg_rating', ascending=False)

# Encode beer styles as numbers to calculate correlation
weighted_avg_styles_location_rb['style_code'] = pd.factorize(weighted_avg_styles_location_rb['style'])[0]

# Encode user location as numbers for correlation
weighted_avg_styles_location_rb['location_user_code'] = pd.factorize(weighted_avg_styles_location_rb['location_user'])[0]

# Calculate the correlation between "style_code", "location_user_code", and "weighted_avg_rating"
correlation_style_location_rb = weighted_avg_styles_location_rb[['style_code', 'location_user_code', 'weighted_avg_rating']].corr()

# Display the correlation matrix
print("Correlation between style, location, and weighted average rating, RateBeer:")
print(correlation_style_location)


In [None]:
# Group data by year and calculate the total number of reviews per year
review_counts_y_rb= total_df.groupby(['year']).size().reset_index(name='review_count')

# Merge the reviews data with the review count data based on the year
reviews_with_counts_y_rb = pd.merge(total_rb, review_counts_y_rb, on=['year'])

# Filter reviews where the review count per year is greater than or equal to 5
filtered_reviews_y_rb = reviews_with_counts_y_rb[reviews_with_counts_y_rb['review_count'] >= 5]

# Calculate the weighted average rating per year
weighted_avg_year_rb = filtered_reviews_y_rb.groupby('year').apply(
    lambda x: (x['rating'] * x['review_count']).sum() / x['review_count'].sum()
).reset_index(name='weighted_avg_rating')

# Calculate the correlation between the year and the weighted average rating
correlation_year_score_rb = weighted_avg_year_rb[['year', 'weighted_avg_rating']].corr()

# Display the correlation matrix
print("Correlation between year and weighted average rating, RateBeer:")
print(correlation_year_score)

# Display a scatter plot to better understand the relationship
plt.figure(figsize=(10, 6))
sns.scatterplot(data=weighted_avg_year_rb, x='year', y='weighted_avg_rating', alpha=0.7)
plt.title('Correlation between Year and Weighted Average Rating, RateBeer')
plt.xlabel('Year')
plt.ylabel('Weighted Average Rating')
plt.grid(True)
plt.show()


In [None]:
# Group data by year and location, and count the number of reviews for each combination
review_counts_y_rb = total_rb.groupby(['year', 'location_user']).size().reset_index(name='review_count')

# Merge the reviews data with the review count data based on the year and location_user
reviews_with_counts_y_rb = pd.merge(total_rb, review_counts_y_rb, on=['year', 'location_user'])

# Filter reviews where the review count per year and location is greater than or equal to 5
filtered_reviews_y_rb = reviews_with_counts_y_rb[reviews_with_counts_y_rb['review_count'] >= 5]

# Calculate the weighted average rating per year and location_user
weighted_avg_year_rb = filtered_reviews_y_rb.groupby(['year', 'location_user']).apply(
    lambda x: (x['rating'] * x['review_count']).sum() / x['review_count'].sum()
).reset_index(name='weighted_avg_rating')

# Encode location_user as a numerical code for correlation calculation
weighted_avg_year_rb['location_user_code'] = pd.factorize(weighted_avg_year_rb['location_user'])[0]

# Calculate the correlation between year, location_user_code, and weighted average rating
correlation_year_score_rb = weighted_avg_year_rb[['year', 'location_user_code', 'weighted_avg_rating']].corr()

# Display the correlation matrix
print("Correlation between year, Location of the user and weighted average rating, RateBeer:")
print(correlation_year_score_rb)

# Display a scatter plot to better understand the relationship
plt.figure(figsize=(10, 6))
sns.scatterplot(data=weighted_avg_year, x='year', y='weighted_avg_rating', alpha=0.7)
plt.title('Correlation between Year, Location of the user and Weighted Average Rating, RateBeer')
plt.xlabel('Year')
plt.ylabel('Weighted Average Rating')
plt.grid(True)
plt.show()


In [None]:
# Group data by year and location, and count the number of reviews for each combination
review_counts_y_rb = total_rb.groupby(['year', 'location_user','style']).size().reset_index(name='review_count')

# Merge the reviews data with the review count data based on the year and location_user
reviews_with_counts_y_rb= pd.merge(total_rb, review_counts_y_rb, on=['year', 'location_user','style'])

# Filter reviews where the review count per year and location is greater than or equal to 5
filtered_reviews_y_rb= reviews_with_counts_y_rb[reviews_with_counts_y_rb['review_count'] >= 5]

# Calculate the weighted average rating per year and location_user
weighted_avg_year_rb = filtered_reviews_y_rb.groupby(['year', 'location_user']).apply(
    lambda x: (x['rating'] * x['review_count']).sum() / x['review_count'].sum()
).reset_index(name='weighted_avg_rating')

# Encode location_user as a numerical code for correlation calculation
weighted_avg_year_rb['location_user_code'] = pd.factorize(weighted_avg_year_rb['location_user'])[0]
weighted_avg_year_rb['style_code'] = pd.factorize(weighted_avg_year_rb['style'])[0]

# Calculate the correlation between year, location_user_code, and weighted average rating
correlation_year_score_rb= weighted_avg_year_rb[['year', 'location_user_code', 'weighted_avg_rating','style_code']].corr()

# Display the correlation matrix
print("Correlation between year,location of the user, style and weighted average rating, RateBeer:")
print(correlation_year_score_rb)

# Display a scatter plot to better understand the relationship
plt.figure(figsize=(10, 6))
sns.scatterplot(data=weighted_avg_year_rb, x='year', y='weighted_avg_rating', alpha=0.7)
plt.title('Correlation between year,location of the user, style and weighted average rating, RateBeer')
plt.xlabel('Year')
plt.ylabel('Weighted Average Rating')
plt.grid(True)
plt.show()


In [None]:
review_counts_abv_rb= total_rb.groupby(['abv']).size().reset_index(name='review_count')

# Merge the reviews data with the review count data based on the year and location_user
reviews_with_counts_abv_rb= pd.merge(total_rb, review_counts_abv_rb, on=['abv'])

# Filter reviews where the review count per year and location is greater than or equal to 5
filtered_reviews_abv_rb = reviews_with_counts_abv_rb[reviews_with_counts_abv_rb['review_count'] >= 5]

# Calculate the weighted average rating per year and location_user
weighted_avg_abv_rb= filtered_reviews_abv_rb.groupby(['abv']).apply(
    lambda x: (x['rating'] * x['review_count']).sum() / x['review_count'].sum()
).reset_index(name='weighted_avg_rating')


# Calculate the correlation between year, location_user_code, and weighted average rating
correlation_year_score_rb = weighted_avg_year_rb[['abv', 'weighted_avg_rating']].corr()

# Display the correlation matrix
print("Correlation between abv and weighted average rating,RateBeer:")
print(correlation_year_score_rb)

# Display a scatter plot to better understand the relationship
plt.figure(figsize=(10, 6))
sns.scatterplot(data=weighted_avg_year_rb, x='year', y='weighted_avg_rating', alpha=0.7)
plt.title('Correlation between year,location of the user, style and weighted average rating, RateBeer')
plt.xlabel('Year')
plt.ylabel('Weighted Average Rating')
plt.grid(True)
plt.show()