# ADA Project : Milestone 2

## Preparation

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sn
from utils import *

### Conversion : txt to csv

In [None]:
# Require extensive amount of RAM (a bit less than 32Gb)
do_conversion = False
if do_conversion:
    ratings_text_to_csv("./data/BeerAdvocate/ratings.txt")
    ratings_text_to_csv("./data/RateBeer/ratings.txt")

### Load data

In [2]:
# BeerAdvocate (BA)
beers_BA, breweries_BA, users_BA, ratings_BA = load_data("BA")

In [3]:
# RateBeer (RB)
beers_RB, breweries_RB, users_RB, ratings_RB = load_data("RB")

In [None]:
# Matched Dataset (MD)
beers_MD, breweries_MD, users_MD, users_approx_MD, ratings_MD = load_data("MD")

### Populate and merge data from BA and RB

In [4]:
# Require extensive amount of RAM (>20Gb)
do_populate_merge = False
if do_populate_merge:
    
    # Populate ratings with info from users, breweries and beers
    ratings_populated_BA = populate_ratings(ratings_BA, users_BA, breweries_BA, beers_BA, "BA")
    ratings_populated_RB = populate_ratings(ratings_RB, users_RB, breweries_RB, beers_RB, "RB")

    # Merge populated ratings
    merged_ratings_path = "./data/ratings_BA_RB.csv"
    ratings_mixed = merge_populated_ratings(ratings_populated_BA, ratings_populated_RB, merged_ratings_path)

    # Save subsample of rating (easier to handle on laptops)
    merged_ratings_sample_path = "./data/ratings_BA_RB_sample.csv"
    save_subsample(dataframe=ratings_mixed, save_path = merged_ratings_sample_path, frac=0.1, random_state=0)

### Load populated merged ratings

In [2]:
# ratings_mixed = pd.read_table("./data/ratings_BAm_RBm.csv", sep=",")
ratings_mixed_sample = pd.read_table("./data/ratings_BA_RB_sample.csv", sep=",")

## Analysis

### Seasonal analysis of the number of reviews per type of beer

In [None]:
df_samp_of_samp = ratings_mixed_sample.sample(frac=1, random_state=42)
print(len(df_samp_of_samp))
print(df_samp_of_samp.columns)
print(len(df_samp_of_samp['style'].unique()))
counts = df_samp_of_samp['style'].value_counts()
print(counts)

In [None]:
import datetime as dt

# create sub_dataframe with type of beer and date
df_count_type_season = df_samp_of_samp[['style','date']].dropna()
#print(df_count_type_season['date'])
#print(pd.to_datetime(df_count_type_season['date'], unit='s'))
df_count_type_season['day_of_year'] = pd.to_datetime(df_count_type_season['date'], unit='s').dt.dayofyear
#df_count_type_season = df_count_type_season[['style','day_of_year']]
# create extra column with the day in year corresponding to the date
#df_count_type_season['day_of_year'] = df_count_type_season['date'].dt.dayofyear
most_represented = df_count_type_season.groupby(['style'])['style'] \
    .count() \
    .reset_index(name='count') \
    .sort_values(['count'], ascending=False) \
    .iloc[:5,0].values

num_bins = 10
bins = 365/num_bins * np.arange(0, num_bins+1)
print(bins)
groups = df_count_type_season[df_count_type_season['style'].isin(most_represented)] \
    .groupby(['style', pd.cut(df_count_type_season.day_of_year, bins)]).count()
df_count_type_season['range'] = df_count_type_season.groupby(['style'])['day_of_year'].transform(lambda x: pd.cut(x, bins = bins).astype(str))
groups = df_count_type_season[df_count_type_season['style'].isin(most_represented)] \
    .groupby(['style','range'])['range'] \
    .count()
#groups.size().unstack()
groups.head(50)


In [None]:
import matplotlib.patches as mpatches
most_represented = df_count_type_season.groupby(['style'])['style'] \
    .count() \
    .reset_index(name='count') \
    .sort_values(['count'], ascending=False) \
    .iloc[:5,0].values
#print(most_represented)
df_five_bests = df_count_type_season[df_count_type_season['style'].isin(most_represented)][['style','day_of_year']].reset_index()
#print(df_five_bests.head(20))
num_bins = 12
bins = 365/num_bins * np.arange(1, num_bins + 1)
#print(bins)
fig = plt.figure(figsize=(10,5))
for style in most_represented:
    counts = np.zeros((num_bins,1))
    for idx, samp in df_five_bests[df_five_bests['style'] == style].iterrows():
        pos = 0
        while samp['day_of_year'] > bins[pos] and pos < num_bins-1:
            pos += 1
        counts[pos] += 1
    plt.plot(bins,counts/max(counts), label=style)
    
# first area of interest
left, bottom, width, height = (260, 0.72, 25, 0.1)
rect=mpatches.Rectangle((left,bottom),width,height, 
                        fill=True,
                        color="grey",
                        linewidth=2,
                        alpha=0.5)
                        #facecolor="red")
                        #facecolor="red")
plt.gca().add_patch(rect)
    
# scd area of interest
left, bottom, width, height = (125, 0.85, 100, .21)
rect=mpatches.Rectangle((left,bottom),width,height, 
                        fill=True,
                        color="grey",
                        linewidth=2,
                        alpha=0.5,
                        label='Areas of interest')
                        #facecolor="red")
plt.gca().add_patch(rect)
plt.legend()
plt.xlabel('days of year')
plt.ylabel('# of reviews (normalized) ')
plt.title('Seasonal analysis per type of beer')
plt.ylim([.3,1.02])
plt.savefig('./figures/season_analysis_per_type.png')
print()

### Number of review per day using rolling average

In [None]:


# Assuming the file is named 'beer_ratings.csv' and is located in the same directory as your script.
# Read the CSV file
BA_NR_RW_DAY=ratings_BA
# Convert the Unix time to a datetime object, assuming the 'date' column is Unix time in seconds
BA_NR_RW_DAY['date'] = pd.to_datetime(BA_NR_RW_DAY['date'], unit='s')

# Filter out dates to ensure we only have one year of data, if necessary
start_date = "2015-01-1"  # start date in the example given (20th August 2015)
end_date = "2016-01-1"    # end date, one year later
BA_NR_RW_DAY_WW = BA_NR_RW_DAY[(BA_NR_RW_DAY['date'] >= start_date) & (BA_NR_RW_DAY['date'] <= end_date)]


In [None]:
# Group by the date and count the reviews
def plot_rolling(df, window=7):
    daily_reviews = df.groupby(df['date'].dt.date).size()
    print(daily_reviews)
    # Calculate the moving average with a window size of 7 to remove weeks days
    rolling = daily_reviews.rolling(window=window, center=True)
    rolling_average = rolling.mean()
    
    # Plot the rolling average
    plt.figure(figsize=(14, 7))
    rolling_average.plot(title='Number of Reviews Per Day Over One Year')
    
    # Add labels and grid
    plt.xlabel('Date')
    plt.ylabel('Average Number of Reviews')
    plt.grid(True)
    plt.show()
plot_rolling(BA_NR_RW_DAY_WW)

In [None]:
german_beers= ratings_BA[ratings_BA["brewery_id"].isin(breweries_BA[breweries_BA['location']=='Germany']['id'])]
german_beers['date'] = pd.to_datetime(german_beers['date'], unit='s')

# Filter out dates to ensure we only have one year of data, if necessary
start_date = "2015-01-1"  # start date in the example given (20th August 2015)
end_date = "2016-01-1"    # end date, one year later

german_beers_WW = german_beers[(german_beers['date'] >= start_date) & (german_beers['date'] <= end_date)]
plot_rolling(german_beers_WW)

In [None]:
german_users= ratings_BA[ratings_BA["user_id"].isin(users_BA[users_BA['location']=='Germany']['user_id'])]
#german_beers['date'] = pd.to_datetime(german_beers['date'], unit='s')

# Filter out dates to ensure we only have one year of data, if necessary
start_date = "2015-01-1"  # start date in the example given (20th August 2015)
end_date = "2016-01-1"    # end date, one year later

german_users_WW = german_users[(german_users['date'] >= start_date) & (german_users['date'] <= end_date)]
plot_rolling(german_users_WW, 7)

### Alcohol by volume evolution in a mean year by country of user

In [None]:
alcohol_reviews = ratings_mixed_sample[['date','user_location','abv']]
alcohol_reviews = alcohol_reviews.dropna()

alcohol_reviews.loc[:,'date'] = alcohol_reviews[['date']].apply(lambda x: pd.to_datetime(x, unit='s'))
alcohol_reviews.loc[:,'time'] = alcohol_reviews.loc[:,'date'].apply(lambda x: x.year)
alcohol_reviews = alcohol_reviews.drop(columns = "date")
alcohol_reviews = alcohol_reviews[alcohol_reviews.groupby('user_location')['user_location'].transform('size') > 50000]
alcohol_reviews

In [None]:
mean_abv = alcohol_reviews.groupby(["user_location", "time"]).var().reset_index()

# Create a plot with multiple lines and legend
plt.figure(figsize=(10, 6))

for location in mean_abv["user_location"].unique():
    location_data = mean_abv[mean_abv["user_location"] == location]
    plt.plot(location_data["time"], location_data["abv"], label=location)

plt.xlabel("Time")
plt.ylabel("ABV")
plt.ylim([0, 15])
plt.legend(loc="upper right")
plt.title("ABV vs Time for Different User Locations")
plt.grid(True)
plt.show()

In [None]:
mean_abv = alcohol_reviews.groupby(["user_location", "time"]).var().reset_index()
mean_abv.groupby("user_location").plot(x="time", y='abv', kind='line', ylim=[0, 15], subplots=True, sharex=False, sharey=True, layout= (1, -1), stacked=True, use_index=True)

In [None]:
alcohol_reviews_global = alcohol_reviews.drop(columns='user_location')
mean_abv = alcohol_reviews_global.groupby(["time"]).var().reset_index()
mean_abv.plot(x="time", y='abv', kind='line', ylim=[0, 15])

In [None]:
mean_abv = alcohol_reviews.groupby(["time"]).median().reset_index()
mean_abv.plot(x="time", y='abv', kind='line', ylim=[0, 15] )

In [None]:
mean_abv = alcohol_reviews.groupby(["time"]).std().reset_index()
mean_abv.plot(x="time", y='abv', kind='line', ylim=[0, 15] )

### Test of Bens idea 2.2

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming you have pandas DataFrames: ratings_BA, breweries_BA

# 1. Calculate average ratings per brewery in ratings_BA
avg_ratings = ratings_BA.groupby('brewery_id')['rating'].mean().reset_index()
avg_ratings.rename(columns={'rating': 'average_rating'}, inplace=True)

# 2. Rename columns in breweries_BA and filter out breweries with more than 200 beers
breweries_BA.rename(columns={'id': 'brewery_id', 'nbr_beers': 'beer_count'}, inplace=True)
filtered_breweries = breweries_BA[breweries_BA['beer_count'] <= 200]

# 3. Analysis: Merge the dataframes for plotting
analysis_df = pd.merge(avg_ratings, filtered_breweries[['brewery_id', 'beer_count']], on='brewery_id')

# Create a line plot with a 95% confidence interval
sns.lineplot(data=analysis_df, x='beer_count', y='average_rating', errorbar=('ci', 95))

plt.xlabel('Number of Beers Produced (up to 200)')
plt.ylabel('Average Rating')
plt.title('Relationship between Number of Beers and Average Ratings')
plt.show()
