#### Analysis of the average of ratings between expert and casual per year

In [None]:
avg_ratings_expert = []
avg_ratings_normal = []
ttest_expert_normal = []

# Only take care of data after 2002 (not relevant before due to the lack of data and misunderstanding of which user can be expert or not)
interest_years = sorted([year for year in df_ratings_stat.year.unique() if year > 2002])

for year in interest_years:
    df_expert_year = df_ratings_stat_expert.query('year == @year')
    df_ratings_stat_year = df_ratings_stat.query('year == @year')
    expert_of_the_year = df_expert_year[df_expert_year.is_expert].user_id
    avg_expert = df_ratings_stat_year[df_ratings_stat_year['user_id'].isin(expert_of_the_year)]['rating'].mean()
    avg_normal = df_ratings_stat_year[~df_ratings_stat_year['user_id'].isin(expert_of_the_year)]['rating'].mean()
    t_stat, p_val = stats.ttest_ind(df_ratings_stat_year[df_ratings_stat_year['user_id'].isin(expert_of_the_year)]['rating'], df_ratings_stat_year[~df_ratings_stat_year['user_id'].isin(expert_of_the_year)]['rating'])
    ttest_expert_normal.append(t_stat)
    print(f'Ttest: Do the ratings of casual and expert are the same in {year} ? p-value = {p_val}, stat = {t_stat}')
    avg_ratings_expert.append(avg_expert)
    avg_ratings_normal.append(avg_normal)

In [None]:
# Plot
bar_width = 0.35
fig, ax = plt.subplots()

bar_positions1 = np.arange(len(avg_ratings_expert))
bar_positions2 = bar_positions1 + bar_width

ax.bar(bar_positions1, avg_ratings_normal, width=bar_width, label='Casual', color='blue', alpha=0.7)
ax.bar(bar_positions2, avg_ratings_expert, width=bar_width, label='Expert', color='red', alpha=0.7)

ax.set_xlabel('Years')
ax.set_ylabel('Average ratings')
ax.set_title('Average of ratings between expert and casual per year ')
ax.set_xticks(bar_positions1 + bar_width / 2)
ax.set_xticklabels(interest_years, rotation=45, ha='right')

y_min = min(min(avg_ratings_normal), min(avg_ratings_expert)) - 1 
y_max = max(max(avg_ratings_normal), max(avg_ratings_expert)) + 1 
ax.set_ylim(y_min, y_max)
ax.legend()

plt.show()

The plot shows that the user considered as expert tend to be more severe concerning the global rating of a beer. We clearly observe that for every year the average of ratings of expert are always below the casual ones.

#### Try to find a beer which have been rated by expert and casual stricly at different years in order to identify if an expert tends to influence the ratings or not

Step 1: Find all beers which have been rated by both casual and expert

In [None]:
# Retrieve the ids of all expert user
expert_user_id = df_ratings_stat_expert[df_ratings_stat_expert.is_expert].user_id
# Separate ratings in expert and casual
df_expert_ratings = df_ratings_stat[df_ratings_stat['user_id'].isin(expert_user_id)]
df_casual_ratings = df_ratings_stat[~df_ratings_stat['user_id'].isin(expert_user_id)]

# Find all beer that has been rated at least once by an expert
beer_rated_by_expert = df_expert_ratings['beer_id'].unique()
# Find all beer that has been rated at least once by a casual
beer_rated_by_casual = df_casual_ratings['beer_id'].unique()

print(f'Total of different beer rated on the both websites : {len(df_ratings_stat.beer_id.unique())}')
print(f'expert have rated {len(beer_rated_by_expert)} different beers')
print(f'casual user have rated {len(beer_rated_by_casual)} different beers')

beer_ids_rated_by_both = list(set(beer_rated_by_casual).intersection(set(beer_rated_by_expert)))
print(f'intersection, number of beers which have been rated by casual and expert : {len(beer_ids_rated_by_both)}')

Step 2: Find all beers which have been rated by an expert and a casual not the same year

In [None]:
# Only take beers which have been rated by both
beer_rated_by_expert_filtered = df_expert_ratings[df_expert_ratings['beer_id'].isin(beer_ids_rated_by_both)]
beer_rated_by_casual_filtered = df_casual_ratings[df_casual_ratings['beer_id'].isin(beer_ids_rated_by_both)]

# Select only the useful features for the task
beer_rated_by_expert_compact = beer_rated_by_expert_filtered[['beer_id', 'rating', 'year']]
beer_rated_by_casual_compact = beer_rated_by_casual_filtered[['beer_id', 'rating', 'year']]

# Add is_expert column to dataframe
beer_rated_by_expert_compact.loc[:, 'is_expert'] = True
beer_rated_by_casual_compact.loc[:, 'is_expert'] = False

# Concatenate the ratings of expert and casual
beer_rated_by_expert_casual = pd.concat([beer_rated_by_expert_compact, beer_rated_by_casual_compact])
# Only take care of data after 2002
beer_rated_by_expert_casual = beer_rated_by_expert_casual.query('year > 2002')

In [None]:
# Search for all beers which have been rated by casual and expert in a same year
# Group by 'beer_id', 'year', and 'is_expert' and count the occurrences
grouped_df = beer_rated_by_expert_casual.groupby(['beer_id', 'year', 'is_expert']).size().reset_index(name='count')
filtered_df = grouped_df[(grouped_df['count'] > 1) & ((grouped_df['is_expert'] == True) | (grouped_df['is_expert'] == False))]
beer_ids_to_remove = filtered_df['beer_id'].unique().tolist()

# Number of beers which have been mutually rated at least 1 year by an expert and casual
print(len(beer_ids_to_remove))

# Remove these beers
beer_rated_by_expert_casual_different_year = beer_rated_by_expert_casual_filtered[~beer_rated_by_expert_casual_filtered['beer_id'].isin(beer_ids_to_remove)]
beer_rated_by_expert_casual_different_year

In [None]:
# Find the beer which has been rated the most different years by either a casual or expert

# Group by 'beer_id' and count the number of unique years
beer_years_count = beer_rated_by_expert_casual_different_year.groupby('beer_id')['year'].nunique()

# Find the beer_id with the maximum number of unique years
most_rated_beer_id = beer_years_count.idxmax()

# See if it is relevant to take this beer
beer_rated_by_expert_casual_different_year[beer_rated_by_expert_casual_different_year['beer_id'] == most_rated_beer_id]

In [None]:
most_rated_beer_name = df_ratings_stat[df_ratings_stat['beer_id'] == most_rated_beer_id].iloc[0][0]

print(f'Beer id with the most ratings over the years by expert and casual where non of them rated it the same year: {most_rated_beer_id}')
print(f'Beer name : {most_rated_beer_name}')

In [None]:
df_most_rated_beer = beer_rated_by_expert_casual_different_year[beer_rated_by_expert_casual_different_year['beer_id'] 
                                                                == most_rated_beer_id].groupby(['year', 'is_expert'])['rating'].mean().reset_index()

In [None]:
# Plot version 1
sns.set(style="whitegrid")
sns.lineplot(data=df_most_rated_beer, x='year', y='rating', hue='is_expert', marker='o')
plt.xlabel('Year')
plt.ylabel('Average rating')
plt.title(f'Average rating of the beer {most_rated_beer_name} by casual and expert at different year')
plt.show()

In [None]:
# Plot version 2
sns.set(style="whitegrid")
sns.scatterplot(data=df_most_rated_beer, x='year', y='rating', hue='is_expert', style='is_expert', s=100)
plt.xlabel('Year')
plt.ylabel('Average rating')
plt.title(f'Average rating of the beer {most_rated_beer_name} by casual and expert at different year')
plt.show()

The graph suggests that the rating of experts in 2012 could have influenced the ratings of casuals in 2013 as well as the rating of 2014 which could have influenced the ratings of casuals in 2016 and 2017. However, it is difficult to ensure that expert have an impact on the ratings of the casual users as we can also see on the plot that in 2011, the casual users rated a beer way better than the experts rated it in 2010.