In [1]:
# python
import csv
import pickle
from pathlib import Path
from random import random,sample,shuffle
from dataclasses import dataclass

from itertools import combinations
import random

# numpy stack
import pandas as pd
import numpy as np

# custom
from utils.helper_functions import *

# Load Data

## Load beer reviews

In [31]:
df = pd.read_csv("aggregated_reviews_small.csv",
                 usecols = ['brewery_id', 'brewery_name',
                            'beer_id', 'beer_name', 'beer_abv',
                            'country', 'style', 'score',
                            'rating_mean', 'rating_std'])

# handle missing scores
df.score = df.score.apply(lambda x: np.nan if "Needs more ratings" in x else x)
score_mean: int = int(df.score.astype(float).mean())
df.score = df.score.fillna(score_mean).astype(int)

# round abv
df.beer_abv = df.beer_abv.apply(lambda x: round_off_rating(x))

# rename style column
df.rename(columns={'style': 'beer_style'}, inplace=True)

df.head()

Unnamed: 0,brewery_name,brewery_id,beer_id,beer_name,beer_abv,country,beer_style,score,rating_mean,rating_std
0,Harpoon Brewery & Beer Hall,10097,313,Harpoon Octoberfest Beer,5.5,"Massachusetts, United States",Märzen,83,3.689777,0.389692
1,Harpoon Brewery & Beer Hall,10097,314,Harpoon Munich Type Dark Beer,5.5,"Massachusetts, United States",Munich Dunkel,85,3.793952,0.365884
2,Harpoon Brewery & Beer Hall,10097,318,UFO Hefeweizen,5.0,"Massachusetts, United States",American Pale Wheat Beer,78,3.411102,0.473329
3,Dogfish Head Craft Brewery,10099,1079,Shelter Pale Ale,5.0,"Delaware, United States",American Pale Ale,77,3.386411,0.452167
4,Dogfish Head Craft Brewery,10099,1161,Indian Brown Ale,7.0,"Delaware, United States",American Brown Ale,91,4.095819,0.387483


## Load similarity matrix

In [3]:
# load sim_matrix path
sim_matrix_path: Path = Path('pkl_files') / "sim_matrix.pkl"

# Load pkl sim_matrix file
with sim_matrix_path.open('rb') as f:
    sim_matrix = pickle.load(f)

# print the first rows of the loaded data
print(sim_matrix[:5, :5])

tensor([[1.0000, 0.2134, 0.0819, 0.1599, 0.1010],
        [0.2134, 1.0000, 0.2251, 0.3023, 0.2801],
        [0.0819, 0.2251, 1.0000, 0.7587, 0.5862],
        [0.1599, 0.3023, 0.7587, 1.0000, 0.7911],
        [0.1010, 0.2801, 0.5862, 0.7911, 1.0000]])


## Load initial Beer reviews file

In [5]:
reviews_df = pd.read_csv("beer_reviews.csv", usecols = ['review_profilename', 'brewery_id', 'brewery_name',
                            'review_overall', 'beer_style', 'beer_name',
                            'beer_abv', 'beer_beerid'])

# only keep scrapped beers
reviews_df = reviews_df.loc[reviews_df.beer_beerid.isin(df.beer_id.drop_duplicates().values)]
reviews_df.head()

Unnamed: 0,brewery_id,brewery_name,review_overall,review_profilename,beer_style,beer_name,beer_abv,beer_beerid
10,163,Amstel Brouwerij B. V.,3.0,fodeeoz,Light Lager,Amstel Light,3.5,436
18,163,Amstel Brouwerij B. V.,2.5,jdhilt,Light Lager,Amstel Light,3.5,436
30,163,Amstel Brouwerij B. V.,3.0,xXTequila,Light Lager,Amstel Light,3.5,436
40,163,Amstel Brouwerij B. V.,3.0,Brent,Light Lager,Amstel Light,3.5,436
257,1075,Caldera Brewing Company,4.0,Akfan,American IPA,Caldera IPA,6.1,10784


# Run a test similarity between two beers

In [6]:
weights={'brewery_name': 0.5, 'beer_abv': 1, 'country': 0.2, 'beer_style': 1, 'score': 1, 'rating_mean': 1, 'reviews': 1}

sim_v2('Founders KBS (Kentucky Breakfast Stout)',
       'Portsmouth Kate The Great',
       df,
       weights,
       sim_matrix)

(4.24,
 [('score', 1.0),
  ('rating_mean', 0.91),
  ('beer_abv', 0.9),
  ('reviews', 0.85),
  ('beer_style', 0.5),
  ('country', 0.08)])

# Make 50 suggestions for two beers

In [7]:
recommend('Founders KBS (Kentucky Breakfast Stout)',50,df,weights,sim_matrix)

[("Hunahpu's Imperial Stout",
  (4.95,
   [('beer_style', 1.0),
    ('score', 1.0),
    ('reviews', 1.0),
    ('beer_abv', 0.95),
    ('rating_mean', 0.9),
    ('country', 0.1)])),
 ('Ten FIDY',
  (4.89,
   [('beer_style', 1.0),
    ('reviews', 1.0),
    ('score', 0.97),
    ('beer_abv', 0.95),
    ('rating_mean', 0.87),
    ('country', 0.1)])),
 ("Serpent's Stout",
  (4.88,
   [('beer_abv', 1.0),
    ('beer_style', 1.0),
    ('reviews', 1.0),
    ('score', 0.94),
    ('rating_mean', 0.84),
    ('country', 0.1)])),
 ('Mokah',
  (4.85,
   [('beer_abv', 1.0),
    ('beer_style', 1.0),
    ('reviews', 1.0),
    ('score', 0.94),
    ('rating_mean', 0.83),
    ('country', 0.08)])),
 ('Beer Geek Brunch Weasel',
  (4.82,
   [('beer_abv', 1.0),
    ('beer_style', 1.0),
    ('reviews', 1.0),
    ('score', 0.96),
    ('rating_mean', 0.86)])),
 ('Bourbon County Brand Coffee Stout',
  (4.82,
   [('beer_style', 1.0),
    ('score', 1.0),
    ('reviews', 1.0),
    ('rating_mean', 0.91),
    ('beer_abv

# Make simulation using actual users

We will make predictions on actual users. We will select users that have a high number of reviews, since the data is sparse, in order to get higher probabilities of find a beer the user has rated.

In [9]:
users = ["BuckeyeNation",
"Foxman",
"mikesgroove",
"northyorksammy",
"OtherShoe2"]

for user in users:
    recommend_beers(weights = weights,
                    user = user,
                    recnum_per_user = 20,
                    beer_df=df,
                    neighbor_num = 10,
                    liked_sample_size = 10,
                    sim_matrix = sim_matrix,
                    reviews_df= reviews_df)

Calculating user threshold for user: BuckeyeNation
Threshold is: 3.6449134046231144
YES Smokestack Stout Random Recommendation
YES Chocolate Stout (Already Rated: P) 
YES Liberty Stout (LB No. 50) 
YES Dominion Oak Barrel Stout (Already Rated: P) 
YES Barrel Aged Hickory Stick Stout 
NO Whiskey Malt Random Recommendation
YES Highland Black Mocha Stout (Already Rated: P) 
YES Aphrodisiaque (Already Rated: P) 
YES Triple 7 Black Cherry Stout 
NO Out Of Bounds Stout (Already Rated: N) 
YES Hofbräu Original (Already Rated: P) Random Recommendation
YES Dark Horse Tres Blueberry Stout (Already Rated: P) 
NO Fuel Cafe (Coffee Flavored Stout) (Already Rated: N) 
YES Péché Mortel (Imperial Stout Au Cafe) (Already Rated: P) 
YES Paulaner Original Münchner (Already Rated: P) 
YES Downtown Brown (Already Rated: P) Random Recommendation
YES Spaten Münchner Hell (Premium Lager) (Already Rated: P) 
NO Löwenbräu Original (Already Rated: N) 
YES Franziskaner Hefe-Weisse (Already Rated: P) 
YES Weihenst

# Appendix

In [None]:
# get sample users
reviews_df.groupby(['review_profilename']).size().reset_index(name='count').sort_values(by=['count'], ascending=False).head()

In [30]:
from sentence_transformers import SentenceTransformer, util

# load sim_matrix path
embeddings_path: Path = Path('pkl_files') / "embeddings.pkl"

# Load pkl embeddings file
with embeddings_path.open('rb') as f:
    embeddings = pickle.load(f)

# print the first rows of the loaded data
_index = df.loc[(df.brewery_id == 4) & (df.beer_id == 40051)].index[0]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

text = "Visual: dark brown, almost rootbeerish red tinge. no lingering head. Aroma: roasted malts, bitter chocolate, burnt sugar, vsl vanilla Taste: best part of the beer - nice burnt caramel, toasted marshmallow, roasted coffee, somehow the IAA still comes through. Overall: if tasted blind i would've thought belgian strong, definitely not typical stout mouthfeel or aroma, especially with the lightness of the body and strong IAA. Another great beer from Allagash."
candidate = model.encode(text)

max = 0
index_loc = 0
for i in range(len(embeddings)):
    sim = util.cos_sim(embeddings[i], candidate)[0][0]
    if sim > max:
        max = sim
        index_loc = i

0
1
25
44
144
248
356
568
