> Chalkiopoulos Georgios, Electrical and Computer Engineer NTUA <br />
> Data Science postgraduate Student <br />
> gchalkiopoulos@aueb.gr

# Import Libraries

In [3]:
import csv
from pathlib import Path
from dataclasses import dataclass, fields
from datetime import datetime

from typing import List, Optional, Union, Set
from pprint import pprint

from collections import defaultdict
import numpy as np

import pandas as pd

import pickle
import logging

from itertools import combinations
from collections import defaultdict


# utils
from utils.Loggers import BaseLogger

# Read data

In [4]:
@dataclass
class BeerReview(object):
    index: int
    brewery_id: str
    brewery_name: str
    review_time: datetime.timestamp
    review_overall: float
    review_aroma: float
    review_appearance: float
    review_profilename: str
    beer_style: str
    review_palate: float
    review_taste: float
    beer_name: str
    beer_abv: float
    beer_beerid: int
    user_id: Optional[Union[int, None]] = None

In [5]:
class ReviewReader(BaseLogger):
    """Read reviews based on an input file"""
    reviews: List[BeerReview] = []

    def __init__(self, file_path: str,
                 user_reviews_threshold: int = 20,
                 beer_reviews_threshold: int = 20,
                 *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.file_path = file_path
        self.user_reviews_threshold = user_reviews_threshold
        self.beer_reviews_threshold = beer_reviews_threshold

        # initialize users and beers dict
        self.users: defaultdict = defaultdict(int) # to keep track of user reviews counts
        self.beers: defaultdict = defaultdict(int) # to keep track of beer reviews counts
            
        self.user_mapping: defaultdict = defaultdict(str)
        self.beer_mapping: defaultdict = defaultdict(str)



    @property
    def input_file(self) -> Path:
        """
        Creates a Path object containing the input file.
        Raises an exception if the file doesn't exist

        Returns:
            Path object
        """
        input_file: Path = Path(self.file_path)
        if not input_file.is_file():
            self.logger.error(f"{input_file.name} file doesn't exist.")

        return input_file

    @property
    def valid_users(self) -> list:
        """Returns that have more than user_reviews_threshold reviews"""
        return [user for user, total_ratings in self.users.items() if total_ratings >= self.user_reviews_threshold]

    @property
    def valid_beers(self) -> list:
        """Returns that have more than user_reviews_threshold reviews"""
        return [beer for beer, total_ratings in self.beers.items() if total_ratings >= self.beer_reviews_threshold]
    
    @staticmethod
    def make_array(_review: BeerReview):
        """Select the necessary columns needed from the BeerReview object
        user_id, review_profilename
        beer_beerid, review_overall
        review_aroma, review_appearance, review_taste, review_palate"""

        return [_review.user_id, _review.review_profilename,
                _review.beer_beerid,_review.beer_name, _review.review_overall,
                _review.review_aroma, _review.review_appearance, _review.review_taste, _review.review_palate,
                _review.brewery_id]


    def filtered_reviews(self, reviews_df: pd.DataFrame) -> pd.DataFrame:
        """Filters the reviews dataframe to keep users based on defined thresholds"""

        # filter users
        return reviews_df.loc[(reviews_df.user_name.isin(self.valid_users))
                              & (reviews_df.beer_id.isin(self.valid_beers))
                            & (reviews_df.user_name != "")]


    def read_reviews(self) -> pd.DataFrame:
        """
        Read the reviews based on the input file. Returns a list of reviews.        """
        with open(self.input_file, encoding="utf8") as f:

            self.logger.info(f"Loading {self.input_file}")

            # initialize user_id
            user_id: dict = {}
            id: int = 0

            for i, row in enumerate(csv.DictReader(f)):

                # add user_id and user_mapping
                if user_id.get(row["review_profilename"].strip()) is None:
                    user_id[row["review_profilename"].strip()] = id                    
                    self.user_mapping[id] = row["review_profilename"].strip()
                    id += 1
                
                # add beer_mapping
                if not self.beer_mapping.get(row["beer_beerid"]):
                    self.beer_mapping[row["beer_beerid"]] = row["beer_name"]
                
                
                # create a review object
                review: BeerReview = BeerReview(
                    int(row["index"]) if row["index"].isnumeric() else None,
                    row["brewery_id"],
                    row["brewery_name"].strip(),
                    datetime.fromtimestamp(int(row["review_time"])),
                    float(row["review_overall"]),
                    float(row["review_aroma"]),
                    float(row["review_appearance"]),
                    row["review_profilename"].strip(),
                    row["beer_style"].strip(),
                    row["review_palate"],
                    row["review_taste"],
                    row["beer_name"],
                    row["beer_abv"],
                    row["beer_beerid"],
                    user_id[row["review_profilename"]])

                # add the review object to the total reviews
                self.reviews.append(self.make_array(review))

                # Keep user count
                self.users[review.review_profilename] += 1
                self.beers[review.beer_beerid] += 1

                if i % 300000 == 0 and i != 0:
                    self.logger.info(f"{i} reviews loaded.")

        self.logger.info(f"All reviews loaded. Total reviews: {len(self.reviews)}")
        f.close()

        # Convert to Pandas Dataframe
        reviews_df = pd.DataFrame(self.reviews,
                                  columns=["user_id", "user_name",
                                            "beer_id","beer_name", "review_overall",
                                            "review_aroma", "review_appearance", "review_taste", "review_palate",
                                            "brewery_id"])

        return self.filtered_reviews(reviews_df)


In [6]:
file_path: str = "beer_reviews.csv"
reader = ReviewReader(file_path=file_path)
reviews: pd.DataFrame = reader.read_reviews()

[2023-02-26 19:35:27,074] INFO [ReviewReader] - Loading beer_reviews.csv
[2023-02-26 19:35:33,519] INFO [ReviewReader] - 300000 reviews loaded.
[2023-02-26 19:35:39,614] INFO [ReviewReader] - 600000 reviews loaded.
[2023-02-26 19:35:45,949] INFO [ReviewReader] - 900000 reviews loaded.
[2023-02-26 19:35:52,467] INFO [ReviewReader] - 1200000 reviews loaded.
[2023-02-26 19:35:58,624] INFO [ReviewReader] - 1500000 reviews loaded.
[2023-02-26 19:36:00,406] INFO [ReviewReader] - All reviews loaded. Total reviews: 1586614


In [8]:
reviews.head()

Unnamed: 0,user_id,user_name,beer_id,beer_name,review_overall,review_aroma,review_appearance,review_taste,review_palate,brewery_id
10,7,fodeeoz,436,Amstel Light,3.0,2.0,3.0,2.5,2.5,163
18,15,jdhilt,436,Amstel Light,2.5,3.0,3.0,2.0,2.0,163
19,16,UCLABrewN84,58046,Rauch Ür Bock,4.5,4.5,3.0,4.5,4.0,1075
20,17,zaphodchak,58046,Rauch Ür Bock,4.0,4.0,4.0,4.0,3.0,1075
21,18,Tilley4,58046,Rauch Ür Bock,4.0,4.5,4.0,4.0,3.5,1075


In [70]:
import sys


def progressbar(it, prefix="", size=60, out=sys.stdout, print_every=100): # Python3.6+
    count = len(it)
    def show(j):
        if j % 100 == 0 or j == count:
            x = int(size*j/count)
            print(f"{prefix}[{u'█'*x}{('.'*(size-x))}] {j}/{count}", end='\r', file=out, flush=True)
    show(0)
    for i, item in enumerate(it):
        yield item
        show(i+1)
    print("\n", flush=True, file=out)


def discretize_rating(rating: float):
    """
    Converts a given float rating to a string value
    """
    polarity = 'A'  # average

    if rating < 2.5:
        polarity = 'N'  # negative
    elif rating > 3.5:
        polarity = 'P'  # positive

    return polarity


def load_ratings(ratings_df : pd.DataFrame,
                 focus : list # used to pick beer or user-based
                 ):

    """
    Loads all the ratings submitted by each user or all ratings submitted for a beer

    Returns:
        a dictionary that maps each user to a second dict that maps movies to discretized ratings

        OR

        a dictionary that maps each beer to a second dict that maps raters to discretized ratings

    """

    distinct_ids = set(ratings_df[focus[0]]) # get all distinct users

    ratings: set = {} # store ratings per entity

    for id_ in progressbar(distinct_ids, "Loading: ", 60): # for each user

        # get the info for every rating submitted for this user or for this beer
        my_ratings = ratings_df[ratings_df[focus[0]] == id_][[focus[1], 'review_overall']]

        # discretize the ratings and attach them to the user or to the beer
        ratings[id_] = dict(zip(my_ratings[focus[1]], my_ratings.review_overall.apply(discretize_rating)))

    return ratings


def get_neighbors(ratings:dict, # ratings submitted by each user or by each beer
                   min_rating_num:int=5 # at least this many ratings are required for a comparison
                      ):

    '''
    Compute rating-based similarity between every two pairs of users or pairs of beers

    '''

    #get all possible pairs
    pairs=list(combinations(list(ratings.keys()),2))

    sim=defaultdict(dict) # initialize the sim dictionary

    cnt=0

    N=len(pairs)

    print(f"Total Pairs: {N}")
    for id1,id2 in pairs: # for every entity pair

        cnt+=1

        if cnt%100000==0:
            print(cnt,'out of',N,' pairs evaluated')

        #get a set with all the discretized ratings (movie/user id, polarity tuples) for x1 and x2
        s1=set([(xid,pol) for xid,pol in ratings[id1].items()])
        s2=set([(xid,pol) for xid,pol in ratings[id2].items()])

        # check if both users/movies respect the lower bound
        if len(s1)<min_rating_num or len(s2)<min_rating_num: continue

        # get the union and intersection for these two users/movies
        union=s1.union(s2)
        inter=s1.intersection(s2)

        # compute user/movie sim via the jaccard coeff
        jacc=len(inter)/len(union)

        # remember the sim values
        sim[id1][id2]=jacc
        sim[id2][id1]=jacc

    # attach each user/movie to its neighbors, sorted by sim in descending order
    return {id_:sorted(sim[id_].items(),key=lambda x:x[1], reverse=True) for id_ in sim}

In [7]:
user_ratings = load_ratings(reviews, ["user_id", "beer_id"])
# beer_ratings = load_ratings(reviews, ["beer_id", "user_id"])

Loading: [███████████████████████████████████████████████████████████.] 7700/7705



In [8]:
try:
    with open('neighbors_u.pkl', 'rb') as handle:
        neighbors_u = pickle.load(handle)
        BaseLogger().logger.info("neighbors_b.pkl file loaded.")

except (FileNotFoundError, EOFError) as e:

    BaseLogger().logger.info("neighbors_b.pkl file not found. Computing neighbor scores.")
    neighbors_u=get_neighbors(user_ratings)

    with open('neighbors_u.pkl', 'wb') as handle:
        pickle.dump(neighbors_u, handle, protocol=pickle.HIGHEST_PROTOCOL)
handle.close()

[2023-02-26 18:43:22,366] INFO [BaseLogger] - neighbors_b.pkl file loaded.


In [10]:
try:
    with open('neighbors_b.pkl', 'rb') as handle:
        neighbors_b = pickle.load(handle)
    BaseLogger().logger.info("neighbors_b.pkl file loaded.")

except (FileNotFoundError, EOFError) as e:

    BaseLogger().logger.info("neighbors_b.pkl file not found. Computing neighbor scores.")

    neighbors_b=get_neighbors(beer_ratings)

    with open('neighbors_b.pkl', 'wb') as handle:
        pickle.dump(neighbors_b, handle, protocol=pickle.HIGHEST_PROTOCOL)
handle.close()

[2023-02-26 11:37:23,608] INFO [BaseLogger] - neighbors_b.pkl file not found. Computing neighbor scores.


Total Pairs: 42186705
100000 out of 42186705  pairs evaluated
200000 out of 42186705  pairs evaluated
300000 out of 42186705  pairs evaluated
400000 out of 42186705  pairs evaluated
500000 out of 42186705  pairs evaluated
600000 out of 42186705  pairs evaluated
700000 out of 42186705  pairs evaluated
800000 out of 42186705  pairs evaluated
900000 out of 42186705  pairs evaluated
1000000 out of 42186705  pairs evaluated
1100000 out of 42186705  pairs evaluated
1200000 out of 42186705  pairs evaluated
1300000 out of 42186705  pairs evaluated
1400000 out of 42186705  pairs evaluated
1500000 out of 42186705  pairs evaluated
1600000 out of 42186705  pairs evaluated
1700000 out of 42186705  pairs evaluated
1800000 out of 42186705  pairs evaluated
1900000 out of 42186705  pairs evaluated
2000000 out of 42186705  pairs evaluated
2100000 out of 42186705  pairs evaluated
2200000 out of 42186705  pairs evaluated
2300000 out of 42186705  pairs evaluated
2400000 out of 42186705  pairs evaluated
250

In [65]:
def recommend_ub(user:int,
                 beer_mapping:dict, # reviews df
                 neighbors_u:dict, # neighbors dict
                 user_ratings:dict, # ratings submitted per user
                 jaccard_threshold:float, # neighbors less similar than this are ignored
                 rec_num:int,# number of movies to recommend
                 verbose:bool=True
                ):

    """
    Delivers user-based recommendations. Given a specific user:
    - find the user's most similar users
    - Go over all the beers rated by all neighbors
    - Each beer gets +2 if a neighbor liked it, -2 if a neighbor didn't like it, -1 if  neighbor was neutral
    - +2,-1,and -2 are scaled based on user sim
    - Sort the beer by their scores in desc order
    - Go over the sorted beer list. If the user has already rated the movie, store its rating. Otherwise print.
    """

    votes=defaultdict(int) # count the votes per movie

    for neighbor,sim_val in neighbors_u[user]: # for each neighbor

        if sim_val<jaccard_threshold:
            break

        if verbose: 
            print('neighbor',neighbor)

        for mid,pol in user_ratings[neighbor].items(): # for each beer rated by this neighbor

            if pol=='P': # positive neighbor rating
                votes[mid]+=2*sim_val
            elif pol=='N': # negative
                votes[mid]-=2*sim_val
            else: # average
                votes[mid]-=1*sim_val

    # sort the movies in desc order
    srt=sorted(votes.items(),key=lambda x:x[1], reverse=True)

    if verbose: print('\nI suggest the following beers because they have received positive ratings\n'
                      'from users who tend to like what you like:\n')

    cnt=0 # count number of recommendations made

    already_rated={}

    for beer, score in srt: # for each movie

        try:
            title=beer_mapping.get(beer) # get the title
        except KeyError:
            title='placeholder'

        rat=user_ratings[user].get(beer,None) # check if the user has already rated the movie

        if rat: # movie already rated
            already_rated[title]=rat # store the rating
            continue

        cnt+=1 # one more recommendation
        if verbose: print('\n',beer, title, score/) # print

        if cnt==rec_num:break # stop once you 've made enough recommendations

    if verbose: print('\n',already_rated)

In [71]:
recommend_ub(5, reader.beer_mapping, neighbors_u, user_ratings, 0.1, 10)

neighbor 1463
neighbor 283
neighbor 695
neighbor 534
neighbor 1635
neighbor 1427
neighbor 1533
neighbor 964
neighbor 1787
neighbor 2702
neighbor 1677
neighbor 209
neighbor 357
neighbor 709
neighbor 968
neighbor 623
neighbor 166
neighbor 1777
neighbor 2834
neighbor 1729
neighbor 2366
neighbor 164
neighbor 452
neighbor 155
neighbor 1483
neighbor 1433
neighbor 1835
neighbor 134
neighbor 1577
neighbor 1529
neighbor 4277
neighbor 1791
neighbor 973
neighbor 1644
neighbor 83
neighbor 1102
neighbor 3324
neighbor 1438
neighbor 325
neighbor 675
neighbor 5217
neighbor 3100
neighbor 1430
neighbor 1476
neighbor 2566
neighbor 3946
neighbor 4169
neighbor 665
neighbor 2279
neighbor 1658
neighbor 1866
neighbor 713
neighbor 942
neighbor 2571
neighbor 458
neighbor 875
neighbor 2378
neighbor 2387
neighbor 468
neighbor 200
neighbor 86
neighbor 1795
neighbor 2164
neighbor 672
neighbor 369
neighbor 1429
neighbor 1895
neighbor 703
neighbor 465
neighbor 1886
neighbor 1790
neighbor 1959
neighbor 4090
neighbor 2

In [60]:
votes=defaultdict(int) # count the votes per movie

for neighbor,sim_val in neighbors_u[5]: # for each neighbor

    if sim_val<0.1:
        break

    for mid,pol in user_ratings[neighbor].items(): # for each beer rated by this neighbor

        if pol=='P': # positive neighbor rating
            votes[mid]+=2*sim_val
        elif pol=='N': # negative
            votes[mid]-=2*sim_val
        else: # average
            votes[mid]-=1*sim_val
            
# sort the beers in desc order
srt=sorted(votes.items(),key=lambda x:x[1], reverse=True)

cnt=0 # count number of recommendations made

already_rated={}

for beer, score in srt: # for each movie
    
    try:
        title=reader.beer_mapping.get(beer) # get the title
    except KeyError:
        title='placeholder'


    rat=user_ratings[5].get(beer,None) # check if the user has already rated the movie

    if rat: # movie already rated
        already_rated[title]=rat # store the rating
        continue

    cnt+=1 # one more recommendation
    print('\n',beer, title, score/len(neighbors_u[5])) # print

    if cnt==10:break # stop once you 've made enough recommendations

print('\n',already_rated)



 1904 Sierra Nevada Celebration Ale 0.013360071743706525

 2751 Racer 5 India Pale Ale 0.012433817723787242

 35738 Hop Stoopid 0.010567764956125669

 6549 Northern Hemisphere Harvest Wet Hop Ale 0.009919087753307534

 16403 Smuttynose IPA "Finest Kind" 0.009363684137463213

 646 Westmalle Trappist Tripel 0.00905988902662376

 33644 B.O.R.I.S. The Crusher Oatmeal-Imperial Stout 0.008701885097014772

 7348 Founders Porter 0.008602518142462203

 3916 AleSmith IPA 0.008565074732287195

 22505 Green Flash West Coast I.P.A. 0.008559233188487532

 {'Founders Breakfast Stout': 'P', 'Two Hearted Ale': 'P', 'Stone Imperial Russian Stout': 'P', "Bell's Hopslam Ale": 'P', 'Old Rasputin Russian Imperial Stout': 'P', 'Stone IPA (India Pale Ale)': 'P', 'St. Bernardus Abt 12': 'P', 'Trappistes Rochefort 10': 'P', 'Weihenstephaner Hefeweissbier': 'P', 'Founders KBS (Kentucky Breakfast Stout)': 'P', 'Hop Rod Rye': 'P', 'Pliny The Elder': 'P', 'Tröegs Nugget Nectar': 'P', 'La Fin Du Monde': 'P', '90 Mi

In [None]:
pd.DataFrame(reader.user_mapping.items(), columns=["user_id", "user_name"]).set_index("beer_id").sort_index()

In [None]:
try:
    with open('indexer_beer.pkl', 'rb') as handle:
        indexer_beer = pickle.load(handle)
        BaseLogger().logger.info("indexer_beer.pkl file loaded.")

except (FileNotFoundError, EOFError) as e:

    indexer_beer=Esim(reviews_df=reviews, 
                 beer_id_col='beer_id', user_id_col='user_id', 
                 beer_mapping=reader.beer_mapping, user_mapping=reader.user_mapping)
    
    indexer_beer.index('beer', jaccard_threshold=0.1)
    
    with open('indexer_beer.pkl', 'wb') as handle:
        pickle.dump(indexer_beer, handle, protocol=pickle.HIGHEST_PROTOCOL)
handle.close()

In [97]:
try:
    with open('indexer_usr.pkl', 'rb') as handle:
        indexer_user = pickle.load(handle)
        BaseLogger().logger.info("indexer_usr.pkl file loaded.")

except (FileNotFoundError, EOFError) as e:

    indexer_user=Esim(reviews_df=reviews, 
                 beer_id_col='beer_id', user_id_col='user_id', 
                 beer_mapping=reader.beer_mapping, user_mapping=reader.user_mapping)
    
    indexer_user.index('usr', jaccard_threshold=0.1)
    
    with open('indexer_usr.pkl', 'wb') as handle:
        pickle.dump(indexer_user, handle, protocol=pickle.HIGHEST_PROTOCOL)
handle.close()

[2023-02-26 21:15:20,738] INFO [BaseLogger] - indexer_usr.pkl file loaded.


In [84]:
def recommend_ub(indexer:Esim,
                 user:int,  # user to receive the recs
                 rec_num:int,# number of beers to recommend
                 verbose:int=0# set the level of verbose, 0 means no output, 1 suggestions, 2 everything
                ):
  
    '''
    Delivers user-based recommendations. Given a specific user:
    - find the user's neighbor_num most similar users
    - Go over all the beers rated by all neighbors
    - Each beer gets +2 if a neighbor liked it, -2 if a neighbor didn't like it, -1 if  neighbor was neutral
    - +2,-1,and -2 are scaled based on user sim
    - Sort the beers by their scores in desc order
    - Go over the sorted beer list. If the user has already rated the beer, store its rating. Otherwise print.
    
    '''
        
    neighbors=indexer.get_neighbors('usr',user)
     
    votes=defaultdict(int) # count the votes per beer
    
    for neighbor,sim_val in neighbors: # for each neighbor 

        if verbose >= 2: print('neighbor', neighbor)
            
        for mid,pol in indexer.user_ratings[neighbor]: # for each beer rated by this neighbor

            if pol=='P': # positive neighbor rating
                votes[mid]+=2*sim_val
            elif pol=='N': # negative 
                votes[mid]-=2*sim_val
            else: # average 
                votes[mid]-=1*sim_val

    # sort the movies in desc order 
    srt=sorted(votes.items(),key=lambda x:x[1], reverse=True)

    if verbose >= 1: print('\nI suggest the following beers because they have received positive ratings\n'
                       'from users who tend to like what you like:\n')
          
    cnt=0 # count number of recommendations made 
    
    already_rated={}
    
    previous_ratings={x:y for x,y in indexer.user_ratings[user]}
    
    for beer, score in srt: # for each beer 
    
        try:
            title=indexer.beer_mapping.get(beer) # get the title
        except KeyError:
            title='placeholder'
            
            
        rat=previous_ratings.get(beer,None)
        
        if rat: # beer already rated 
            already_rated[title]=rat # store the rating
            continue
     
        cnt+=1 # one more recommendation
        if verbose >= 1: print('\n',beer, title, score) # print 
    
        if cnt==rec_num:break # stop once you 've made enough recommendations
    
    if verbose >= 1: print('\n',already_rated)


In [98]:
recommend_ub(indexer, user=5, rec_num=10, verbose=1)


I suggest the following beers because they have received positive ratings
from users who tend to like what you like:


 1904 Sierra Nevada Celebration Ale 241.14936616998204

 2751 Racer 5 India Pale Ale 214.68552239494002

 35738 Hop Stoopid 172.68441615286685

 6549 Northern Hemisphere Harvest Wet Hop Ale 168.24977833634478

 1117 Bell's Kalamazoo Stout 164.42613692444684

 6260 Punkin Ale 158.49712795180852

 22505 Green Flash West Coast I.P.A. 156.02965780040668

 16403 Smuttynose IPA "Finest Kind" 155.51994438419283

 646 Westmalle Trappist Tripel 153.56674168378598

 8919 G'Knight Imperial Red Ale 152.68990322014932

 {'Old Rasputin Russian Imperial Stout': 'P', 'Two Hearted Ale': 'P', '90 Minute IPA': 'P', 'Founders Breakfast Stout': 'P', 'Stone Imperial Russian Stout': 'P', 'Stone Ruination IPA': 'A', 'Stone IPA (India Pale Ale)': 'P', 'St. Bernardus Abt 12': 'P', 'Trappistes Rochefort 10': 'P', "Bell's Hopslam Ale": 'P', 'Brooklyn Black Chocolate Stout': 'P', 'La Fin Du Monde