In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from src.utils import tqdm

In [2]:
beers = pd.read_csv("./data/RateBeer_processed/beers.csv")
breweries = pd.read_csv("./data/RateBeer_processed/breweries.csv")
users = pd.read_csv("./data/RateBeer_processed/users.csv")

beers.shape, breweries.shape, users.shape

((396690, 11), (24189, 4), (70112, 7))

In [3]:
reviews = pd.read_csv("./data/RateBeer_processed/popular_beers_reviews.csv")
reviews = reviews[reviews["language"] == "en"]
reviews.shape

(894848, 17)

In [4]:
def build_reviews_corpus(reviews, sample_beers=10, expert_ids=None, expert_weight=2, random_state=23):
    reviews_by_beer = reviews.groupby("beer_name")

    unique_beers = len(reviews_by_beer)

    if sample_beers == "all" or sample_beers > unique_beers:
        sample_beers = unique_beers

    beer_names = list(reviews_by_beer.size().sample(sample_beers, random_state=random_state).index)

    corpus = []
    for beer_name in beer_names:
        beer_reviews = reviews_by_beer.get_group(beer_name)

        if expert_ids is not None:
            expert_reviews = beer_reviews[beer_reviews["user_id"].isin(expert_ids)]
            beer_reviews = pd.concat([beer_reviews] + (expert_weight - 1) * [expert_reviews], ignore_index=True)

        document = "\n".join(beer_reviews["text"])

        # some apostrophes have been replaced by â\x80\x99 (â) this is due to an encoding issue beyond our control
        document = re.sub(r"â\x80\x99", "’", document)

        # the word ipa is often missspelled (eg ipaa) and the lemmatizer doesn't remove the s from the plural ipas
        document = re.sub(r"\b\w*ipa\w*\b", "ipa", document, flags=re.IGNORECASE)

        corpus.append(document)

    return corpus, beer_names

# NOTE some apostrophes have been replaced by â ? (eg canât, Iâm) this is probably an encoding issue
# (not on our end), but these get stripped by the TfidfVectorizer so we leave those as is
corpus, corpus_beers = build_reviews_corpus(reviews, sample_beers=10)
print(corpus_beers[0])
print(corpus[0][:1000])

St Peters Cream Stout
Aroma: prunes and caramel.Taste: sweet molasses, subtle bitterness of dark chocolate and pine flavour. Not as creamy as expected, but a great beer! 
Pitch black with ruby reflections and small brown head. Sme?l is cocoa and roasted coffee. So is the taste plus smoked wood, by it feels like it lacks something in the mouth, which makes it a bit lightweight. 
Aroma of cocoa & milk chocolate. Black with brown long lasting head. Light sweet & medium dry. Medium body, creamy texture & average carbonation. 
Pours almost black with nice reddish hints. Low mocha head, decent retention and hints of lacing. Aroma is soft milk coffee and vanilla with the slightest dusty hint. It’s rather simple but clean and quite intense. Taste has a sweet foundation, milk coffee and the slightest hint of roasted malts, again vanilla and maybe a hint of berries. Overall medium sweet and light to medium bitter, with a touch of burnt caramel in the finish. Medium bodied with very low carbonati

In [5]:
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

TWO_LETTERS_NUMBERS = r"(?u)\b\w\w+\b"
THREE_LETTERS = r"(?u)\b[a-zA-Z]{3,}\b"

class Lemmatizer:
    def __init__(self, token_pattern=TWO_LETTERS_NUMBERS):
        self.tokenize = re.compile(token_pattern).findall
        self.lemmatizer = WordNetLemmatizer()

    def _wordnet_pos(self, tag):
        """Map POS tag to WordNet POS tag"""
        tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
        }
        return tag_dict.get(tag[0].upper(), wordnet.NOUN)

    def lemmatize(self, token, tag):
        return self.lemmatizer.lemmatize(token, self._wordnet_pos(tag))

    def __call__(self, text):
        tokens = self.tokenize(text)
        pos_tags = pos_tag(tokens)
        return [self.lemmatize(token, tag) for token, tag in pos_tags]

class Stemmer:
    def __init__(self, token_pattern=TWO_LETTERS_NUMBERS):
        self.tokenize = re.compile(token_pattern).findall
        self.stem = SnowballStemmer("english").stem # or PorterStemmer

    def __call__(self, text):
        tokens = self.tokenize(text)
        return [self.stem(token) for token in tokens]

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer, ENGLISH_STOP_WORDS

def get_stop_words(corpus, tokenizer=None, token_pattern=THREE_LETTERS, max_df=0.9):
    """Get english stop words and frequent words (those with document frequency higher than *max_df*)."""
    if tokenizer is not None:
        tokenizer = tokenizer(token_pattern=token_pattern)

    stop_words = set(ENGLISH_STOP_WORDS)

    doc_freq_vectorizer = CountVectorizer(
        binary=True, lowercase=True, strip_accents="ascii", tokenizer=tokenizer,
        token_pattern=token_pattern if tokenizer is None else None
    )

    doc_freqs = np.array(doc_freq_vectorizer.fit_transform(corpus).todense()).sum(axis=0) / len(corpus)

    words = doc_freq_vectorizer.get_feature_names_out()
    frequent_words = words[doc_freqs >= max_df]
    stop_words.update(frequent_words)

    stop_words = list(stop_words)
    stop_words.extend(["pours"])
    stop_words.extend([
        'argentina', 'thailand', 'turkey', 'united states', 'us', 'trinidad', 'tobago', 'sri lanka',
        'australia', 'aussie', 'australian', 'spain', 'austria', 'belgium', 'brazil', 'brazilian',
        'canada', 'china', 'czech', 'denmark', 'england', 'finland', 'france', 'germany', 'greece',
        'india', 'ireland', 'italy', 'jamaica', 'japan', 'kenya', 'mexico', 'netherlands', 'norway',
        'poland', 'russia', 'scotland', 'singapore', 'chinese', 'french', 'irish', 'italian',
        'jamaican', 'japanese', 'africa', 'african', 'mexican', 'alaska', 'alaskan'
    ])
    #stop_words.extend(["aaaagghh", "aaagh", "aaah", "meh"])

    if tokenizer is not None:
        stop_words = tokenizer(" ".join(stop_words))

    return stop_words

def get_word_counts(corpus, beer_names, tokenizer=None, token_pattern=THREE_LETTERS, **vectorizer_kwargs):
    """Get the word counts in the corpus. When using a tokenizer, this takes some time.

    This is separate from computing the tf-idf scores because we may want to do something simple
    like subtracting frequencies when looking for negative words."""
    if tokenizer is not None:
        tokenizer = tokenizer(token_pattern=token_pattern)

    vectorizer = CountVectorizer(
        lowercase=True, strip_accents="ascii", tokenizer=tokenizer,
        token_pattern=token_pattern if tokenizer is None else None,
        **vectorizer_kwargs
    )

    counts = vectorizer.fit_transform(corpus)

    vocabulary = vectorizer_kwargs.get("vocabulary")
    if vocabulary is None:
        vocabulary = vectorizer.get_feature_names_out()

    counts = pd.DataFrame(counts.toarray(), columns=vocabulary, index=beer_names)
    counts.index.name = "beer_name"
    return counts

def get_tfidf_scores(counts):
    """Get the tf-idf scores from the raw word counts."""
    tfidf = TfidfTransformer().fit_transform(counts)

    tfidf = pd.DataFrame(tfidf.toarray(), columns=counts.columns, index=counts.index)
    tfidf.index.name = "beer_name"

    return tfidf

def get_top_attributes(scores, top_attributes=10, column_prefix="attr_"):
    attributes = {}
    for idx, row in scores.iterrows():
        top_attr = row[1:].sort_values(ascending=False).head(top_attributes)
        attributes[idx] = list(top_attr.index)

    attributes = pd.DataFrame.from_dict(
        attributes, orient="index", columns=[f"{column_prefix}{i+1}" for i in range(top_attributes)])
    attributes.index.name = "beer_name"

    return attributes

def split_worst_and_best_reviews(reviews, percent=10):
    percentiles = reviews.groupby("beer_name")["rating"].quantile(percent / 100).rename("percentile")
    reviews = reviews.merge(percentiles, on="beer_name")
    worst_reviews = reviews[reviews["rating"] <= reviews["percentile"]]
    best_reviews = reviews[reviews["rating"] > reviews["percentile"]]
    return worst_reviews, best_reviews

In [7]:
# TODO experts
expert_threshold = users["review_count"].quantile(0.9)
experts = users[users["review_count"] > expert_threshold]["user_id"].tolist()
expert_threshold

77.0

In [8]:
corpus, corpus_beers = build_reviews_corpus(reviews, sample_beers="all", expert_ids=experts)

tokenizer = Lemmatizer
stop_words = get_stop_words(corpus, tokenizer=tokenizer)
word_counts = get_word_counts(corpus, corpus_beers, tokenizer=tokenizer, stop_words=stop_words, min_df=0.02)



In [9]:
tfidf_scores = get_tfidf_scores(word_counts)
top_attributes = get_top_attributes(tfidf_scores)

In [10]:
top_attributes

Unnamed: 0_level_0,attr_1,attr_2,attr_3,attr_4,attr_5,attr_6,attr_7,attr_8,attr_9,attr_10
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
St Peters Cream Stout,chocolate,coffee,stout,roasty,licorice,cocoa,milk,opaque,beige,peter
Evil Twin Imperial Biscotti Break,chocolate,coffee,biscotti,pitch,cocoa,almond,espresso,mocha,stout,milk
Schneider Weisse Original,clove,hefe,hefeweizen,weizen,weisse,cleave,schneider,hefes,german,weissbier
Port Brewing Older Viscosity,bourbon,chocolate,oak,coffee,barrel,molasses,boozy,licorice,viscosity,pitch
Speakeasy Big Daddy IPA,grapefruit,piney,speakeasy,resin,daddy,coast,west,ipas,francisco,san
...,...,...,...,...,...,...,...,...,...,...
Shock Top Belgian White,coriander,moon,blue,wit,anheuser,busch,witbier,macro,witbiers,wheaty
Het Anker Gouden Carolus Classic,chocolate,raisin,plum,beige,carolus,coffee,prune,gouden,ruby,molasses
Allagash Dubbel Reserve,dubbel,allagash,chocolate,raisin,dubbels,plum,beige,reddish,earlier,ruby
St-Feuillien Cuvée de Noël,christmas,beige,ruby,raisin,anise,chocolate,reddish,cinnamon,plum,licorice


In [11]:
neg_reviews, pos_reviews = split_worst_and_best_reviews(reviews, 10)

neg_corpus, beer_names = build_reviews_corpus(neg_reviews, sample_beers="all")
pos_corpus, _ = build_reviews_corpus(pos_reviews, sample_beers="all")

neg_counts = get_word_counts(neg_corpus, beer_names, tokenizer=tokenizer, stop_words=stop_words, min_df=0.02)
pos_counts = get_word_counts(pos_corpus, beer_names, tokenizer=tokenizer, stop_words=stop_words, vocabulary=neg_counts.columns)



In [12]:
scores = get_tfidf_scores(neg_counts) - get_tfidf_scores(pos_counts)
top_criticisms = get_top_attributes(scores, column_prefix="crit_")

In [13]:
top_criticisms

Unnamed: 0_level_0,crit_1,crit_2,crit_3,crit_4,crit_5,crit_6,crit_7,crit_8,crit_9,crit_10
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
St Peters Cream Stout,stout,coffe,motor,english,mocca,oil,treacle,creamer,coffee,ache
Evil Twin Imperial Biscotti Break,almond,stout,boozy,york,extract,ash,impy,liqueur,bomber,sweeten
Schneider Weisse Original,minerality,wheatiness,weisse,muddy,footed,disappointing,cloudiness,thins,complimentary,disappointment
Port Brewing Older Viscosity,barrel,viscosity,bottlesource,boozy,whiskey,tannin,cage,december,booziness,chocolatey
Speakeasy Big Daddy IPA,fold,anemic,box,synthetic,soggy,shallow,bay,brewpub,clingy,goblet
...,...,...,...,...,...,...,...,...,...,...
Shock Top Belgian White,anheuser,garbage,sink,budweiser,waste,money,piss,free,lager,busch
Het Anker Gouden Carolus Classic,emperor,coffee,beutiful,allright,bro,alco,glow,storage,kill,transition
Allagash Dubbel Reserve,dubbels,double,abbey,tripel,blind,crab,steam,hazelnut,mahogony,attempt
St-Feuillien Cuvée de Noël,anise,juniper,carrot,bia,mint,rotten,boozey,raisins,booziness,ferment


In [14]:
beers_with_attributes = beers.merge(top_attributes.reset_index(), on="beer_name")
beers_with_attributes = beers_with_attributes.merge(top_criticisms.reset_index(), on="beer_name")
beers_with_attributes.to_csv("./data/RateBeer_processed/popular_beers_with_attributes.csv", index=False)