In [None]:
import pandas as pd
import numpy as np
import gzip

# Data processing and manipulation
import re
from multiprocessing import Pool
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer

# NLTK for natural language processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

beers = pd.read_csv('data/beers.csv')
breweries = pd.read_csv('data/breweries.csv')
users = pd.read_csv('data/users.csv')

### Extracting and processing the reviews

We extract all the tokens of 1 million reviews in order to get a good sense of the reviews vocabulary and of the tokens distributions.

Here the processing is the following :
- We lower the text
- Remove the punctation
- Apply lemmatization
- Keep only nouns and adjectives, this step is very useful as these type of words are the ones that describe in the most effective manner a beer.

In [None]:
from process_reviews import get_tokens

# Processes the tokens, using stopwords, removing punctuation, lemmatization and only keeping nouns and adjectives
all_tokens = get_tokens('data/reviews.txt.gz', review_limit=10000000)

### Vectorizing with TF-IDF and getting the best features out

In a straightforward way, we create a TF-IDF matrix for each tokens in the reviews (With the text pre-processing mentionned above). We then retain the *1000* most frequent word.

In [None]:
# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000, min_df = 0.005)  # Limit to top 1000 terms, showing terms that appears in at least 0.5% of the docs
X = vectorizer.fit_transform([" ".join(token) for token in all_tokens])  # Transform reviews to TF-IDF matrix

In [28]:
top_feats = set(vectorizer.get_feature_names_out())
print('Some Feature Names : ', vectorizer.get_feature_names_out()[::100])

Some Feature Names :  ['able' 'bubble' 'day' 'finish' 'ice' 'middle' 'place' 'semi' 'sweetish'
 'wet']


As we can see from the output above, when we look at random words from the 1000 words, we see that some of them are not very useful in describing a beer. They might appear but are not what we look for when trying to describe a beer.

### From the top 1000 features of our TF-IDF matrix, we manually select the ones that have something to do with beer

The strategy is to look for the words that actually describe a beer. From the 1000 most common, we pick the one that are beer-related. This lexicon will be our new one for describing beer and will further used for filtering the reviews.

In [None]:
# selected from the 1000 word vocabulary of the TF-IDF matrix
beer_lexicon = {
    'beer', 'ale', 'lager', 'stout', 'porter', 'pilsner', 'ipa', 'brewer', 
 'brewery', 'brewing', 'draft', 'cask', 'bottle', 'glass', 'pour', 'tap','head', 'foam', 
    'frothy', 'bubbly', 'carbonated', 'flat', 'clarity','hazy', 'opaque', 'cloudy', 'clear', 'golden', 'amber', 'brown', 'dark', 
 'black', 'mahogany', 'ruby', 'white', 'pale', 'yellow', 'beige', 'red', 
 'floral', 'herbal', 'earthy', 'spicy', 'citrusy', 'fruity', 'sweet', 
 'sour', 'bitter', 'bitterness', 'smooth', 'rich', 'dry', 'crisp', 'creamy', 
 'sticky', 'tart', 'tangy', 'sweetness', 'malty', 'hoppy', 'aromatic', 
 'aroma', 'aftertaste', 'subtle', 'intense',
'barley', 'malt', 'malty', 'grain', 'yeast', 'hops', 'hop', 'water', 
 'spices', 'vanilla', 'chocolate', 'caramel', 'toffee', 'coffee', 'cocoa', 
 'citrus', 'orange', 'lemon', 'grapefruit', 'pine', 'oak', 'nutty', 'raisin', 
 'plum', 'cherry', 'maple', 'banana', 'berry', 'ginger','session', 'sessionable', 'pint', 'ounce', 'ounce', 'snifter', 'chalice', 
 'pub', 'bar', 'draught', 'taste', 'drink', 'drinker', 'sip', 'gulp', 
 'quaffable','balanced', 'strong', 'bold', 'weak', 'delicate', 'refreshing', 'complex', 
 'light', 'heavy', 'moderate', 'intense', 'robust', 'unique', 'classic', 
 'distinctive', 'remarkable', 'awesome', 'excellent', 'superb', 'good', 
 'decent', 'ok', 'average', 'bad', 'boring', 'cheap', 'quality','balanced', 'strong', 'bold', 'weak', 'delicate', 'refreshing', 'complex', 
 'light', 'heavy', 'moderate', 'intense', 'robust', 'unique', 'classic', 
 'distinctive', 'remarkable', 'awesome', 'excellent', 'superb', 'good', 
 'decent', 'ok', 'average', 'bad', 'boring', 'cheap', 'aged', 'vintage', 'cellar', 'oak', 'barrel', 'reserve', 'craft', 'microbrewery', 'homebrew', 'regional', 'seasonal', 'festive', 
 'holiday', 'celebration', 'festival', 'special'}

### We filter the beer-related tokens, getting rid of the rest
Every review is left with beer-related tokens whose vocabulary is the beer_lexicon

In [77]:
all_tokens_beer = [[word for word in sentence if word in beer_lexicon] for sentence in all_tokens]

### Recreating a Vectorizer (TF-IDF) focusing only on the beer lexicon

With the goal of simplifying our model and reducing the noise, new TF-IDF matrix with vocabulary being beer_lexicon

In [None]:
# Create a TF-IDF Vectorizer using the beer lexicon
vectorizer_beer = TfidfVectorizer(vocabulary=beer_lexicon)

# Transform the beer-related tokens to a TF-IDF matrix
X_beer = vectorizer_beer.fit_transform([" ".join(token) for token in all_tokens_beer])

### Extracting the top features

These features will be added to each beer by analyzing its reviews, enriching its descriptions and ultimately enhancing the performance of our recommender system. By incorporating this new beer lexicon vocabulary, we can convey more detailed and nuanced information about each beer. This improved descriptive capability allows our recommender system to make more accurate and personalized suggestions, ensuring that users find beers that match their preferences more closely.

Example for the top 50 featues that describe a beer.

In [None]:
features_score = np.sum(X_beer, axis=0).A1

top_indices = np.argsort(features_score)[::-1][:50]  # Sort in descending order
top_50_features = np.array(vectorizer_beer.get_feature_names_out())[top_indices]

top_50_features

array(['beer', 'malt', 'taste', 'hop', 'head', 'dark', 'good', 'sweet',
       'chocolate', 'light', 'aroma', 'bottle', 'ale', 'caramel', 'glass',
       'bitter', 'brown', 'coffee', 'black', 'white', 'bitterness',
       'stout', 'creamy', 'malty', 'smooth', 'dry', 'sweetness', 'clear',
       'pint', 'strong', 'fruity', 'rich', 'toffee', 'aftertaste',
       'decent', 'citrus', 'orange', 'earthy', 'vanilla', 'floral',
       'golden', 'pale', 'amber', 'heavy', 'hoppy', 'bad', 'drink', 'ipa',
       'complex', 'red'], dtype=object)