# Recommender System: Naive Bayes Classifier

### Types
- Collaborative filtering
- Content-based
- Hybrid
- Context-aware

#### Ideas
- NLP wine recommendations
- Content based recommendations based on input text
- cluster wines based on ratings (give 3 recommendations per cluster)?
- cluster wines based on price (give 3 recommendations per cluster)?
- option to choose country
- classify variety by description?

### Imports, datasets and EDA

In [2]:
import joblib
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import classify
from nltk.corpus import stopwords, wordnet
from nltk import NaiveBayesClassifier

In [3]:
drop_cols = ['title', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle']
wine_df = pd.read_csv('data/winemag-data-130k-v2.csv', index_col=0).drop(columns=drop_cols)
wine_df

Unnamed: 0,country,description,designation,points,price,province,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Pinot Noir,Sweet Cheeks
...,...,...,...,...,...,...,...,...
129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef)
129967,US,Citation is given as much as a decade of bottl...,,90,75.0,Oregon,Pinot Noir,Citation
129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Gewürztraminer,Domaine Gresser
129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Pinot Gris,Domaine Marcel Deiss


In [4]:
# Drop rows with no price value
wine_df = wine_df[wine_df['price'].notna()]
# Drop rows with no variety value
wine_df = wine_df[wine_df['variety'].notna()]

wine_df.head(3)

Unnamed: 0,country,description,designation,points,price,province,variety,winery
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Riesling,St. Julian


In [5]:
# Plots
def plot_frequencies():
    # counts = wine_df['variety'].value_counts()
    # top_df = counts.reset_index()
    # top_df.columns = ['variety', 'frequency']
    # top_df = top_df[top_df['frequency'] > 500]
    # plt.figure(figsize=(8, 5))
    # sns.barplot(data=top_df, x=top_df.index, y='frequency', width=0.9)
    # plt.xlabel('Variety Index')
    # plt.ylabel('Frequency')
    # # plt.xticks([])
    # # plt.show()
    # plt.savefig('../report/images/freq-varieties-after')
    # # plt.savefig('../report/images/freq-varieties-before')
    pass
# plot_frequencies()

In [6]:
frequency_threshold = 500

counts = wine_df['variety'].value_counts()
top_df = counts.reset_index()
top_df.columns = ['variety', 'frequency']
classes = top_df[top_df['frequency'] > frequency_threshold]['variety'].to_list()

wine_df = wine_df.loc[wine_df['variety'].isin(classes)]
# print(f'length of database: {len(wine_df)}\n number of classes: {len(classes)}')
# print(classes)
# wine_df.to_csv('final-wine-dataset.csv')
classes

['Pinot Noir',
 'Chardonnay',
 'Cabernet Sauvignon',
 'Red Blend',
 'Bordeaux-style Red Blend',
 'Riesling',
 'Sauvignon Blanc',
 'Syrah',
 'Rosé',
 'Merlot',
 'Zinfandel',
 'Malbec',
 'Sangiovese',
 'Nebbiolo',
 'Portuguese Red',
 'White Blend',
 'Sparkling Blend',
 'Tempranillo',
 'Rhône-style Red Blend',
 'Pinot Gris',
 'Cabernet Franc',
 'Champagne Blend',
 'Grüner Veltliner',
 'Pinot Grigio',
 'Portuguese White',
 'Viognier',
 'Gewürztraminer',
 'Gamay',
 'Shiraz',
 'Petite Sirah',
 'Bordeaux-style White Blend',
 'Grenache',
 'Barbera',
 'Glera',
 'Sangiovese Grosso',
 'Tempranillo Blend',
 'Carmenère',
 'Chenin Blanc']

In [7]:
# wine_df['country'].unique()
# wine_df[wine_df['country'] == 'South Africa']
# wine_df.isna().sum() * 100 / len(wine_df)

# len(wine_df['taster_twitter_handle'].unique())
# wine_df['points'].corr(wine_df['price'])
# stellies_df = wine_df[wine_df['province'] == 'Stellenbosch']
# stellies_df['points'].corr(stellies_df['price'])
# len(wine_df['variety'].unique())

# varieties = wine_df['variety'].unique()
# wine_df['variety'].value_counts()

# wine_df.describe()

### Pre-processing
- Normalizing
- Tokenization

In [8]:
reviews = wine_df['description'].to_list()

# Tokenize
for i in range(len(reviews)):
    reviews[i] = reviews[i].split()
    # reviews[i] = word_tokenize(reviews[i])

# Clean
def clean_tokens(tokens):
    cleaned = []
    for token in tokens:
        if token.startswith('@'):
            continue
        cleaned.append(token.lower().replace(',', '').replace('.', ''))
    return cleaned

reviews = [clean_tokens(review) for review in reviews]

Normalization

In [9]:
lemmatizer = WordNetLemmatizer()

def get_wordnet_tag(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def lemmatize(word, tag):
    tag = get_wordnet_tag(tag)
    if tag != '':
        return lemmatizer.lemmatize(word, tag)
    return word

def lemmatize_tokens(tokens):
    tagged_tokens = pos_tag(tokens)
    return [lemmatize(token[0], token[1]) for token in tagged_tokens]

lemmatized_reviews = [lemmatize_tokens(review) for review in reviews]

In [10]:
# lemmatized_reviews[0]

Prepare data for model

In [11]:
def prepare_tokens(tokens, index):
    return ({token: True for token in tokens}, wine_df.iloc[index]['variety'])

prepped_reviews = [prepare_tokens(review, index) for index, review in enumerate(lemmatized_reviews)]

### Train-test split

In [12]:
train_data, test_data = train_test_split(prepped_reviews, shuffle=True, test_size=0.3, random_state=0)

### Training and evaluation

In [13]:
classifier = NaiveBayesClassifier.train(train_data)

In [17]:
joblib.dump(classifier, 'nb-classifier.joblib', compress=3)

['nb-classifier.joblib']

Evaluation: 2 accuracies

In [14]:
top3_matches = 0
first_try = 0

for data in test_data:
    class_probabilities = [[k, (classifier.prob_classify(data[0]).prob(k))] for k in classes]
    class_probabilities.sort(key=lambda x: x[1], reverse=True)
    top3 = class_probabilities[:3]
    if top3[0][0] == data[1]:
        first_try += 1
    for guess in top3:
        if guess[0] == data[1]:
            top3_matches += 1

In [15]:
print(f'top1 accuracy: {round(100 * first_try / len(test_data), 2)}%')
print(f'top3 accuracy: {round(100 * top3_matches / len(test_data), 2)}%')

top1 accuracy: 46.9%
top3 accuracy: 72.54%


Most informative features

In [32]:
classifier.show_most_informative_features(10)

Most Informative Features
                grenache = True           Rhône- : Chardo =   2845.2 : 1.0
                   peach = True            Glera : Cabern =   2413.8 : 1.0
                brunello = True           Sangio : Red Bl =   2237.6 : 1.0
                viognier = True           Viogni : Pinot  =   2234.0 : 1.0
                    pear = True           Pinot  : Red Bl =   2150.4 : 1.0
                    gris = True           Pinot  : Chardo =   1616.1 : 1.0
                  petite = True           Petite : Pinot  =   1439.1 : 1.0
                  chenin = True           Chenin : Chardo =   1373.1 : 1.0
                  shiraz = True           Shiraz : Cabern =   1312.6 : 1.0
              chardonnay = True           Chardo : Red Bl =   1305.4 : 1.0


Testing the model

In [24]:
def convert_input(text):
    # Tokenize
    tokens = lemmatize_tokens(clean_tokens(text.split()))
    return {token: True for token in tokens}

def return_recommendations(description):
    class_probabilities = []
    for k in classes:
        class_probabilities.append([k, round(100 * classifier.prob_classify(convert_input(description)).prob(k), 2)])
    class_probabilities.sort(key=lambda x: x[1], reverse=True)
    top3 = class_probabilities[:3]

    selections = []
    for i in range(3):
        selection_df = wine_df[wine_df['variety'] == top3[i][0]]
        selection_df = selection_df.sort_values(by=['points', 'price'], ascending=False)
        selections.append(selection_df.iloc[0])

    return top3, selections

# description = 'strong grassy aroma, farmyard character with a lingering citrus after taste'
# description = 'sweet aromatic flowery perfumed wine with high acidity'
# description = 'very citrusy, floral with notes of mango and orange on the nose'
description = 'powerful overwhelming strong unappealing'
# description = 'goes well with gamey meat'

top3, selections = return_recommendations(description)
print(top3, '\n')
# print(selections[2]['description'])
# print(selections[1]['description'])
# print(selections[2]['description'])
# selections

[['Petite Sirah', 15.19], ['Malbec', 12.45], ['Chardonnay', 11.99]] 

