# Wineteller : Reminder

- Build a wine recommender based on occasion rather than intrinsic wine characteristics 

- Deduce occasion from wine descriptors that can fit an atmosphere -> Body, Complexity, Finish 

- Use Computational Wine Wheel (mappings) to extract only words that are specific to Body, Complexity and Finish

- Train a Word2Vec model -> clusterize with K-means

# Data Exploration

In [59]:
#python
import pandas as pd
import string

#compute cell-executing time
from tqdm.notebook import trange, tqdm

#text preprocessing
from operator import itemgetter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from collections import Counter, OrderedDict

#nlp modeling
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

#clusterization 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

## Load Data

In [60]:
file_location = '/Users/hyunoochang/code/chyunoo/wineteller/raw_data/winemag-data_first150k.csv'

data = pd.read_csv(file_location, index_col='Unnamed: 0')
print(data.shape)

(150930, 10)


In [61]:
data.head(5)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


## Clean Data

In [62]:
data.isna().sum()

country            5
description        0
designation    45735
points             0
price          13695
province           5
region_1       25060
region_2       89977
variety            0
winery             0
dtype: int64

In [63]:
data = data.drop_duplicates()
data.shape

(97851, 10)

# Data Preprocessing

In [64]:
%%time
#tokenization : reviews -> sentences

reviews_list = list(data['description'])
reviews_list = [str(r) for r in reviews_list]

sentences_tokenized=[]
for review in tqdm(reviews_list) :
    sentences_tokenized.append(sent_tokenize(review))
sentences_tokenized = [item for sublist in sentences_tokenized for item in sublist]

  0%|          | 0/97851 [00:00<?, ?it/s]

CPU times: user 17 s, sys: 723 ms, total: 17.8 s
Wall time: 24.8 s


In [65]:
len(sentences_tokenized)

270505

In [66]:
sentences_tokenized[:1]

['This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak.']

In [67]:
%%time
#normalization : remove stopwords and punctuation + tokenize sentences into words

stop_words = set(stopwords.words('english')) 

punctuation_table = str.maketrans({key: None for key in string.punctuation})
sno = SnowballStemmer('english')

def normalize_text(raw_text):
    try:
        word_list = word_tokenize(raw_text)
        normalized_sentence = []
        for w in word_list:
            try:
                w = str(w)
                lower_case_word = str.lower(w)
                stemmed_word = sno.stem(lower_case_word)
                no_punctuation = stemmed_word.translate(punctuation_table)
                if len(no_punctuation) > 1 and no_punctuation not in stop_words:
                    normalized_sentence.append(no_punctuation)
            except:
                continue
        return normalized_sentence
    except:
        return ''

sentence_sample = sentences_tokenized[:10]
normalized_sentences = []
for s in tqdm(sentences_tokenized) :
    normalized_text = normalize_text(s)
    normalized_sentences.append(normalized_text)

  0%|          | 0/270505 [00:00<?, ?it/s]

CPU times: user 3min 48s, sys: 8.82 s, total: 3min 57s
Wall time: 5min 34s


In [68]:
normalized_sentences[:1]

[['tremend',
  '100',
  'variet',
  'wine',
  'hail',
  'oakvill',
  'age',
  'three',
  'year',
  'oak']]

In [69]:
%%time 
#retrieve bi-grams and tri-grams from normalized sentences

phrases = Phrases(normalized_sentences)
phrases = Phrases(phrases[normalized_sentences])

ngrams = Phraser(phrases)

phrased_sentences = []
for sent in tqdm(normalized_sentences):
    phrased_sentence = ngrams[sent]
    phrased_sentences.append(phrased_sentence)

full_list_words = [item for sublist in phrased_sentences for item in sublist]

  0%|          | 0/270505 [00:00<?, ?it/s]

CPU times: user 32.2 s, sys: 1.14 s, total: 33.3 s
Wall time: 39.9 s


In [70]:
phrased_sentences[:1]

[['tremend',
  '100_variet',
  'wine',
  'hail',
  'oakvill',
  'age',
  'three_year',
  'oak']]

In [71]:
full_list_words[:5]

['tremend', '100_variet', 'wine', 'hail', 'oakvill']

In [72]:
#most common 5000 words in corpus

word_counts = Counter(full_list_words)
sorted_counts = OrderedDict(word_counts.most_common(5000))
counter_df = pd.DataFrame.from_dict(sorted_counts, orient='index')
top_5000_words = counter_df.head(5000)
counter_df.to_csv('top_5000_descriptors.csv')

In [73]:
top_5000_words.head()

Unnamed: 0,0
wine,60966
flavor,55905
fruit,40097
finish,29737
acid,25920


In [74]:
#load wine descriptor_mapping
map_location = '/Users/hyunoochang/code/chyunoo/wineteller/raw_data/descriptor_mapping.csv'

descriptor_mapping = pd.read_csv(map_location).set_index('raw descriptor')
descriptor_mapping.head()

Unnamed: 0_level_0,level_3,level_2,level_1
raw descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abras,abrasive,high_tannin,tannin
acacia,acacia,flowery,flower
acacia_flower,acacia,flowery,flower
aciddriven,acid_driven,high_acid,acid
aggress,aggressive,high_acid,acid


In [75]:
#occasion 'related' features : body, complexity, finish(B.C.F) -> more body = more intimacy, less body = more energy etc

descriptor_list = ['body', 'complexity', 'finish']
filtered_descriptor_mapping = descriptor_mapping[descriptor_mapping['level_1'].isin(descriptor_list)]

In [76]:
filtered_descriptor_mapping.head()

Unnamed: 0_level_0,level_3,level_2,level_1
raw descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
airi,airy,light_bodied,body
big_bold,bold,full_bodied,body
bullish,bullish,full_bodied,body
blocki,chunky,full_bodied,body
chunki,chunky,full_bodied,body


In [77]:
#number of raw descriptors per features 
filtered_descriptor_mapping["level_1"].value_counts()

body          57
complexity    16
finish         6
Name: level_1, dtype: int64

In [78]:
#list of all raw descriptors (body, finish, complexity)
list(filtered_descriptor_mapping.index)

['airi',
 'big_bold',
 'bullish',
 'blocki',
 'chunki',
 'solid_chunki',
 'clampi',
 'close',
 'complex',
 'complic',
 'dainti',
 'deep',
 'depth',
 'eas',
 'easi_sip',
 'eleg',
 'expans',
 'extract',
 'feminin',
 'finess',
 'ampl_weight',
 'bold',
 'full_bodi',
 'fullbodi',
 'soupi',
 'weighti',
 'hearti',
 'heavi',
 'heavier',
 'heavyweight',
 'lavish',
 'rather_lean',
 'length',
 'lengthi',
 'long_mouthwat',
 'longlast',
 'lightest',
 'light',
 'light_bodi',
 'lightbodi',
 'lighter',
 'lighter_style',
 'lightfoot',
 'lightweight',
 'thin',
 'linear',
 'straightforward',
 'lush',
 'luxuri',
 'mass',
 'medium_bodi',
 'medium_fullbodi',
 'medium_weight',
 'mediumbodi',
 'mediumweight',
 'medium_length',
 'mediumlength_finish',
 'modest',
 'onedimension',
 'opul',
 'plump',
 'quaffer',
 'refin',
 'rich',
 'richer',
 'robust',
 'uncompl',
 'stout',
 'sturdi',
 'succul',
 'superrich',
 'syrup',
 'syrupi',
 'thick',
 'unoak',
 'viscos',
 'viscous',
 'voluptu',
 'weight']

In [79]:
%%time
#apply mapping on each word of each sentence

def return_mapped_descriptor(word):
    if word in list(filtered_descriptor_mapping.index):
        normalized_word = filtered_descriptor_mapping['level_3'][word]
        return normalized_word
    else:
        return ""

normalized_sentences = []
for sent in tqdm(phrased_sentences):
    normalized_sentence = []
    for word in sent:
        normalized_word = return_mapped_descriptor(word)
        if normalized_word != "" :
            normalized_sentence.append(str(normalized_word))
        else :
            pass
    normalized_sentences.append(normalized_sentence)

  0%|          | 0/270505 [00:00<?, ?it/s]

CPU times: user 50.3 s, sys: 1.95 s, total: 52.3 s
Wall time: 1min 6s


In [80]:
normalized_sentences[:10]

[[], ['elegant'], [], [], [], ['rich'], [], [], ['complex'], ['succulent']]

# Model

## Word embeddings with Word2Vec

In [81]:
%%time
#fit Word2Vec model into corpus 
model = Word2Vec(normalized_sentences, vector_size=300, min_count=1, epochs=15)
print(model)

model.save('model.bin')

Word2Vec<vocab=50, vector_size=300, alpha=0.025>
CPU times: user 8.68 s, sys: 232 ms, total: 8.91 s
Wall time: 10.7 s


In [82]:
#list of vocab learned by model
model.wv.index_to_key

['rich',
 'light_bodied',
 'complex',
 'elegant',
 'full_bodied',
 'depth',
 'medium_bodied',
 'lush',
 'closed',
 'heavy',
 'weight',
 'thick',
 'opulent',
 'extracted',
 'chunky',
 'plump',
 'modest',
 'refined',
 'syrupy',
 'length',
 'low_complexity',
 'succulent',
 'robust',
 'linear',
 'finessed',
 'hearty',
 'lengthy',
 'sturdy',
 'one_dimensional',
 'voluptuous',
 'bold',
 'feminine',
 'luxurious',
 'viscous',
 'complicated',
 'lavish',
 'medium_length_finish',
 'unoaked',
 'mass',
 'simple',
 'expansive',
 'quaffer',
 'super_rich',
 'easy',
 'stout',
 'dainty',
 'light',
 'clampy',
 'airy',
 'bullish']

In [120]:
#example 
model.wv.most_similar(positive='light_bodied', topn=20)

[('feminine', 0.9984492063522339),
 ('linear', 0.9981278777122498),
 ('finessed', 0.9979815483093262),
 ('refined', 0.9979263544082642),
 ('modest', 0.9977827668190002),
 ('low_complexity', 0.9976708292961121),
 ('length', 0.9975299835205078),
 ('quaffer', 0.9973291158676147),
 ('medium_bodied', 0.997273325920105),
 ('airy', 0.9972459673881531),
 ('sturdy', 0.9971774220466614),
 ('easy', 0.9971200823783875),
 ('simple', 0.9970458745956421),
 ('light', 0.9969235062599182),
 ('unoaked', 0.9967983365058899),
 ('dainty', 0.9966878890991211),
 ('closed', 0.9966592192649841),
 ('hearty', 0.9966194033622742),
 ('elegant', 0.9965943694114685),
 ('medium_length_finish', 0.996440589427948)]

## Merge with dataset

In [84]:
%%time
wine_reviews = list(data['description'])

def return_descriptor_from_mapping(word):
    if word in list(filtered_descriptor_mapping.index):
        descriptor_to_return = filtered_descriptor_mapping['level_3'][word]
        return descriptor_to_return

descriptorized_reviews = []
for review in tqdm(wine_reviews):
    normalized_review = normalize_text(review)
    phrased_review = ngrams[normalized_review]
    descriptors_only = [return_descriptor_from_mapping(word) for word in phrased_review]
    no_nones = [str(d) for d in descriptors_only if d is not None]
    descriptorized_review = ' '.join(no_nones)
    descriptorized_reviews.append(descriptorized_review)

  0%|          | 0/97851 [00:00<?, ?it/s]

CPU times: user 4min 38s, sys: 11.3 s, total: 4min 50s
Wall time: 6min 43s


In [85]:
descriptorized_reviews[:5]

['elegant', 'rich', 'complex succulent', '', 'rich']

In [86]:
%%time

#apply Tfidf weights and compute wine review vectors from word vectors

vectorizer = TfidfVectorizer()
X = vectorizer.fit(descriptorized_reviews)

dict_of_tfidf_weightings = dict(zip(X.get_feature_names(), X.idf_))

CPU times: user 596 ms, sys: 45.3 ms, total: 642 ms
Wall time: 731 ms




In [87]:
%%time
wine_review_vectors = []
for d in tqdm(descriptorized_reviews):
    descriptor_count = 0
    weighted_review_terms = []
    terms = d.split(' ')
    for term in terms:
        if term in dict_of_tfidf_weightings.keys():
            tfidf_weighting = dict_of_tfidf_weightings[term]
            word_vector = model.wv.get_vector(term).reshape(1, 300)
            weighted_word_vector = tfidf_weighting * word_vector
            weighted_review_terms.append(weighted_word_vector)
            descriptor_count += 1
        else:
            continue
    try:
        review_vector = sum(weighted_review_terms)/len(weighted_review_terms)
    except:
        review_vector = []
    vector_and_count = [terms, review_vector, descriptor_count]
    wine_review_vectors.append(vector_and_count)

  0%|          | 0/97851 [00:00<?, ?it/s]

CPU times: user 2.94 s, sys: 1.38 s, total: 4.32 s
Wall time: 5.52 s


In [88]:
#concatenante in new dataset
data['normalized_descriptors'] = list(map(itemgetter(0), wine_review_vectors))
data['review_vector'] = list(map(itemgetter(1), wine_review_vectors))
data['descriptor_count'] = list(map(itemgetter(2), wine_review_vectors))

data.reset_index(inplace=True)

In [89]:
data.head()

Unnamed: 0,index,country,description,designation,points,price,province,region_1,region_2,variety,winery,normalized_descriptors,review_vector,descriptor_count
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,[elegant],"[[0.00021687507, 0.14922334, -0.040845368, 0.0...",1
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,[rich],"[[0.015195419, 0.12782869, -0.004479676, 0.037...",1
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,"[complex, succulent]","[[0.047932237, 0.21209106, -0.05247305, -0.010...",2
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,[],[],0
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude,[rich],"[[0.015195419, 0.12782869, -0.004479676, 0.037...",1


In [90]:
data.loc[1,"description"]

'Ripe aromas of fig, blackberry and cassis are softened and sweetened by a slathering of oaky chocolate and vanilla. This is full, layered, intense and cushioned on the palate, with rich flavors of chocolaty black fruits and baking spices. A toasty, everlasting finish is heady but ideally balanced. Drink through 2023.'

In [91]:
#rows with no BCF descriptor 
data[data["descriptor_count"]>0].shape

(55199, 14)

In [92]:
#save in csv

# Clusterization 

In [93]:
input_vectors = list(data['review_vector'])

In [94]:
input_vectors_listed = [type(a) for a in input_vectors]

In [95]:
input_vectors_listed[:10]

[numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 list,
 numpy.ndarray,
 numpy.ndarray,
 list,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray]

In [96]:
bcf_data = data.loc[data["descriptor_count"]>0]
bcf_data.shape

(55199, 14)

In [106]:
input_vectors = list(bcf_data["review_vector"])

In [118]:
input_vectors[:1]

[array([[ 2.16875065e-04,  1.49223343e-01, -4.08453681e-02,
          4.64704745e-02, -1.54722989e-01, -3.90235960e-01,
          1.69378638e-01,  7.93100834e-01, -9.84736905e-03,
         -3.55258793e-01,  3.93205196e-01, -1.57392323e-01,
         -1.66908249e-01,  3.69446486e-01, -2.40434289e-01,
         -1.91845089e-01,  3.11993152e-01, -1.74097806e-01,
          3.26657854e-02, -3.31091195e-01, -5.97387739e-03,
         -2.01914564e-01,  1.79754011e-02,  1.62239403e-01,
          7.25750998e-02, -5.51029928e-02, -3.86371493e-01,
          3.04610372e-01, -1.81376174e-01, -2.37972572e-01,
          7.17296898e-02, -1.67084798e-01,  2.78835565e-01,
         -1.38280541e-01, -6.96543753e-02,  9.44915116e-02,
          5.59572019e-02, -3.51908833e-01,  1.05551384e-01,
         -2.01598898e-01, -1.05937541e-01,  1.65763013e-02,
         -4.04731445e-02, -2.52373338e-01,  1.31698847e-01,
          1.38928220e-01, -1.81922644e-01,  9.09381956e-02,
         -1.91948526e-02,  3.03617477e-0

In [108]:
input_vectors_listed = [a.tolist() for a in input_vectors]

In [117]:
input_vectors_listed[:1]

list