# Wineteller : Reminder

- Build a wine recommender based on occasion rather than intrinsic wine characteristics 

- Deduce occasion from wine descriptors that can fit an atmosphere -> Body, Complexity, Finish 

- Use Computational Wine Wheel (mappings) to extract only words that are specific to Body, Complexity and Finish

- Train a Word2Vec model -> clusterize with K-means

# Data Exploration

In [41]:
#python
import pandas as pd
import string

#compute cell-executing time
from tqdm.notebook import trange, tqdm

#text preprocessing
from operator import itemgetter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from collections import Counter, OrderedDict

#nlp modeling
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

#clusterization 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from nltk.cluster import KMeansClusterer
import nltk

## Load Data

In [2]:
file_location = '/Users/hyunoochang/code/chyunoo/wineteller/raw_data/winemag-data_first150k.csv'

data = pd.read_csv(file_location, index_col='Unnamed: 0')
print(data.shape)

(150930, 10)


In [3]:
data.head(5)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [4]:
data=data[:10000]

## Clean Data

In [5]:
data.isna().sum()

country           2
description       0
designation    2929
points            0
price           539
province          2
region_1       1560
region_2       6126
variety           0
winery            0
dtype: int64

In [6]:
data = data.drop_duplicates()
data.shape

(9392, 10)

# Data Preprocessing

In [7]:
%%time
#tokenization : reviews -> sentences

reviews_list = list(data['description'])
reviews_list = [str(r) for r in reviews_list]

sentences_tokenized=[]
for review in tqdm(reviews_list) :
    sentences_tokenized.append(sent_tokenize(review))
sentences_tokenized = [item for sublist in sentences_tokenized for item in sublist]

  0%|          | 0/9392 [00:00<?, ?it/s]

CPU times: user 2.17 s, sys: 140 ms, total: 2.31 s
Wall time: 3.84 s


In [8]:
len(sentences_tokenized)

25680

In [9]:
sentences_tokenized[:1]

['This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak.']

In [10]:
%%time
#normalization : remove stopwords and punctuation + tokenize sentences into words

stop_words = set(stopwords.words('english')) 

punctuation_table = str.maketrans({key: None for key in string.punctuation})
sno = SnowballStemmer('english')

def normalize_text(raw_text):
    try:
        word_list = word_tokenize(raw_text)
        normalized_sentence = []
        for w in word_list:
            try:
                w = str(w)
                lower_case_word = str.lower(w)
                stemmed_word = sno.stem(lower_case_word)
                no_punctuation = stemmed_word.translate(punctuation_table)
                if len(no_punctuation) > 1 and no_punctuation not in stop_words:
                    normalized_sentence.append(no_punctuation)
            except:
                continue
        return normalized_sentence
    except:
        return ''

sentence_sample = sentences_tokenized[:10]
normalized_sentences = []
for s in tqdm(sentences_tokenized) :
    normalized_text = normalize_text(s)
    normalized_sentences.append(normalized_text)

  0%|          | 0/25680 [00:00<?, ?it/s]

CPU times: user 27.7 s, sys: 1.15 s, total: 28.8 s
Wall time: 53.9 s


In [11]:
normalized_sentences[:1]

[['tremend',
  '100',
  'variet',
  'wine',
  'hail',
  'oakvill',
  'age',
  'three',
  'year',
  'oak']]

In [12]:
%%time 
#retrieve bi-grams and tri-grams from normalized sentences

phrases = Phrases(normalized_sentences)
phrases = Phrases(phrases[normalized_sentences])

ngrams = Phraser(phrases)

phrased_sentences = []
for sent in tqdm(normalized_sentences):
    phrased_sentence = ngrams[sent]
    phrased_sentences.append(phrased_sentence)

full_list_words = [item for sublist in phrased_sentences for item in sublist]

  0%|          | 0/25680 [00:00<?, ?it/s]

CPU times: user 4.7 s, sys: 292 ms, total: 5 s
Wall time: 9.88 s


In [13]:
phrased_sentences[:1]

[['tremend',
  '100_variet',
  'wine',
  'hail',
  'oakvill',
  'age',
  'three_year',
  'oak']]

In [14]:
full_list_words[:5]

['tremend', '100_variet', 'wine', 'hail', 'oakvill']

In [15]:
#most common 5000 words in corpus

word_counts = Counter(full_list_words)
sorted_counts = OrderedDict(word_counts.most_common(5000))
counter_df = pd.DataFrame.from_dict(sorted_counts, orient='index')
top_5000_words = counter_df.head(5000)
counter_df.to_csv('top_5000_descriptors.csv')

In [16]:
top_5000_words.head()

Unnamed: 0,0
wine,5753
flavor,4931
aroma,3408
fruit,3279
finish,3046


In [17]:
#load wine descriptor_mapping
map_location = '/Users/hyunoochang/code/chyunoo/wineteller/raw_data/descriptor_mapping.csv'

descriptor_mapping = pd.read_csv(map_location).set_index('raw descriptor')
descriptor_mapping.head()

Unnamed: 0_level_0,level_3,level_2,level_1
raw descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abras,abrasive,high_tannin,tannin
acacia,acacia,flowery,flower
acacia_flower,acacia,flowery,flower
aciddriven,acid_driven,high_acid,acid
aggress,aggressive,high_acid,acid


In [18]:
#occasion 'related' features : body, complexity, finish(B.C.F) -> more body = more intimacy, less body = more energy etc

descriptor_list = ['body', 'complexity', 'finish']
filtered_descriptor_mapping = descriptor_mapping[descriptor_mapping['level_1'].isin(descriptor_list)]

In [19]:
filtered_descriptor_mapping.head()

Unnamed: 0_level_0,level_3,level_2,level_1
raw descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
airi,airy,light_bodied,body
big_bold,bold,full_bodied,body
bullish,bullish,full_bodied,body
blocki,chunky,full_bodied,body
chunki,chunky,full_bodied,body


In [20]:
#number of raw descriptors per features 
filtered_descriptor_mapping["level_1"].value_counts()

body          57
complexity    16
finish         6
Name: level_1, dtype: int64

In [21]:
#list of all raw descriptors (body, finish, complexity)
list(filtered_descriptor_mapping.index)

['airi',
 'big_bold',
 'bullish',
 'blocki',
 'chunki',
 'solid_chunki',
 'clampi',
 'close',
 'complex',
 'complic',
 'dainti',
 'deep',
 'depth',
 'eas',
 'easi_sip',
 'eleg',
 'expans',
 'extract',
 'feminin',
 'finess',
 'ampl_weight',
 'bold',
 'full_bodi',
 'fullbodi',
 'soupi',
 'weighti',
 'hearti',
 'heavi',
 'heavier',
 'heavyweight',
 'lavish',
 'rather_lean',
 'length',
 'lengthi',
 'long_mouthwat',
 'longlast',
 'lightest',
 'light',
 'light_bodi',
 'lightbodi',
 'lighter',
 'lighter_style',
 'lightfoot',
 'lightweight',
 'thin',
 'linear',
 'straightforward',
 'lush',
 'luxuri',
 'mass',
 'medium_bodi',
 'medium_fullbodi',
 'medium_weight',
 'mediumbodi',
 'mediumweight',
 'medium_length',
 'mediumlength_finish',
 'modest',
 'onedimension',
 'opul',
 'plump',
 'quaffer',
 'refin',
 'rich',
 'richer',
 'robust',
 'uncompl',
 'stout',
 'sturdi',
 'succul',
 'superrich',
 'syrup',
 'syrupi',
 'thick',
 'unoak',
 'viscos',
 'viscous',
 'voluptu',
 'weight']

In [22]:
%%time
#apply mapping on each word of each sentence

def return_mapped_descriptor(word):
    if word in list(filtered_descriptor_mapping.index):
        normalized_word = filtered_descriptor_mapping['level_3'][word]
        return normalized_word
    else:
        return ""

normalized_sentences = []
for sent in tqdm(phrased_sentences):
    normalized_sentence = []
    for word in sent:
        normalized_word = return_mapped_descriptor(word)
        if normalized_word != "" :
            normalized_sentence.append(str(normalized_word))
        else :
            pass
    normalized_sentences.append(normalized_sentence)

  0%|          | 0/25680 [00:00<?, ?it/s]

CPU times: user 6.8 s, sys: 290 ms, total: 7.09 s
Wall time: 17.8 s


In [23]:
normalized_sentences[:10]

[[], ['elegant'], [], [], [], ['rich'], [], [], ['complex'], ['succulent']]

# Model

## Word embeddings with Word2Vec

In [24]:
%%time
#fit Word2Vec model into corpus 
model = Word2Vec(normalized_sentences, vector_size=300, min_count=1, epochs=15)
print(model)

model.save('model.bin')

Word2Vec<vocab=50, vector_size=300, alpha=0.025>
CPU times: user 895 ms, sys: 28.6 ms, total: 923 ms
Wall time: 1 s


In [25]:
#list of vocab learned by model
model.wv.index_to_key

['rich',
 'light_bodied',
 'full_bodied',
 'complex',
 'medium_bodied',
 'elegant',
 'depth',
 'weight',
 'closed',
 'heavy',
 'lush',
 'chunky',
 'low_complexity',
 'thick',
 'plump',
 'robust',
 'length',
 'hearty',
 'extracted',
 'succulent',
 'opulent',
 'modest',
 'syrupy',
 'linear',
 'lengthy',
 'refined',
 'finessed',
 'viscous',
 'luxurious',
 'lavish',
 'expansive',
 'light',
 'bold',
 'voluptuous',
 'sturdy',
 'one_dimensional',
 'simple',
 'easy',
 'lean',
 'clampy',
 'airy',
 'dainty',
 'quaffer',
 'unoaked',
 'stout',
 'complicated',
 'bullish',
 'super_rich',
 'mass',
 'feminine']

In [26]:
#example 
model.wv.most_similar(positive='light_bodied', topn=10)

[('full_bodied', 0.6375356912612915),
 ('rich', 0.633638322353363),
 ('elegant', 0.5696607232093811),
 ('syrupy', 0.5197229385375977),
 ('medium_bodied', 0.4964926540851593),
 ('complex', 0.481254518032074),
 ('thick', 0.4690910577774048),
 ('heavy', 0.45315712690353394),
 ('weight', 0.45136749744415283),
 ('length', 0.4176982641220093)]

## Merge with dataset

In [27]:
%%time
wine_reviews = list(data['description'])

def return_descriptor_from_mapping(word):
    if word in list(filtered_descriptor_mapping.index):
        descriptor_to_return = filtered_descriptor_mapping['level_3'][word]
        return descriptor_to_return

descriptorized_reviews = []
for review in tqdm(wine_reviews):
    normalized_review = normalize_text(review)
    phrased_review = ngrams[normalized_review]
    descriptors_only = [return_descriptor_from_mapping(word) for word in phrased_review]
    no_nones = [str(d) for d in descriptors_only if d is not None]
    descriptorized_review = ' '.join(no_nones)
    descriptorized_reviews.append(descriptorized_review)

  0%|          | 0/9392 [00:00<?, ?it/s]

CPU times: user 28.4 s, sys: 890 ms, total: 29.3 s
Wall time: 34.5 s


In [28]:
descriptorized_reviews[:5]

['elegant', 'rich', 'complex succulent', '', 'rich']

In [29]:
%%time

#apply Tfidf weights and compute wine review vectors from word vectors

vectorizer = TfidfVectorizer()
X = vectorizer.fit(descriptorized_reviews)

dict_of_tfidf_weightings = dict(zip(X.get_feature_names(), X.idf_))

CPU times: user 93.9 ms, sys: 7.1 ms, total: 101 ms
Wall time: 197 ms




In [30]:
%%time
wine_review_vectors = []
for d in tqdm(descriptorized_reviews):
    descriptor_count = 0
    weighted_review_terms = []
    terms = d.split(' ')
    for term in terms:
        if term in dict_of_tfidf_weightings.keys():
            tfidf_weighting = dict_of_tfidf_weightings[term]
            word_vector = model.wv.get_vector(term).reshape(1, 300)
            weighted_word_vector = tfidf_weighting * word_vector
            weighted_review_terms.append(weighted_word_vector)
            descriptor_count += 1
        else:
            continue
    try:
        review_vector = sum(weighted_review_terms)/len(weighted_review_terms)
    except:
        review_vector = []
    vector_and_count = [terms, review_vector, descriptor_count]
    wine_review_vectors.append(vector_and_count)

  0%|          | 0/9392 [00:00<?, ?it/s]

CPU times: user 392 ms, sys: 39.3 ms, total: 432 ms
Wall time: 758 ms


In [31]:
#concatenante in new dataset
data['normalized_descriptors'] = list(map(itemgetter(0), wine_review_vectors))
data['review_vector'] = list(map(itemgetter(1), wine_review_vectors))
data['descriptor_count'] = list(map(itemgetter(2), wine_review_vectors))

data.reset_index(inplace=True)

In [32]:
data.head()

Unnamed: 0,index,country,description,designation,points,price,province,region_1,region_2,variety,winery,normalized_descriptors,review_vector,descriptor_count
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,[elegant],"[[-0.009475023, 0.014538017, 7.603483e-05, 0.0...",1
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,[rich],"[[-0.00035677507, 0.011194306, 0.0016463208, 0...",1
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,"[complex, succulent]","[[-0.009610914, 0.007981829, -0.00077262614, 0...",2
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,[],[],0
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude,[rich],"[[-0.00035677507, 0.011194306, 0.0016463208, 0...",1


In [74]:
data.loc[1,"description"]

'Ripe aromas of fig, blackberry and cassis are softened and sweetened by a slathering of oaky chocolate and vanilla. This is full, layered, intense and cushioned on the palate, with rich flavors of chocolaty black fruits and baking spices. A toasty, everlasting finish is heady but ideally balanced. Drink through 2023.'

In [34]:
#rows with no BCF descriptor 
data[data["descriptor_count"]>0].shape

(5323, 14)

In [35]:
#save in csv

# Clusterization 

In [60]:
bcf_data = data.loc[data["descriptor_count"]>0]
bcf_data.shape

(5323, 14)

In [61]:
input_vectors = list(bcf_data["review_vector"])
len(input_vectors)

5323

In [68]:
input_vectors_listed = [a.tolist() for a in input_vectors]
len(input_vectors_listed)

list

In [71]:
input_vectors_listed = [a[0] for a in input_vectors_listed]
len(input_vectors_listed)

In [73]:
input_vectors_listed[:1]

[[-0.009475022554397583,
  0.014538017101585865,
  7.603482663398609e-05,
  0.014904347248375416,
  -0.012064171023666859,
  -0.015744628384709358,
  -0.005646529607474804,
  0.021332520991563797,
  -0.015585128217935562,
  0.005606562830507755,
  0.004733514040708542,
  -0.02029336988925934,
  -0.011129258200526237,
  0.025872567668557167,
  -0.012565359473228455,
  -0.01945394277572632,
  0.00465345336124301,
  0.004110360983759165,
  0.007612778805196285,
  -0.0061904932372272015,
  -0.0017957333475351334,
  0.005348891485482454,
  0.0006386013119481504,
  0.0032991275656968355,
  -0.009331905283033848,
  0.000559321662876755,
  -0.01670609600841999,
  0.019189035519957542,
  -0.006279172375798225,
  -0.02345680259168148,
  -0.006812826730310917,
  -0.011949241161346436,
  0.012510721571743488,
  0.010075286962091923,
  -0.014139238744974136,
  0.016926264390349388,
  0.014275437220931053,
  -0.018618786707520485,
  -0.014947215095162392,
  0.006459944415837526,
  0.0044036735780537

In [None]:
X = 
NUM_CLUSTERS=3
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
print (assigned_clusters)
# output: [0, 2, 1, 2, 2, 1, 2, 2, 0, 1, 0, 1, 2, 1, 2]