In [4]:
#python
import pandas as pd
import string

#compute cell-executing time
from tqdm.notebook import trange, tqdm

#text preprocessing
from operator import itemgetter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from collections import Counter, OrderedDict

#nlp modeling
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

#clusterization 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

In [54]:
file_location = '/Users/hyunoochang/code/chyunoo/wineteller/raw_data/winemag-data_first150k.csv'

data = pd.read_csv(file_location, index_col='Unnamed: 0')
print(data.shape)

(150930, 10)


## Data Exploration

In [135]:
data.head(5)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [122]:
labels = data['variety']
descriptions = data['description']

In [123]:
print('{}   :   {}'.format(labels.tolist()[0], descriptions.tolist()[0]))
print('{}   :   {}'.format(labels.tolist()[1], descriptions.tolist()[1]))
print('{}   :   {}'.format(labels.tolist()[2], descriptions.tolist()[2]))

Cabernet Sauvignon   :   This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.
Tinta de Toro   :   Ripe aromas of fig, blackberry and cassis are softened and sweetened by a slathering of oaky chocolate and vanilla. This is full, layered, intense and cushioned on the palate, with rich flavors of chocolaty black fruits and baking spices. A toasty, everlasting finish is heady but ideally balanced. Drink through 2023.
Sauvignon Blanc   :   Mac Watson honors the memory of a wine once made by his mother in this tremendously delicious, balanced and complex botrytised white. Dark gold in color, it layers toasted hazelnut, pear compote and orange peel flavors, reveling in the succulence of its 122 

In [124]:
varietal_counts = labels.value_counts()
print(varietal_counts[:50])

Chardonnay                       14482
Pinot Noir                       14291
Cabernet Sauvignon               12800
Red Blend                        10062
Bordeaux-style Red Blend          7347
Sauvignon Blanc                   6320
Syrah                             5825
Riesling                          5524
Merlot                            5070
Zinfandel                         3799
Sangiovese                        3345
Malbec                            3208
White Blend                       2824
Rosé                              2817
Tempranillo                       2556
Nebbiolo                          2241
Portuguese Red                    2216
Sparkling Blend                   2004
Shiraz                            1970
Corvina, Rondinella, Molinara     1682
Rhône-style Red Blend             1505
Pinot Gris                        1365
Barbera                           1365
Cabernet Franc                    1363
Sangiovese Grosso                 1346
Pinot Grigio             

In [125]:
print(varietal_counts[-10:])

Cococciola             1
Garnacha Tintorera     1
Malbec-Petit Verdot    1
Albarossa              1
Aidani                 1
Carignan-Syrah         1
Premsal                1
Muskat                 1
Syrah-Carignan         1
Carnelian              1
Name: variety, dtype: int64


## Data Preprocessing

In [23]:
%%time
#tokenization : reviews -> sentences

reviews_list = list(data['description'])
reviews_list = [str(r) for r in reviews_list]

sentences_tokenized=[]
for review in tqdm(reviews_list) :
    sentences_tokenized.append(sent_tokenize(review))
sentences_tokenized = [item for sublist in sentences_tokenized for item in sublist]

  0%|          | 0/150930 [00:00<?, ?it/s]

CPU times: user 27 s, sys: 1.26 s, total: 28.3 s
Wall time: 39.8 s


In [74]:
len(sentences_tokenized)

417173

In [75]:
sentences_tokenized[:1]

['This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak.']

In [13]:
%%time
#normalization : remove stopwords and punctuation + tokenize sentences into words

stop_words = set(stopwords.words('english')) 

punctuation_table = str.maketrans({key: None for key in string.punctuation})
sno = SnowballStemmer('english')

def normalize_text(raw_text):
    try:
        word_list = word_tokenize(raw_text)
        normalized_sentence = []
        for w in tqdm(word_list):
            try:
                w = str(w)
                lower_case_word = str.lower(w)
                stemmed_word = sno.stem(lower_case_word)
                no_punctuation = stemmed_word.translate(punctuation_table)
                if len(no_punctuation) > 1 and no_punctuation not in stop_words:
                    normalized_sentence.append(no_punctuation)
            except:
                continue
        return normalized_sentence
    except:
        return ''

sentence_sample = sentences_tokenized[:10]
normalized_sentences = []
for s in tqdm(sentences_tokenized) :
    normalized_text = normalize_text(s)
    normalized_sentences.append(normalized_text)

In [14]:
normalized_sentences[:1]

[['tremend',
  '100',
  'variet',
  'wine',
  'hail',
  'oakvill',
  'age',
  'three',
  'year',
  'oak']]

In [21]:
%%time 
#retrieve bi-grams and tri-grams from normalized sentences

phrases = Phrases(normalized_sentences)
phrases = Phrases(phrases[normalized_sentences])

ngrams = Phraser(phrases)

phrased_sentences = []
for sent in tqdm(normalized_sentences):
    phrased_sentence = ngrams[sent]
    phrased_sentences.append(phrased_sentence)

full_list_words = [item for sublist in phrased_sentences for item in sublist]

  0%|          | 0/417173 [00:00<?, ?it/s]

CPU times: user 49.8 s, sys: 1.43 s, total: 51.2 s
Wall time: 1min 3s


In [22]:
phrased_sentences[:1]

[['tremend',
  '100_variet',
  'wine',
  'hail',
  'oakvill',
  'age',
  'three_year',
  'oak']]

In [169]:
full_list_words[:5]

['tremend', '100_variet', 'wine', 'hail', 'oakvill']

In [26]:
#most common 5000 words in corpus

word_counts = Counter(full_list_words)
sorted_counts = OrderedDict(word_counts.most_common(5000))
counter_df = pd.DataFrame.from_dict(sorted_counts, orient='index')
top_5000_words = counter_df.head(5000)
counter_df.to_csv('top_5000_descriptors.csv')

In [29]:
top_5000_words.head()

Unnamed: 0,0
wine,95170
flavor,86400
fruit,68895
finish,45375
acid,40144


In [32]:
#load wine descriptor_mapping
map_location = '/Users/hyunoochang/code/chyunoo/wineteller/raw_data/descriptor_mapping.csv'

descriptor_mapping = pd.read_csv(map_location).set_index('raw descriptor')
descriptor_mapping.head()

Unnamed: 0_level_0,level_3,level_2,level_1
raw descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abras,abrasive,high_tannin,tannin
acacia,acacia,flowery,flower
acacia_flower,acacia,flowery,flower
aciddriven,acid_driven,high_acid,acid
aggress,aggressive,high_acid,acid


In [33]:
#occasion 'related' features : body, complexity, finish(B.C.F) -> more body = more intimacy, less body = more energy etc

descriptor_list = ['body', 'complexity', 'finish']
filtered_descriptor_mapping = descriptor_mapping[descriptor_mapping['level_1'].isin(descriptor_list)]

In [34]:
filtered_descriptor_mapping.head()

Unnamed: 0_level_0,level_3,level_2,level_1
raw descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
airi,airy,light_bodied,body
big_bold,bold,full_bodied,body
bullish,bullish,full_bodied,body
blocki,chunky,full_bodied,body
chunki,chunky,full_bodied,body


In [176]:
#number of raw descriptors per features 
filtered_descriptor_mapping["level_1"].value_counts()

body          57
complexity    16
finish         6
Name: level_1, dtype: int64

In [177]:
#list of all raw descriptors (body, finish, complexity)
list(filtered_descriptor_mapping.index)

['airi',
 'big_bold',
 'bullish',
 'blocki',
 'chunki',
 'solid_chunki',
 'clampi',
 'close',
 'complex',
 'complic',
 'dainti',
 'deep',
 'depth',
 'eas',
 'easi_sip',
 'eleg',
 'expans',
 'extract',
 'feminin',
 'finess',
 'ampl_weight',
 'bold',
 'full_bodi',
 'fullbodi',
 'soupi',
 'weighti',
 'hearti',
 'heavi',
 'heavier',
 'heavyweight',
 'lavish',
 'rather_lean',
 'length',
 'lengthi',
 'long_mouthwat',
 'longlast',
 'lightest',
 'light',
 'light_bodi',
 'lightbodi',
 'lighter',
 'lighter_style',
 'lightfoot',
 'lightweight',
 'thin',
 'linear',
 'straightforward',
 'lush',
 'luxuri',
 'mass',
 'medium_bodi',
 'medium_fullbodi',
 'medium_weight',
 'mediumbodi',
 'mediumweight',
 'medium_length',
 'mediumlength_finish',
 'modest',
 'onedimension',
 'opul',
 'plump',
 'quaffer',
 'refin',
 'rich',
 'richer',
 'robust',
 'uncompl',
 'stout',
 'sturdi',
 'succul',
 'superrich',
 'syrup',
 'syrupi',
 'thick',
 'unoak',
 'viscos',
 'viscous',
 'voluptu',
 'weight']

In [36]:
%%time
#apply mapping on each word of each sentence

def return_mapped_descriptor(word):
    if word in list(filtered_descriptor_mapping.index):
        normalized_word = filtered_descriptor_mapping['level_3'][word]
        return normalized_word
    else:
        return ""

normalized_sentences = []
for sent in tqdm(phrased_sentences):
    normalized_sentence = []
    for word in sent:
        normalized_word = return_mapped_descriptor(word)
        if normalized_word != "" :
            normalized_sentence.append(str(normalized_word))
        else :
            pass
    normalized_sentences.append(normalized_sentence)

  0%|          | 0/417173 [00:00<?, ?it/s]

In [37]:
normalized_sentences[:10]

[[], ['elegant'], [], [], [], ['rich'], [], [], ['complex'], ['succulent']]

## Word embeddings 

In [39]:
%%time
#fit Word2Vec model into corpus 
model = Word2Vec(normalized_sentences, vector_size=300, min_count=1, epochs=15)
print(model)

model.save('model.bin')

Word2Vec<vocab=50, vector_size=300, alpha=0.025>
CPU times: user 12.1 s, sys: 310 ms, total: 12.4 s
Wall time: 15.4 s


In [40]:
#list of vocab learned by model
model.wv.index_to_key

['rich',
 'light_bodied',
 'complex',
 'elegant',
 'full_bodied',
 'depth',
 'medium_bodied',
 'lush',
 'closed',
 'heavy',
 'thick',
 'weight',
 'opulent',
 'extracted',
 'chunky',
 'plump',
 'length',
 'modest',
 'refined',
 'syrupy',
 'low_complexity',
 'succulent',
 'robust',
 'linear',
 'finessed',
 'lengthy',
 'hearty',
 'sturdy',
 'one_dimensional',
 'voluptuous',
 'bold',
 'feminine',
 'luxurious',
 'lavish',
 'viscous',
 'complicated',
 'medium_length_finish',
 'unoaked',
 'simple',
 'mass',
 'super_rich',
 'quaffer',
 'expansive',
 'easy',
 'stout',
 'light',
 'clampy',
 'dainty',
 'airy',
 'bullish']

In [41]:
#example 
model.wv.most_similar(positive='light_bodied', topn=10)

[('medium_bodied', 0.9927793741226196),
 ('linear', 0.9914177060127258),
 ('modest', 0.9914150834083557),
 ('feminine', 0.9908044934272766),
 ('low_complexity', 0.9882341027259827),
 ('length', 0.9881134629249573),
 ('refined', 0.9871636629104614),
 ('medium_length_finish', 0.9871259927749634),
 ('finessed', 0.9865640997886658),
 ('airy', 0.9841231107711792)]

In [42]:
%%time
wine_reviews = list(data['description'])

def return_descriptor_from_mapping(word):
    if word in list(filtered_descriptor_mapping.index):
        descriptor_to_return = filtered_descriptor_mapping['level_3'][word]
        return descriptor_to_return

descriptorized_reviews = []
for review in tqdm(wine_reviews):
    normalized_review = normalize_text(review)
    phrased_review = ngrams[normalized_review]
    descriptors_only = [return_descriptor_from_mapping(word) for word in phrased_review]
    no_nones = [str(d) for d in descriptors_only if d is not None]
    descriptorized_review = ' '.join(no_nones)
    descriptorized_reviews.append(descriptorized_review)

  0%|          | 0/150930 [00:00<?, ?it/s]

CPU times: user 6min 55s, sys: 15.3 s, total: 7min 11s
Wall time: 9min


In [213]:
descriptorized_reviews[:5]

['elegant',
 'rich',
 'complex succulent',
 '',
 'rich',
 'depth',
 '',
 'lush lush depth',
 'chunky',
 'succulent lush length']

In [51]:
%%time

#apply Tfidf weights and compute wine review vectors from word vectors

vectorizer = TfidfVectorizer()
X = vectorizer.fit(descriptorized_reviews)

dict_of_tfidf_weightings = dict(zip(X.get_feature_names(), X.idf_))

CPU times: user 695 ms, sys: 23.4 ms, total: 719 ms
Wall time: 796 ms


In [52]:
%%time
wine_review_vectors = []
for d in tqdm(descriptorized_reviews):
    descriptor_count = 0
    weighted_review_terms = []
    terms = d.split(' ')
    for term in terms:
        if term in dict_of_tfidf_weightings.keys():
            tfidf_weighting = dict_of_tfidf_weightings[term]
            word_vector = model.wv.get_vector(term).reshape(1, 300)
            weighted_word_vector = tfidf_weighting * word_vector
            weighted_review_terms.append(weighted_word_vector)
            descriptor_count += 1
        else:
            continue
    try:
        review_vector = sum(weighted_review_terms)/len(weighted_review_terms)
    except:
        review_vector = []
    vector_and_count = [terms, review_vector, descriptor_count]
    wine_review_vectors.append(vector_and_count)

  0%|          | 0/150930 [00:00<?, ?it/s]

CPU times: user 4.12 s, sys: 282 ms, total: 4.41 s
Wall time: 4.98 s


In [55]:
#concatenante in new dataset
data['normalized_descriptors'] = list(map(itemgetter(0), wine_review_vectors))
data['review_vector'] = list(map(itemgetter(1), wine_review_vectors))
data['descriptor_count'] = list(map(itemgetter(2), wine_review_vectors))

data.reset_index(inplace=True)

In [56]:
data.head()

Unnamed: 0,index,country,description,designation,points,price,province,region_1,region_2,variety,winery,normalized_descriptors,review_vector,descriptor_count
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,[elegant],"[[-0.07686549, 0.118861645, -0.07442225, 0.172...",1
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,[rich],"[[-0.058690846, 0.14173521, 0.0077731507, 0.11...",1
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,"[complex, succulent]","[[-0.06216684, 0.30808628, -0.08620003, 0.0290...",2
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,[],[],0
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude,[rich],"[[-0.058690846, 0.14173521, 0.0077731507, 0.11...",1


In [48]:
data.loc[1,"description"]

'Ripe aromas of fig, blackberry and cassis are softened and sweetened by a slathering of oaky chocolate and vanilla. This is full, layered, intense and cushioned on the palate, with rich flavors of chocolaty black fruits and baking spices. A toasty, everlasting finish is heady but ideally balanced. Drink through 2023.'

In [107]:
#rows with no BCF descriptor 
data[data["descriptor_count"]>0].shape

(86046, 14)

In [None]:
#save in csv

## Clusterization 

In [84]:
input_vectors = list(data['review_vector'])

In [86]:
input_vectors_listed = [type(a) for a in input_vectors]

In [89]:
input_vectors_listed[:10]

[numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray,
 list,
 numpy.ndarray,
 numpy.ndarray,
 list,
 numpy.ndarray,
 numpy.ndarray,
 numpy.ndarray]

In [103]:
bcf_data = data.loc[data["descriptor_count"]>0]
bcf_data.shape

(86046, 14)

In [104]:
input_vectors = list(bcf_data["review_vector"])
input_vectors_listed = [a.tolist() for a in input_vectors]
input_vectors_listed = [a[0] for a in input_vectors_listed]

knn = NearestNeighbors(n_neighbors=10, algorithm= 'brute', metric='cosine')
model_knn = knn.fit(input_vectors_listed)