In [1]:
from collections import Counter
import numpy as np
import nltk
import re
import sklearn.manifold
import multiprocessing
import pandas as pd
import gensim.models.word2vec as w2v

In [2]:
data = pd.read_csv('winemag-data_first150k.csv')

In [3]:
labels = data['variety']
descriptions = data['description']

In [4]:
print('{}   :   {}'.format(labels.tolist()[0], descriptions.tolist()[0]))
print('{}   :   {}'.format(labels.tolist()[56], descriptions.tolist()[56]))
print('{}   :   {}'.format(labels.tolist()[93], descriptions.tolist()[93]))

Cabernet Sauvignon   :   This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.
Sauvignon Blanc   :   Delicious while also young and textured, this wine comes from biodynamically grown grapes. It has a strong sense of minerality as well as intense citrus and green fruits. It's tight at the moment and needs to round out, so drink from 2018.
Chardonnay   :   A smoky scent and earthy, crisp-apple flavors make this medium-bodied wine a change of pace from the average butterball Chardonnay. It has welcome acidity and a nicely smooth texture.


In [5]:
varietal_counts = labels.value_counts()
print(varietal_counts[:50])

Chardonnay                       14482
Pinot Noir                       14291
Cabernet Sauvignon               12800
Red Blend                        10062
Bordeaux-style Red Blend          7347
Sauvignon Blanc                   6320
Syrah                             5825
Riesling                          5524
Merlot                            5070
Zinfandel                         3799
Sangiovese                        3345
Malbec                            3208
White Blend                       2824
Rosé                              2817
Tempranillo                       2556
Nebbiolo                          2241
Portuguese Red                    2216
Sparkling Blend                   2004
Shiraz                            1970
Corvina, Rondinella, Molinara     1682
Rhône-style Red Blend             1505
Pinot Gris                        1365
Barbera                           1365
Cabernet Franc                    1363
Sangiovese Grosso                 1346
Pinot Grigio             

In [6]:
corpus_raw = ""
for description in descriptions:
    corpus_raw += description

In [7]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [8]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [9]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [10]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [11]:
print(raw_sentences[0])
print(sentence_to_wordlist(raw_sentences[0]))

This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak.
['This', 'tremendous', 'varietal', 'wine', 'hails', 'from', 'Oakville', 'and', 'was', 'aged', 'over', 'three', 'years', 'in', 'oak']


In [12]:
token_count = sum([len(sentence) for sentence in sentences])
print('The wine corpus contains {0:,} tokens'.format(token_count))

The wine corpus contains 6,194,763 tokens


In [13]:
num_features = 300
min_word_count = 10
num_workers = multiprocessing.cpu_count()
context_size = 10
downsampling = 1e-3
seed=1993

In [14]:
wine2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [15]:
wine2vec.build_vocab(sentences)

In [16]:
print('Word2Vec vocabulary length:', len(wine2vec.wv.vocab))


Word2Vec vocabulary length: 11379


In [17]:
wine2vec.wv.vocab

{'This': <gensim.models.keyedvectors.Vocab at 0x1d030ad18c8>,
 'tremendous': <gensim.models.keyedvectors.Vocab at 0x1d0538f4f88>,
 'varietal': <gensim.models.keyedvectors.Vocab at 0x1d0538fa108>,
 'wine': <gensim.models.keyedvectors.Vocab at 0x1d0538fa4c8>,
 'hails': <gensim.models.keyedvectors.Vocab at 0x1d0538fa5c8>,
 'from': <gensim.models.keyedvectors.Vocab at 0x1d0538fa648>,
 'Oakville': <gensim.models.keyedvectors.Vocab at 0x1d0538fa688>,
 'and': <gensim.models.keyedvectors.Vocab at 0x1d0538fa6c8>,
 'was': <gensim.models.keyedvectors.Vocab at 0x1d0538fa588>,
 'aged': <gensim.models.keyedvectors.Vocab at 0x1d0538fa608>,
 'over': <gensim.models.keyedvectors.Vocab at 0x1d0538fa708>,
 'three': <gensim.models.keyedvectors.Vocab at 0x1d0538fa748>,
 'years': <gensim.models.keyedvectors.Vocab at 0x1d0538fa788>,
 'in': <gensim.models.keyedvectors.Vocab at 0x1d0538fa7c8>,
 'oak': <gensim.models.keyedvectors.Vocab at 0x1d0538fa808>,
 'Juicy': <gensim.models.keyedvectors.Vocab at 0x1d0538fa8

In [18]:
print(wine2vec.corpus_count)


266254


In [19]:
wine2vec.train(sentences, total_examples=wine2vec.corpus_count, epochs=wine2vec.iter)


  """Entry point for launching an IPython kernel.


(22314594, 30973815)

In [26]:
wine2vec.most_similar('tannin')

  """Entry point for launching an IPython kernel.


[('tannins', 0.5339179039001465),
 ('tannic', 0.45286205410957336),
 ('acid', 0.4517667293548584),
 ('laudable', 0.45031818747520447),
 ('Structurally', 0.4196407198905945),
 ('powering', 0.4163557291030884),
 ('buoy', 0.41513803601264954),
 ('pH', 0.41407719254493713),
 ('unobtrusive', 0.4081850051879883),
 ('cushioning', 0.40577155351638794)]

In [24]:
wine2vec.most_similar('bad')

  """Entry point for launching an IPython kernel.


[('awful', 0.5221390724182129),
 ('horrible', 0.5130788087844849),
 ('inappropriate', 0.5070090293884277),
 ('se', 0.500730037689209),
 ('overpriced', 0.5002012252807617),
 ('terrible', 0.49675774574279785),
 ('disaster', 0.4957996904850006),
 ('unattractive', 0.4917920231819153),
 ('Plus', 0.4725416302680969),
 ('purposes', 0.4697968661785126)]

In [None]:
wine2vec.most_similar('oak')

In [None]:
wine2vec.most_similar('acidic')

In [None]:
wine2vec.most_similar('full')

In [None]:
wine2vec.most_similar('tannins')

In [None]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = wine2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [None]:
nearest_similarity_cosmul('oak', 'vanilla', 'cherry');

In [None]:
nearest_similarity_cosmul('full', 'berry', 'light');

In [None]:
nearest_similarity_cosmul('tannins', 'plum', 'fresh');

In [None]:
nearest_similarity_cosmul('full', 'bodied', 'acidic');

In [None]:
wine2vec.most_similar('sweetness')

In [None]:
wine2vec.most_similar('alcohol')

In [None]:
wine2vec.most_similar('aromas')

In [None]:
wine2vec.most_similar('gumdrop')