In [3]:
# import dependencies

from collections import Counter
import numpy as np
import nltk
from nltk.corpus import stopwords 
nltk.download("popular")
import re
import sklearn.manifold
import multiprocessing
import pandas as pd
import gensim.models.word2vec as w2v

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

# ***Getting the data and loading it***

In [4]:
# import the data and view it
wine_df = pd.read_csv("../data/final_wine_data_172k.csv")
wine_df.head()

Unnamed: 0.1,Unnamed: 0,country,description,price,points,variety,winery
0,1,Portugal,"This is ripe and fruity, a wine that is smooth...",15.0,87,Portuguese Red,Quinta dos Avidagos
1,2,US,"Tart and snappy, the flavors of lime flesh and...",14.0,87,Pinot Gris,Rainstorm
2,3,US,"Pineapple rind, lemon pith and orange blossom ...",13.0,87,Riesling,St. Julian
3,4,US,"Much like the regular bottling from 2012, this...",65.0,87,Pinot Noir,Sweet Cheeks
4,5,Spain,Blackberry and raspberry aromas show a typical...,15.0,87,Tempranillo-Merlot,Tandem


In [0]:
# split out the variety and description
variety = wine_df['variety']
description = wine_df['description']

# ***Getting the Word2Vec Model ready***

In [0]:
# In order to train a word2vec model, all of the description data will need to be concatenated into one giant string.
corpus_raw = ""
for d in description:
    corpus_raw += d

In [0]:
# create a toeknizer to break the words up
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [0]:
# run the tokenizer with our raw data
raw_sentences = tokenizer.tokenize(corpus_raw)

In [0]:
# Create a function that will grab all of the sentences and make them into words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [0]:
# run the above function to a variable sentences
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [11]:
# view the differences between the raw stentences and the words
print(raw_sentences[234])
print(sentence_to_wordlist(raw_sentences[234]))

The tactile acidity once sipped creates a fascinating soft buzz on the tongue, with flavors of dried mint, brisk raspberry and cherry tomato.
['The', 'tactile', 'acidity', 'once', 'sipped', 'creates', 'a', 'fascinating', 'soft', 'buzz', 'on', 'the', 'tongue', 'with', 'flavors', 'of', 'dried', 'mint', 'brisk', 'raspberry', 'and', 'cherry', 'tomato']


In [12]:
# see how many tokens are in this whole dataset
token_count = sum([len(sentence) for sentence in sentences])
print('The wine corpus contains {0:,} tokens'.format(token_count))

The wine corpus contains 7,203,114 tokens


# ***Training the Word2Vec model***

In [0]:
# creating our vairables for the model and applying the values
num_features = 150
min_word_count = 5
num_workers = multiprocessing.cpu_count()
context_size = 5
downsampling = 1e-3
seed=1993

In [0]:
# creating our word2vec model
wine2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [0]:
# building our word2vec model
wine2vec.build_vocab(sentences)

In [40]:
# seeing how many vocabulary words are in the model
print('Word2Vec vocabulary length:', len(wine2vec.wv.vocab))

Word2Vec vocabulary length: 15406


In [41]:
# getting the total corpus count of the model
print(wine2vec.corpus_count)

305018


In [42]:
# training the model
wine2vec.train(sentences, total_examples=wine2vec.corpus_count, epochs=wine2vec.iter)

  """Entry point for launching an IPython kernel.


(25990310, 36015570)

# ***Playing with the model***

### Word2Vec provides a "most similar" word feature we can use in the model

In [43]:
# run against the model to see what words are similar and what we can predict
wine2vec.most_similar('melon')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('papaya', 0.8312772512435913),
 ('cantaloupe', 0.814118504524231),
 ('banana', 0.7892691493034363),
 ('honeydew', 0.7831417322158813),
 ('nectarine', 0.783075749874115),
 ('peach', 0.7792102694511414),
 ('Papaya', 0.7460393905639648),
 ('mango', 0.7447105646133423),
 ('buttercup', 0.7415136098861694),
 ('apricot', 0.7185927033424377)]

In [44]:
wine2vec.most_similar('berry')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('berries', 0.6874915957450867),
 ('blackberry', 0.6861197352409363),
 ('Sunbaked', 0.6063026785850525),
 ('Berry', 0.6006947755813599),
 ('withered', 0.5966188311576843),
 ('cowhide', 0.5902107954025269),
 ('horsehide', 0.5849367380142212),
 ('intermixed', 0.5836091637611389),
 ('plum', 0.5824931859970093),
 ('sarsparilla', 0.5823000073432922)]

In [45]:
wine2vec.most_similar('oak')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('oaky', 0.6163449883460999),
 ('wood', 0.5897282958030701),
 ('cradled', 0.5766119360923767),
 ('bases', 0.5713842511177063),
 ('charry', 0.5671426057815552),
 ('roasty', 0.565669059753418),
 ('Bourbon', 0.5611021518707275),
 ('barrel', 0.5546651482582092),
 ('Deft', 0.5471131205558777),
 ('macaroon', 0.5415995121002197)]

In [46]:
wine2vec.most_similar('full')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('ultrasmooth', 0.6805888414382935),
 ('Full', 0.667316198348999),
 ('supersmooth', 0.645301103591919),
 ('bold', 0.6434340476989746),
 ('plushly', 0.6331969499588013),
 ('abundantly', 0.629429042339325),
 ('opulently', 0.6219155788421631),
 ('concentrated', 0.6079220175743103),
 ('Generously', 0.6009907126426697),
 ('sumptuously', 0.5982006192207336)]

In [47]:
wine2vec.most_similar('tannins')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('Tannins', 0.711875319480896),
 ('firm', 0.612248420715332),
 ('tannin', 0.6036678552627563),
 ('gripping', 0.5909509658813477),
 ('fined', 0.5851506590843201),
 ('tannic', 0.5739542841911316),
 ('Framed', 0.5725005269050598),
 ('structurally', 0.5720433592796326),
 ('compacted', 0.5716628432273865),
 ('lithely', 0.5683258771896362)]

In [48]:
wine2vec.most_similar('white')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('hyacinth', 0.5800676345825195),
 ('rennet', 0.5656931400299072),
 ('McIntosh', 0.5609203577041626),
 ('kisses', 0.5548864603042603),
 ('Rennet', 0.5507103800773621),
 ('salvia', 0.5447258353233337),
 ('Fetel', 0.5419474244117737),
 ('Stargazer', 0.5395685434341431),
 ('Caprettone', 0.5384214520454407),
 ('White', 0.5376459956169128)]

In [49]:
wine2vec.most_similar_cosmul("white")

  """Entry point for launching an IPython kernel.


[('hyacinth', 0.7900330424308777),
 ('rennet', 0.7828457951545715),
 ('McIntosh', 0.7804595232009888),
 ('kisses', 0.7774425745010376),
 ('Rennet', 0.7753545045852661),
 ('salvia', 0.772362232208252),
 ('Fetel', 0.7709730863571167),
 ('Stargazer', 0.769783616065979),
 ('Caprettone', 0.7692101001739502),
 ('White', 0.7688223123550415)]

In [50]:
# get everything related to tannins
w1 = ["tannins",'firm','tannin']
w2 = ['clean']
wine2vec.most_similar (positive=w1,negative=w2,topn=10)

  This is separate from the ipykernel package so we can avoid doing imports until
  if np.issubdtype(vec.dtype, np.int):


[('Tannins', 0.5966731309890747),
 ('Grippy', 0.5767356753349304),
 ('tannic', 0.5610880851745605),
 ('compressed', 0.5587855577468872),
 ('sizably', 0.5552229881286621),
 ('gripping', 0.554302453994751),
 ('walls', 0.5495252013206482),
 ('flexed', 0.547943651676178),
 ('handshake', 0.5441824793815613),
 ('scaffolding', 0.5425626039505005)]

In [57]:
# similarity between two unrelated
wine2vec.similarity(w1="berry",w2="crush")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


0.063815005

In [55]:
# similarity between two related
wine2vec.similarity(w1="melon",w2="cantaloupe")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


0.81411844

In [58]:
# Which one is the odd one out in this list?
wine2vec.doesnt_match(["wine","red","white"])

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'wine'

In [60]:
# Which one is the odd one out in this list?
wine2vec.doesnt_match(["berry","chocolate","fruit"])

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'chocolate'