In [52]:
#python
import pandas as pd
import string
import numpy as np

#compute cell-executing time
from tqdm.notebook import trange, tqdm

#text preprocessing
from operator import itemgetter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from collections import Counter, OrderedDict

#nlp modeling
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

#clusterization 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from nltk.cluster import KMeansClusterer
import nltk
import os
from scipy import spatial

In [2]:
#import file locations

path_to_glove_file_50 = "/Users/hyunoochang/Downloads/glove.6B.50d.txt"

path_to_glove_file_100 = "/Users/hyunoochang/Downloads/glove.6B/glove.6B.100d.txt"

In [3]:
embeddings_dict = {}
with open(path_to_glove_file_50) as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector
        
print("Found %s word vectors." % len(embeddings_dict))

Found 379 word vectors.


In [4]:
#create dictionnary word : vector 

embeddings_index = {}
with open(path_to_glove_file_100) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400001 word vectors.


In [None]:
# function to retrieve closest words
def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))

In [None]:
# example : 100 words of family
print(find_closest_embeddings(embeddings_index["family"])[1:100])

In [34]:
embeddings_index["medium"]

array([-1.1756   ,  0.66743  , -0.82674  ,  0.47844  , -0.61724  ,
       -0.17721  ,  0.52252  , -0.34432  , -0.45989  ,  0.95159  ,
        0.060506 , -0.68491  , -0.36106  , -0.043206 ,  0.34444  ,
       -0.7292   ,  0.095189 , -0.26954  ,  0.51413  ,  0.14246  ,
       -0.44721  ,  0.11529  ,  0.41065  , -0.73908  ,  0.66015  ,
        1.0372   ,  0.15455  ,  0.59409  , -1.6329   ,  0.87067  ,
       -0.61104  ,  0.65734  , -0.84588  , -0.46524  , -0.017633 ,
       -0.17467  ,  0.65676  ,  0.29783  , -0.091765 , -0.57205  ,
       -0.13039  , -1.0417   , -0.6036   , -0.25659  ,  0.6483   ,
       -0.62761  ,  0.4318   ,  0.033122 , -0.17567  , -0.42246  ,
       -0.38075  ,  0.37231  , -0.018643 ,  1.2721   ,  0.17436  ,
       -1.5936   , -0.15106  ,  0.25212  ,  1.59     ,  0.15644  ,
        0.32544  ,  0.49731  , -0.45329  ,  0.53998  ,  0.91785  ,
       -0.040135 ,  0.33496  ,  0.031095 ,  0.42636  , -0.0092552,
        0.45997  , -0.29174  ,  0.43143  ,  0.033528 ,  0.2102

In [38]:
descriptors = ['rich',
 'light_bodied',
 'full_bodied',
 'complex',
 'medium_bodied',
 'elegant',
 'depth',
 'weight',
 'closed',
 'heavy',
 'lush',
 'chunky',
 'low_complexity',
 'thick',
 'plump',
 'robust',
 'length',
 'hearty',
 'extracted',
 'succulent',
 'opulent',
 'modest',
 'syrupy',
 'linear',
 'lengthy',
 'refined',
 'finessed',
 'viscous',
 'luxurious',
 'lavish',
 'expansive',
 'light',
 'bold',
 'voluptuous',
 'sturdy',
 'one_dimensional',
 'simple',
 'easy',
 'lean',
 'clampy',
 'airy',
 'dainty',
 'quaffer',
 'unoaked',
 'stout',
 'complicated',
 'bullish',
 'super_rich',
 'mass',
 'feminine']

In [58]:
embedding_dim = 100
hits = 0
misses = 0

converted = []
for word in descriptors :
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        hits += 1
        converted.append(word)
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 42 words (8 misses)


In [59]:
converted

['rich',
 'complex',
 'elegant',
 'depth',
 'weight',
 'closed',
 'heavy',
 'lush',
 'chunky',
 'thick',
 'plump',
 'robust',
 'length',
 'hearty',
 'extracted',
 'succulent',
 'opulent',
 'modest',
 'syrupy',
 'linear',
 'lengthy',
 'refined',
 'finessed',
 'viscous',
 'luxurious',
 'lavish',
 'expansive',
 'light',
 'bold',
 'voluptuous',
 'sturdy',
 'simple',
 'easy',
 'lean',
 'airy',
 'dainty',
 'unoaked',
 'stout',
 'complicated',
 'bullish',
 'mass',
 'feminine']

In [45]:
len(occasions)

13

In [44]:
occasions = ["family", "friends", "date", "restaurant", "alone", "home", "barbecue", "birthday", "colleague", "fiancé", "fiance", "chill", "netflix"]

In [46]:
embedding_dim = 100
hits = 0
misses = 0

for word in occasions :
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 13 words (0 misses)


In [47]:
for word in occasions : 
    print(embeddings_index[word])

[ 4.2179e-01 -9.6730e-02  1.0657e-01 -2.1117e-01 -5.4202e-01  8.8692e-01
 -1.4038e-01 -1.0424e-01 -1.6009e-01  1.5360e-01 -3.7699e-01  4.5063e-02
  4.4316e-01  3.9670e-01 -5.7958e-01 -3.5208e-01  5.2960e-01 -4.3271e-01
 -2.1603e-01  1.0731e+00 -3.3560e-01  6.7252e-02  5.7345e-01  3.5972e-01
  5.6134e-01 -3.1222e-01 -5.8412e-01 -2.2302e-01  8.1725e-02  5.4772e-01
  3.5482e-01  8.9450e-01  6.9674e-01  6.2971e-02  1.7604e-01  6.7448e-01
  5.9729e-01  4.9058e-01  5.2370e-01 -7.8386e-02 -5.2658e-01 -5.6530e-01
  4.5006e-01 -7.5849e-01 -5.2401e-02  5.1847e-02 -3.2363e-01  7.1197e-01
  5.8238e-01 -7.0496e-01 -1.9225e-01 -1.0275e+00  8.8209e-01  6.8192e-01
 -7.0748e-02 -1.7905e+00 -8.8179e-01 -8.5265e-01  1.3588e+00  1.0301e+00
  2.6106e-01  8.9355e-01  4.3638e-01 -5.8021e-01  1.3251e+00 -5.7793e-01
 -1.4836e-01 -9.2882e-02  3.8736e-01  2.2685e-01 -4.7656e-01  6.8733e-04
  1.0141e-01 -8.1426e-01  3.6058e-01  2.9421e-01 -2.8724e-01 -6.1977e-01
 -1.3356e+00  8.9373e-01 -3.4835e-01  8.1035e-01  6

In [53]:
def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))

In [56]:
embeddings_dict["family"]

array([ 1.1636   ,  0.90386  , -0.74277  ,  0.49272  ,  1.88     ,
        0.9285   , -1.0429   ,  0.01343  , -0.51903  , -0.68762  ,
        0.0083396,  0.50754  ,  0.51792  , -0.75577  ,  1.0241   ,
        0.1761   , -0.91889  ,  0.21455  ,  0.19232  ,  0.19562  ,
        0.0044282,  0.40701  , -0.3812   ,  0.51286  ,  0.14278  ,
       -1.4652   , -0.55194  , -0.8362   ,  0.41877  ,  0.40793  ,
        2.5335   , -0.20787  ,  0.088761 , -0.10973  ,  0.52074  ,
        0.026259 , -0.65515  , -0.1425   ,  0.44071  , -0.047796 ,
       -0.23276  ,  0.17212  , -0.16844  ,  0.60122  ,  0.47156  ,
        0.11336  , -0.96031  , -1.3412   , -0.021467 ,  0.049217 ],
      dtype=float32)

In [63]:
cosine_similarity([embeddings_index["family"]], [embeddings_index["rich"]])

array([[0.47507977]], dtype=float32)

In [64]:
cosine_similarity([embeddings_index["family"]], [embeddings_index["elegant"]])

array([[0.26872295]], dtype=float32)

In [123]:
embeddings_index["family"]

array([ 4.2179e-01, -9.6730e-02,  1.0657e-01, -2.1117e-01, -5.4202e-01,
        8.8692e-01, -1.4038e-01, -1.0424e-01, -1.6009e-01,  1.5360e-01,
       -3.7699e-01,  4.5063e-02,  4.4316e-01,  3.9670e-01, -5.7958e-01,
       -3.5208e-01,  5.2960e-01, -4.3271e-01, -2.1603e-01,  1.0731e+00,
       -3.3560e-01,  6.7252e-02,  5.7345e-01,  3.5972e-01,  5.6134e-01,
       -3.1222e-01, -5.8412e-01, -2.2302e-01,  8.1725e-02,  5.4772e-01,
        3.5482e-01,  8.9450e-01,  6.9674e-01,  6.2971e-02,  1.7604e-01,
        6.7448e-01,  5.9729e-01,  4.9058e-01,  5.2370e-01, -7.8386e-02,
       -5.2658e-01, -5.6530e-01,  4.5006e-01, -7.5849e-01, -5.2401e-02,
        5.1847e-02, -3.2363e-01,  7.1197e-01,  5.8238e-01, -7.0496e-01,
       -1.9225e-01, -1.0275e+00,  8.8209e-01,  6.8192e-01, -7.0748e-02,
       -1.7905e+00, -8.8179e-01, -8.5265e-01,  1.3588e+00,  1.0301e+00,
        2.6106e-01,  8.9355e-01,  4.3638e-01, -5.8021e-01,  1.3251e+00,
       -5.7793e-01, -1.4836e-01, -9.2882e-02,  3.8736e-01,  2.26

In [125]:
embeddings_index["elegant"]

array([-1.6793e-01,  4.1604e-01,  1.5100e-01,  7.7931e-01,  1.4573e-01,
        1.0088e+00,  1.3422e-01,  4.1758e-01, -4.8192e-01,  1.0052e+00,
       -2.1608e-02, -3.7152e-01, -8.5531e-04,  2.1093e-02, -9.7570e-02,
        4.2913e-01, -3.5820e-02,  8.9167e-02,  4.2993e-01, -2.3457e-01,
        1.6009e-01,  1.7447e-01,  8.4783e-02, -1.0514e+00,  7.6653e-01,
        6.4205e-02, -6.7817e-01, -6.2814e-01, -9.7590e-01, -7.7208e-01,
       -1.9586e-01, -1.9140e-01, -3.2568e-02, -8.3018e-01,  6.9522e-01,
        7.4564e-01,  1.2142e-01, -1.8777e-01,  2.1688e-01, -2.3975e-01,
        7.4525e-01, -5.5891e-01,  4.6432e-02,  1.3116e-01,  2.0289e-01,
        2.3692e-01,  3.2467e-01,  3.7572e-01,  5.1194e-01,  2.5578e-01,
        1.5884e-01, -3.3892e-01,  7.3877e-01, -1.2824e-02, -5.8199e-01,
       -1.6869e+00, -7.7130e-02,  5.3444e-01,  2.8601e-01, -4.3873e-01,
       -5.4135e-01,  5.5921e-01, -1.1435e-02,  7.0715e-02,  1.1941e-01,
       -9.1085e-01,  7.2305e-01, -2.7237e-01, -3.4374e-01, -9.26

In [86]:
for o in converted :
    d = cosine_similarity([embeddings_index['family']], [embeddings_index[o]])
    print(f"family with {o} : {d}")

family with rich : [[0.47507977]]
family with complex : [[0.46088707]]
family with elegant : [[0.26872295]]
family with depth : [[0.14464968]]
family with weight : [[0.32998246]]
family with closed : [[0.3241926]]
family with heavy : [[0.27543324]]
family with lush : [[0.14895333]]
family with chunky : [[0.02663029]]
family with thick : [[0.18726459]]
family with plump : [[0.11496408]]
family with robust : [[0.14012983]]
family with length : [[0.25455943]]
family with hearty : [[0.1743893]]
family with extracted : [[0.10080183]]
family with succulent : [[0.06887427]]
family with opulent : [[0.21488434]]
family with modest : [[0.3449059]]
family with syrupy : [[-0.13621171]]
family with linear : [[0.12678994]]
family with lengthy : [[0.27487922]]
family with refined : [[0.09338707]]
family with finessed : [[-0.21288514]]
family with viscous : [[-0.22532278]]
family with luxurious : [[0.27989385]]
family with lavish : [[0.3526376]]
family with expansive : [[0.10436751]]
family with light

In [87]:
for o in converted :
    d = cosine_similarity([embeddings_index['friends']], [embeddings_index[o]])
    print(f"family with {o} : {d}")

family with rich : [[0.46321967]]
family with complex : [[0.2624081]]
family with elegant : [[0.26146123]]
family with depth : [[0.18123151]]
family with weight : [[0.2175516]]
family with closed : [[0.2932834]]
family with heavy : [[0.24971998]]
family with lush : [[0.16792077]]
family with chunky : [[0.03331318]]
family with thick : [[0.15859269]]
family with plump : [[0.11011384]]
family with robust : [[0.07588001]]
family with length : [[0.14459103]]
family with hearty : [[0.2713528]]
family with extracted : [[0.04199443]]
family with succulent : [[-0.00939421]]
family with opulent : [[0.09363578]]
family with modest : [[0.30699334]]
family with syrupy : [[-0.09060363]]
family with linear : [[-0.02842217]]
family with lengthy : [[0.2486445]]
family with refined : [[-0.01517897]]
family with finessed : [[-0.18907556]]
family with viscous : [[-0.23540196]]
family with luxurious : [[0.19619668]]
family with lavish : [[0.3266826]]
family with expansive : [[0.11522232]]
family with ligh

In [89]:
for o in converted :
    d = cosine_similarity([embeddings_index['date']], [embeddings_index[o]])
    print(f"family with {o} : {d}")

family with rich : [[0.2888705]]
family with complex : [[0.409241]]
family with elegant : [[0.19330315]]
family with depth : [[0.28186053]]
family with weight : [[0.2503259]]
family with closed : [[0.435784]]
family with heavy : [[0.30462268]]
family with lush : [[0.08298731]]
family with chunky : [[-0.02864331]]
family with thick : [[0.16291402]]
family with plump : [[-0.05090385]]
family with robust : [[0.22651698]]
family with length : [[0.40706185]]
family with hearty : [[-0.06709845]]
family with extracted : [[0.19778103]]
family with succulent : [[-0.05967436]]
family with opulent : [[0.05862183]]
family with modest : [[0.36467493]]
family with syrupy : [[-0.06443225]]
family with linear : [[0.17951953]]
family with lengthy : [[0.38440642]]
family with refined : [[0.17849568]]
family with finessed : [[-0.17516252]]
family with viscous : [[-0.19801751]]
family with luxurious : [[0.06104483]]
family with lavish : [[0.19866906]]
family with expansive : [[0.08733183]]
family with lig

In [90]:
for o in converted :
    d = cosine_similarity([embeddings_index['night']], [embeddings_index[o]])
    print(f"family with {o} : {d}")

family with rich : [[0.3296055]]
family with complex : [[0.37355775]]
family with elegant : [[0.30030328]]
family with depth : [[0.28916943]]
family with weight : [[0.31971863]]
family with closed : [[0.51743114]]
family with heavy : [[0.49723715]]
family with lush : [[0.22366719]]
family with chunky : [[-0.05952709]]
family with thick : [[0.32874507]]
family with plump : [[0.01656975]]
family with robust : [[0.14897218]]
family with length : [[0.3317667]]
family with hearty : [[0.18794224]]
family with extracted : [[-0.01324762]]
family with succulent : [[-0.05707498]]
family with opulent : [[0.18016753]]
family with modest : [[0.36168975]]
family with syrupy : [[-0.00078473]]
family with linear : [[0.05486108]]
family with lengthy : [[0.35122958]]
family with refined : [[0.01208579]]
family with finessed : [[-0.23385672]]
family with viscous : [[-0.10506747]]
family with luxurious : [[0.22616175]]
family with lavish : [[0.31975397]]
family with expansive : [[0.20554706]]
family with 

In [93]:
for o in converted :
    d = cosine_similarity([embeddings_index['alone']], [embeddings_index[o]])
    print(f"family with {o} : {d}")

family with rich : [[0.35291287]]
family with complex : [[0.37208286]]
family with elegant : [[0.22124778]]
family with depth : [[0.35031724]]
family with weight : [[0.47428137]]
family with closed : [[0.41900647]]
family with heavy : [[0.34644216]]
family with lush : [[0.16534069]]
family with chunky : [[-0.10497703]]
family with thick : [[0.25433883]]
family with plump : [[-0.01067879]]
family with robust : [[0.32851332]]
family with length : [[0.4187239]]
family with hearty : [[0.03325732]]
family with extracted : [[0.20163621]]
family with succulent : [[-0.1218478]]
family with opulent : [[0.08282035]]
family with modest : [[0.43873924]]
family with syrupy : [[-0.11749495]]
family with linear : [[0.12739816]]
family with lengthy : [[0.25454962]]
family with refined : [[0.1660002]]
family with finessed : [[-0.22594991]]
family with viscous : [[-0.16045167]]
family with luxurious : [[0.19885121]]
family with lavish : [[0.21245867]]
family with expansive : [[0.17148316]]
family with l

In [105]:
fwine = embeddings_index["family"]+embeddings_index["wine"]

In [111]:
frwine = embeddings_index["friends"]+embeddings_index["wine"]

In [119]:
sundayw = embeddings_index["sunday"] + embeddings_index["wine"] - embeddings_index["day"]

In [121]:
datew = embeddings_index["date"] + embeddings_index["wine"]

In [108]:
def find_closest_embeddings(embedding):
    return sorted(embeddings_index.keys(), key=lambda word: spatial.distance.euclidean(embeddings_index[word], embedding))

In [126]:
print(find_closest_embeddings(fwine)[1:11])

['family', 'restaurant', 'coffee', 'food', 'beer', 'wines', 'taste', 'friends', 'own', 'whose']


In [113]:
print(find_closest_embeddings(frwine)[1:11])

['friends', 'coffee', 'dinner', 'drink', 'wines', 'beer', 'love', 'lovers', 'restaurant', 'good']


In [120]:
print(find_closest_embeddings(sundayw)[1:11])

['wines', 'tasting', 'winemakers', 'bordeaux', 'napa', 'champagne', 'grape', 'beer', 'restaurant', 'cognac']


In [122]:
print(find_closest_embeddings(datew)[1:11])

['date', 'dates', 'wines', 'this', 'place', 'year', 'although', 'beginning', 'same', 'until']


In [127]:
print(find_closest_embeddings(embeddings_index["family"])[1:100])

['father', 'mother', 'friends', 'whose', 'parents', 'lived', 'grandfather', 'families', 'grandmother', 'couple', 'relatives', 'husband', 'uncle', 'brother', 'life', 'home', 'wife', 'lives', 'siblings', 'sister', 'sons', 'grandparents', 'cousins', 'cousin', 'own', 'son', 'daughter', 'friend', 'daughters', 'once', 'brought', 'whom', 'living', 'learned', 'child', 'shared', 'parent', 'having', 'aunt', 'neighbors', 'nephew', 'same', 'widow', '’s', 'survived', 'present', 'beloved', 'deceased', 'elder', 'others', 'instance', 'childhood', 'loved', 'dad', 'brothers', 'hometown', 'latter', 'example', 'described', 'children', 'found', 'foster', 'one', 'fact', 'personal', 'inherited', 'later', 'though', 'where', 'turned', 'estranged', 'rest', 'neighbor', 'returned', 'addition', 'man', 'ancestors', 'well', 'belonged', 'raised', 'particular', 'wives', 'finds', 'part', 'surviving', 'name', 'although', 'stepfather', 'presumably', 'sisters', 'referred', 'instead', 'generations', 'recently', 'humble', '

In [156]:
A = np.array([embeddings_index["food"],embeddings_index["thing"],embeddings_index["evening"],embeddings_index["school"]])

In [160]:
B=np.array([embeddings_index["dinner"]])

In [157]:
A.shape

(4, 100)

In [161]:
B.shape

(1, 100)

In [165]:
x = np.linalg.inv(A)

LinAlgError: Last 2 dimensions of the array must be square

In [None]:
embeddings_index["dinner"]

In [None]:
embeddings_index["data"]

In [None]:
embeddings_index["thing"]

In [None]:
embeddings_index["school"]

In [None]:
embeddings_index["evening"]

In [141]:
embeddings_index["family"]

array([ 4.2179e-01, -9.6730e-02,  1.0657e-01, -2.1117e-01, -5.4202e-01,
        8.8692e-01, -1.4038e-01, -1.0424e-01, -1.6009e-01,  1.5360e-01,
       -3.7699e-01,  4.5063e-02,  4.4316e-01,  3.9670e-01, -5.7958e-01,
       -3.5208e-01,  5.2960e-01, -4.3271e-01, -2.1603e-01,  1.0731e+00,
       -3.3560e-01,  6.7252e-02,  5.7345e-01,  3.5972e-01,  5.6134e-01,
       -3.1222e-01, -5.8412e-01, -2.2302e-01,  8.1725e-02,  5.4772e-01,
        3.5482e-01,  8.9450e-01,  6.9674e-01,  6.2971e-02,  1.7604e-01,
        6.7448e-01,  5.9729e-01,  4.9058e-01,  5.2370e-01, -7.8386e-02,
       -5.2658e-01, -5.6530e-01,  4.5006e-01, -7.5849e-01, -5.2401e-02,
        5.1847e-02, -3.2363e-01,  7.1197e-01,  5.8238e-01, -7.0496e-01,
       -1.9225e-01, -1.0275e+00,  8.8209e-01,  6.8192e-01, -7.0748e-02,
       -1.7905e+00, -8.8179e-01, -8.5265e-01,  1.3588e+00,  1.0301e+00,
        2.6106e-01,  8.9355e-01,  4.3638e-01, -5.8021e-01,  1.3251e+00,
       -5.7793e-01, -1.4836e-01, -9.2882e-02,  3.8736e-01,  2.26

In [128]:
print(find_closest_embeddings(embeddings_index["dinner"])[1:100])

['breakfast', 'dinners', 'lunch', 'luncheon', 'guests', 'banquet', 'brunch', 'meal', 'buffet', 'evening', 'gala', 'wedding', 'meals', 'dining', 'thanksgiving', 'dined', 'toast', 'cocktail', 'weekend', 'restaurant', 'occasion', 'birthday', 'lunchtime', 'dine', 'dessert', 'eve', 'supper', 'friends', 'reception', 'dressing', 'night', 'lunches', 'fancy', 'diners', 'celebration', 'day', 'invitation', 'invited', 'bash', 'breakfasts', 'nights', 'afterward', 'hosted', 'ate', 'attending', 'table', 'banquets', 'soiree', 'menu', 'room', 'trip', 'meetings', 'guest', 'diner', 'gathering', 'favorite', 'barbecue', 'gift', 'farewell', 'couple', 'conversation', 'billed', 'patrons', 'arranged', 'weekends', 'festivities', 'tasting', 'prepared', 'usual', 'sitting', 'ceremony', 'met', 'gifts', 'valentine', 'reunion', 'inviting', 'arrange', 'attend', 'catered', 'celebrate', 'visiting', 'sandwiches', 'joked', 'lavish', 'toasts', 'instead', 'tonight', 'attendees', 'afternoon', 'morning', 'welcoming', 'trips',

In [130]:
print(find_closest_embeddings(embeddings_index["occasion"]+embeddings_index["wine"])[1:100])

['wines', 'champagne', 'tasting', 'occasion', 'drink', 'taste', 'beer', 'dinner', 'dessert', 'tea', 'coffee', 'celebrate', 'celebration', 'gift', 'wonderful', 'drinks', 'famous', 'bottle', 'great', 'celebrated', 'day', 'delicious', 'every', 'well', 'fine', 'meal', 'fruit', 'sparkling', 'holiday', 'grape', 'given', 'good', 'whole', 'place', 'perhaps', 'made', 'example', 'important', 'restaurant', 'always', 'festive', 'here', 'enjoy', 'come', 'guests', 'comes', 'this', 'best', 'same', 'visitors', 'today', 'prepared', 'instance', 'birthday', 'spirits', 'especially', 'passion', 'once', 'indeed', 'occasions', 'fact', 'toast', 'variety', 'even', 'though', 'nice', 'tradition', 'gifts', 'tasted', 'sometimes', 'excellent', 'christmas', 'chocolate', 'festival', 'welcome', 'finest', 'rare', 'particular', 'favorite', 'celebrating', 'book', 'thanksgiving', 'each', 'few', 'quite', 'tastes', 'although', 'flavor', 'ever', 'brought', 'usually', 'wedding', 'unusual', 'yet', 'popular', 'kind', 'dinners',

In [None]:
print(find_closest_embeddings(embeddings_index["friends"])[1:200])

In [133]:
print(find_closest_embeddings(embeddings_index["friends"])[1:200])

['friend', 'parents', 'loved', 'couple', 'strangers', 'others', 'whom', 'telling', 'colleagues', 'neighbors', 'tell', 'everyone', 'relatives', 'acquaintances', 'remember', 'folks', 'buddies', 'dad', 'wish', 'talk', 'wanted', 'lovers', 'learned', 'wives', 'wanting', 'talked', 'daughters', 'uncle', 'kids', 'find', 'come', 'love', 'talking', 'knew', 'mom', 'turned', 'siblings', 'guests', 'cousins', 'husband', 'happy', 'mother', 'why', 'family', 'sister', 'once', 'remembered', 'always', 'classmates', 'asked', 'grandmother', 'having', 'shared', 'liked', 'pals', 'asking', 'know', 'afterward', 'father', 'wonder', 'beloved', 'knowing', 'grandparents', 'brought', 'well', 'thought', 'sometimes', 'met', 'invited', 'instead', 'recalled', 'own', 'eager', 'lover', 'few', 'brother', 'companions', 'wishes', 'seeing', 'reminded', 'working', 'else', 'sons', 'gone', 'ones', 'looking', 'mrs.', "'d", 'leave', 'approached', 'hoping', 'even', 'whose', 'fact', 'mind', 'wife', 'supposed', 'besides', 'handful',

In [135]:
print(find_closest_embeddings(embeddings_index["love"])[1:200])

['passion', 'dream', 'wonder', 'mind', 'true', 'life', 'dreams', 'always', 'loves', 'kind', 'friends', 'remember', 'loved', 'happy', 'forget', 'fun', 'wish', 'crazy', 'me', 'feel', 'lover', 'wonderful', 'spirit', 'sort', 'soul', 'everyone', '”', 'like', 'happiness', 'thing', 'mom', 'joy', 'good', 'romantic', 'desire', 'know', 'luck', 'really', 'cry', 'sense', 'mother', 'imagine', 'nothing', 'everything', '`', 'something', 'tell', 'why', 'lovers', 'never', 'romance', 'thought', "'d", 'my', 'longing', 'dad', 'things', 'feelings', '“', 'kiss', 'couple', 'obsession', 'fact', 'friend', 'maybe', 'come', 'yes', '...', 'knowing', 'you', 'explains', 'feeling', 'else', 'inspired', '?', 'likes', 'loving', 'way', 'little', 'talk', 'heaven', 'affection', 'indeed', 'goes', 'grace', 'hope', 'inspiration', 'idea', 'strange', 'she', 'touch', 'great', 'stuff', 'man', 'everybody', 'stranger', 'forever', 'kid', 'gone', 'glory', 'girl', 'makes', 'hell', 'comes', 'tale', 'moment', 'thoughts', '…', 'well', '

In [136]:
print(find_closest_embeddings(embeddings_index["rich"])[1:100])

['especially', 'richer', 'particularly', 'vast', 'well', 'diverse', 'wealthy', 'perhaps', 'indeed', 'besides', 'wealth', 'natural', 'unlike', 'instance', 'generous', 'kind', 'example', 'once', 'though', 'fact', 'highly', 'brohd', 'concentrated', 'much', 'brooks', 'contrast', 'abundant', 'addition', 'equally', 'fertile', 'little', 'bring', 'moreover', 'brought', 'like', 'comes', 'whose', 'ones', 'country', 'even', 'giving', 'yet', 'big', 'sort', 'humble', 'mineral', 'ways', 'possibly', 'mostly', 'bringing', 'gives', 'naturally', 'middle', 'importantly', 'part', 'notably', 'good', 'attracted', 'very', 'mix', 'far', 'most', 'thanks', 'whereas', 'rest', 'fields', 'riches', 'although', 'boasts', 'promising', 'attractive', 'way', 'come', 'supposedly', 'unique', '__________________________________', 'places', 'whole', 'rather', 'poor', 'looking', 'typical', 'among', 'fabulous', 'otherwise', 'means', 'important', 'turned', 'deals', 'particular', 'similarly', 'except', 'noting', 'impoverished',

In [137]:
#load wine descriptor_mapping
map_location = '/Users/hyunoochang/code/chyunoo/wineteller/raw_data/descriptor_mapping.csv'

descriptor_mapping = pd.read_csv(map_location).set_index('raw descriptor')
descriptor_mapping.head()

Unnamed: 0_level_0,level_3,level_2,level_1
raw descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abras,abrasive,high_tannin,tannin
acacia,acacia,flowery,flower
acacia_flower,acacia,flowery,flower
aciddriven,acid_driven,high_acid,acid
aggress,aggressive,high_acid,acid


In [140]:
len(list(descriptor_mapping.level_2))

1015

In [None]:
descriptor_list = ['body', 'complexity', 'finish']
filtered_descriptor_mapping = descriptor_mapping[descriptor_mapping['level_1'].isin(descriptor_list)]

In [166]:
for o in converted : 
    print(f" {o} find_closest_embeddings(embeddings_index[o])[1:100])

['especially', 'richer', 'particularly', 'vast', 'well', 'diverse', 'wealthy', 'perhaps', 'indeed', 'besides', 'wealth', 'natural', 'unlike', 'instance', 'generous', 'kind', 'example', 'once', 'though', 'fact', 'highly', 'brohd', 'concentrated', 'much', 'brooks', 'contrast', 'abundant', 'addition', 'equally', 'fertile', 'little', 'bring', 'moreover', 'brought', 'like', 'comes', 'whose', 'ones', 'country', 'even', 'giving', 'yet', 'big', 'sort', 'humble', 'mineral', 'ways', 'possibly', 'mostly', 'bringing', 'gives', 'naturally', 'middle', 'importantly', 'part', 'notably', 'good', 'attracted', 'very', 'mix', 'far', 'most', 'thanks', 'whereas', 'rest', 'fields', 'riches', 'although', 'boasts', 'promising', 'attractive', 'way', 'come', 'supposedly', 'unique', '__________________________________', 'places', 'whole', 'rather', 'poor', 'looking', 'typical', 'among', 'fabulous', 'otherwise', 'means', 'important', 'turned', 'deals', 'particular', 'similarly', 'except', 'noting', 'impoverished',

KeyboardInterrupt: 

In [163]:
print(find_closest_embeddings(embeddings_index["elegant"])[1:100])

['stylish', 'graceful', 'sleek', 'sumptuous', 'opulent', 'luxurious', 'charming', 'lovely', 'gorgeous', 'exquisite', 'stately', 'unpretentious', 'tasteful', 'elegance', 'ornate', 'classy', 'fashionable', 'whimsical', 'elegantly', 'splendid', 'spacious', 'rustic', 'minimalist', 'magnificent', 'unadorned', 'glamorous', 'decor', 'majestic', 'chic', 'colorful', 'fashioned', 'fancy', 'beautiful', 'understated', 'unassuming', 'polished', 'dazzling', 'evocative', 'quirky', 'delightful', 'seductive', 'demure', 'airy', 'genteel', 'enchanting', 'flashy', 'handsome', 'simple', 'old-fashioned', 'beguiling', 'nondescript', 'lively', 'sensuous', 'resplendent', 'neat', 'expansive', 'deceptively', 'shabby', 'homey', 'eclectic', 'sturdy', 'unremarkable', 'urbane', 'idiosyncratic', 'tidy', 'parisian', 'austere', 'wonderfully', 'exuberant', 'captivating', 'unfussy', 'exquisitely', 'amiable', 'beautifully', 'artful', 'distinctive', 'pleasing', 'oddly', 'cheerful', 'courtly', 'typical', 'brilliant', 'quain