In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
from geopy import Nominatim



import os
import urllib.request
import matplotlib.pyplot as plt
from scipy import spatial
from sklearn.manifold import TSNE
import spacy

from sklearn.cluster import KMeans


In [2]:
ratings = pd.read_csv('../data/chocolate_ratings.csv')
ratings.shape, ratings.columns

((2530, 10),
 Index(['REF', 'Company (Manufacturer)', 'Company Location', 'Review Date',
        'Country of Bean Origin', 'Specific Bean Origin or Bar Name',
        'Cocoa Percent', 'Ingredients', 'Most Memorable Characteristics',
        'Rating'],
       dtype='object'))

In [3]:
ratings['Cocoa Percent Int'] = ratings['Cocoa Percent'].str[:2].astype(int)

abbrev2ingredient = {'B':'Beans', 'S':'Sugar', 'S*': 'Sweetener other than white cane or beet sugar', 'C':'Cocoa Butter', 
                    'V': 'Vanilla', 'L': 'Lecithin', 'Sa': 'Salt', '':''}

ratings['Ingredients List'] = [[abbrev2ingredient[x] for x in i[1].strip().split(',')] for i in ratings['Ingredients'].fillna('0 - ').str.split('-')]

ratings['Most Memorable Characteristics List'] = ratings['Most Memorable Characteristics'].fillna('none,').str.split(',')

In [4]:
ratings.head()

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating,Cocoa Percent Int,Ingredients List,Most Memorable Characteristics List
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25,76,"[Beans, Sugar, Cocoa Butter]","[rich cocoa, fatty, bready]"
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.5,76,"[Beans, Sugar, Cocoa Butter]","[cocoa, vegetal, savory]"
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75,76,"[Beans, Sugar, Cocoa Butter]","[cocoa, blackberry, full body]"
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68%,"3- B,S,C","chewy, off, rubbery",3.0,68,"[Beans, Sugar, Cocoa Butter]","[chewy, off, rubbery]"
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72%,"3- B,S,C","fatty, earthy, moss, nutty,chalky",3.0,72,"[Beans, Sugar, Cocoa Butter]","[fatty, earthy, moss, nutty, chalky]"


In [5]:
ratings.explode('Ingredients List')['Ingredients List'].unique()

array(['Beans', 'Sugar', 'Cocoa Butter', 'Lecithin', 'Vanilla', 'Salt',
       '', 'Sweetener other than white cane or beet sugar'], dtype=object)

In [6]:
flavours = list(ratings.explode('Most Memorable Characteristics List')['Most Memorable Characteristics List'].unique())
len(flavours)

1232

In [7]:
import spacy
nlp = spacy.load('en_core_web_md')

flavour_embds = [np.mean([token.vector for i,token in enumerate(nlp(doc)) if (token.pos_ in ["ADJ", "NOUN"]) or i == len(nlp(doc))-1], axis=0) for doc in tqdm(flavours)]

flavour_embds[0]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 1232/1232 [00:33<00:00, 36.25it/s]


array([-1.54005006e-01, -1.20966502e-01,  1.83849990e-01, -1.02813005e-01,
       -4.07429993e-01,  3.45719993e-01,  1.65680006e-01,  1.55233502e-01,
       -1.33925512e-01,  1.16965997e+00, -5.84129989e-01, -8.59294981e-02,
       -2.24844992e-01,  1.75199956e-02,  1.12769991e-01, -3.26214999e-01,
       -7.49095008e-02,  1.09688497e+00,  2.94016480e-01, -4.75584984e-01,
       -2.67454982e-01, -1.00549981e-02,  8.45865011e-02,  3.93880010e-01,
       -5.78589998e-02, -4.90425020e-01,  2.91509986e-01,  4.49180007e-01,
        8.52850080e-02, -6.94689989e-01,  4.99188900e-06,  1.23570003e-01,
        1.85879007e-01, -4.71915007e-01,  1.16549999e-01, -2.26609007e-01,
       -3.55574995e-01, -2.99239993e-01, -1.54258996e-01, -1.09274998e-01,
        2.26435006e-01, -4.00165021e-01,  9.43975002e-02, -2.96508998e-01,
        7.54429996e-02,  4.18060005e-01,  1.15400031e-02, -8.18825066e-02,
        2.17345998e-01, -2.83675015e-01,  8.68194997e-02,  2.12384984e-01,
       -1.85939997e-01, -

In [8]:
flavour_embds = [f_emb if type(f_emb) == np.ndarray else nlp('none').vector for f_emb in flavour_embds]

In [10]:
kmeans = KMeans(n_clusters=1000, random_state=0).fit(np.array(flavour_embds))

  kmeans = KMeans(n_clusters=1000, random_state=0).fit(np.array(flavour_embds))


In [11]:
flavour_labels = kmeans.labels_

In [12]:
from sklearn.metrics import pairwise_distances_argmin_min
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, np.array(flavour_embds))
closest

array([  77,  965, 1079,  317, 1100,   27,  329,  207,  647,  181, 1100,
       1100,  987,  822,   76,   64,  319,  171,  153,   12,   43,  127,
        193,  334,  321, 1100,  249,  336,  993,  337,  338,  203,  187,
        339,   40,    9,  941,   43,  322,   61,  323,  341,  899,    2,
       1100,  355,  612,  252,   31,  881,  281,  112,  438,  241,  185,
         15, 1100,   83,  343,   41,  344, 1100,  949,    1, 1100,  158,
        200,  857,   25,  179,   32,  120,  239,   27, 1021,  161,  325,
        651, 1100,  326,   41,  347,  348,  225,   48,   98,  216,  195,
        100,  350,  351, 1100, 1100,   33,  207,  640,  764,  276,   46,
         75, 1100,  544,  384,  129,  353,  153,  626, 1048,  391,  459,
       1100, 1100,   70,  242,  167,  192, 1173,   72,  355,  721,  622,
       1008,  107,    5,  306, 1169, 1100,   10,  773,  102,  356,  172,
        366,  357, 1094,  406,  195,  249,   12,  932,  520,  720,  443,
       1050,  193,  303,  602,   94,  299, 1100,  4

In [13]:
centers = np.array(flavours)[closest]

centers

array([' molasses', ' sour ending', ' sweet spices', ' muted spice',
       'full cocoa flavor', 'sticky', ' dominant tobacco', 'long lasting',
       'mild licorice', ' mild earthy', 'full cocoa flavor',
       'full cocoa flavor', ' slight ashy', 'cooling sensation', 'herbal',
       ' spice', ' crisp nibs', 'light roast', 'creamy', ' earthy',
       ' caramel', ' red berry', 'cashew', ' tang',
       'honey with subtle tobacco', 'full cocoa flavor', ' flat',
       ' bread', 'intense spice', ' intense cocoa', ' sl. Burnt',
       'sour fruit', ' peanut butter', ' possible mold', ' fig', ' off',
       ' intense smoke', ' caramel', ' tart red berry', ' acidic',
       'powerful', ' grape', ' herb', ' bready', 'full cocoa flavor',
       ' rich mocha', ' strong floral', 'bland', ' grass',
       ' strong chemical', 'gummy', 'grit', ' sweet edge', 'off aroma',
       ' rum', 'chalky', 'full cocoa flavor', 'orange', ' mild mint',
       'oily', ' gentle roast', 'full cocoa flavor', 'dar

In [14]:
data = pd.DataFrame({'flavour':flavours, 'embedding':flavour_embds, 'group':flavour_labels, 'group_name':[centers[f_label] for f_label in flavour_labels]})

data.sample(10)

Unnamed: 0,flavour,embedding,group,group_name
956,oranges,"[-0.56331, 0.61604, 0.12365, -0.53078, -0.1821...",400,plums
1081,fat residue,"[-0.5002775, 0.159138, -0.188565, 0.2786653, -...",248,fat residue
1116,fudgey then spicy,"[-0.50711, 0.081616, -0.10346, 0.47979, 0.3700...",434,fruity
479,cigarette butt,"[-0.15593001, -0.2671405, -0.170205, 0.2887849...",965,cigarette butt
1205,mellow fruit,"[-0.09779, 0.469435, -0.080235004, -0.460865, ...",525,mellow fruit
794,ashey,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",769,ashey
82,spicy and candy-like,"[-0.18517, -0.021926334, -0.080586664, 0.08582...",641,spicy and candy-like
964,melon notes,"[-0.44882, 0.155455, 0.236245, -0.002700001, -...",560,melon notes
457,atypical,"[-0.002184, 0.24405, -0.098245, -0.063107, -0....",922,atypical
929,rum,"[0.12628, -0.21481, 0.4965, -0.39865, -0.27106...",54,rum


In [19]:
centers = {}
for group, flavour in data.groupby('group')['flavour']:
    centers[group] =list(flavour)[np.argmin(np.array(flavour.str.len().values))].strip()

centers, len(centers)

({0: 'molasses',
  1: 'sour ending',
  2: 'sweet spices',
  3: 'muted spice',
  4: 'full cocoa flavor',
  5: 'sticky',
  6: 'dominant tobacco',
  8: 'mild licorice',
  9: 'mild earthy',
  12: 'slight ashy',
  13: 'cooling sensation',
  14: 'herbs',
  15: 'spice',
  16: 'crisp nibs',
  17: 'light roast',
  18: 'creamy',
  19: 'smokey',
  21: 'red berry',
  22: 'cashew',
  23: 'tang',
  24: 'honey with subtle tobacco',
  28: 'intense spice',
  30: 'sl. Burnt',
  31: 'sour fig',
  32: 'peanut butter',
  33: 'possible mold',
  34: 'fig',
  35: 'off',
  36: 'intense smoke',
  37: 'tart',
  38: 'tart red berry',
  39: 'acidic',
  40: 'powerful',
  41: 'grape',
  42: 'herb',
  45: 'rich mocha',
  46: 'strong floral',
  47: 'bland',
  48: 'grass',
  49: 'strong chemical',
  50: 'gummy',
  51: 'grit',
  52: 'sweet edge',
  53: 'off aroma',
  54: 'rum',
  55: 'chalky',
  57: 'orange',
  58: 'mild mint',
  59: 'oily',
  60: 'gentle roast',
  62: 'dark fruit',
  63: 'fatty',
  65: 'nuts',
  66: 'b

In [20]:
data['group_name'] = data['group'].apply(lambda x: centers[x])

In [21]:
for g, d in data.groupby('group_name'):
    print(g, '\n', d['flavour'].sample(3, replace=True))

 
 636    
636    
636    
Name: flavour, dtype: object
Cadbury egg 
 1229     Cadbury egg
1229     Cadbury egg
1229     Cadbury egg
Name: flavour, dtype: object
Easter candy 
 891     easter candy
891     easter candy
891     easter candy
Name: flavour, dtype: object
accesible 
 726    accessible
920     accesible
920     accesible
Name: flavour, dtype: object
acidic 
 61      acidic
328     acidic
61      acidic
Name: flavour, dtype: object
alcohol 
 268     alcohol
268     alcohol
268     alcohol
Name: flavour, dtype: object
alkalyzed notes 
 26    alkalyzed notes
26    alkalyzed notes
26    alkalyzed notes
Name: flavour, dtype: object
alluring aroma 
 67    alluring aroma
67    alluring aroma
67    alluring aroma
Name: flavour, dtype: object
almond 
 19       macadamia
1026     pistachio
1026     pistachio
Name: flavour, dtype: object
almond butter 
 630     almond butter
630     almond butter
630     almond butter
Name: flavour, dtype: object
alocohol 
 356     alocohol
356     al

In [None]:
#new_names = ['nuts', 'other', 'burnt', 'caramel, marshmellow, nutella', 'nuts', 'cardboard', 'roasty, bitter', 'tobacco', 'berries', 'tea', 'fruit', 'cocoa', 'spicy, nutty', 'roast', 'herbal', 'mild', 'gummy', 'berries', 'sweet']


In [22]:
flavour2group = {}

for row in data.iterrows():
    flavour2group[row[1]['flavour']] = row[1]['group_name']


In [24]:
group2emb = {}

for cent in centers.values():
    group2emb[cent] = nlp(cent).vector

In [25]:
ratings['New Flavours'] = [[flavour2group[flav] for flav in flavors] for flavors in ratings['Most Memorable Characteristics List']]

In [26]:
ratings['Flavours Vectors'] = [np.mean(np.array([group2emb[flav.strip()] for flav in flavors]), axis=0) for flavors in ratings['New Flavours']]

In [27]:
ratings

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating,Cocoa Percent Int,Ingredients List,Most Memorable Characteristics List,New Flavours,Flavours Vectors
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25,76,"[Beans, Sugar, Cocoa Butter]","[rich cocoa, fatty, bready]","[rich cocoa, fatty, malt]","[-0.33688498, -0.0722355, -0.016420001, 0.1627..."
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.50,76,"[Beans, Sugar, Cocoa Butter]","[cocoa, vegetal, savory]","[cocoa, woody, savory]","[-0.0789, 0.092045665, -0.013883342, -0.163613..."
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75,76,"[Beans, Sugar, Cocoa Butter]","[cocoa, blackberry, full body]","[cocoa, apple, full body]","[-0.031931832, 0.10720399, -0.018783333, 0.027..."
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68%,"3- B,S,C","chewy, off, rubbery",3.00,68,"[Beans, Sugar, Cocoa Butter]","[chewy, off, rubbery]","[chewy, off, rubbery]","[0.28423834, -0.035696674, -0.14546233, 0.2694..."
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72%,"3- B,S,C","fatty, earthy, moss, nutty,chalky",3.00,72,"[Beans, Sugar, Cocoa Butter]","[fatty, earthy, moss, nutty, chalky]","[fatty, smokey, moss, chewy, chalky]","[0.14354801, 0.170264, -0.242106, 0.2186352, -..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,1205,Zotter,Austria,2014,Blend,Raw,80%,"4- B,S*,C,Sa","waxy, cloying, vegetal",2.75,80,"[Beans, Sweetener other than white cane or bee...","[waxy, cloying, vegetal]","[oily, sugary, woody]","[-0.08211667, 0.013276677, -0.28126234, 0.0032..."
2526,1996,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75%,"3- B,S,C","strong nutty, marshmallow",3.75,75,"[Beans, Sugar, Cocoa Butter]","[strong nutty, marshmallow]","[strong nutty, choco]","[0.120533004, 0.24399251, 0.040894993, -0.0264..."
2527,2036,Zotter,Austria,2018,Blend,"Dry Aged, 30 yr Anniversary bar",75%,"3- B,S,C","fatty, earthy, cocoa",3.00,75,"[Beans, Sugar, Cocoa Butter]","[fatty, earthy, cocoa]","[fatty, smokey, cocoa]","[-0.099429995, 0.04608567, -0.08730668, 0.1893..."
2528,2170,Zotter,Austria,2018,Congo,Mountains of the Moon,70%,"3- B,S,C","fatty, mild nuts, mild fruit",3.25,70,"[Beans, Sugar, Cocoa Butter]","[fatty, mild nuts, mild fruit]","[fatty, mild nuts, mild fruit]","[-0.3228815, 0.08561, -0.039761666, 0.17633168..."


In [28]:
ratings.to_json('postprocessing_form.json')

In [29]:
ratings.T.to_json('postprocessing_form_t.json')

In [30]:
pd.DataFrame({'flavour':group2emb.keys(), 'vector':group2emb.values()}).set_index('flavour').T.to_json('group2emb.json')