In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
from geopy import Nominatim


import os
import urllib.request
import matplotlib.pyplot as plt
from scipy import spatial
from sklearn.manifold import TSNE
import spacy

from sklearn.cluster import KMeans


In [2]:
ratings = pd.read_csv('../data/chocolate_ratings.csv')
ratings.shape, ratings.columns

((2530, 10),
 Index(['REF', 'Company (Manufacturer)', 'Company Location', 'Review Date',
        'Country of Bean Origin', 'Specific Bean Origin or Bar Name',
        'Cocoa Percent', 'Ingredients', 'Most Memorable Characteristics',
        'Rating'],
       dtype='object'))

In [3]:
ratings['Cocoa Percent Int'] = ratings['Cocoa Percent'].str[:2].astype(int)

abbrev2ingredient = {'B':'Beans', 'S':'Sugar', 'S*': 'Sweetener other than white cane or beet sugar', 'C':'Cocoa Butter', 
                    'V': 'Vanilla', 'L': 'Lecithin', 'Sa': 'Salt', '':''}

ratings['Ingredients List'] = [[abbrev2ingredient[x] for x in i[1].strip().split(',')] for i in ratings['Ingredients'].fillna('0 - ').str.split('-')]

ratings['Most Memorable Characteristics List'] = ratings['Most Memorable Characteristics'].fillna('none,').str.split(',')

In [4]:
ratings.head()

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating,Cocoa Percent Int,Ingredients List,Most Memorable Characteristics List
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25,76,"[Beans, Sugar, Cocoa Butter]","[rich cocoa, fatty, bready]"
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.5,76,"[Beans, Sugar, Cocoa Butter]","[cocoa, vegetal, savory]"
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75,76,"[Beans, Sugar, Cocoa Butter]","[cocoa, blackberry, full body]"
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68%,"3- B,S,C","chewy, off, rubbery",3.0,68,"[Beans, Sugar, Cocoa Butter]","[chewy, off, rubbery]"
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72%,"3- B,S,C","fatty, earthy, moss, nutty,chalky",3.0,72,"[Beans, Sugar, Cocoa Butter]","[fatty, earthy, moss, nutty, chalky]"


In [5]:
ratings.explode('Ingredients List')['Ingredients List'].unique()

array(['Beans', 'Sugar', 'Cocoa Butter', 'Lecithin', 'Vanilla', 'Salt',
       '', 'Sweetener other than white cane or beet sugar'], dtype=object)

In [6]:
flavours = list(ratings.explode('Most Memorable Characteristics List')['Most Memorable Characteristics List'].unique())
len(flavours)

1232

In [8]:
import spacy
nlp = spacy.load('en_core_web_md')

flavour_embds = [np.mean([token.vector for i,token in enumerate(nlp(doc)) if (token.pos_ in ["ADJ", "NOUN"]) or i == len(nlp(doc))-1], axis=0) for doc in tqdm(flavours)]

flavour_embds[0]

100%|██████████| 1232/1232 [00:24<00:00, 51.10it/s]


array([-1.54005006e-01, -1.20966502e-01,  1.83849990e-01, -1.02813005e-01,
       -4.07429993e-01,  3.45719993e-01,  1.65680006e-01,  1.55233502e-01,
       -1.33925512e-01,  1.16965997e+00, -5.84129989e-01, -8.59294981e-02,
       -2.24844992e-01,  1.75199956e-02,  1.12769991e-01, -3.26214999e-01,
       -7.49095008e-02,  1.09688497e+00,  2.94016480e-01, -4.75584984e-01,
       -2.67454982e-01, -1.00549981e-02,  8.45865011e-02,  3.93880010e-01,
       -5.78589998e-02, -4.90425020e-01,  2.91509986e-01,  4.49180007e-01,
        8.52850080e-02, -6.94689989e-01,  4.99188900e-06,  1.23570003e-01,
        1.85879007e-01, -4.71915007e-01,  1.16549999e-01, -2.26609007e-01,
       -3.55574995e-01, -2.99239993e-01, -1.54258996e-01, -1.09274998e-01,
        2.26435006e-01, -4.00165021e-01,  9.43975002e-02, -2.96508998e-01,
        7.54429996e-02,  4.18060005e-01,  1.15400031e-02, -8.18825066e-02,
        2.17345998e-01, -2.83675015e-01,  8.68194997e-02,  2.12384984e-01,
       -1.85939997e-01, -

In [9]:
flavour_embds = [f_emb if type(f_emb) == np.ndarray else nlp('none').vector for f_emb in flavour_embds]

In [10]:
kmeans = KMeans(n_clusters=200, random_state=0).fit(np.array(flavour_embds))

In [11]:
flavour_labels = kmeans.labels_

In [12]:
from sklearn.metrics import pairwise_distances_argmin_min
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, np.array(flavour_embds))
closest

array([  41,  477,  565,  414,  974,  101,   52,  525,  823,  199,    8,
          2,   43,  640,   21,   12,  572,  170, 1006,   76,  128,   42,
         19,  112,   25,   88,  227,  390,    3,   45,   57,   93,    4,
         31,   58,  242,   16,  435,  862,  237,  786,   15,    5,   39,
        447,  210,  436, 1228,  693,    6,   40,   32,  822,   49,   18,
         44,  251, 1034,  120,  694,   70,   59,  835,  276,   77,  129,
         84,  941,  938,    0,  158,  438,  295,  167,   83,  726,  195,
        100,   87,  658,  532,   27,  384, 1087,   69,   48,  252,   98,
         61,  774,  720,  102,   46,   85,  200,  118,  567,   89,  256,
         99,   10,  185,  229,  459,  161,  825,   68,  587,  193,  141,
        377,  179,  153,  172,  391,  336,  428,  325,   94,  513, 1050,
        107,  353,  207, 1195, 1008,  764,   74,  443,  563,  622,  512,
        783,  351,  105,  644,   86,  835,  280,  440,  181,  932,   55,
        602,  216,  612,  253,  713,  509,  762,   

In [13]:
centers = np.array(flavours)[closest]

centers

array(['oily', 'full flavor w/ dominant roast', 'sweet spice',
       ' hint of fruit', 'sour banana', ' ashey', ' mild choco',
       ' basic cocoa base', 'mild bitter and chemical off', 'dark berry',
       'chewy', ' bready', ' caramel', ' metal', 'fruity', ' earthy',
       ' odd rubber notes', ' burnt', ' slight woody', 'herbal',
       ' cinamon', ' nut', ' macadamia', 'grit', 'burnt rubber',
       ' metallic', ' black licorice', ' soil', 'cocoa', 'sweet',
       ' brownie', ' coffee', ' vegetal', ' grass', 'burnt wood', 'honey',
       'mildly bitter', ' muted', ' dirty/burnt edge', 'strawberries',
       ' cheesey', 'chalky', ' savory', 'sandy', ' slight cherry',
       'smooth', ' grain', ' intense blueberry', 'slow develop',
       ' blackberry', ' fig', 'mild tobacco', 'cooling sensation',
       'sliglty dry', 'milk brownie', ' raspberry', ' ham',
       'bright red fruit', ' mint', ' mild fatty', 'light color',
       ' choco', 'mild vanilla and mild fruit', 'pastey', ' m

In [14]:
data = pd.DataFrame({'flavour':flavours, 'embedding':flavour_embds, 'group':flavour_labels, 'group_name':[centers[f_label] for f_label in flavour_labels]})

data.sample(10)

Unnamed: 0,flavour,embedding,group,group_name
1010,off sour,"[-0.58668, 0.22827, 0.15329, 0.067326, 0.02935...",180,sour
995,punchy,"[-0.15457, 0.66251, -0.37018, -0.34796, 0.2264...",162,slight hammy
732,rustic,"[0.3056, 0.32791, -0.57033, -0.13677, 0.038429...",72,bitter then nutty rustic
646,too few nibs,"[0.1253095, -0.1314215, -0.20518002, -0.096123...",7,basic cocoa base
165,bright red,"[0.05549501, -0.02578, -0.1683025, -0.119295, ...",57,bright red fruit
1226,masculine,"[-0.14235, -0.31078, -0.66529, 0.28438, 0.6970...",119,harsh aroma
1111,cigar,"[0.28597, -0.38349, 0.11914, -0.109, -0.044704...",51,mild tobacco
1133,chocolate ice cream,"[0.12965533, -0.13324334, 0.21693964, -0.20207...",61,choco
471,intense orange,"[-0.34010547, 0.112114504, 0.015213, -0.006970...",47,intense blueberry
998,sour banana,"[-0.1922, 0.075826, 0.261805, 0.0500855, -0.19...",4,sour banana


In [32]:
centers = []
for group, flavour in data.groupby('group')['flavour']:
    centers.append(list(flavour)[np.argmin(np.array(flavour.str.len().values))].strip())

centers

['oily',
 'classic',
 'spice',
 'ripe',
 'banana',
 'wtf',
 'mild tart',
 'deep cocoa',
 'mild off',
 'red berry',
 'chewy',
 'malt',
 'tart',
 'metal',
 'spicy',
 'smokey',
 'odd',
 'burnt',
 'dark woody',
 'herbs',
 'anise',
 'nut',
 'almond',
 'grit',
 'rubber',
 'metallic',
 'black tea',
 'soil',
 'cocoa',
 'sweet',
 'fudgey',
 'coffee',
 'woody',
 'grass',
 'wood',
 'honey',
 'bitter',
 'muted',
 'deep',
 'cherries',
 'cheesy',
 'chalky',
 'meaty',
 'sandy',
 'deep cherry',
 'silky',
 'grain',
 'intense tart',
 'slow',
 'apple',
 'fig',
 'cigar',
 'cooling sensation',
 'dry',
 'pudding',
 'licorice',
 'ham',
 'green',
 'mint',
 'fat',
 'bright',
 'choco',
 'mild ham',
 'pastey',
 'molasses',
 'strong',
 'floral',
 'smoke',
 'thick',
 'rich cocoa',
 'nuts',
 'sharp',
 'rustic',
 'grainy',
 'orange',
 'accesible',
 'subtle',
 'dirt',
 'oats',
 'plums',
 'salt',
 'sticky',
 'fuel',
 'tart fruit',
 'dairy',
 'leather',
 'dull',
 'hay',
 'acidic',
 'palm',
 'undefined',
 'burlap',
 'pl

In [34]:
data['group_name'] = data['group'].apply(lambda x: centers[x])

In [35]:
for g, d in data.groupby('group_name'):
    print(g, '\n', d['flavour'].sample(3, replace=True))

 
 516    high intensity bitter
516    high intensity bitter
636                         
Name: flavour, dtype: object
accesible 
 726    accessible
726    accessible
920     accesible
Name: flavour, dtype: object
acidic 
 61     acidic
61     acidic
61     acidic
Name: flavour, dtype: object
almond 
 475       hazelnut
1026     pistachio
475       hazelnut
Name: flavour, dtype: object
alocohol 
 356     alocohol
356     alocohol
356     alocohol
Name: flavour, dtype: object
ambiguous 
 727     ambiguous
727     ambiguous
727     ambiguous
Name: flavour, dtype: object
anise 
 233      nutmeg
788    cardamon
128     cinamon
Name: flavour, dtype: object
apple 
 1113         apple
926          apple
591     blackberry
Name: flavour, dtype: object
baked 
 1181          baked
336           bread
717     baked bread
Name: flavour, dtype: object
balanced 
 1149     well balanced
1149     well balanced
851      well balanced
Name: flavour, dtype: object
banana 
 998     sour banana
681      gr

In [100]:
new_names = ['nuts', 'other', 'burnt', 'caramel, marshmellow, nutella', 'nuts', 'cardboard', 'roasty, bitter', 'tobacco', 'berries', 'tea', 'fruit', 'cocoa', 'spicy, nutty', 'roast', 'herbal', 'mild', 'gummy', 'berries', 'sweet']


In [36]:
flavour2group = {}

for row in data.iterrows():
    flavour2group[row[1]['flavour']] = row[1]['group_name']


In [38]:
group2emb = {}

for cent in centers:
    group2emb[cent] = nlp(cent).vector

In [40]:
ratings['New Flavours'] = [[flavour2group[flav] for flav in flavors] for flavors in ratings['Most Memorable Characteristics List']]

In [43]:
ratings['Flavours Vectors'] = [np.mean(np.array([group2emb[flav.strip()] for flav in flavors]), axis=0) for flavors in ratings['New Flavours']]

In [44]:
ratings

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating,Cocoa Percent Int,Ingredients List,Most Memorable Characteristics List,New Flavours,Flavours Vectors
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25,76,"[Beans, Sugar, Cocoa Butter]","[rich cocoa, fatty, bready]","[rich cocoa, fat, malt]","[-0.435315, 0.15850784, -0.04282667, 0.1234329..."
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.50,76,"[Beans, Sugar, Cocoa Butter]","[cocoa, vegetal, savory]","[cocoa, woody, meaty]","[-0.22953333, 0.136224, -0.11319, -0.09985, -0..."
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75,76,"[Beans, Sugar, Cocoa Butter]","[cocoa, blackberry, full body]","[cocoa, apple, ]","[-0.068569995, 0.12159232, -0.01899, -0.137396..."
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68%,"3- B,S,C","chewy, off, rubbery",3.00,68,"[Beans, Sugar, Cocoa Butter]","[chewy, off, rubbery]","[chewy, off, rubbery]","[0.28423834, -0.035696674, -0.14546233, 0.2694..."
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72%,"3- B,S,C","fatty, earthy, moss, nutty,chalky",3.00,72,"[Beans, Sugar, Cocoa Butter]","[fatty, earthy, moss, nutty, chalky]","[fat, smokey, moss, chewy, chalky]","[0.084489994, 0.30870998, -0.25795, 0.19503121..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525,1205,Zotter,Austria,2014,Blend,Raw,80%,"4- B,S*,C,Sa","waxy, cloying, vegetal",2.75,80,"[Beans, Sweetener other than white cane or bee...","[waxy, cloying, vegetal]","[oily, sugary, woody]","[-0.08211667, 0.013276677, -0.28126234, 0.0032..."
2526,1996,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75%,"3- B,S,C","strong nutty, marshmallow",3.75,75,"[Beans, Sugar, Cocoa Butter]","[strong nutty, marshmallow]","[strong, choco]","[-0.100362, 0.275335, 0.013254993, 0.009725004..."
2527,2036,Zotter,Austria,2018,Blend,"Dry Aged, 30 yr Anniversary bar",75%,"3- B,S,C","fatty, earthy, cocoa",3.00,75,"[Beans, Sugar, Cocoa Butter]","[fatty, earthy, cocoa]","[fat, smokey, cocoa]","[-0.19786, 0.27682897, -0.11371335, 0.14996, -..."
2528,2170,Zotter,Austria,2018,Congo,Mountains of the Moon,70%,"3- B,S,C","fatty, mild nuts, mild fruit",3.25,70,"[Beans, Sugar, Cocoa Butter]","[fatty, mild nuts, mild fruit]","[fat, mild ham, mild ham]","[-0.62492675, 0.14060333, 0.06380334, 0.562856..."


In [45]:
ratings.to_json('postprocessing_form.json')

In [46]:
ratings.T.to_json('postprocessing_form_t.json')