# 1. Load and clean data
Run the section below to load and clean the dataset. You do not need to understand the code. The code displays data for the first few chocolates.

In [1]:
# Run to load and clean the dataset
%reset -f
# from __future__ import print_function

import math
import numpy as np
import numpy.linalg as nla
import pandas as pd
import tensorflow as tf
import re
import six
from os.path import join
from matplotlib import pyplot as plt

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
# Set the output display to have one digit for decimal places and limit it to
# printing 15 rows.
pd.options.display.max_rows =  15
pd.options.display.float_format = '{:.2f}'.format

In [3]:
choc_data = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/flavors_of_cacao.csv", sep=",", encoding='latin-1')

In [4]:
# rename the columns.
choc_data.columns = ['maker', 'specific_origin', 'reference_number', 'review_date', 'cocoa_percent', 'maker_location', 'rating', 'bean_type', 'broad_origin']

In [5]:
choc_data.dtypes

maker                object
specific_origin      object
reference_number      int64
review_date           int64
cocoa_percent        object
maker_location       object
rating              float64
bean_type            object
broad_origin         object
dtype: object

In [6]:
choc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1795 entries, 0 to 1794
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   maker             1795 non-null   object 
 1   specific_origin   1795 non-null   object 
 2   reference_number  1795 non-null   int64  
 3   review_date       1795 non-null   int64  
 4   cocoa_percent     1795 non-null   object 
 5   maker_location    1795 non-null   object 
 6   rating            1795 non-null   float64
 7   bean_type         907 non-null    object 
 8   broad_origin      1721 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 126.3+ KB


In [7]:
# Replace empty/null values with "Blend"
choc_data['bean_type'] = choc_data['bean_type'].fillna('Blend')

In [8]:
# Cast bean_type to string to remove leading 'u'
choc_data['bean_type'] = choc_data['bean_type'].astype(str)

In [9]:
choc_data['cocoa_percent'] = choc_data['cocoa_percent'].str.strip('%')

In [10]:
choc_data['cocoa_percent'] = pd.to_numeric(choc_data['cocoa_percent'])

In [11]:
choc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1795 entries, 0 to 1794
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   maker             1795 non-null   object 
 1   specific_origin   1795 non-null   object 
 2   reference_number  1795 non-null   int64  
 3   review_date       1795 non-null   int64  
 4   cocoa_percent     1795 non-null   float64
 5   maker_location    1795 non-null   object 
 6   rating            1795 non-null   float64
 7   bean_type         1795 non-null   object 
 8   broad_origin      1721 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 126.3+ KB


In [12]:
choc_data[choc_data['maker_location'].isin(['Domincan Republic'])]

Unnamed: 0,maker,specific_origin,reference_number,review_date,cocoa_percent,maker_location,rating,bean_type,broad_origin
883,Kah Kow,"Rizek Cacao, Cibao Valley, Domin. Rep.",1061,2013,70.0,Domincan Republic,3.5,Blend,Dominican Republic
884,Kah Kow,"Rizek Cacao, Domin. Rep.",1069,2013,82.0,Domincan Republic,3.0,Blend,Dominican Republic
885,Kah Kow,"Rizek Cacao, Domin. Rep.",1069,2013,55.0,Domincan Republic,3.25,Blend,Dominican Republic
886,Kah Kow,"Rizek Cacao, Domin. Rep.",1073,2013,62.0,Domincan Republic,3.25,Blend,Dominican Republic
1758,Xocolat,Hispaniola,1057,2013,66.0,Domincan Republic,3.0,Blend,Dominican Republic


In [13]:
choc_data[choc_data['maker_location'].isin(['Domincan Republic','Amsterdam', 'U.K.', 'Niacragua'])]

Unnamed: 0,maker,specific_origin,reference_number,review_date,cocoa_percent,maker_location,rating,bean_type,broad_origin
121,Artisan du Chocolat,"Trinidad, Heritage, Limited ed.",1193,2013,72.00,U.K.,3.25,Trinitario,Trinidad
122,Artisan du Chocolat,"Colombia, Casa Luker",947,2012,72.00,U.K.,3.75,Blend,Colombia
123,Artisan du Chocolat,Haiti,729,2011,72.00,U.K.,4.00,Blend,Haiti
124,Artisan du Chocolat,Panama,745,2011,72.00,U.K.,2.75,Blend,Panama
125,Artisan du Chocolat,Venezuela,486,2010,100.00,U.K.,1.75,Blend,Venezuela
...,...,...,...,...,...,...,...,...,...
1743,Willie's Cacao,Hacienda Las Trincheras,593,2010,72.00,U.K.,3.50,Blend,Venezuela
1744,Willie's Cacao,Java,593,2010,69.00,U.K.,3.75,Blend,Indonesia
1745,Willie's Cacao,San Martin,457,2009,70.00,U.K.,3.00,Blend,Peru
1746,Willie's Cacao,Rio Caribe,457,2009,72.00,U.K.,3.25,Trinitario,Venezuela


In [14]:
# Correct spelling mistakes, and replace city with country name
choc_data['maker_location'] = choc_data['maker_location']\
.str.replace('Amsterdam', 'Holland', regex=True)\
.str.replace('U.K.', 'England', regex=True)\
.str.replace('Niacragua', 'Nicaragua', regex=True)\
.str.replace('Domincan Republic', 'Dominican Republic', regex=True)

In [15]:
# Adding this so that Holland and Netherlands map to the same country.
choc_data['maker_location'] = choc_data['maker_location']\
.str.replace('Holland', 'Netherlands')

In [16]:
def cleanup_spelling_abbrev(text):
    replacements = [
        ['-', ', '], ['/ ', ', '], ['/', ', '], ['\(', ', '], [' and', ', '], [' &', ', '], ['\)', ''],
        ['Dom Rep|DR|Domin Rep|Dominican Rep,|Domincan Republic', 'Dominican Republic'],
        ['Mad,|Mad$', 'Madagascar, '],
        ['PNG', 'Papua New Guinea, '],
        ['Guat,|Guat$', 'Guatemala, '],
        ['Ven,|Ven$|Venez,|Venez$', 'Venezuela, '],
        ['Ecu,|Ecu$|Ecuad,|Ecuad$', 'Ecuador, '],
        ['Nic,|Nic$', 'Nicaragua, '],
        ['Cost Rica', 'Costa Rica'],
        ['Mex,|Mex$', 'Mexico, '],
        ['Jam,|Jam$', 'Jamaica, '],
        ['Haw,|Haw$', 'Hawaii, '],
        ['Gre,|Gre$', 'Grenada, '],
        ['Tri,|Tri$', 'Trinidad, '],
        ['C Am', 'Central America'],
        ['S America', 'South America'],
        [', $', ''], [',  ', ', '], [', ,', ', '], ['\xa0', ' '],[',\s+', ','],
        [' Bali', ',Bali']
    ]
    for i, j in replacements:
        text = re.sub(i, j, text)
    return text

In [17]:
choc_data[choc_data['specific_origin'].str.contains('\.')]

Unnamed: 0,maker,specific_origin,reference_number,review_date,cocoa_percent,maker_location,rating,bean_type,broad_origin
33,Akesson's (Pralus),"Madagascar, Ambolikapiky P.",502,2010,75.00,Switzerland,2.75,Criollo,Madagascar
34,Akesson's (Pralus),"Monte Alegre, D. Badero",508,2010,75.00,Switzerland,2.75,Forastero,Brazil
89,AMMA,"Monte Alegre, 3 diff. plantations",572,2010,85.00,Brazil,2.75,Forastero (Parazinho),Brazil
90,AMMA,"Monte Alegre, 3 diff. plantations",572,2010,50.00,Brazil,3.75,Forastero (Parazinho),Brazil
91,AMMA,"Monte Alegre, 3 diff. plantations",572,2010,75.00,Brazil,3.75,Forastero (Parazinho),Brazil
...,...,...,...,...,...,...,...,...,...
1702,Valrhona,"Sambirano, Ampamakia 2005, Millot P.",75,2006,64.00,France,3.50,Trinitario,Madagascar
1747,Wm,"Guasare, Zulia Prov., 2015, batch 124",1912,2016,74.00,U.S.A.,3.00,Criollo,Venezuela
1751,Woodblock,"Camino Verde P., Balao, Guayas",1042,2013,70.00,U.S.A.,3.25,Blend,Ecuador
1767,Zart Pralinen,"Millot P., Ambanja",1820,2016,70.00,Austria,3.50,"Criollo, Trinitario",Madagascar


In [18]:
choc_data['specific_origin'] = choc_data['specific_origin']\
.str.replace('\.', '', regex=True).apply(cleanup_spelling_abbrev)

In [19]:
choc_data['specific_origin'][1783]

'Loma Los Pinos,Yacao region,Dominican Republic'

In [20]:
choc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1795 entries, 0 to 1794
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   maker             1795 non-null   object 
 1   specific_origin   1795 non-null   object 
 2   reference_number  1795 non-null   int64  
 3   review_date       1795 non-null   int64  
 4   cocoa_percent     1795 non-null   float64
 5   maker_location    1795 non-null   object 
 6   rating            1795 non-null   float64
 7   bean_type         1795 non-null   object 
 8   broad_origin      1721 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 126.3+ KB


In [21]:
# Cast specific_origin to string
choc_data['specific_origin'] = choc_data['specific_origin'].astype(str)

In [22]:
# Replace null-valued fields with the same value as for specific_origin
choc_data['broad_origin'] = choc_data['broad_origin'].fillna(choc_data['specific_origin'])

In [23]:
# Clean up spelling mistakes and deal with abbreviations
choc_data['broad_origin'] = choc_data['broad_origin'].str.replace('\.', '', regex=True).apply(cleanup_spelling_abbrev)

In [24]:
choc_data[choc_data['bean_type'].isin(['Trinitario, Criollo'])]

Unnamed: 0,maker,specific_origin,reference_number,review_date,cocoa_percent,maker_location,rating,bean_type,broad_origin
284,Cacao Barry,Grand 'Anse,1716,2016,65.0,France,3.5,"Trinitario, Criollo",Haiti
645,Felchlin,Elvesia P,105,2006,74.0,Switzerland,3.0,"Trinitario, Criollo",Dominican Republic
766,Guittard,Sur del Lago,87,2006,65.0,U.S.A.,2.5,"Trinitario, Criollo",Venezuela
837,Hotel Chocolat (Coppeneur),Sambirano,809,2012,66.0,England,3.5,"Trinitario, Criollo",Madagascar
854,Hummingbird,"Ocumare,Cumboto",1097,2013,70.0,Canada,3.25,"Trinitario, Criollo",Venezuela
926,La Chocolaterie Nanairo,"Lumas,2015 Harvest,Batch 6,brown sugar",1892,2016,70.0,Japan,2.25,"Trinitario, Criollo",Peru
927,La Chocolaterie Nanairo,"Lumas,2015 Harvest,Batch 7",1892,2016,70.0,Japan,2.5,"Trinitario, Criollo",Peru
929,La Chocolaterie Nanairo,"Belize,2014 Harvest,Batch 9",1892,2016,70.0,Japan,3.0,"Trinitario, Criollo",Belize
1430,Scharffen Berger,Amina,464,2010,65.0,U.S.A.,3.75,"Trinitario, Criollo",Madagascar


In [25]:
choc_data['bean_type'].unique()

array(['Blend', 'Criollo', 'Trinitario', 'Forastero (Arriba)',
       'Forastero', 'Forastero (Nacional)', 'Criollo, Trinitario',
       'Criollo (Porcelana)', 'Trinitario (85% Criollo)',
       'Forastero (Catongo)', 'Forastero (Parazinho)',
       'Trinitario, Criollo', 'CCN51', 'Criollo (Ocumare)', 'Nacional',
       'Criollo (Ocumare 61)', 'Criollo (Ocumare 77)',
       'Criollo (Ocumare 67)', 'Criollo (Wild)', 'Beniano', 'Amazon mix',
       'Trinitario, Forastero', 'Forastero (Arriba) ASS', 'Criollo, +',
       'Amazon', 'Amazon, ICS', 'EET', 'Blend-Forastero,Criollo',
       'Trinitario (Scavina)', 'Criollo, Forastero', 'Matina',
       'Forastero(Arriba, CCN)', 'Nacional (Arriba)',
       'Forastero (Arriba) ASSS', 'Forastero, Trinitario',
       'Forastero (Amelonado)', 'Trinitario, Nacional',
       'Trinitario (Amelonado)', 'Trinitario, TCGA', 'Criollo (Amarru)'],
      dtype=object)

In [26]:
choc_data['bean_type'].value_counts()

Blend                   929
Trinitario              419
Criollo                 153
Forastero                87
Forastero (Nacional)     52
                       ... 
Criollo (Ocumare)         1
Amazon                    1
Criollo, +                1
Criollo (Ocumare 77)      1
CCN51                     1
Name: bean_type, Length: 40, dtype: int64

In [27]:
# Change 'Trinitario, Criollo' to "Criollo, Trinitario"
# Check with choc_data['bean_type'].unique()
choc_data.loc[choc_data['bean_type'].isin(['Trinitario, Criollo']),'bean_type'] = "Criollo, Trinitario"
# Confirm with choc_data[choc_data['bean_type'].isin(['Trinitario, Criollo'])]

In [28]:
choc_data[choc_data['maker'].str.contains(r'Na\w*ve')]

Unnamed: 0,maker,specific_origin,reference_number,review_date,cocoa_percent,maker_location,rating,bean_type,broad_origin
1162,Naive,"Trinidad,Tobago",1046,2013,70.0,Lithuania,3.75,Blend,"Trinidad,Tobago"
1163,Naive,"Maranon Canyon,Fortunato No 4",1133,2013,78.0,Lithuania,3.75,Forastero (Nacional),Peru
1164,Naive,Grenada,867,2012,71.0,Lithuania,2.5,Trinitario,Grenada


In [29]:
# Fix chocolate maker names
choc_data.loc[choc_data['maker']=='Shattel','maker'] = 'Shattell'
choc_data['maker'] = choc_data['maker'].str.replace(u'Na\xef\xbf\xbdve','Naive')

The u in <span style='color:red'>u'Some String'</span> means that your string is a <span style='color:red'>Unicode string</span>.
<span style='color:blue'>In Python 3.x the strings use Unicode by default</span> and there's no need for the u prefix.

In [30]:
choc_data.head()

Unnamed: 0,maker,specific_origin,reference_number,review_date,cocoa_percent,maker_location,rating,bean_type,broad_origin
0,A. Morin,Agua Grande,1876,2016,63.0,France,3.75,Blend,Sao Tome
1,A. Morin,Kpime,1676,2015,70.0,France,2.75,Blend,Togo
2,A. Morin,Atsane,1676,2015,70.0,France,3.0,Blend,Togo
3,A. Morin,Akata,1680,2015,70.0,France,3.5,Blend,Togo
4,A. Morin,Quilla,1704,2015,70.0,France,3.5,Blend,Peru


# 2. Process Data
Because you're using a DNN, you do not need to manually process the data. The DNN transforms the data for us. However, if possible, you should remove features that could distort the similarity calculation. Here, the features `review_date` and `reference_number` are not correlated with similarity. That is, chocolates that are reviewed closer together in time are not more or less similar than chocolates reviewed further apart. Remove these two features by running the following code.

In [31]:
choc_data.drop(columns=['review_date','reference_number'],inplace=True)
choc_data.head()

Unnamed: 0,maker,specific_origin,cocoa_percent,maker_location,rating,bean_type,broad_origin
0,A. Morin,Agua Grande,63.0,France,3.75,Blend,Sao Tome
1,A. Morin,Kpime,70.0,France,2.75,Blend,Togo
2,A. Morin,Atsane,70.0,France,3.0,Blend,Togo
3,A. Morin,Akata,70.0,France,3.5,Blend,Togo
4,A. Morin,Quilla,70.0,France,3.5,Blend,Peru


# 3. Generate Embeddings from DNN

We're ready to generate embeddings by training the DNN on the feature data. This section draws on concepts discussed on the page [Supervised Similarity Measure](https://developers.google.com/machine-learning/clustering/similarity/supervised-similarity).

Run the section below to set up functions to train the DNN that generates embeddings. You do not need to understand the code.

### Functions to Build and Train a Similarity DNN Model

In [42]:
class SimilarityModel(object):
    
    def __init__(self,
                 dataframe,
                 input_feature_names,
                 output_feature_names,
                 dense_feature_names,
                 sparse_input_feature_embedding_dims,
                 hidden_dims=[32],
                 l2_regularization=0.0,
                 use_bias=True,
                 batch_size=100,
                 inspect=False):
        used_feature_names = tuple(
            set(input_feature_names).union(output_feature_names))
        sparse_feature_names = tuple(
            set(used_feature_names).difference(dense_feature_names))
        # Dictionary mapping each sparse feature column to its vocabulary.
        ### sparse_feature_vocabs = { 'maker': [u'A. Morin', u'AMMA', ...], ... }
        sparse_feature_vocabs = {
            sfn: sorted(list(set(choc_data[sfn].values)))
            for sfn in sparse_feature_names
        }
        
        # Sparse output features are mapped to ids via tf.feature_to_id, hence
        # we need key-id pairs for these vocabularies.
        sparse_output_feature_names = (
            tuple(set(sparse_feature_names).intersection(output_feature_names)))
        keys_and_values = {}
        for fn in sparse_output_feature_names:
            keys = tf.constant(
                sparse_feature_vocabs[fn],
                dtype=tf.string,
                name='{}_vocab_keys'.format(fn))
            values = tf.range(
                len(sparse_feature_vocabs[fn]),
                dtype=tf.int64,
                name='{}_vocab_values'.format(fn))
            keys_and_values[fn] = (keys, values)
            
        # Class instance data members.
        self._session = None
        self._loss = None
        self._metrics = {}
        self._embeddings = None
        self._vars_to_inspect = {}

        def split_dataframe(df, holdout_fraction=0.1):
            
            test = df.sample(frac=holdout_fraction, replace=False)
            train = df[~df.index.isin(test.index)]
            return train, test

        train_dataframe, test_dataframe = split_dataframe(dataframe)

        def make_batch(dataframe, batch_size):
            used_features = {ufn: dataframe[ufn] for ufn in used_feature_names}
            batch = (
                tf.data.Dataset.from_tensor_slices(used_features).shuffle(1000)
                .repeat().batch(batch_size).make_one_shot_iterator().get_next())
            if inspect:
                for k, v in six.iteritems(batch):
                    self._vars_to_inspect['input_%s' % k] = v
            return batch
        
        def generate_feature_columns(feature_names):
            used_sparse_feature_names = (
                tuple(set(sparse_feature_names).intersection(feature_names)))
            used_dense_feature_names = (
                tuple(set(dense_feature_names).intersection(feature_names)))
            f_columns = {}
            for sfn in used_sparse_feature_names:
                sf_column = tf.feature_column.categorical_column_with_vocabulary_list(
                    key=sfn,
                    vocabulary_list=sparse_feature_vocabs[sfn],
                    num_oov_buckets=0)
                f_columns[sfn] = tf.feature_column.embedding_column(
                    categorical_column=sf_column,
                    dimension=sparse_input_feature_embedding_dims[sfn],
                    combiner='mean',
                    initializer=tf.truncated_normal_initializer(stddev=.1))
            for dfn in used_dense_feature_names:
                f_columns[dfn] = tf.feature_column.numeric_column(dfn)
            return f_columns
        
        def create_tower(features, columns):
            input_columns = [columns[fn] for fn in input_feature_names]
            hidden_layer = tf.feature_column.input_layer(features, input_columns)
            dense_input_feature_names = (
                tuple(set(dense_feature_names).intersection(input_feature_names)))
            input_dim = (
                sum(sparse_input_feature_embedding_dims.values()) +
                len(dense_input_feature_names))
            for layer_idx, layer_output_dim in enumerate(hidden_dims):
                w = tf.get_variable(
                    'hidden%d_w_' % layer_idx,
                    shape=[input_dim, layer_output_dim],
                    initializer=tf.truncated_normal_initializer(
                        stddev=1.0 / np.sqrt(layer_output_dim)))
                if inspect:
                    self._vars_to_inspect['hidden%d_w_' % layer_idx] = w
                hidden_layer = tf.matmul(hidden_layer, w)  # / 10.)
                if inspect:
                    self._vars_to_inspect['hidden_layer_%d' % layer_idx] = hidden_layer
                input_dim = layer_output_dim
              # Output features.
            output_layer = {}
            for ofn in output_feature_names:
                if ofn in sparse_feature_names:
                    feature_dim = len(sparse_feature_vocabs[ofn])
                else:
                    feature_dim = 1
                w = tf.get_variable(
                    'output_w_%s' % ofn,
                    shape=[input_dim, feature_dim],
                    initializer=tf.truncated_normal_initializer(stddev=1.0 /
                                                                np.sqrt(feature_dim)))
                if inspect:
                    self._vars_to_inspect['output_w_%s' % ofn] = w
                if use_bias:
                    bias = tf.get_variable(
                        'output_bias_%s' % ofn,
                        shape=[1, feature_dim],
                        initializer=tf.truncated_normal_initializer(stddev=1.0 /
                                                                    np.sqrt(feature_dim)))
                    if inspect:
                        self._vars_to_inspect['output_bias_%s' % ofn] = bias
                else:
                    bias = tf.constant(0.0, shape=[1, feature_dim])
                output_layer[ofn] = {
                    'labels':
                        features[ofn],
                    'logits':
                        tf.add(tf.matmul(hidden_layer, w), bias)  # w / 10.), bias)
                }
                if inspect:
                    self._vars_to_inspect['output_labels_%s' %
                                        ofn] = output_layer[ofn]['labels']
                    self._vars_to_inspect['output_logits_%s' %
                                        ofn] = output_layer[ofn]['logits']
                return hidden_layer, output_layer
            
        def similarity_loss(top_embeddings, output_layer):
            
            losses = {}
            total_loss = tf.scalar_mul(l2_regularization,
                                        tf.nn.l2_loss(top_embeddings))
            for fn, output in six.iteritems(output_layer):
                if fn in sparse_feature_names:
                    losses[fn] = tf.reduce_mean(
                        tf.nn.sparse_softmax_cross_entropy_with_logits(
                            logits=output['logits'],
                            labels=tf.feature_to_id(
                                output['labels'], keys_and_values=keys_and_values[fn])))
                else:
                    losses[fn] = tf.sqrt(
                        tf.reduce_mean(
                            tf.square(output['logits'] -
                                    tf.cast(output['labels'], tf.float32))))
                total_loss += losses[fn]
                return total_loss, losses

        # Body of the constructor.
        input_feature_columns = generate_feature_columns(input_feature_names)
        # Train
        with tf.variable_scope('model', reuse=False):
            train_hidden_layer, train_output_layer = create_tower(
                make_batch(train_dataframe, batch_size), input_feature_columns)
            self._train_loss, train_losses = similarity_loss(train_hidden_layer,
                                                            train_output_layer)
        # Test
        with tf.variable_scope('model', reuse=True):
            test_hidden_layer, test_output_layer = create_tower(
                make_batch(test_dataframe, batch_size), input_feature_columns)
            test_loss, test_losses = similarity_loss(test_hidden_layer,
                                                    test_output_layer)
        # Whole dataframe to get final embeddings
        with tf.variable_scope('model', reuse=True):
            self._hidden_layer, _ = create_tower(
                make_batch(dataframe, dataframe.shape[0]), input_feature_columns)
        # Metrics is a dictionary of dictionaries of dictionaries.
        # The 3 levels are used as plots, line colors, and line styles respectively.
        self._metrics = {
            'total': {
                'train': {'loss': self._train_loss},
                'test': {'loss': test_loss}
                },
            'feature': {
                'train': {'%s loss' % k: v for k, v in six.iteritems(train_losses)},
                'test': {'%s loss' % k: v for k, v in six.iteritems(test_losses)}
                }
            }
        
        def train(self,
                  num_iterations=30,
                  learning_rate=1.0,
                  plot_results=True,
                  optimizer=tf.train.GradientDescentOptimizer):
            
            with self._train_loss.graph.as_default():
                opt = optimizer(learning_rate)
                train_op = opt.minimize(self._train_loss)
                opt_init_op = tf.variables_initializer(opt.variables())
                if self._session is None:
                    self._session = tf.Session()
                    with self._session.as_default():
                        self._session.run(tf.global_variables_initializer())
                        self._session.run(tf.local_variables_initializer())
                        self._session.run(tf.tables_initializer())
                        tf.train.start_queue_runners()
                        
            with self._session.as_default():
                self._session.run(opt_init_op)
                if plot_results:  
                    iterations = []
                    metrics_vals = {k0: {k1: {k2: []
                                              for k2 in v1}
                                         for k1, v1 in six.iteritems(v0)}
                                    for k0, v0 in six.iteritems(self._metrics)}
                    
                for i in range(num_iterations + 1):
                    _, results = self._session.run((train_op, self._metrics))

                    if (i % 10 == 0) or i == num_iterations:
                        print('\riteration%6d,   ' % i + ',   '.join(
                            ['%s %s %s: %7.3f' % (k0, k1, k2, v2)
                            for k0, v0 in six.iteritems(results)
                            for k1, v1 in six.iteritems(v0)
                            for k2, v2 in six.iteritems(v1)])
                            , end=" "
                            )
                        if plot_results:
                            iterations.append(i)
                            for k0, v0 in six.iteritems(results):
                                for k1, v1 in six.iteritems(v0):
                                    for k2, v2 in six.iteritems(v1):
                                        metrics_vals[k0][k1][k2].append(results[k0][k1][k2])

                # Feedforward the entire dataframe to get all the embeddings.
                self._embeddings = self._session.run(self._hidden_layer)

                # Plot the losses and embeddings.
                if plot_results:
                    num_subplots = len(metrics_vals) + 1
                    colors = 10 * ('red', 'blue', 'black', 'green')
                    styles = 10 * ('-', '--', '-.', ':')
                    # Plot the metrics.
                    fig = plt.figure()
                    fig.set_size_inches(num_subplots*10, 8)
                    for i0, (k0, v0) in enumerate(six.iteritems(metrics_vals)):
                        
                        ax = fig.add_subplot(1, num_subplots, i0+1)
                        ax.set_title(k0)
                        for i1, (k1, v1) in enumerate(six.iteritems(v0)):
                            for i2, (k2, v2) in enumerate(six.iteritems(v1)):
                                ax.plot(iterations, v2, label='%s %s' % (k1, k2),
                                        color=colors[i1], linestyle=styles[i2])
                        ax.set_xlim([1, num_iterations])
                        ax.set_yscale('log')
                        ax.legend()
                    # Plot the embeddings (first 3 dimensions).
                    ax.legend(loc='upper right')
                    ax = fig.add_subplot(1, num_subplots, num_subplots)
                    ax.scatter(
                        self._embeddings[:, 0], self._embeddings[:, 1],
                        alpha=0.5, marker='o')
                    ax.set_title('embeddings')

    @property
    def embeddings(self):
        return self._embeddings

In [41]:
#@title Training a DNN Similarity Model

# Define some constants related to this dataset.
sparse_feature_names = ('maker', 'maker_location', 'broad_origin',
                        'specific_origin', 'bean_type')
dense_feature_names = ('reference_number', 'review_date', 'cocoa_percent',
                       'rating')

# Set of features used as input to the similarity model.
input_feature_names = ('maker', 'maker_location', 'broad_origin',
                       'cocoa_percent', 'bean_type','rating', )
# Set of features used as output to the similarity model.
output_feature_names = ['rating']  #@param

# As a rule of thumb, a reasonable choice for the embedding dimension of a
# sparse feature column is the log2 of the cardinality of its vocabulary.
# sparse_input_feature_embedding_dims = { 'maker': 9, 'maker_location': 6, ... }
default_embedding_dims = {
    sfn: int(round(math.log(choc_data[sfn].nunique()) / math.log(2)))
    for sfn in set(sparse_feature_names).intersection(input_feature_names)
}
# Dictionary mapping each sparse input feature to the dimension of its embedding
# space.
sparse_input_feature_embedding_dims = default_embedding_dims  # can be a param

# Weight of the L2 regularization applied to the top embedding layer.
l2_regularization = 10  #@param
# List of dimensions of the hidden layers of the deep neural network.
hidden_dims = [20, 10]  #@param

print('------ build model')
with tf.Graph().as_default():
    similarity_model = SimilarityModel(
        choc_data,
        input_feature_names=input_feature_names,
        output_feature_names=output_feature_names,
        dense_feature_names=dense_feature_names,
        sparse_input_feature_embedding_dims=sparse_input_feature_embedding_dims,
        hidden_dims=hidden_dims,
        l2_regularization=l2_regularization,
        batch_size=100,
        use_bias=True,
        inspect=True)

print('------ train model')
similarity_model.train(
    num_iterations=1000,
    learning_rate=0.1,
    optimizer=tf.train.AdagradOptimizer)
print('\n')

------ build model
------ train model


AttributeError: 'SimilarityModel' object has no attribute 'train'

# 4. Cluster Chocolate Dataset
We're ready to cluster the chocolates! Run the code to set up the k-means clustering functions. You do not need to understand the code.

In [None]:
def dfSimilarity(df, centroids):
    numPoints = len(df.index)
    numCentroids = len(controids.index)
    
    pointNorms = np.square(nla.norm(df, axis=1))
    pointNorms = np.reshape(pointNorms,[numPoints,1])
    
    centroidNorms = np.square(nla.norm(centroids,axis=1))
    centroidNorms = np.reshape(centroidNorms,(1,numCentroids))
    
    similarities = pointNorms + centroidNorms - 2.0*np.dot(df,np.transpose(centroids))
    
    # Divide by the number of features
    # Which is 10 because the one-hot encoding means the "Maker" and "Bean" are
    # weighted twice
    similarities = similarities/10.0
    similarities = similarities.clip(min=0.0)
    similarities = np.sqrt(similarities)
    return similarities

def initCentroids(df,k,feature_cols):
    # Pick 'k' examples are random to serve as initial centroids
    limit = len(df.index)
    centroids_key = np.random.randint(0,limit-1,k)
    centroids = df.loc[centroids_key,feature_cols].copy(deep=True)
    # the indexes get copied over so reset them
    centroids.reset_index(drop=True,inplace=True)
    return centroids
    
def pt2centroid(df,centroids,feature_cols):
    ### Calculate similarities between all points and centroids
    ### And assign points to the closest centroid + save that distance
    numCentroids = len(centroids.index)
    numExamples = len(df.index)
    # dfSimilarity = Calculate similarities for dataframe input
    dist = dfSimilarity(df.loc[:,feature_cols],centroids.loc[:,feature_cols])
    df.loc[:,'centroid'] = np.argmin(dist,axis=1) # closest centroid
    df.loc[:,'pt2centroid'] = np.min(dist,axis=1) # minimum distance
    return df

def recomputeCentroids(df,centroids,feature_cols):
    
    ### For every centroid, recompute it as an average of the points
    ### assigned to it
    numCentroids = len(centroids.index)
    for cen in range(numCentroids):
        dfSubset = df.loc[df['centroid'] == cen, feature_cols] # all points for centroid
        if not(dfSubset.empty): # if there are points assigned to the centroid
            clusterAvg = np.sum(dfSubset)/len(dfSubset.index)
            centroids.loc[cen] = clusterAvg
    return centroids

def kmeans(df,k,feature_cols,verbose):
    flagConvergence = False
    maxIter = 100
    iter = 0                      # ensure kmeans doesn't run for ever
    centroids = initCentroids(df,k,feature_cols)
    while not(flagConvergence):
        iter += 1
        #Save old mapping of points to centroids
        oldMapping = df['centroid'].copy(deep=True)
        # Perform k-means
        df = pt2centroid(df,centroids,feature_cols)
        centroids = recomputeCentroids(df,centroids,feature_cols)
        # Check convergence by comparing [oldMapping, newMapping]
        newMapping = df['centroid']
        flagConvergence = all(oldMapping == newMapping)
        if verbose == 1:
            print("Total distance:" + str(np.sum(df['pt2centroid'])))
        if (iter > maxIter):
            print('k-means did not converge! Reached maximum iteration limit of ' \
                  + str(maxIter) + '.')
            sys.exit()
            return
    
    print('k-means converged for ' + str(k) + ' clusters' + \
          ' after ' + str(iter) + ' iterations!')
    return [df,centroids]

In [None]:
k = 160 #@param

# Extract embeddings into a dataframe
choc_embed = similarity_model.embeddings
choc_embed = pd.DataFrame(choc_embed)

feature_cols = choc_embed.columns.values # save original columns
# initialize every point to an impossible value, the k+1 cluster
choc_embed['centroid'] = k
# init the point to centroid distance to an impossible value "2" (>1)
choc_embed['pt2centroid'] = 2
[choc_embed,centroids] = kmeans(choc_embed,k,feature_cols,1)
print("Data for the first few chocolates, with 'centroid' and 'pt2centroid' on the extreme right:")
choc_embed.head()