# Clustering with Manual Similarity Measure

In this Colab, you will group chocolates in the
[Chocolate Bar Ratings](https://www.kaggle.com/rtatman/chocolate-bar-ratings)
dataset using the k-means clustering algorithm with a manual similarity measure. The dataset has ratings
of chocolate bars along with their cocoa percentage, bean type, bean origin,
maker name, and maker country. You will:

*   Load and clean the data.
*   Process the data.
*   Calculate similarity between pairs of chocolates.
*   Cluster the chocolates using k-means.
*   Check the clustering result using quality metrics.


# 1. Load and clean data

Run the following cell to load and clean the chocolate dataset. The first few rows of the dataset are displayed. Inspect
the features and their values.

In [1]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import altair as alt
import re
import pdb # python debugger
import sys
from os.path import join

np.set_printoptions(precision=2)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_rows = 15

choc_data = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/flavors_of_cacao.csv", sep=",", encoding='latin-1')

# We can rename the columns.
choc_data.columns = ['maker', 'specific_origin', 'reference_number', 'review_date', 'cocoa_percent', 'maker_location', 'rating', 'bean_type', 'broad_origin']

# choc_data.dtypes

In [2]:
choc_data.dtypes

maker                object
specific_origin      object
reference_number      int64
review_date           int64
cocoa_percent        object
maker_location       object
rating              float64
bean_type            object
broad_origin         object
dtype: object

In [3]:
# Replace empty/null values with "Blend"
choc_data['bean_type'] = choc_data['bean_type'].fillna('Blend')

# Cast bean_type to string to remove leading 'u'
choc_data['bean_type'] = choc_data['bean_type'].astype('str')
choc_data['cocoa_percent'] = choc_data['cocoa_percent'].str.strip('%')
choc_data['cocoa_percent'] = pd.to_numeric(choc_data['cocoa_percent'])

choc_data

Unnamed: 0,maker,specific_origin,reference_number,review_date,cocoa_percent,maker_location,rating,bean_type,broad_origin
0,A. Morin,Agua Grande,1876,2016,63.00,France,3.75,Blend,Sao Tome
1,A. Morin,Kpime,1676,2015,70.00,France,2.75,Blend,Togo
2,A. Morin,Atsane,1676,2015,70.00,France,3.00,Blend,Togo
3,A. Morin,Akata,1680,2015,70.00,France,3.50,Blend,Togo
4,A. Morin,Quilla,1704,2015,70.00,France,3.50,Blend,Peru
...,...,...,...,...,...,...,...,...,...
1790,Zotter,Peru,647,2011,70.00,Austria,3.75,Blend,Peru
1791,Zotter,Congo,749,2011,65.00,Austria,3.00,Forastero,Congo
1792,Zotter,Kerala State,749,2011,65.00,Austria,3.50,Forastero,India
1793,Zotter,Kerala State,781,2011,62.00,Austria,3.25,Blend,India


In [4]:
# # Correct spelling mistakes, and replace city with country name
# choc_data['maker_location'] = choc_data['maker_location']\
# .str.replace('amsterdam', 'Holland')\
# .str.replace('U.K', 'England')\
# .str.replace('Niacragua', ' Nicaragua')\
# .str.replace('Domincan Republic', 'Dominican Republic')

In [5]:
choc_data['maker_location'] = choc_data['maker_location'].str.replace('amsterdam', 'Holland')
choc_data['maker_location'] = choc_data['maker_location'].str.replace('U.K', 'England')
choc_data['maker_location'] = choc_data['maker_location'].str.replace('Niacragua', 'Nicaragua')
choc_data['maker_location'] = choc_data['maker_location'].str.replace('Domincan Republic', 'Dominican Republic')

  choc_data['maker_location'] = choc_data['maker_location'].str.replace('U.K', 'England')


In [6]:
choc_data.loc[choc_data['maker_location'] == 'amsterdam', 'maker_location'] = 'Holland'
choc_data.loc[choc_data['maker_location'] == 'U.K', 'maker_location'] = 'England'
choc_data.loc[choc_data['maker_location'] == 'Niacragua', 'maker_location'] = 'Nicaragua'
choc_data.loc[choc_data['maker_location'] == 'Domincan Republic', 'maker_location'] = 'Dominican Republic'

In [7]:
# Adding this so that Holland and Netherlands map to the same country.
choc_data['maker_location'] = choc_data['maker_location'].str.replace('Holland', 'Netherlands')

In [8]:
def cleanup_spelling_abbrev(text):
    replacements = [
        ['-', ', '], ['/ ', ', '], ['/', ', '], ['\(', ', '], [' and', ', '], [' &', ', '], ['\)', ''],
        ['Dom Rep|DR|Domin Rep|Dominican Rep,|Domincan Republic', 'Dominican Republic'],
        ['Mad,|Mad$', 'Madagascar, '],
        ['PNG', 'Papua New Guinea, '],
        ['Guat,|Guat$', 'Guatemala, '],
        ['Ven,|Ven$|Venez,|Venez$', 'Venezuela, '],
        ['Ecu,|Ecu$|Ecuad,|Ecuad$', 'Ecuador, '],
        ['Nic,|Nic$', 'Nicaragua, '],
        ['Cost Rica', 'Costa Rica'],
        ['Mex,|Mex$', 'Mexico, '],
        ['Jam,|Jam$', 'Jamaica, '],
        ['Haw,|Haw$', 'Hawaii, '],
        ['Gre,|Gre$', 'Grenada, '],
        ['Tri,|Tri$', 'Trinidad, '],
        ['C Am', 'Central America'],
        ['S America', 'South America'],
        [', $', ''], [',  ', ', '], [', ,', ', '], ['\xa0', ' '],[',\s+', ','],
        [' Bali', ',Bali']
    ]
    for i, j in replacements:
        text = re.sub(i, j, text)
    return text

In [9]:
choc_data['specific_origin'] = choc_data['specific_origin'].str.replace('.', '').apply(cleanup_spelling_abbrev)

  choc_data['specific_origin'] = choc_data['specific_origin'].str.replace('.', '').apply(cleanup_spelling_abbrev)


In [10]:
# Cast specific_origin to string
choc_data['specific_origin'] = choc_data['specific_origin'].astype(str)

In [11]:
# Replace null-valued fields with the same value as for specific_origin
choc_data['broad_origin'] = choc_data['broad_origin'].fillna(choc_data['specific_origin'])

In [12]:
# Clean up spelling mistakes and deal with abbreviations
choc_data['broad_origin'] = choc_data['broad_origin'].str.replace('.', '').apply(cleanup_spelling_abbrev)

  choc_data['broad_origin'] = choc_data['broad_origin'].str.replace('.', '').apply(cleanup_spelling_abbrev)


In [13]:
# Change 'Trinitario, Criollo' to "Criollo, Trinitario"
# Check with choc_data['bean_type'].unique()
choc_data.loc[choc_data['bean_type'].isin(['Trinitario, Criollo']),'bean_type'] = "Criollo, Trinitario"

In [14]:
# Confirm
choc_data[choc_data['bean_type'].isin(['Trinitario, Criollo'])]

Unnamed: 0,maker,specific_origin,reference_number,review_date,cocoa_percent,maker_location,rating,bean_type,broad_origin


In [15]:
# Fix chocolate maker names
choc_data.loc[choc_data['maker']=='Shattel','maker'] = 'Shattell'
choc_data['maker'] = choc_data['maker'].str.replace(u'Na\xef\xbf\xbdve','Naive')

In [16]:
# Save the original column names
original_cols = choc_data.columns.values

In [17]:
choc_data.head()

Unnamed: 0,maker,specific_origin,reference_number,review_date,cocoa_percent,maker_location,rating,bean_type,broad_origin
0,A. Morin,Agua Grande,1876,2016,63.0,France,3.75,Blend,Sao Tome
1,A. Morin,Kpime,1676,2015,70.0,France,2.75,Blend,Togo
2,A. Morin,Atsane,1676,2015,70.0,France,3.0,Blend,Togo
3,A. Morin,Akata,1680,2015,70.0,France,3.5,Blend,Togo
4,A. Morin,Quilla,1704,2015,70.0,France,3.5,Blend,Peru
