# Cheese Clustering Project

# Imports

In [62]:
import pandas as pd
import umap as umap
from sklearn.cluster import HDBSCAN
import geopandas
import numpy as np
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import nbformat
import plotly.express as px
from sklearn.preprocessing import StandardScaler

In [63]:
df = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-06-04/cheeses.csv')
df = df.drop(['url', 'calcium_content', 'region', 'synonyms', 'alt_spellings'], axis=1)
cheese_df = df.copy()

# Functions

In [64]:
# do we want indicators for each? indicator with a weight?
def convert_with_weights(df=None, column_name='milk'):
    OG_column_names = list(df.columns)
    df[column_name] = df[column_name].str.replace(" ", "")
    df[column_name] = df[column_name].str.replace("-", "")
    df = df.join(df.pop(column_name).str.get_dummies(','))
    column_names = list(df.columns)
    for item in OG_column_names:
        if item in column_names: column_names.remove(item)
    new_column_names = []
    for item in column_names:
        df = df.rename(columns = {item: f'{column_name}_{item}'})
        new_column_names.append(f'{column_name}_{item}')
    df['weights'] = 1/(df[new_column_names].sum(axis=1))
    df.replace([np.inf, -np.inf], 0, inplace=True)
    for item in new_column_names:
        df[item] = df[item] * df['weights']
    df = df.drop('weights', axis=1)
    return df

def convert_without_weights(df=None, column_name='column'):
    OG_column_names = list(df.columns)
    df[column_name] = df[column_name].str.replace(" ", "")
    df[column_name] = df[column_name].str.replace("-", "")
    df = df.join(df.pop(column_name).str.get_dummies(','))
    column_names = list(df.columns)
    for item in OG_column_names:
        if item in column_names: column_names.remove(item)
    for item in column_names:
        df = df.rename(columns = {item: f'{column_name}_{item}'})
    return df

country_climate_map = {
    "United States": "Temperate",
    "France": "Temperate",
    "Italy": "Temperate",
    "Canada": "Continental",
    "Australia": "Dry",
    "United Kingdom": "Temperate",
    "England": "Temperate",
    "Ireland": "Temperate",
    "England, Great Britain, United Kingdom": "Temperate",
    "Germany": "Temperate",
    "Netherlands": "Temperate",
    "Spain": "Temperate",
    "Scotland": "Temperate",
    "Switzerland": "Temperate",
    "Austria": "Continental",
    "England, United Kingdom": "Temperate",
    "Canada, Italy": "Temperate",
    "Sweden": "Continental",
    "Belgium": "Temperate",
    "New Zealand": "Temperate",
    "Denmark": "Temperate",
    "Portugal": "Temperate",
    "Mexico": "Tropical",
    "Wales": "Temperate",
    "India": "Tropical",
    "Greece": "Temperate",
    "Brazil": "Tropical",
    "Canada, United States": "Continental",
    "Scotland, United Kingdom": "Temperate",
    "Croatia": "Temperate",
    "Iceland": "Polar",
    "Turkey": "Temperate",
    "Cyprus": "Temperate",
    "Middle East": "Dry",
    "Serbia": "Continental",
    "Argentina": "Temperate",
    "United Kingdom, Wales": "Temperate",
    "Hungary": "Continental",
    "Mexico and Caribbean": "Tropical",
    "Great Britain": "Temperate",
    "China, Nepal, Tibet": "Continental",
    "Georgia": "Continental",
    "Finland": "Continental",
    "Great Britain, Scotland, United Kingdom": "Temperate",
    "Great Britain, United Kingdom, Wales": "Temperate",
    "Cyprus, Egypt, Israel, Jordan, Lebanon, Middle East, Syria": "Dry",
    "Czech Republic": "Continental",
    "Canada, Denmark, France, Germany, Netherlands, United States": "Temperate",
    "Armenia": "Continental",
    "Lebanon, Middle East": "Dry",
    "France, Italy": "Temperate",
    "England, Scotland, Wales": "Temperate",
    "Mauritania": "Dry",
    "Mongolia": "Continental",
    "Italy, United States": "Temperate",
    "Austria, Germany": "Continental",
    "Hungary, Poland, Slovakia": "Continental",
    "Romania": "Continental",
    "Denmark, Finland, Germany, Iceland, Norway, Sweden": "Continental",
    "Netherlands, United States": "Temperate",
    "United Kingdom, United States": "Temperate",
    "China, Tibet": "Continental",
    "Israel": "Dry",
    "France, Switzerland": "Temperate",
    "Bulgaria": "Continental",
    "Canada, India, United States": "Temperate",
    "Lithuania": "Continental",
    "Belgium, Germany, Netherlands": "Temperate",
    "Australia, France": "Dry",
    "Canada, France": "Temperate",
    "Belgium, Canada, France, Switzerland, United States": "Temperate",
    "Mexico, United States": "Tropical",
    "Iraq": "Dry",
    "Holland": "Temperate",
    "Chile": "Temperate",
    "Bangladesh, India": "Tropical",
    "France, United States": "Temperate",
    "Poland": "Continental",
    "Afghanistan": "Dry",
    "England, Scotland, United Kingdom": "Temperate",
    "Egypt, Lebanon, Syria": "Dry",
    "Albania, Bulgaria, Croatia, Greece, Israel, Macedonia, Romania, Serbia": "Temperate"
}

def convert_string_to_number(equation):
    if '/' in equation:
        y = equation.split('/')
        x = float(y[0])/float(y[1])
    elif '-' in equation:
        x = (equation.replace("%", ""))
        y = x.split('-')
        x = ((float(y[0]) + float(y[1]))/2)
    elif '%' in equation:
        x = float(equation.replace("%", ""))/100

    return x

def convert_fat_content_to_percent(df, column_name = 'fat_content'):
    df[column_name] = df[column_name].str.replace(" ", "")
    df[column_name] = df[column_name].str.replace(r'[a-zA-Z]', '', regex=True)
    df[column_name] = df[column_name].map(lambda x: convert_string_to_number(x) if type(x) == str else x)
    return df

# Converting Categorical Variables to Indicator Variables

In [65]:
cheese_df = convert_with_weights(cheese_df, 'milk')
cheese_df["climate"] = cheese_df["country"].apply(lambda x: country_climate_map.get(x))
cheese_df = cheese_df.drop('country', axis=1)
#indicator variables? Look at top producers? Create lists that join them by latitiude and some that join them by longitude?
cheese_df = convert_without_weights(cheese_df, 'climate')
#Can these be seen as truth?  All the cheeses in the same family should be clustered together
(cheese_df['cheese'].str.lower() == cheese_df['family'].str.lower()).sum()
cheese_df = convert_without_weights(cheese_df, 'family')
#separate by comma and then create indicator variables
cheese_df = convert_without_weights(cheese_df, 'type')
#create function that converts the fractions into a percentage
cheese_df =convert_fat_content_to_percent(cheese_df)
cheese_df = convert_without_weights(cheese_df, 'texture')
cheese_df =convert_without_weights(cheese_df, 'rind')
cheese_df = convert_without_weights(cheese_df, 'color')
cheese_df = convert_with_weights(cheese_df, 'flavor')
cheese_df = convert_with_weights(cheese_df, 'aroma')
cheese_df['known_producer'] = 1 - cheese_df['producers'].isna().astype(int)
cheese_df = cheese_df.drop('producers', axis=1)
cheese_df[['vegan', 'vegetarian']] = cheese_df[['vegan', 'vegetarian']].fillna(False)
cheese_df['fat_content'] = cheese_df['fat_content'].fillna(0)
cheese_df[['fat_content']] = StandardScaler().fit_transform(cheese_df[['fat_content']])



Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [66]:
cheese_df.isnull().sum(axis = 0).sort_values()

cheese            0
fat_content       0
vegetarian        0
vegan             0
milk_buffalo      0
                 ..
aroma_toasty      0
aroma_whiskey     0
aroma_woody       0
aroma_yeasty      0
known_producer    0
Length: 199, dtype: int64

# Dimensionality Reduction

In [67]:
feature_list = list(cheese_df.columns)
feature_list.remove('cheese')

In [126]:
pca = PCA(n_components=50)
pca_array = pca.fit_transform(cheese_df[feature_list])
pca_df = pd.DataFrame(pca_array)

In [155]:
umapReducer = umap.UMAP(n_components=2)
umap_pca_df = pd.DataFrame(umapReducer.fit_transform(pca_df))
umap_pca_df = pd.concat([umap_pca_df, df], axis=1)
umap_pca_df
fig = px.scatter(umap_pca_df, x=0, y=1, color='family', hover_name='cheese', hover_data=['color', 'country', 'aroma', 'rind', 'texture', 'milk', 'vegan'])
fig.show()


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [156]:
umapReducer = umap.UMAP(n_components=2)
umap_df = pd.DataFrame(umapReducer.fit_transform(cheese_df[feature_list]))
umap_df = pd.concat([umap_df, df], axis=1)
fig = px.scatter(umap_df, x=0, y=1, color='family', hover_name='cheese', hover_data=['color', 'country', 'aroma', 'rind', 'texture', 'milk', 'vegan'])
fig.show()


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



# Remove Outlier Cheese

In [121]:
cheese_to_remove = ['Leyden', 'Boeren-Leidse met sleutels', 'Brocciu', 'Maribo', 'Amul Pizza Mozzarella Cheese', 'Manouri', 'Prastost']

def remove_cheese(df=None, cheese_to_remove=[]):
    temp_df = df[~df['cheese'].isin(cheese_to_remove)]
    return temp_df

temp_cheese_df = remove_cheese(cheese_df, cheese_to_remove)
temp_df = remove_cheese(df, cheese_to_remove)
pca = PCA(n_components=30)
pca_removed_array = pca.fit_transform(temp_cheese_df[feature_list])
pca_removed_df = pd.DataFrame(pca_removed_array)
umapReducer = umap.UMAP(n_components=2)
umap_pca_removed_df = pd.DataFrame(umapReducer.fit_transform(pca_removed_df))
umap_pca_removed_df = pd.concat([umap_pca_removed_df, temp_df], axis=1)
fig = px.scatter(umap_pca_removed_df, x=0, y=1, color='family', hover_name='cheese', hover_data=['color', 'country', 'aroma', 'rind', 'texture', 'milk', 'vegan'])
fig.show()


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [120]:
umapReducer = umap.UMAP(n_components=2)
umap_removed_df = pd.DataFrame(umapReducer.fit_transform(temp_cheese_df[feature_list]))
umap_removed_df = pd.concat([umap_removed_df, temp_df], axis=1)
fig = px.scatter(umap_removed_df, x=0, y=1, color='milk', hover_name='cheese', hover_data=['color', 'country', 'aroma', 'rind', 'texture', 'type', 'vegetarian', 'family', 'fat_content'])
fig.show()


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



# Clustering PCA and UMAP results without removing outliers

In [153]:
#Cluster Original Data without Dim Reduction
hdb = HDBSCAN(min_cluster_size=2)
hdb.fit(cheese_df[feature_list])

pca = PCA(n_components=50)
pca_array = pca.fit_transform(cheese_df[feature_list])
pca_df = pd.DataFrame(pca_array)
umapReducer = umap.UMAP(n_components=2)
umap_pca_df = pd.DataFrame(umapReducer.fit_transform(pca_df[pca_df.columns]))
umap_pca_df = pd.concat([umap_pca_df, df], axis=1)
umap_pca_df['clusters'] = hdb.labels_
fig = px.scatter(umap_pca_df, x=0, y=1, color='clusters', hover_name='cheese', hover_data=['color', 'country', 'aroma', 'rind', 'texture', 'milk', 'vegan', 'clusters'])
fig.show()


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [150]:
#Cluster PCA in 50 dimensions
pca = PCA(n_components=50)
pca_array = pca.fit_transform(cheese_df[feature_list])
pca_df = pd.DataFrame(pca_array)

hdb = HDBSCAN(min_cluster_size=3)
hdb.fit(pca_df)
umapReducer = umap.UMAP(n_components=2)
umap_pca_df = pd.DataFrame(umapReducer.fit_transform(pca_df[pca_df.columns]))
umap_pca_df = pd.concat([umap_pca_df, df], axis=1)
umap_pca_df
umap_pca_df['clusters'] = hdb.labels_
fig = px.scatter(umap_pca_df, x=0, y=1, color='clusters', hover_name='cheese', hover_data=['color', 'country', 'aroma', 'rind', 'texture', 'milk', 'vegan', 'clusters'])
fig.show()


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.

