# Unsupervised Learning - Checking the swiss food dataset

This notebook is used to check the preprocessed swiss food dataset and some of its categories, how would they cluster. I used only the first 120 rows (1-120) and changed the categories according to the dictionary below.

In [75]:
import pandas as  pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

## Load the data

In [76]:
dataset = pd.read_csv('../data/swiss_food_composition_database.csv')

In [77]:
dataset = dataset.iloc[1:121, :] # working only with the first 120 food samples
dataset

Unnamed: 0,f_ID,f_name,f_category,f_energy_kcal,f_fat_g,f_fatty_acids_sat_g,f_fatty_acids_monounsat_g,f_fatty_acids_polyunsat_g,f_cholesterol_mg,f_carbohydrates_g,...,f_potassium_mg,f_sodium_mg,f_chloride_mg,f_calcium_mg,f_magnesium_mg,f_phosphorus_mg,f_iron_mg,f_iodide_µg,f_zinc_mg,f_selenium_µg
1,10536,Agave syrup,Sweets/Sugar and sweeteners,293,0.0,0,n.d.,n.d.,n.d.,73.1,...,n.d.,4.0,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.
2,273,Almond,"Nuts, seeds and oleaginous fruit",624,52.1,4.1,31.4,11.4,0,7.8,...,740,1.1,40,270,240,510,3.3,0.2,3.3,2.2
3,278,"Almond, dry roasted, salted","Savoury snacks/Salted nuts, seeds and kernels",637,52.5,4.1,33.1,13,0,10.1,...,710,230.0,1190,270,280,470,3.7,2.4,3.3,2
4,269,"Almond, roasted, salted","Savoury snacks/Salted nuts, seeds and kernels",649,55.2,4.2,34.8,13.5,0,7.2,...,670,330.0,1190,240,270,470,3.3,2.4,3.1,2
5,13398,"Amaranth, seed, cooked (without addition of fa...","Cereal products, pulses and potatoes/Other cer...",119,2.1,0.4,0.5,0.8,0,18.1,...,150,1.9,36,58,84,180,3,0.8,1.2,n.d.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,825,"Bread for toasting, wholemeal","Bread, flakes and breakfast cereals/Bread and ...",271,4.9,0.5,1.9,2.3,0,43.7,...,270,510.0,770,21,87,240,3.4,1.8,2.3,3.3
117,842,Bread roll (semi white),"Bread, flakes and breakfast cereals/Bread and ...",215,1.0,0.2,0.2,0.5,0,41.5,...,120,770.0,1170,17,28,110,1.3,1.1,1,2.7
118,843,Bread roll from St. Gallen (semi white),"Bread, flakes and breakfast cereals/Bread and ...",231,1.0,0.1,0.1,0.5,0,44.6,...,140,840.0,1280,16,37,130,1.7,1.2,1.2,2.7
119,14038,Bread roll with chocolate,Sweets/Other sweet pastries,366,14.3,8.5,4.1,0.9,20,49.7,...,310,530.0,800,100,34,160,1.3,1.1,1.1,n.d.


## Add the new category column

In [80]:
new_categories_dict = { 'dairy':'dairy', 'non-alcoholic beverages':'non_alcoholic_beverages', 'alcoholic beverages':'alcoholic_beverages',
                        'sweet':'sweets', 'fruit':'fruits', 'herbs':'herbs', 'vegetable':'vegetables',
                        'cereal':'cereals', 'bread':'bread', 'sauces':'sauce', 'meat':'meat',
                        'nut':'nuts'}
keys = new_categories_dict.keys()
n_cols = len(dataset.columns)
dataset['f_category_new'] = np.zeros((dataset.shape[0], 1))

for i, value in enumerate(dataset['f_category']):
    for word in keys:
        if word in value.lower():
            dataset.iloc[i, n_cols] = new_categories_dict[word]
            break
        else:
            dataset.iloc[i, n_cols] = 'other'


dataset

Unnamed: 0,f_ID,f_name,f_category,f_energy_kcal,f_fat_g,f_fatty_acids_sat_g,f_fatty_acids_monounsat_g,f_fatty_acids_polyunsat_g,f_cholesterol_mg,f_carbohydrates_g,...,f_sodium_mg,f_chloride_mg,f_calcium_mg,f_magnesium_mg,f_phosphorus_mg,f_iron_mg,f_iodide_µg,f_zinc_mg,f_selenium_µg,f_category_new
1,10536,Agave syrup,Sweets/Sugar and sweeteners,293,0.0,0,n.d.,n.d.,n.d.,73.1,...,4.0,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,sweets
2,273,Almond,"Nuts, seeds and oleaginous fruit",624,52.1,4.1,31.4,11.4,0,7.8,...,1.1,40,270,240,510,3.3,0.2,3.3,2.2,fruits
3,278,"Almond, dry roasted, salted","Savoury snacks/Salted nuts, seeds and kernels",637,52.5,4.1,33.1,13,0,10.1,...,230.0,1190,270,280,470,3.7,2.4,3.3,2,nuts
4,269,"Almond, roasted, salted","Savoury snacks/Salted nuts, seeds and kernels",649,55.2,4.2,34.8,13.5,0,7.2,...,330.0,1190,240,270,470,3.3,2.4,3.1,2,nuts
5,13398,"Amaranth, seed, cooked (without addition of fa...","Cereal products, pulses and potatoes/Other cer...",119,2.1,0.4,0.5,0.8,0,18.1,...,1.9,36,58,84,180,3,0.8,1.2,n.d.,cereals
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,825,"Bread for toasting, wholemeal","Bread, flakes and breakfast cereals/Bread and ...",271,4.9,0.5,1.9,2.3,0,43.7,...,510.0,770,21,87,240,3.4,1.8,2.3,3.3,cereals
117,842,Bread roll (semi white),"Bread, flakes and breakfast cereals/Bread and ...",215,1.0,0.2,0.2,0.5,0,41.5,...,770.0,1170,17,28,110,1.3,1.1,1,2.7,cereals
118,843,Bread roll from St. Gallen (semi white),"Bread, flakes and breakfast cereals/Bread and ...",231,1.0,0.1,0.1,0.5,0,44.6,...,840.0,1280,16,37,130,1.7,1.2,1.2,2.7,cereals
119,14038,Bread roll with chocolate,Sweets/Other sweet pastries,366,14.3,8.5,4.1,0.9,20,49.7,...,530.0,800,100,34,160,1.3,1.1,1.1,n.d.,sweets


In [82]:
new_df = dataset.iloc[:, 2:]
new_df

Unnamed: 0,f_category,f_energy_kcal,f_fat_g,f_fatty_acids_sat_g,f_fatty_acids_monounsat_g,f_fatty_acids_polyunsat_g,f_cholesterol_mg,f_carbohydrates_g,f_sugars_g,f_starch_g,...,f_sodium_mg,f_chloride_mg,f_calcium_mg,f_magnesium_mg,f_phosphorus_mg,f_iron_mg,f_iodide_µg,f_zinc_mg,f_selenium_µg,f_category_new
1,Sweets/Sugar and sweeteners,293,0.0,0,n.d.,n.d.,n.d.,73.1,n.d.,n.d.,...,4.0,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,sweets
2,"Nuts, seeds and oleaginous fruit",624,52.1,4.1,31.4,11.4,0,7.8,6.6,0.6,...,1.1,40,270,240,510,3.3,0.2,3.3,2.2,fruits
3,"Savoury snacks/Salted nuts, seeds and kernels",637,52.5,4.1,33.1,13,0,10.1,4.9,0.7,...,230.0,1190,270,280,470,3.7,2.4,3.3,2,nuts
4,"Savoury snacks/Salted nuts, seeds and kernels",649,55.2,4.2,34.8,13.5,0,7.2,4.6,0.6,...,330.0,1190,240,270,470,3.3,2.4,3.1,2,nuts
5,"Cereal products, pulses and potatoes/Other cer...",119,2.1,0.4,0.5,0.8,0,18.1,0.5,17.6,...,1.9,36,58,84,180,3,0.8,1.2,n.d.,cereals
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,"Bread, flakes and breakfast cereals/Bread and ...",271,4.9,0.5,1.9,2.3,0,43.7,4.2,39.5,...,510.0,770,21,87,240,3.4,1.8,2.3,3.3,cereals
117,"Bread, flakes and breakfast cereals/Bread and ...",215,1.0,0.2,0.2,0.5,0,41.5,0.4,40.9,...,770.0,1170,17,28,110,1.3,1.1,1,2.7,cereals
118,"Bread, flakes and breakfast cereals/Bread and ...",231,1.0,0.1,0.1,0.5,0,44.6,0.3,43.9,...,840.0,1280,16,37,130,1.7,1.2,1.2,2.7,cereals
119,Sweets/Other sweet pastries,366,14.3,8.5,4.1,0.9,20,49.7,16.3,32.6,...,530.0,800,100,34,160,1.3,1.1,1.1,n.d.,sweets


In [84]:
new_df.to_csv('new_df.csv', index=False)

In [88]:
new_df.replace('n.d.', np.nan, inplace=True)

In [90]:
new_df.mean()

  new_df.mean()


f_energy_kcal        186.366667
f_fat_g                8.051667
f_carbohydrates_g     16.213333
f_protein_g           10.742500
f_salt_g               0.682500
f_sodium_mg          276.046669
dtype: float64

In [89]:
new_df.fillna()

Unnamed: 0,f_category,f_energy_kcal,f_fat_g,f_fatty_acids_sat_g,f_fatty_acids_monounsat_g,f_fatty_acids_polyunsat_g,f_cholesterol_mg,f_carbohydrates_g,f_sugars_g,f_starch_g,...,f_sodium_mg,f_chloride_mg,f_calcium_mg,f_magnesium_mg,f_phosphorus_mg,f_iron_mg,f_iodide_µg,f_zinc_mg,f_selenium_µg,f_category_new
1,Sweets/Sugar and sweeteners,293,0.0,0,,,,73.1,,,...,4.0,,,,,,,,,sweets
2,"Nuts, seeds and oleaginous fruit",624,52.1,4.1,31.4,11.4,0,7.8,6.6,0.6,...,1.1,40,270,240,510,3.3,0.2,3.3,2.2,fruits
3,"Savoury snacks/Salted nuts, seeds and kernels",637,52.5,4.1,33.1,13,0,10.1,4.9,0.7,...,230.0,1190,270,280,470,3.7,2.4,3.3,2,nuts
4,"Savoury snacks/Salted nuts, seeds and kernels",649,55.2,4.2,34.8,13.5,0,7.2,4.6,0.6,...,330.0,1190,240,270,470,3.3,2.4,3.1,2,nuts
5,"Cereal products, pulses and potatoes/Other cer...",119,2.1,0.4,0.5,0.8,0,18.1,0.5,17.6,...,1.9,36,58,84,180,3,0.8,1.2,,cereals
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,"Bread, flakes and breakfast cereals/Bread and ...",271,4.9,0.5,1.9,2.3,0,43.7,4.2,39.5,...,510.0,770,21,87,240,3.4,1.8,2.3,3.3,cereals
117,"Bread, flakes and breakfast cereals/Bread and ...",215,1.0,0.2,0.2,0.5,0,41.5,0.4,40.9,...,770.0,1170,17,28,110,1.3,1.1,1,2.7,cereals
118,"Bread, flakes and breakfast cereals/Bread and ...",231,1.0,0.1,0.1,0.5,0,44.6,0.3,43.9,...,840.0,1280,16,37,130,1.7,1.2,1.2,2.7,cereals
119,Sweets/Other sweet pastries,366,14.3,8.5,4.1,0.9,20,49.7,16.3,32.6,...,530.0,800,100,34,160,1.3,1.1,1.1,,sweets


In [102]:
new_df.iloc[:, 1:-1].mean(numeric_only=False)

TypeError: can only concatenate str (not "int") to str