# Supervised Learning - Checking the Swiss Food dataset

This notebook is used to experiment with the models of supervised learning module in the Swiss Food Composition dataset (classification) and Chocolate Bar Ratings (regression). It essentially gives a flavour of the visualizations that we are going to use in the corresponding module.

In [1]:
import pandas as  pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, auc, roc_curve
import ipywidgets as widgets
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

## Classification

### Load the data

In [2]:
dataset = pd.read_csv('..\data\swiss_food_composition_proc.csv', index_col=0)
dataset.head()

Unnamed: 0_level_0,name,category,energy_kcal,fat_g,fatty_acids_sat_g,fatty_acids_monounsat_g,fatty_acids_polyunsat_g,cholesterol_mg,carbohydrates_g,sugars_g,...,potassium_mg,sodium_mg,chloride_mg,calcium_mg,magnesium_mg,phosphorus_mg,iron_mg,iodide_µg,zinc_mg,split
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,Almond,fruits,2.383884,2.367211,0.007568,3.747869,1.491113,-0.536758,-0.392082,0.004006,...,1.588591,-0.159777,-0.14816,1.295299,4.040213,2.312243,0.749351,-0.106408,1.315695,train
3,"Almond, dry roasted, salted",nuts,2.458504,2.390231,0.007568,3.980279,1.745907,-0.536758,-0.285661,-0.117252,...,1.488645,-0.083438,0.108003,1.295299,4.816868,2.059374,0.906205,-0.088162,1.315695,train
4,"Almond, roasted, salted",nuts,2.527384,2.545613,0.020843,4.212688,1.82553,-0.536758,-0.419844,-0.13865,...,1.355384,-0.050088,0.108003,1.098111,4.622704,2.059374,0.749351,-0.088162,1.185016,test
5,"Amaranth, seed, cooked (without addition of fa...",cereals,-0.51482,-0.510234,-0.483593,-0.476516,-0.1969,-0.536758,0.084497,-0.431096,...,-0.377014,-0.159511,-0.149051,-0.098161,1.01126,0.226077,0.631711,-0.101432,-0.05644,test
6,"Amaranth, seed, raw",cereals,0.960362,-0.228244,-0.337572,-0.312462,0.121593,-0.536758,1.875138,-0.345502,...,0.722392,-0.15881,-0.132568,0.572278,4.234377,2.628328,2.984515,-0.087333,1.577054,train


In [3]:
dataset["category"].value_counts()

other                      228
meat                       187
cereals                    146
vegetables                 127
sweets                     118
dairy                      103
fruits                      99
non_alcoholic_beverages     28
alcoholic_beverages         22
sauce                       16
herbs                        7
nuts                         3
Name: category, dtype: int64

### Pre-processing

Only label encoding for the categories that we have.

In [4]:
labels = dataset['category']
labels_meat = labels.apply(lambda x: 1 if x == 'meat' else 0)
labels_cereals = labels.apply(lambda x: 1 if x == 'cereals' else 0)
labels_vegetables = labels.apply(lambda x: 1 if x == 'vegetables' else 0)
labels_meat.shape

(1084,)

In [5]:
def prepare_dataset(labels:np.array, original_dataset:pd.DataFrame):
    onehot_encoder = OneHotEncoder()
    labels_reshaped = labels.values.reshape(-1,1)
    onehot_encoded = onehot_encoder.fit_transform(labels_reshaped)
    onehot_encoded_labels = onehot_encoded.toarray()
    encoded_df = pd.DataFrame(onehot_encoded_labels, index=original_dataset.index)
    new_dataset = pd.concat([dataset, encoded_df], axis=1)
    train_data = new_dataset[new_dataset['split'] == 'train']
    test_data = new_dataset[new_dataset['split'] == 'test']
    return train_data, test_data

In [6]:
meat_train_set, meat_test_set = prepare_dataset(labels_meat, dataset)
cereals_train_set, cereals_test_set = prepare_dataset(labels_cereals, dataset)
vegetables_train_set, vegetables_test_set = prepare_dataset(labels_vegetables, dataset)

In [7]:
meat_test_set.shape

(217, 43)

In [8]:
meat_train_set.shape

(867, 43)

In [9]:
cols = meat_test_set.columns
cols

Index([                     'name',                  'category',
                     'energy_kcal',                     'fat_g',
               'fatty_acids_sat_g',   'fatty_acids_monounsat_g',
         'fatty_acids_polyunsat_g',            'cholesterol_mg',
                 'carbohydrates_g',                  'sugars_g',
                        'starch_g',                  'fibres_g',
                       'protein_g',                    'salt_g',
                       'alcohol_g',                   'water_g',
            'vit_A_activity_re_µg',     'vit_A_activity_rae_µg',
                      'retinol_µg', 'beta_carotene_activity_µg',
                'beta_carotene_µg',                 'vit_B1_mg',
                       'vit_B2_mg',                 'vit_B6_mg',
                      'vit_B12_µg',                 'niacin_mg',
                       'folate_µg',       'panthotenic_acid_mg',
                        'vit_c_mg',                  'vit_d_µg',
               'vit_e_act

In [10]:
feature_cols = cols[2:-3]
labels_cols = cols[-2:]
print(feature_cols)
print(labels_cols)

Index(['energy_kcal', 'fat_g', 'fatty_acids_sat_g', 'fatty_acids_monounsat_g',
       'fatty_acids_polyunsat_g', 'cholesterol_mg', 'carbohydrates_g',
       'sugars_g', 'starch_g', 'fibres_g', 'protein_g', 'salt_g', 'alcohol_g',
       'water_g', 'vit_A_activity_re_µg', 'vit_A_activity_rae_µg',
       'retinol_µg', 'beta_carotene_activity_µg', 'beta_carotene_µg',
       'vit_B1_mg', 'vit_B2_mg', 'vit_B6_mg', 'vit_B12_µg', 'niacin_mg',
       'folate_µg', 'panthotenic_acid_mg', 'vit_c_mg', 'vit_d_µg',
       'vit_e_activity_mg', 'potassium_mg', 'sodium_mg', 'chloride_mg',
       'calcium_mg', 'magnesium_mg', 'phosphorus_mg', 'iron_mg', 'iodide_µg',
       'zinc_mg'],
      dtype='object')
Index([0, 1], dtype='object')


### Models

Trying only Random Forrest.

In [11]:
def run_random_forest_classifier(n_estimators, max_depth, 
                                   train_data_features:np.array, train_data_labels:np.array,
                                   test_data_features:np.array, test_data_labels:np.array, 
                                   sup_title:str, roc_title:str, prc_title:str):
    
    rf_classifier = RandomForestClassifier(n_estimators=n_estimators, 
                                           max_depth=max_depth,
                                           random_state=0)
    
    rf_classifier.fit(train_data_features, train_data_labels)
    
    predicted_test_labels = rf_classifier.predict(test_data_features)
    predicted_train_labels = rf_classifier.predict(train_data_features)
    
    # categories = dataset["category"].unique()
    test_set_labels = np.array(test_data_labels)
    train_set_labels = np.array(train_data_labels)
    #title = f"ROC - n_estimators={n_estimators} and max_depth={max_depth}"
    run_auc_roc_prc(train_set_labels, predicted_train_labels, test_set_labels, 
                predicted_test_labels, sup_title, roc_title, prc_title)

def run_auc_roc_prc(y_train:np.array, y_train_pred:np.array, 
                y_test:np.array, y_test_pred:np.array, sup_title:str,
                roc_title:str, prc_title:str):

    # store the fpr, tpr, and roc_auc for all averaging strategies
    fpr, tpr, roc_auc, prc, rec, prc_auc = dict(), dict(), dict(), dict(), dict(), dict()

    #-----------------------------------------------------------------------------------#
    # Calculate the ROC and PRC curves
    #-----------------------------------------------------------------------------------#

    # Compute ROC curve and ROC area
    fpr["test"], tpr["test"], _ = roc_curve(y_test.ravel(), y_test_pred.ravel())
    fpr["train"], tpr["train"], _ = roc_curve(y_train.ravel(), y_train_pred.ravel())
    roc_auc["test"] = auc(fpr["test"], tpr["test"])
    roc_auc["train"] = auc(fpr["train"], tpr["train"])

    # Compute PRC curve and PRC area
    prc["test"], rec["test"], _ = precision_recall_curve(y_test.ravel(), y_test_pred.ravel())
    prc["train"], rec["train"], _ = precision_recall_curve(y_train.ravel(), y_train_pred.ravel())
    prc_auc["test"] = auc(rec["test"], prc["test"])
    prc_auc["train"] = auc(rec["train"], prc["train"])


    #-----------------------------------------------------------------------------------#
    # Display curves here
    #-----------------------------------------------------------------------------------#
    y_test = np.all(y_test==[0,1], axis=1).astype(int)
    y_test_pred = np.all(y_test_pred==[0,1], axis=1).astype(int)
    y_train = np.all(y_train==[0,1], axis=1).astype(int)
    y_train_pred = np.all(y_train_pred==[0,1], axis=1).astype(int)
    
    print(f"Accuracy (Test): {accuracy_score(y_true=y_test, y_pred=y_test_pred)*100:.2f}%")
    print(f"F1-score (Test): {f1_score(y_true=y_test, y_pred=y_test_pred):.2f}")
    print(f"Accuracy (Train): {accuracy_score(y_true=y_train, y_pred=y_train_pred)*100:.2f}%")
    print(f"F1-score (Train): {f1_score(y_true=y_train, y_pred=y_train_pred):.2f}")

    _, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

    axes[0].plot(
        fpr["test"],
        tpr["test"],
        label=f"Test-AUC = {roc_auc['test']:.2f}",
        color="deeppink",
        linestyle="-",
        linewidth=2,
    )

    axes[0].plot(
        fpr["train"],
        tpr["train"],
        label=f"Train-AUC = {roc_auc['train']:.2f}",
        color="navy",
        linestyle="-",
        linewidth=2,
    )

    axes[0].axis("square")
    axes[0].set_xlabel("False Positive Rate")
    axes[0].set_ylabel("True Positive Rate")
    axes[0].set_ylim(-0.05, 1.05)
    axes[0].set_xlim(-0.05, 1.05)
    axes[0].set_title(roc_title)
    axes[0].legend(loc='upper right', bbox_to_anchor=(1, 0.5))
    
    axes[1].plot(
        rec["test"],
        prc["test"],
        label=f"Test-AUC = {prc_auc['test']:.2f}",
        color="deeppink",
        linestyle="-",
        linewidth=2,
    )

    axes[1].plot(
        rec["train"],
        prc["train"],
        label=f"Train-AUC = {prc_auc['train']:.2f}",
        color="navy",
        linestyle="-",
        linewidth=2,
    )

    axes[1].axis("square")
    axes[1].set_xlabel("False Positive Rate")
    axes[1].set_ylabel("True Positive Rate")
    axes[1].set_ylim(-0.05, 1.05)
    axes[1].set_xlim(-0.05, 1.05)
    axes[1].set_title(prc_title)
    axes[1].legend(loc='upper left', bbox_to_anchor=(1, 0.5))
    
    plt.suptitle(sup_title)
    plt.show()

In [12]:
#----------------- wrapper & helper functions are here ----------------------#
def get_plot_titles(category:str, max_depth:int, n_estimators:int):
    sup_title = f'category={category}: max_depth={max_depth} and n_estimators={n_estimators}'
    title_roc=f'ROC - curve'
    title_prc=f'PRC - curve'
    return sup_title, title_roc, title_prc

def wrapper_function_meat(n_estimators:int, max_depth:int):
    train_data_features = meat_train_set[feature_cols]
    train_data_labels = meat_train_set[labels_cols]
    test_data_features = meat_test_set[feature_cols]
    test_data_labels = meat_test_set[labels_cols]
    sup_title, title_roc, title_prc = get_plot_titles('Meat', n_estimators, max_depth)
    run_random_forest_classifier(n_estimators, max_depth, train_data_features, 
                                   train_data_labels, test_data_features,
                                   test_data_labels, sup_title, 
                                   title_roc, title_prc)

def wrapper_function_cereals(n_estimators:int, max_depth:int):
    train_data_features = cereals_train_set[feature_cols]
    train_data_labels = cereals_train_set[labels_cols]
    test_data_features = cereals_test_set[feature_cols]
    test_data_labels = cereals_test_set[labels_cols]
    sup_title, title_roc, title_prc = get_plot_titles('Cereals', n_estimators, max_depth)
    run_random_forest_classifier(n_estimators, max_depth, train_data_features, 
                                   train_data_labels, test_data_features,
                                   test_data_labels, sup_title, 
                                   title_roc, title_prc)

def wrapper_function_vegetables(n_estimators:int, max_depth:int):
    train_data_features = vegetables_train_set[feature_cols]
    train_data_labels = vegetables_train_set[labels_cols]
    test_data_features = vegetables_test_set[feature_cols]
    test_data_labels = vegetables_test_set[labels_cols]
    sup_title, title_roc, title_prc = get_plot_titles('Vegetables', n_estimators, max_depth)
    run_random_forest_classifier(n_estimators, max_depth, train_data_features, 
                                   train_data_labels, test_data_features,
                                   test_data_labels, sup_title, 
                                   title_roc, title_prc)

In [13]:
n_estimators_slider = widgets.IntSlider(value=30, min=10, max=1000, step=5, description='n_estimators:')
max_depth_slider = widgets.IntSlider(value=10, min=1, max=20, step=1, description='max_depth:')
widgets.interact_manual(wrapper_function_meat, 
                        n_estimators=n_estimators_slider,
                        max_depth=max_depth_slider,
                        );

interactive(children=(IntSlider(value=30, description='n_estimators:', max=1000, min=10, step=5), IntSlider(va…

In [14]:
widgets.interact_manual(wrapper_function_cereals, 
                        n_estimators=n_estimators_slider,
                        max_depth=max_depth_slider,
                        );

interactive(children=(IntSlider(value=30, description='n_estimators:', max=1000, min=10, step=5), IntSlider(va…

In [15]:
widgets.interact_manual(wrapper_function_vegetables, 
                        n_estimators=n_estimators_slider,
                        max_depth=max_depth_slider,
                        );

interactive(children=(IntSlider(value=30, description='n_estimators:', max=1000, min=10, step=5), IntSlider(va…

## Regression

The source of the dataset can be found [here](https://www.kaggle.com/datasets/evangower/chocolate-bar-ratings).

In [16]:
chocolate_dataset = pd.read_csv("../data/chocolate_bars.csv")
chocolate_dataset.head()

Unnamed: 0,id,manufacturer,company_location,year_reviewed,bean_origin,bar_name,cocoa_percent,num_ingredients,ingredients,review,rating
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76.0,3.0,"B,S,C","rich cocoa, fatty, bready",3.25
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76.0,3.0,"B,S,C","cocoa, vegetal, savory",3.5
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76.0,3.0,"B,S,C","cocoa, blackberry, full body",3.75
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68.0,3.0,"B,S,C","chewy, off, rubbery",3.0
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72.0,3.0,"B,S,C","fatty, earthy, moss, nutty,chalky",3.0


We will see if we can predict the rating based on the cocoa percentage and number of ingredients.

In [17]:
chocolate_dataset = chocolate_dataset[['cocoa_percent','num_ingredients', 'rating']]
chocolate_dataset.head()

Unnamed: 0,cocoa_percent,num_ingredients,rating
0,76.0,3.0,3.25
1,76.0,3.0,3.5
2,76.0,3.0,3.75
3,68.0,3.0,3.0
4,72.0,3.0,3.0


### Impute Missing Values

Mean imputation.

In [18]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(chocolate_dataset.iloc[:, 0:-1])
chocolate_dataset.iloc[:, 0:-1] = imputer.transform(chocolate_dataset.iloc[:, 0:-1])

### Train-Test split

In [19]:
train_set, test_set = train_test_split(chocolate_dataset, test_size=0.2, random_state=0)

### Standardization

In [20]:
cols = chocolate_dataset.columns[:-1]
standard_scaler = StandardScaler()
train_set_numerical_st = standard_scaler.fit_transform(train_set[cols])
test_set_numerical_st = standard_scaler.transform(test_set[cols])

### Model

In [21]:
def run_random_forest_regression(n_estimators:int, max_depth:int):
    regressor = RandomForestRegressor(n_estimators=n_estimators, 
                                      max_depth=max_depth, random_state=0)
    regressor.fit(train_set_numerical_st, train_set.iloc[:, -1])
    y_train_pred = regressor.predict(train_set_numerical_st)
    y_test_pred = regressor.predict(test_set_numerical_st)

    title = f"Random Forest Regression - n_estimators={n_estimators} and max_depth={max_depth}"
    visualize_results(train_set.iloc[:, -1], y_train_pred, 
                      test_set.iloc[:, -1], y_test_pred,
                      title)

def visualize_results(y_train_true:np.array, y_train_pred:np.array, 
                      y_test_true:np.array, y_test_pred:np.array, title:str):

    print(f"MSE (Train) = {mean_squared_error(y_train_true, y_train_pred)}")
    print(f"MSE (Test) = {mean_squared_error(y_test_true, y_test_pred)}")

    _, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,4))

    axes[0].scatter(x=y_train_true, y=y_train_pred, s=4)
    axes[0].plot(y_train_true, y_train_true, color='red', label='y=x')
    axes[0].set_xlabel("True Rating")
    axes[0].set_ylabel("Predicted Label")
    axes[0].set_title("Predicted vs Actual - Train Set");
    axes[0].legend()

    axes[1].scatter(x=y_test_true, y=y_test_pred, s=4)
    axes[1].plot(y_train_true, y_train_true, color='red', label='y=x')
    axes[1].set_xlabel("True Rating")
    axes[1].set_ylabel("Predicted Label")
    axes[1].set_title("Predicted vs Actual - Test Set");
    axes[1].legend()
    
    plt.suptitle(title)

In [22]:
n_estimators_slider = widgets.IntSlider(value=30, min=10, max=1000, step=5, description='n_estimators:')
max_depth_slider = widgets.IntSlider(value=10, min=1, max=20, step=1, description='max_depth:')
widgets.interact_manual(run_random_forest_regression, 
                        n_estimators=n_estimators_slider,
                        max_depth=max_depth_slider,
                        );

interactive(children=(IntSlider(value=30, description='n_estimators:', max=1000, min=10, step=5), IntSlider(va…

Link for [one vs all](https://towardsdatascience.com/multiclass-classification-evaluation-with-roc-curves-and-roc-auc-294fd4617e3a).