# Supervised Learning - Checking the Swiss Food dataset

This notebook is used to experiment with the models of supervised learning module in the Swiss Food Composition dataset (classification) and Chocolate Bar Ratings (regression). It essentially gives a flavour of the visualizations that we are going to use in the corresponding module.

In [2]:
import os
import pandas as  pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score
import ipywidgets as widgets
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

## Classification

### Load the data

In [3]:
filepath = os.path.join("..","data", "swiss_food_composition_proc.csv")
dataset = pd.read_csv(filepath, index_col=0)
dataset.head()

Unnamed: 0_level_0,name,category,energy_kcal,fat_g,fatty_acids_sat_g,fatty_acids_monounsat_g,fatty_acids_polyunsat_g,cholesterol_mg,carbohydrates_g,sugars_g,...,potassium_mg,sodium_mg,chloride_mg,calcium_mg,magnesium_mg,phosphorus_mg,iron_mg,iodide_µg,zinc_mg,split
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,Almond,fruits,2.383884,2.367211,0.007568,3.747869,1.491113,-0.536758,-0.392082,0.004006,...,1.588591,-0.159777,-0.14816,1.295299,4.040213,2.312243,0.749351,-0.106408,1.315695,train
3,"Almond, dry roasted, salted",nuts,2.458504,2.390231,0.007568,3.980279,1.745907,-0.536758,-0.285661,-0.117252,...,1.488645,-0.083438,0.108003,1.295299,4.816868,2.059374,0.906205,-0.088162,1.315695,train
4,"Almond, roasted, salted",nuts,2.527384,2.545613,0.020843,4.212688,1.82553,-0.536758,-0.419844,-0.13865,...,1.355384,-0.050088,0.108003,1.098111,4.622704,2.059374,0.749351,-0.088162,1.185016,test
5,"Amaranth, seed, cooked (without addition of fa...",cereals,-0.51482,-0.510234,-0.483593,-0.476516,-0.1969,-0.536758,0.084497,-0.431096,...,-0.377014,-0.159511,-0.149051,-0.098161,1.01126,0.226077,0.631711,-0.101432,-0.05644,test
6,"Amaranth, seed, raw",cereals,0.960362,-0.228244,-0.337572,-0.312462,0.121593,-0.536758,1.875138,-0.345502,...,0.722392,-0.15881,-0.132568,0.572278,4.234377,2.628328,2.984515,-0.087333,1.577054,train


In [4]:
dataset["category"].value_counts()

other                      228
meat                       187
cereals                    146
vegetables                 127
sweets                     118
dairy                      103
fruits                      99
non_alcoholic_beverages     28
alcoholic_beverages         22
sauce                       16
herbs                        7
nuts                         3
Name: category, dtype: int64

### Pre-processing

Only label encoding for the categories that we have.

In [5]:
def encode_label_category(label:str, label_column:pd.Series):
    '''
    This function converts the label_column to a column containing binary values
    of 0 or 1. The rows whose value will be equal to `label` will contain a 1. All
    the other rows will contain a 0.

    Args:
        - label: the category that will be encoded with 1
        - label_column: the column that will be transformed

    Return:
    The transformed column.
    '''
    return label_column.apply(lambda x: 1 if x == label else 0)


def prepare_dataset(label:str, original_dataset:pd.DataFrame):
    
    ''' 
    This function is used to prepare the original dataset for the next steps in 
    supervised learning. It extracts the labels, does the manual encoding of the 
    category specified in the label parameter and splits the dataset into train and 
    test based on the split from Module 1. Also, here the features and labels are
    splitted in different dataframes.

    Args: 
        - label: the category that will be classified, "category" column in the dataset
        - original_datatset: the dataset
    
    Returns:
        - train_features, test_features, train_labels, test_labels
    '''
    
    labels_col = original_dataset['category']
    
    one_hot_labels = encode_label_category(label=label, label_column=labels_col)

    original_dataset['category_binary'] = one_hot_labels
    train_data = original_dataset[original_dataset['split'] == 'train']
    test_data = original_dataset[original_dataset['split'] == 'test']
    
    not_needed_cols = ['name', 'category', 'split', 'category_binary']
    feature_cols = original_dataset.columns.difference(not_needed_cols)

    return train_data[feature_cols], test_data[feature_cols], \
        train_data['category_binary'], test_data['category_binary']

In [6]:
meat_train_set_features, \
    meat_test_set_features, \
    meat_train_set_labels, \
        meat_test_set_labels = prepare_dataset("meat", dataset)

cereals_train_set_features, \
    cereals_test_set_features,\
      cereals_train_set_labels, \
        cereals_test_set_labels = prepare_dataset("cereals", dataset)

vegetables_train_set_features, \
    vegetables_test_set_features,\
      vegetables_train_set_labels, \
        vegetables_test_set_labels = prepare_dataset("vegetables", dataset)

In [7]:
meat_train_set_features.head()

Unnamed: 0_level_0,alcohol_g,beta_carotene_activity_µg,beta_carotene_µg,calcium_mg,carbohydrates_g,chloride_mg,cholesterol_mg,energy_kcal,fat_g,fatty_acids_monounsat_g,...,vit_A_activity_re_µg,vit_B12_µg,vit_B1_mg,vit_B2_mg,vit_B6_mg,vit_c_mg,vit_d_µg,vit_e_activity_mg,water_g,zinc_mg
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-0.122655,-0.164054,-0.160407,1.295299,-0.392082,-0.14816,-0.536758,2.383884,2.367211,3.747869,...,-0.182957,-0.219544,0.010923,2.814479,-0.167012,-0.399327,-0.346081,4.417923,-1.954732,1.315695
3,-0.122655,-0.163317,-0.159667,1.295299,-0.285661,0.108003,-0.536758,2.458504,2.390231,3.980279,...,-0.182957,-0.219544,-0.146968,3.746767,-0.125736,-0.42604,-0.346081,3.862716,-1.986216,1.315695
6,-0.122655,-0.164054,-0.160407,0.572278,1.875138,-0.132568,-0.536758,0.960362,-0.228244,-0.312462,...,-0.182957,-0.219544,-0.146968,0.125184,1.48402,-0.201648,-0.346081,-0.07752,-1.681869,1.577054
7,-0.122655,-0.164054,-0.160407,-0.025858,2.795907,-0.146601,-0.536758,1.465483,0.117049,0.521478,...,-0.182957,-0.20141,-0.237192,0.555472,-0.538494,-0.420698,-0.346081,0.885417,-2.070174,-0.317799
8,-0.122655,-0.154476,-0.150793,0.966653,-0.743732,1.179435,0.285864,0.363401,0.594705,1.341747,...,-0.164188,3.842408,-0.259748,0.125184,0.452125,-0.40467,0.64298,0.67201,-0.016704,0.0089


In [8]:
meat_train_set_labels.head()

ID
2    0
3    0
6    0
7    0
8    0
Name: category_binary, dtype: int64

### Models

Trying only Random Forrest.

In [9]:
from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay, classification_report, confusion_matrix

def get_plot_titles(category:str, max_depth:int, n_estimators:int):
    sup_title = f'category={category}: max_depth={max_depth} and n_estimators={n_estimators}'
    return sup_title

def wrapper_function_meat_2(n_estimators:int, max_depth:int):
    train_data_features, test_data_features, \
        train_data_labels, test_data_labels \
            = prepare_dataset(label="meat", original_dataset=dataset)
    
    sup_title = get_plot_titles('Meat', n_estimators=n_estimators, max_depth=max_depth)
    
    run_random_forest_classifier_2(n_estimators, max_depth, train_data_features, train_data_labels, 
                                   test_data_features, test_data_labels, sup_title)
    
def wrapper_function_cereals_2(n_estimators:int, max_depth:int):
    train_data_features, test_data_features, \
        train_data_labels, test_data_labels \
            = prepare_dataset(label="cereals", original_dataset=dataset)
    
    sup_title = get_plot_titles('Cereals', n_estimators=n_estimators, max_depth=max_depth)
    
    run_random_forest_classifier_2(n_estimators, max_depth, train_data_features, train_data_labels, 
                                   test_data_features, test_data_labels, sup_title)
    
def wrapper_function_vegetables_2(n_estimators:int, max_depth:int):
    train_data_features, test_data_features, \
        train_data_labels, test_data_labels \
            = prepare_dataset(label="vegetables", original_dataset=dataset)
    
    sup_title = get_plot_titles('Vegetables', n_estimators=n_estimators, max_depth=max_depth)
    
    run_random_forest_classifier_2(n_estimators, max_depth, train_data_features, train_data_labels, 
                                   test_data_features, test_data_labels, sup_title)

def run_random_forest_classifier_2(n_estimators:int, max_depth:int, train_data_features, 
                                   train_data_labels, test_data_features, test_data_labels, sup_title:str):
    
    rf_classifier = RandomForestClassifier(n_estimators=n_estimators, 
                                           max_depth=max_depth,
                                           random_state=0)
    
    rf_classifier.fit(train_data_features, train_data_labels.values.ravel())
    
    run_auc_roc_prc_2(rf_classifier, X_train=train_data_features, 
                                   y_train=train_data_labels, X_test=test_data_features,
                                   y_test=test_data_labels, sup_title=sup_title)

def run_auc_roc_prc_2(clf, X_train, X_test, y_train, y_test, sup_title:str):

    roc_title = f'ROC - curve'
    prc_title = f'PRC - curve'
    
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)


    print(f"Accuracy (Test): {accuracy_score(y_true=y_test, y_pred=y_pred_test):.2f}")
    print(f"F1-score (Test): {f1_score(y_true=y_test, y_pred=clf.predict(X_test)):.2f}")
    print(f"Accuracy (Train): {accuracy_score(y_true=y_train, y_pred=y_pred_train):.2f}")
    print(f"F1-score (Train): {f1_score(y_true=y_train, y_pred=clf.predict(X_train)):.2f}")

    _, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
    axes[0].set_title(roc_title)
    axes[1].set_title(prc_title) 
    RocCurveDisplay.from_estimator(clf, X_test, y_test, name="Test set", ax=axes[0], alpha=0.8, plot_chance_level=True)
    RocCurveDisplay.from_estimator(clf, X_train, y_train, name="Train set", ax=axes[0], alpha=0.8)
    PrecisionRecallDisplay.from_estimator(clf, X_test, y_test, name="Test set", ax=axes[1], alpha=0.8, plot_chance_level=True)
    PrecisionRecallDisplay.from_estimator(clf, X_train, y_train, name="Train set", ax=axes[1], alpha=0.8)

    plt.suptitle(sup_title)
    plt.show()

In [10]:
n_estimators_slider = widgets.IntSlider(value=30, min=10, max=1000, step=5, description='n_estimators:')
max_depth_slider = widgets.IntSlider(value=10, min=1, max=20, step=1, description='max_depth:')


In [11]:
widgets.interact_manual(wrapper_function_meat_2, 
                        n_estimators=n_estimators_slider,
                        max_depth=max_depth_slider,
                        );

interactive(children=(IntSlider(value=30, description='n_estimators:', max=1000, min=10, step=5), IntSlider(va…

In [12]:
widgets.interact_manual(wrapper_function_cereals_2, 
                        n_estimators=n_estimators_slider,
                        max_depth=max_depth_slider,
                        );

interactive(children=(IntSlider(value=30, description='n_estimators:', max=1000, min=10, step=5), IntSlider(va…

In [13]:
widgets.interact_manual(wrapper_function_vegetables_2, 
                        n_estimators=n_estimators_slider,
                        max_depth=max_depth_slider,
                        );

interactive(children=(IntSlider(value=30, description='n_estimators:', max=1000, min=10, step=5), IntSlider(va…

## Regression

The source of the dataset can be found [here](https://www.kaggle.com/datasets/evangower/chocolate-bar-ratings).

In [14]:
#filepath = os.path.join("..", "data", "chocolate_bars.csv")
filepath_2 = os.path.join("..", "data", "chocolate_bars_proc.csv")
#chocolate_dataset = pd.read_csv(filepath)
chocolate_dataset_2 = pd.read_csv(filepath_2)
chocolate_dataset_2.head()

Unnamed: 0,id,cocoa_percent,num_ingredients,rating,split,year_binary,country_Peru,country_Venezuela
0,2454,0.771568,-0.04148,3.25,train,1,0,0
1,2458,0.771568,-0.04148,3.5,test,1,0,0
2,2454,0.771568,-0.04148,3.75,train,1,0,0
3,2542,-0.682486,-0.04148,3.0,train,1,0,0
4,2546,0.044541,-0.04148,3.0,train,1,0,1


We will see if we can predict the rating based on the cocoa percentage and number of ingredients.

In [15]:
#chocolate_dataset = chocolate_dataset[['cocoa_percent','num_ingredients', 'rating']]
#chocolate_dataset.head()

In [16]:
chocolate_dataset_2 = chocolate_dataset_2[['cocoa_percent','num_ingredients', 'year_binary',\
                                           'country_Venezuela', 'country_Peru', \
                                                'split', 'rating']]
chocolate_dataset_2.head()

Unnamed: 0,cocoa_percent,num_ingredients,year_binary,country_Venezuela,country_Peru,split,rating
0,0.771568,-0.04148,1,0,0,train,3.25
1,0.771568,-0.04148,1,0,0,test,3.5
2,0.771568,-0.04148,1,0,0,train,3.75
3,-0.682486,-0.04148,1,0,0,train,3.0
4,0.044541,-0.04148,1,1,0,train,3.0


### Train-Test split

In [17]:
def get_train_test_sets(dataset:pd.DataFrame):
    
    feature_cols = dataset.columns.difference(['split', 'rating'])
    
    train_data = dataset[dataset['split'] == 'train']
    test_data = dataset[dataset['split'] == 'test']
    
    return train_data[feature_cols], train_data['rating'], \
        test_data[feature_cols], test_data['rating']

### Model

In [18]:
def run_random_forest_regression(n_estimators:int, max_depth:int):
    regressor = RandomForestRegressor(n_estimators=n_estimators, 
                                      max_depth=max_depth, random_state=0)
    
    train_set_features, train_set_ratings, \
        test_set_features, test_set_ratings = get_train_test_sets(chocolate_dataset_2)
    
    regressor.fit(train_set_features, train_set_ratings)
    
    y_train_pred = regressor.predict(train_set_features)
    y_test_pred = regressor.predict(test_set_features)

    title = f"Random Forest Regression - n_estimators={n_estimators} and max_depth={max_depth}"
    visualize_results(train_set_ratings, y_train_pred, 
                      test_set_ratings, y_test_pred,
                      title)

def visualize_results(y_train_true:np.array, y_train_pred:np.array, 
                      y_test_true:np.array, y_test_pred:np.array, title:str):

    print(f"MSE (Train) = {mean_squared_error(y_train_true, y_train_pred):.3f}")
    print(f"MSE (Test) = {mean_squared_error(y_test_true, y_test_pred):.3f}")
    print(f"R2 (Train) = {r2_score(y_train_true, y_train_pred):.3f}")
    print(f"R2 (Test) = {r2_score(y_test_true, y_test_pred):.3f}")

    _, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,4))

    axes[0].scatter(x=y_train_true, y=y_train_pred, s=4)
    axes[0].plot(y_train_true, y_train_true, color='red', label='Perfect Model')
    axes[0].set_xlabel("True Rating")
    axes[0].set_ylabel("Predicted Rating")
    axes[0].set_title("Predicted vs Actual - Train Set");
    axes[0].legend()

    axes[1].scatter(x=y_test_true, y=y_test_pred, s=4)
    axes[1].plot(y_test_true, y_test_true, color='red', label='Perfect Model')
    axes[1].set_xlabel("True Rating")
    axes[1].set_ylabel("Predicted Rating")
    axes[1].set_title("Predicted vs Actual - Test Set");
    axes[1].legend()
    
    plt.suptitle(title)

In [19]:
n_estimators_slider = widgets.IntSlider(value=30, min=10, max=1000, step=5, description='n_estimators:')
max_depth_slider = widgets.IntSlider(value=10, min=1, max=20, step=1, description='max_depth:')
widgets.interact_manual(run_random_forest_regression,
                        n_estimators=n_estimators_slider,
                        max_depth=max_depth_slider,
                        );

interactive(children=(IntSlider(value=30, description='n_estimators:', max=1000, min=10, step=5), IntSlider(va…

Link for [one vs all](https://towardsdatascience.com/multiclass-classification-evaluation-with-roc-curves-and-roc-auc-294fd4617e3a).