In [None]:
%reset -f

# Question 1 - Nutrition of Dairy Products

In [None]:
# Data Manipulation Dependencies
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Graphing Dependencies
import matplotlib.pyplot as plt
import seaborn as sns

# Miscellaneous Dependencies
from typing import Union, List

## Import Data

In [None]:
df = pd.read_csv('./data/dairy_nutrition.csv')
df.drop(columns='Description', inplace=True)
df

In [None]:
# Extract the quantitative variables from the raw data
df_numeric = df.drop(columns='Type')
df_numeric

## Part (a)

Should PCA be carried out on covariance or correlation matrix?<br />
Explain.

PCA should be carried out on the correlation matrix. All the variables (nutrients) measure mass (per 100g). Although they currently have different scales (g/mg/mcg), it won't matter when they are standardized (converted to z-scores).

## Part (b)

Extract the principal components.<br />
Justify your decision and interpret the principal components.<br />
You should include the necessary tables, outputs and graphs.

In [None]:
# Standardize the data (convert to z-scores)
scaler = StandardScaler()

df_scaled = pd.DataFrame(
    data=scaler.fit_transform(X=df_numeric),
    columns=df_numeric.columns,
    index=df_numeric.index
)

# Rename the columns (the variables are unitless after standardization)
df_scaled.rename(mapper=dict(
    Protein_g='Protein',
    Fat_g='Fat',
    Carb_g='Carbohydrate',
    Sugar_g='Sugar',
    VitA_mcg='Vitamin A',
    Calcium_mg='Calcium'
), axis=1, inplace=True)

df_scaled

In [None]:
# Perform eigendecomposition on correlation matrix
pca = PCA(n_components=6).fit(df_scaled)

# View the eigenvectors (row-wise)
pca.components_.round(5)

In [None]:
# Summarize eigenvalues, eigenvectors and the amount of variance they explain respectively
def get_pca_results(pca, col_names, precision: int = 5):

    index = [f'PC {i + 1}' for i in range(pca.n_components_)]
    columns = np.concatenate((
        np.array(['Eigenvalue', 'Explained Variance', 'Cumulative Explained Variance']),
        col_names
    ))

    return pd.DataFrame(
        data=np.hstack((
            pca.explained_variance_.reshape(-1, 1),                 # Eigenvalues
            pca.explained_variance_ratio_.reshape(-1, 1),           # Explained Variance
            pca.explained_variance_ratio_.cumsum().reshape(-1, 1),  # Cumulative Explained Variance
            pca.components_                                         # Eigenvectors
        )),
        index=index,
        columns=columns
    ).round(precision)

pca_results = get_pca_results(pca, df_scaled.columns.values)
pca_results

### Select Principal Components

#### Method 1 - Kaiser's Rule

PCA was carried out on the correlation matrix, so Kaiser's Rule can be used in this case. Kaiser’s Rule states that only PCs that have eigenvalues greater than 1. By Kaiser's Rule, only the top 2 PCs should be kept.

In [None]:
def kaisers_rule(pca_results):
    return pca_results[pca_results['Eigenvalue'] >= 1]

kaisers_rule(pca_results)

#### Method 2 - Cumulative Explained Variance

For this analysis, 80% is the benchmark for sufficient explained variance. The top 2 PCs already explain more than 80% (82%) of the total variance. Hence, only the top 2 PCs should be retained.

In [None]:
def cumulative_explained_variance(pca_results, proportion=0.8):
    cum_expl_var = pca_results['Cumulative Explained Variance']
    first_to_cross_threshold = cum_expl_var[cum_expl_var < proportion].shape[0]
    return pca_results.iloc[:first_to_cross_threshold + 1]

cumulative_explained_variance(pca_results)

#### Method 3 - Scree Plot

By the scree plot, there is an elbow at the 3rd PC. Therefore, only the top 2 PCs should be kept.

In [None]:
def circular_mark(ax, x, y, **kwargs):
    default_options = dict(marker='o', mec='r', mfc='none', markersize=28)
    for opt in default_options:
        if opt not in kwargs.keys():
            kwargs[opt] = default_options[opt]
    ax.plot(x, y, **kwargs)
    return ax

In [None]:
def scree_plot(pca, mark: Union[int, None] = None):
    with sns.axes_style(style='darkgrid'):
        ax = sns.pointplot(x=pca.index, y=pca['Eigenvalue'])
        ax.set(
            title='Scree Plot'
        )
        if mark is not None:
            circular_mark(ax, mark - 1, pca['Eigenvalue'][mark - 1])

    return pca.iloc[:mark-1]

scree_plot(pca_results, mark=3)

#### Summary of Principal Component Selection

All 3 methods agree that the top 2 Principal Components should be kept.

In [None]:
def get_focused_pca_results(pca_results, num_of_pcs):
    return pca_results.iloc[:num_of_pcs, 3:]

pca_results_focused = get_focused_pca_results(pca_results, 2)
pca_results_focused

### Score Plot

Visualize the transformed points using a 2D scatter plot.

In [None]:
def score_plot(df_scaled, pca, labels: Union[pd.Series, None] = None, **kwargs):
    transformed = pca.transform(df_scaled)
    PC1, PC2 = transformed[:,0], transformed[:,1]

    fig, ax = plt.subplots()

    if labels is not None:
        sns.scatterplot(x=PC1, y=PC2, hue=labels, ax=ax, **kwargs)
    else:
        sns.scatterplot(x=PC1, y=PC2, ax=ax, **kwargs)

    ax.set(
        title='Score Plot',
        xlabel='PC 1',
        ylabel='PC 2'
    )
    ylim = ax.set_ylim()
    xlim = ax.set_xlim()
    ax.plot([0, 0], [ylim[0], ylim[1]], color='grey', linestyle='--', linewidth=1)
    ax.plot([xlim[0], xlim[1]], [0, 0], color='grey', linestyle='--', linewidth=1)

    return ax

score_plot(df_scaled, pca, labels=df['Type'])

From the score plot, it can be observed that the cheese, yogurt and ice cream clusters are quite distinct in terms of PC1 and PC2. The cream data points have more variability across the PC1 axis, and the milk data points are scattered across both axes (perhaps due to outliers).

### Loading Plots

Visualize the loading values using a 2D scatter plot (This can be done as there are only 2 PCs).

In [None]:
def loading_plot(pca_results, width=7, height=5, margin=0.3):

    fig, ax = plt.subplots(figsize = (width,height))

    x_min, x_max = ax.set_xlim(
        min(pca_results.loc['PC 1'].min() , 0) - margin, max(pca_results.loc['PC 1'].max() , 0) + margin)
    y_min, y_max = ax.set_ylim(
        min(pca_results.loc['PC 2'].min() , 0) - margin, max(pca_results.loc['PC 2'].max() , 0) + margin)

    for i, col in enumerate(pca_results):
        pc1 = pca_results.loc['PC 1', col]
        pc2 = pca_results.loc['PC 2', col]
        
        alignment = 'left' if pc1 > 0 else 'right'
        text_x_adj = 0.02 if pc1 > 0 else -0.02
        text_y_adj = 0.02 if pc2 > 0 else -0.01

        if col == 'Sugar':
            text_x_adj = 0.01
            text_y_adj = 0.06
        elif col == 'Protein' or col == 'Vitamin A':
            text_y_adj = 0

        ax.arrow(0, 0, pc1, pc2,
            head_width=0.05,
            head_length=0.05,
            length_includes_head=True,
            linewidth=1,
            color='red'
        )

        ax.text(pc1 + text_x_adj, pc2 + text_y_adj, s=pca_results.columns[i],
            color='black',
            ha=alignment,
            va='center',
            fontsize=10
        )

    plt.plot([x_min, x_max], [0, 0], color='grey', linestyle='--', linewidth=1)
    plt.plot([0, 0], [y_min, y_max], color='grey', linestyle='--', linewidth=1)
    ax.set_xlabel("PC1", fontsize=14)
    ax.set_ylabel("PC2", fontsize=14)
    ax.set_title("Loading plot", fontsize = 14)

loading_plot(pca_results_focused)

### Interpretation of the Principal Components

1.  PC 1:
    *   PC 1 seems to measure the contrast between the concentrations of sugar and non-sugar nutrients
    *   `Carbohydrate` and `Sugar` are sugar nutrients
    *   `Carbohydrate` and `Sugar` have negative loading values for PC 1, while
    *   `Calcium`, `Protein`, `Vitamin A` and `Fat` have positive loading values for PC 1
    *   A higher score for PC 1 means that the dairy product has a greater concentration of non-sugar nutrients as compared to sugar nutrients
    *   A lower score for PC 1 means that the dairy product has a greater concentration of sugar nutrients as compared to non-sugar nutrients
2.  PC 2:
    *   PC 2 seems to measure the contrast between the concentrations of fats and the other nutrients
    *   `Fat` is the only variable with a negative loading value
    *   A higher score for PC 2 means that the dairy product has a lower concentration of fats as compared to other nutrients
    *   A lower score for PC 2 means that the dairy product has a greater concentration of fats as compared to other nutrients

## Part (c)

Which type(s) of dairy product has/have the following attributes?<br />
Explain your answer with the aid of a suitable graph with colour or marker to display "Type" information.

In [None]:
score_plot(df_scaled, pca, labels=df['Type'])

### Sub-Part (i)

Low carbohydrates and sugar but high in other nutrients.

Cheese. The cheese (blue) products have generally high PC 1 scores. Recalling the interpretations of the PCs, "a higher score for PC 1 means that the food has a greater concentration of non-sugar nutrients as compared to sugar nutrients". Therefore, the dairy product(s) with low carbohydrates and sugar but high in other nutrients should have a high PC 1 score.

### Sub-Part (ii)

High carbohydrates and sugar but low in other nutrients.

Yogurt and ice cream. The yogurt (orange) and ice cream (purple) products have generally low PC 1 scores. Recalling the interpretations of the PCs, "a lower score for PC 1 means that the food has a greater concentration of sugar nutrients as compared to non-sugar nutrients". Therefore, the dairy product(s) with high carbohydrates and sugar but low in other nutrients should have low PC 1 scores.

For milk (green) and cream (red) products, some data points have positive PC 1 scores while others have negative PC 1 scores. Due to this inconsistency, they cannot be definitively classified to have high carbohydrates and sugar but low in other nutrients.

## Part (d)

A dairy product has its nutritional value listed below. <br />
Which type of dairy product is it likely to be? <br />
Show your working and explain.

<br />
<table style="text-align:left;">
    <tr>
        <th>Protein</th>
        <td style="padding:0 2em;">4.8 g</td>
        <th>Sugar</th>
        <td style="padding:0 2em;">19.1 g</td>
    </tr>
    <tr>
        <th>Fat</th>
        <td style="padding:0 2em;">23 g</td>
        <th>Vitamin A</th>
        <td style="padding:0 2em;">17 mcg</td>
    </tr>
    <tr>
        <th>Carbohydrate</th>
        <td style="padding:0 2em;">26.2 g</td>
        <th>Calcium</th>
        <td style="padding:0 2em;">121 mg</td>
    </tr>
</table>

In [None]:
def score_plot_with_target(pca, pca_results, df_scaled, scaler, labels, target_vars: List[float]):
    def append_to_series(s: pd.Series, value):
        return s.append(pd.Series(data=value), ignore_index=True)

    def append_to_dataframe(df: pd.DataFrame, values, index):
        return df.append(pd.Series(data=values, index=index), ignore_index=True)
    
    target = np.array(target_vars).reshape(1, -1)

    pc_scores = (scaler.transform(target) @ pca_results.T).iloc[0]
    target_scaled = scaler.transform(target).flatten()
    
    ax = score_plot(
        append_to_dataframe(df_scaled, scaler.transform(target).flatten(), df_scaled.columns.values),
        pca,
        labels=append_to_series(labels, 'Target'),
        hue_order=['Target', 'Ice cream', 'Cheese', 'Yogurt', 'Cream', 'Milk']
    )
    
    circular_mark(
        ax=ax,
        x=pc_scores['PC 1'],
        y=pc_scores['PC 2'],
        markersize=15,
        mec='k'
    ).arrow(pc_scores['PC 1'], pc_scores['PC 2'] + 1.5, 0, -1, width=0.1, length_includes_head=True, ec='k', fc='none')

    return pc_scores

score_plot_with_target(pca=pca, pca_results=pca_results_focused, df_scaled=df_scaled, scaler=scaler, labels=df['Type'], target_vars=[4.8, 23, 26.2, 19.1, 17, 121])

<table style="text-align:left;">
    <tr>
        <th>Protein</th>
        <td style="padding:0 2em;">4.8 g</td>
        <th>Sugar</th>
        <td style="padding:0 2em;">19.1 g</td>
    </tr>
    <tr>
        <th>Fat</th>
        <td style="padding:0 2em;">23 g</td>
        <th>Vitamin A</th>
        <td style="padding:0 2em;">17 mcg</td>
    </tr>
    <tr>
        <th>Carbohydrate</th>
        <td style="padding:0 2em;">26.2 g</td>
        <th>Calcium</th>
        <td style="padding:0 2em;">121 mg</td>
    </tr>
</table>
<br />
<br />
The dairy product is likely to be ice cream. The given (above) data was scaled and transformed by PCs 1 and 2, and plotted on a labelled score plot, together with the rest of the data points. As the target datum point (blue) was situated in the `Ice cream` (orange) cluster, it is likely that the dairy product is an ice cream.