In [None]:
%reset -f

In [None]:
import numpy as np
import sympy as sp
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from typing import Union

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('./data/dairy_nutrition.csv')
df

In [None]:
df.drop(columns='Description', inplace=True)
df

## Part (a)

PCA should be carried out on the correlation matrix. All the variables (nutrients) are measured in terms of mass. Although they have different units, they can be scaled such that they are all in grams.

## Part (b)

In [None]:
df_numeric = df.drop(columns='Type')
df_numeric

In [None]:
scaler = StandardScaler()

df_scaled = pd.DataFrame(
    data=scaler.fit_transform(X=df_numeric),
    columns=df_numeric.columns,
    index=df_numeric.index
)

df_scaled.rename(mapper=dict(
    Protein_g='Protein',
    Fat_g='Fat',
    Carb_g='Carbohydrates',
    Sugar_g='Sugar',
    VitA_mcg='Vitamin A',
    Calcium_mg='Calcium'
), axis=1, inplace=True)

df_scaled

In [None]:
pca = PCA(n_components=6).fit(df_scaled)

pca.components_.round(5)

In [None]:
def get_pca_results(pca, col_names, precision: int = 5):

    index = [f'PC {i + 1}' for i in range(pca.n_components_)]
    columns = np.concatenate((
        np.array(['Eigenvalue', 'Explained Variance', 'Cumulative Explained Variance']),
        col_names
    ))

    return pd.DataFrame(
        data=np.hstack((
            pca.explained_variance_.reshape(-1, 1),                 # Eigenvalues
            pca.explained_variance_ratio_.reshape(-1, 1),           # Explained Variance
            pca.explained_variance_ratio_.cumsum().reshape(-1, 1),  # Cumulative Explained Variance
            pca.components_                                         # Eigenvectors
        )),
        index=index,
        columns=columns
    ).round(precision)

pca_results = get_pca_results(pca, df_scaled.columns.values)
pca_results

In [None]:
# def corr_pca(df_corr):
#     eig_vals, eig_vecs = np.linalg.eig(df_corr)

#     pca_results_ = pd.DataFrame(
#         data=np.hstack((eig_vals.reshape(-1, 1), eig_vecs.T)),
#         columns=[
#             'Eigenvalue',
#             'Protein_g', 'Fat_g', 'Carb_g', 'Sugar_g', 'VitA_g', 'Calcium_g']
#     ).sort_values(
#         by='Eigenvalue',
#         ascending=False
#     )

#     pca_results_['Explained Variance'] = pca_results_['Eigenvalue'] / df_corr.shape[0]

#     pca_results_['Cumulative Explained Variance'] = np.cumsum(pca_results_['Explained Variance'])

#     pca_results_.index = [f'PC {i + 1}' for i in range(pca_results_.shape[0])]

#     return pca_results_[['Eigenvalue', 'Explained Variance', 'Cumulative Explained Variance', 'Protein_g', 'Fat_g', 'Carb_g', 'Sugar_g', 'VitA_g', 'Calcium_g']].round(4)

# pca_res = corr_pca(df_corr)

In [None]:
def eig_expl(pca, proportion=0.8):
    cum_expl_var = pca['Cumulative Explained Variance']
    first_to_cross_threshold = cum_expl_var[cum_expl_var < proportion].shape[0]
    pca_satisified = pca.iloc[:first_to_cross_threshold + 1]
    return pca_satisified

eig_expl(pca_results)

In [None]:
def eig_more_than_1(pca):
    return pca[pca['Eigenvalue'] >= 1]

eig_more_than_1(pca_results)

In [None]:
def circular_mark(ax, x, y, **kwargs):
    default_options = dict(marker='o', mec='r', mfc='none', markersize=28)
    for opt in default_options:
        if opt not in kwargs.keys():
            kwargs[opt] = default_options[opt]
    ax.plot(x, y, **kwargs)
    return ax

In [None]:
def scree_plot(pca, mark: Union[int, None] = None):
    with sns.axes_style(style='darkgrid'):
        ax = sns.pointplot(x=pca.index, y=pca['Eigenvalue'])
        ax.set(
            title='Scree Plot'
        )
        if mark is not None:
            circular_mark(ax, mark - 1, pca['Eigenvalue'][mark - 1])

    return pca.iloc[:mark-1]

scree_plot(pca_results, mark=3)

In [None]:
def get_focused_pca(pca, num_of_pcs):
    return pca_results.iloc[:num_of_pcs, 3:]

pca_results_focused = get_focused_pca(pca_results, 2)
pca_results_focused

In [None]:
def score_plot(df_scaled, pca, labels: Union[pd.Series, None] = None, **kwargs):

    PC1 = df_scaled @ pca.loc['PC 1']
    PC2 = df_scaled @ pca.loc['PC 2']

    fig, ax = plt.subplots()

    if labels is not None:
        sns.scatterplot(x=PC1, y=PC2, hue=labels, ax=ax, **kwargs)
    else:
        sns.scatterplot(x=PC1, y=PC2, ax=ax, **kwargs)

    ax.set(
        title='Score Plot',
        xlabel='PC 1',
        ylabel='PC 2'
    )

    ylim = ax.set_ylim()
    xlim = ax.set_xlim()

    ax.plot([0, 0], [ylim[0], ylim[1]], color='grey', linestyle='--', linewidth=1)
    ax.plot([xlim[0], xlim[1]], [0, 0], color='grey', linestyle='--', linewidth=1)
    return ax

score_plot(df_scaled, pca_results_focused, df['Type'])

In [None]:
def loading_plot(pca, width=7, height=7, margin=0.5):

    fig, ax = plt.subplots(figsize = (width,height))

    x_min, x_max = ax.set_xlim(
        min(pca.loc['PC 1'].min() , 0) - margin, max(pca.loc['PC 1'].max() , 0) + margin)
    y_min, y_max = ax.set_ylim(
        min(pca.loc['PC 2'].min() , 0) - margin, max(pca.loc['PC 2'].max() , 0) + margin)

    for i, col in enumerate(pca):
        var = pca[col]
        pc1 = var['PC 1']
        pc2 = var['PC 2']
        
        alignment = 'left' if pc1 > 0 else 'right'
        text_x_adj = 1.01 if pc1 > 0 else 1.03
        text_y_adj = 1.02

        ax.arrow(0, 0, pc1, pc2,
            head_width=0.05,
            head_length=0.05,
            length_includes_head=True,
            linewidth=1,
            color='red'
        )

        ax.text(pc1 * text_x_adj ** 3, pc2 * text_y_adj ** 3, s=pca.columns[i],
            color='black',
            ha=alignment,
            va='center',
            fontsize=10
        )

    plt.plot([x_min, x_max], [0, 0], color='grey', linestyle='--', linewidth=1)
    plt.plot([0, 0], [y_min, y_max], color='grey', linestyle='--', linewidth=1)
    ax.set_xlabel("PC1", fontsize=14)
    ax.set_ylabel("PC2", fontsize=14)
    ax.set_title("Loading plot", fontsize = 14)

loading_plot(pca_results_focused)

### Interpretation of the Principal Components

1.  PC 1:
    *   PC 1 seems to represent the contrast between sugar and non-sugar nutrients
    *   `Carbohydrates` and `Sugars` have negative loading values for PC 1, while
    *   `Calcium`, `Protein`, `Vitamin A` and `Fat` have positive loading values for PC 1
    *   Therefore, a higher score for PC 1 means that the food has a greater imbalance of sugar and non-sugar nutrients
    *   A lower score for PC 1 means that the food is very sugary and, possibly, very sweet
2.  PC 2:
    *   PC 2 seems to represent the contrast between fat and the rest of the nutrients
    *   `Fat` is the only variable with a negative loading value
    *   Therefore, a higher score for PC 2 means that the food has little fats
    *   On the other hand, a lower score for PC 2 indicates that the food is quite fatty

In [None]:
# # This function plots the loading plot.
# # Pass original data dataframe and returns of PCA to this function. Optional width, height and margin
# # This function returns the axes of the loading plot

# def loading_plot_C(data, pca, width=5, height=5, margin=0.5):

#     fig, ax = plt.subplots(figsize = (width,height))

#     #Set limits for figure
#     x_min = min(pca.components_[0,:].min(),0)-margin
#     x_max = max(pca.components_[0,:].max(),0)+margin
#     y_min = min(pca.components_[1,:].min(),0)-margin
#     y_max = max(pca.components_[1,:].max(),0)+margin

#     ax.set_xlim(x_min, x_max)
#     ax.set_ylim(y_min, y_max)

#     #Scaling factor for text position
#     text_pos = 0.2

#     for i, v in enumerate(pca.components_.T):
#         ax.arrow(0, 0, v[0], v[1], head_width=0.1, head_length=0.1, linewidth=2, color='red')
#         ax.text(v[0], v[1]+text_pos, data.columns[i], color='black', ha='center', va='center', fontsize=12)

#     plt.plot([x_min, x_max], [0, 0], color='k', linestyle='--', linewidth=1)
#     plt.plot([0, 0], [y_min, y_max], color='k', linestyle='--', linewidth=1)
#     ax.set_xlabel("PC1", fontsize=14)
#     ax.set_ylabel("PC2", fontsize=14)
#     ax.set_title("Loading plot", fontsize = 14)

# loading_plot_C(df_numeric, pca)

## Part (c)

Which type(s) of dairy product has/have the following attributes? Explain your answer with the aid of a suitable graph with colour or marker to display ‘Type’ information.

In [None]:
score_plot(df_scaled, pca_results_focused, labels=df['Type'])

### Sub-Part (i)

Low carbohydrates and sugar but high in other nutrients.

<br />
Answer: Cheese
<br />
Explanation: Cheese products have generally high PC 1 scores

### Sub-Part (ii)

High carbohydrates and sugar but low in other nutrients.

<br />
Answer: Yogurt and Ice Cream
<br />
Explanation: Yogurt and Ice Cream products have generally low PC 1 scores

## Part (d)

A dairy product has its nutritional value listed below. Which type of dairy product is it
likely to be? Show your working and explain.

<br />
<table style="text-align:left;">
    <tr>
        <th>Protein</th>
        <td style="padding:0 2em;">4.8 g</td>
        <th>Sugar</th>
        <td style="padding:0 2em;">19.1 g</td>
    </tr>
    <tr>
        <th>Fat</th>
        <td style="padding:0 2em;">23 g</td>
        <th>Vitamin A</th>
        <td style="padding:0 2em;">17 mcg</td>
    </tr>
    <tr>
        <th>Carbohydrate</th>
        <td style="padding:0 2em;">26.2 g</td>
        <th>Calcium</th>
        <td style="padding:0 2em;">121 mg</td>
    </tr>
</table>

In [None]:
target = np.array([4.8, 23, 26.2, 19.1, 17, 121]).reshape(1, -1)

In [None]:
pc_scores = (scaler.transform(target) @ pca_results_focused.T).iloc[0]
pc_scores

In [None]:
def append_to_series(s: pd.Series, value):
    return s.append(pd.Series(data=value), ignore_index=True)

def append_to_dataframe(df: pd.DataFrame, values, index):
    return df.append(pd.Series(data=values, index=index), ignore_index=True)

In [None]:
circular_mark(score_plot(
    append_to_dataframe(df_scaled, scaler.transform(target).flatten(), df_scaled.columns.values),
    pca_results_focused,
    labels=append_to_series(df['Type'], 'Target'),
    hue_order=['Target', 'Ice cream', 'Cheese', 'Yogurt', 'Cream', 'Milk']),
        pc_scores['PC 1'], pc_scores['PC 2'],
        markersize=15
)

Answer: Ice Cream
<br />
Explanation: According to the labelled score plot above, the target is 