# Question 2

In [None]:
# Data Manipulation Dependencies
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

# Graphing Dependencies
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Miscellaneous Dependencies
from typing import Union, List

## Import Data

In [None]:
df = pd.read_csv('./data/glass.csv')
df.drop(columns='ID', inplace=True)
df

## Part (a)

Should PCA be carried out on covariance or correlation matrix?

Explain.

PCA should be carried out on the covariance matrix. `RI` measures the ratio of the velocity of light in a vacuum to its velocity in the objects. On the other hand, `Na`, `Mg`, `Al`, `Si`, `K` and `Ca` measure the proportions of their corresponding oxides in the objects by weight. As `RI` and the 6 other quantitative variables have different scales and measure different things, PCA should be carried out on the covariance matrix instead of the correlation matrix.

## Part (b)

Extract the principal components.

Justify your decision and interpret the principal components.

You should include the necessary tables, outputs and graphs.

In [None]:
df_numeric = df.iloc[:,:-1]
df_numeric

In [None]:
pca = PCA(n_components=7).fit(df_numeric)

pca.components_.round(5)

In [None]:
def get_pca_results(pca, col_names, precision: int = 5):

    index = [f'PC {i + 1}' for i in range(pca.n_components_)]
    columns = np.concatenate((
        np.array(['Eigenvalue', 'Explained Variance', 'Cumulative Explained Variance']),
        col_names
    ))

    return pd.DataFrame(
        data=np.hstack((
            pca.explained_variance_.reshape(-1, 1),                 # Eigenvalues
            pca.explained_variance_ratio_.reshape(-1, 1),           # Explained Variance
            pca.explained_variance_ratio_.cumsum().reshape(-1, 1),  # Cumulative Explained Variance
            pca.components_                                         # Eigenvectors
        )),
        index=index,
        columns=columns
    ).round(precision)

pca_results = get_pca_results(pca, df_numeric.columns.values)
pca_results

### Select Principal Components

#### Method 1 - Cumulative Explained Variance

In [None]:
def eig_expl(pca, proportion=0.8):
    cum_expl_var = pca['Cumulative Explained Variance']
    first_to_cross_threshold = cum_expl_var[cum_expl_var < proportion].shape[0]
    pca_satisified = pca.iloc[:first_to_cross_threshold + 1]
    return pca_satisified

eig_expl(pca_results)

#### Method 2 - Eigenvalues

PCA was not carried out on covariance matrix, so Method 2 is not applicable

In [None]:
# def eig_more_1(pca):
#     return pca[pca['Eigenvalue'] >= 1]
# 
# eig_more_1(pca_results)

#### Method 3 - Scree Plot

In [None]:
def circular_mark(ax, x, y, **kwargs):
    default_options = dict(marker='o', mec='r', mfc='none', markersize=28)
    for opt in default_options:
        if opt not in kwargs.keys():
            kwargs[opt] = default_options[opt]
    ax.plot(x, y, **kwargs)
    return ax

In [None]:
def scree_plot(pca, mark: Union[int, None] = None):
    with sns.axes_style(style='darkgrid'):
        ax = sns.pointplot(x=pca.index, y=pca['Eigenvalue'])
        ax.set(
            title='Scree Plot'
        )
        if mark is not None:
            circular_mark(ax, mark - 1, pca['Eigenvalue'][mark - 1])

    return pca.iloc[:mark-1]

scree_plot(pca_results, mark=4)

Both methods 1 and 3 agree that the top 3 Principal Components should be kept

In [None]:
def get_focused_pca(pca, num_of_pcs):
    return pca_results.iloc[:num_of_pcs, 3:]

pca_results_focused = get_focused_pca(pca_results, 3)
pca_results_focused

In [None]:
### Score Plot

Visualize the transformed

In [None]:
def score_plot(df, pca, labels: Union[pd.Series, None] = None, centre=False):

    PC1 = df @ pca.loc['PC 1'] if not centre else (df - df.mean(axis=0)) @ pca.loc['PC 1']
    PC2 = df @ pca.loc['PC 2'] if not centre else (df - df.mean(axis=0)) @ pca.loc['PC 2']
    PC3 = df @ pca.loc['PC 3'] if not centre else (df - df.mean(axis=0)) @ pca.loc['PC 3']

    fig = px.scatter_3d(x=PC1, y=PC2, z=PC3, color=labels, title='Score Plot', labels={'x': 'PC 1', 'y': 'PC 2', 'z': 'PC 3'}, width=800, height=500)

    fig.update_layout(margin=dict(l=30, r=10, t=75, b=20))
    return fig

score_plot(df_numeric, pca_results_focused, labels=df['Class'], centre=True).show()

In [None]:
def score_plot_2(df, pca, labels: Union[pd.Series, None] = None):
    
    transformed = pca.transform(df)
    PC1, PC2, PC3 = transformed[:,:3]

    # fig = px.scatter_3d(x=PC1, y=PC2, z=PC3, color=labels, title='Score Plot', labels={'x': 'PC 1', 'y': 'PC 2', 'z': 'PC 3'}, width=800, height=500)

    # fig.update_layout(margin=dict(l=30, r=10, t=75, b=20))
    # return fig

score_plot_2(df_numeric, pca, labels=df['Class'])

In [None]:
# def score_plot(df):
#     pc1 = df.loc['PC 1'].iloc[3:]
#     pc2 = df.loc['PC 2'].iloc[3:]
#     pc3 = df.loc['PC 3'].iloc[3:]
#     ax = sns.scatterplot(x=pc1, y=pc2, size=pc3)
#     ax.set(
#         title='Score Plot',
#         ylim=(-1, 1)
#     )
#     ylim = ax.get_ylim()
#     xlim = ax.get_xlim()
#     ax.plot([0, 0], [ylim[0], ylim[1]], color='grey', linestyle='--', linewidth=1)
#     ax.plot([xlim[0], xlim[1]], [0, 0], color='grey', linestyle='--', linewidth=1)

# score_plot(pca_res)

In [None]:
# # This function plots the loading plot.
# # Pass original data dataframe and returns of PCA to this function. Optional width, height and margin
# # This function returns the axes of the loading plot

# def loading_plot_C(data, pca, width=5, height=5, margin=0.5):

#     fig, ax = plt.subplots(figsize = (width,height))

#     #Set limits for figure
#     x_min = min(pca.components_[0,:].min(),0)-margin
#     x_max = max(pca.components_[0,:].max(),0)+margin
#     y_min = min(pca.components_[1,:].min(),0)-margin
#     y_max = max(pca.components_[1,:].max(),0)+margin

#     ax.set_xlim(x_min, x_max)
#     ax.set_ylim(y_min, y_max)

#     #Scaling factor for text position
#     text_pos = 0.2

#     for i, v in enumerate(pca.components_.T):
#         ax.arrow(0, 0, v[0], v[1], head_width=0.1, head_length=0.1, linewidth=2, color='red')
#         ax.text(v[0], v[1]+text_pos, data.columns[i], color='black', ha='center', va='center', fontsize=12)

#     plt.plot([x_min, x_max], [0, 0], color='k', linestyle='--', linewidth=1)
#     plt.plot([0, 0], [y_min, y_max], color='k', linestyle='--', linewidth=1)
#     ax.set_xlabel("PC1", fontsize=14)
#     ax.set_ylabel("PC2", fontsize=14)
#     ax.set_title("Loading plot", fontsize = 14)

# loading_plot_C(df_numeric, pca)

In [None]:
def loading_plots(pca_res):
    fig, ax = plt.subplots(nrows=pca_res.shape[0], figsize=(8, 8))
    for i in range(pca_res.shape[0]):
        ax[i].set(
            xlim=(pca_res.values.min() - 0.2, pca_res.values.max() + 0.2),
            ylim=(-1, 1),
            yticks=()
        )
        sns.scatterplot(x=pca_res.iloc[i], y=[0] * len(pca_res.iloc[i]), hue=pca_res.columns.values, ax=ax[i], legend=None)
        ax[i].plot([0, 0], [pca_res.values.min() - 0.2, pca_res.values.max() + 0.2], '--', color='grey', linewidth=1)
        for j in range(pca_res.shape[1]):
            ax[i].text(x=pca_res.iloc[i, j] - 0.02, y=0.1 + j / 18, s=pca_res.columns.values[j])
    plt.subplots_adjust(hspace=0.4)

loading_plots(pca_results_focused)

### Interpretation of the Principal Components

1.  PC 1:
    *   PC 1 seems to measure the contrast between the amount of silicon and calcium against the amount of potassium, sodium, aluminium and magnesium
    *   Therefore, a higher score for PC 1 means that the food has a greater imbalance of sugar and non-sugar nutrients
    *   A lower score for PC 1 means that the food is very sugary and, possibly, very sweet
2.  PC 2:
    *   PC 2 seems to measure the contrast between the amount of potassium, calcium and aluminium against the amount of sodium, silicon and magnesium

## Part (c)

The following shows the attributes of a glass object. Which class does it likely belong to?
Explain your answer with the aid of a suitable graph with colour or marker to display
‘Class’ information.

<br />
<table style="text-align:left;">
    <tr>
        <th>RI</th>
        <td style="padding:0 1em;">1.51641</td>
        <th>Si</th>
        <td style="padding:0 1em;">73.05</td>
    </tr>
    <tr>
        <th>Na</th>
        <td style="padding:0 1em;">13.04</td>
        <th>K</th>
        <td style="padding:0 1em;">0.53</td>
    </tr>
    <tr>
        <th>Mg</th>
        <td style="padding:0 1em;">3.5</td>
        <th>Ca</th>
        <td style="padding:0 1em;">8.6</td>
    </tr>
    <tr>
        <th>Al</th>
        <td style="padding:0 1em;">1.28</td>
        <th></th>
        <td style="padding:0 1em;"></td>
    </tr>
</table>

In [None]:
def score_plot_with_target(pca, df_numeric, labels, target_vars: List[float]):
    def append_to_series(s: pd.Series, value):
        return s.append(pd.Series(data=value), ignore_index=True)

    def append_to_dataframe(df: pd.DataFrame, values, index):
        return df.append(pd.Series(data=values, index=index), ignore_index=True)
    
    return score_plot(
        append_to_dataframe(df_numeric, np.array(target_vars), df_numeric.columns.values),
        pca,
        labels=append_to_series(labels, 'Target'),
        centre=True)

score_plot_with_target(pca_results_focused, df_numeric, df['Class'], [1.51641, 13.04, 3.5, 1.28, 73.05, 0.53, 8.6])

Answer: VehWin
<br />
Explanation: According to the labelled score plot above, the target can be located in the VehWin (purple) cluster

## Part (c)

 Explain how PC3 is advantageous over the first two principal components.

Answer: PC3 captures the contrast in critical values for PC1 and PC2 scores
<br />
Explanation:
*   `Mg`, `K` and `Ca` have negative loading values for PC3
*   `Mg`, `K` and `Ca` have large absolute loading values for PC1 and PC2
*   For PC1, `K` has the lowest loading value while `Ca` has the highest loading value
*   For PC2, `Mg` has the lowest loading value while `K` has the highest loading value