In [None]:
%reset -f

# Question 2 - Refraction of Glass

In [None]:
# Data Manipulation Dependencies
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

# Graphing Dependencies
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Miscellaneous Dependencies
from typing import Union, List

## Import Data

In [None]:
df = pd.read_csv('./data/glass.csv')
df.drop(columns='ID', inplace=True)
df

In [None]:
# Extract the quantitative variables from the raw data
df_numeric = df.iloc[:,:-1]
df_numeric

## Part (a)

Should PCA be carried out on covariance or correlation matrix?<br />
Explain.

PCA should be carried out on the covariance matrix. `RI` measures the ratio of the velocity of light in a vacuum to its velocity in the objects. On the other hand, `Na`, `Mg`, `Al`, `Si`, `K` and `Ca` measure the proportions of their corresponding oxides in the objects by weight. As `RI` and the 6 other quantitative variables have different scales and measure different things, it doesn't make sense to carry out PCA on the correlation matrix. Therefore, PCA should be carried out on the covariance matrix.

## Part (b)

Extract the principal components.<br />
Justify your decision and interpret the principal components.<br />
You should include the necessary tables, outputs and graphs.

In [None]:
# Perform eigendecomposition on covariance matrix
pca = PCA(n_components=7).fit(df_numeric)

# View the eigenvectors (row-wise)
pca.components_.round(3)

In [None]:
# Summarize eigenvalues, eigenvectors and the amount of variance they explain respectively
def get_pca_results(pca, col_names, precision: int = 5):

    index = [f'PC {i + 1}' for i in range(pca.n_components_)]
    columns = np.concatenate((
        np.array(['Eigenvalue', 'Explained Variance', 'Cumulative Explained Variance']),
        col_names
    ))

    return pd.DataFrame(
        data=np.hstack((
            pca.explained_variance_.reshape(-1, 1),                 # Eigenvalues
            pca.explained_variance_ratio_.reshape(-1, 1),           # Explained Variance
            pca.explained_variance_ratio_.cumsum().reshape(-1, 1),  # Cumulative Explained Variance
            pca.components_                                         # Eigenvectors
        )),
        index=index,
        columns=columns
    ).round(precision)

pca_results = get_pca_results(pca, df_numeric.columns.values)
pca_results

### Select Principal Components

#### Method 1 - Kaiser's Rule

PCA was not carried out on a covariance matrix, so Kaiser's Rule is not applicable in this case.

#### Method 2 - Cumulative Explained Variance

Once again, 80% is the benchmark in this analysis. The top 3 PCs explain more than 80% (90%) of the total variance. Hence, only the top 3 PCs should be retained.

In [None]:
def eig_expl(pca, proportion=0.8):
    cum_expl_var = pca['Cumulative Explained Variance']
    first_to_cross_threshold = cum_expl_var[cum_expl_var < proportion].shape[0]
    pca_satisified = pca.iloc[:first_to_cross_threshold + 1]
    return pca_satisified

eig_expl(pca_results)

#### Method 3 - Scree Plot

By the scree plot, there is an elbow at the 4th PC. Therefore, only the top 3 PCs should be kept.

In [None]:
def circular_mark(ax, x, y, **kwargs):
    default_options = dict(marker='o', mec='r', mfc='none', markersize=28)
    for opt in default_options:
        if opt not in kwargs.keys():
            kwargs[opt] = default_options[opt]
    ax.plot(x, y, **kwargs)
    return ax

In [None]:
def scree_plot(pca, mark: Union[int, None] = None):
    with sns.axes_style(style='darkgrid'):
        ax = sns.pointplot(x=pca.index, y=pca['Eigenvalue'])
        ax.set(
            title='Scree Plot'
        )
        if mark is not None:
            circular_mark(ax, mark - 1, pca['Eigenvalue'][mark - 1])

    return pca.iloc[:mark-1]

scree_plot(pca_results, mark=4)

#### Summary of Principal Component Selection

Both methods 2 and 3 agree that the top 3 Principal Components should be kept.

In [None]:
def get_focused_pca_results(pca_results, num_of_pcs):
    return pca_results.iloc[:num_of_pcs, 3:]

pca_results_focused = get_focused_pca_results(pca_results, 3)
pca_results_focused

### Score Plot

Visualize the transformed points using a 3D scatter plot.

In [None]:
def score_plot(df, pca, labels: Union[pd.Series, None] = None):
    
    transformed = pca.transform(df)
    PC1, PC2, PC3 = transformed[:,0], transformed[:,1], transformed[:,2]

    fig = px.scatter_3d(x=PC1, y=PC2, z=PC3, color=labels, title='Score Plot', labels={'x': 'PC 1', 'y': 'PC 2', 'z': 'PC 3'}, width=800, height=500)

    fig.update_layout(margin=dict(l=30, r=10, t=75, b=20))
    return fig

score_plot(df_numeric, pca, labels=df['Class'])

From the 3D score plot, it is quite obvious that the different classes of glass objects form relatively distinct clusters. This is useful in predicting the class of an unknown glass object, like in Part (c).

### Loading Plots

Visualize the loading values using several 1D scatter plots (1 for each PC).

In [None]:
def loading_plots(pca_results):
    fig, ax = plt.subplots(nrows=pca_results.shape[0], figsize=(8, 8))
    fig.suptitle(t='Loading Plots', y=0.92, fontsize=14)
    for i in range(pca_results.shape[0]):
        ax[i].set(
            xlim=(pca_results.values.min() - 0.2, pca_results.values.max() + 0.2),
            ylim=(-1, 1),
            yticks=()
        )
        sns.scatterplot(x=pca_results.iloc[i], y=[0] * len(pca_results.iloc[i]), hue=pca_results.columns.values, ax=ax[i], legend=None)
        ax[i].plot([0, 0], [-1, 1], '--', color='grey', linewidth=1)
        for j in range(pca_results.shape[1]):
            ax[i].text(x=pca_results.iloc[i, j] - 0.02, y=0.1 + j / 18, s=pca_results.columns.values[j])
    plt.subplots_adjust(hspace=0.4)

loading_plots(pca_results_focused)

In [None]:
# RI is redundant
pca_results_focused['RI']

### Interpretation of the Principal Components

1.  PC 1:
    *   PC 1 seems to measure the contrast between the concentrations of silicon oxide (SiO$_{2}$) and calcium oxide (CaO) against the concentrations of potassium oxide (K$_{2}$O), sodium oxide (Na$_{2}$O), aluminium oxide (Al$_{2}$O$_{3}$) and magnesium oxide (MgO)
    *   `Ca` and `Si` have positive loading values while
    *   `K`, `Na`, `Al` and `Mg` have negative loading values
    *   A higher PC 1 score means that the glass object has a higher concentration of silicon oxide and calcium oxide as compared to the rest of the oxides involved
    *   A lower PC 1 score means that the glass object has a lower concentration of silicon oxide and calcium oxide as compared to the rest of the oxides involved
2.  PC 2:
    *   PC 2 seems to measure the contrast between the concentrations of potassium oxide, calcium oxide and aluminium oxide against the concentrations of magnesium oxide, sodium oxide and silicon oxide
    *   `K`, `Ca` and `Al` have positive loading values while
    *   `Na`, `Si` and `Mg` have negative loading values
    *   A higher PC 2 score means that the glass object has a higher concentration of potassium oxide, calcium oxide and aluminium oxide as compared to the rest of the oxides involved
    *   A lower PC 2 score means that the glass object has a lower concentration of potassium oxide, calcium oxide and aluminium oxide as compared to the rest of the oxides involved
3.  PC 3:
    *   PC 3 seems to measure the contrast between the concentrations of aluminium oxide, sodium oxide and silicon oxide against the concentrations of magnesium oxide, calcium oxide and potassium oxide
    *   `Al`, `Na` and `Si` have positive loading values while
    *   `Mg`, `Ca` and `K` have negative loading values
    *   A higher PC 3 score means that the glass object has a higher concentration of aluminium oxide, sodium oxide and silicon oxide as compared to the rest of the oxides involved
    *   A lower PC 3 score means that the glass object has a lower concentration of aluminium oxide, sodium oxide and silicon oxide as compared to the rest of the oxides involved

`RI` has a very small absolute loading value for all 3 PCs. This means that `RI` adds little to no information. Thus, it can be considered a redundant variable in this case study.

## Part (c)

The following shows the attributes of a glass object.<br />
Which class does it likely belong to?<br />
Explain your answer with the aid of a suitable graph with colour or marker to display "Class" information.

<br />
<table style="text-align:left;">
    <tr>
        <th>RI</th>
        <td style="padding:0 1em;">1.51641</td>
        <th>Si</th>
        <td style="padding:0 1em;">73.05</td>
    </tr>
    <tr>
        <th>Na</th>
        <td style="padding:0 1em;">13.04</td>
        <th>K</th>
        <td style="padding:0 1em;">0.53</td>
    </tr>
    <tr>
        <th>Mg</th>
        <td style="padding:0 1em;">3.5</td>
        <th>Ca</th>
        <td style="padding:0 1em;">8.6</td>
    </tr>
    <tr>
        <th>Al</th>
        <td style="padding:0 1em;">1.28</td>
        <th></th>
        <td style="padding:0 1em;"></td>
    </tr>
</table>

In [None]:
def score_plot_with_target(pca, df_numeric, labels, target_vars: List[float]):
    def append_to_series(s: pd.Series, value):
        return s.append(pd.Series(data=value), ignore_index=True)

    def append_to_dataframe(df: pd.DataFrame, values, index):
        return df.append(pd.Series(data=values, index=index), ignore_index=True)
    
    return score_plot(
        append_to_dataframe(df_numeric, np.array(target_vars), df_numeric.columns.values),
        pca,
        labels=append_to_series(labels, 'Target'))

score_plot_with_target(pca, df_numeric, df['Class'], [1.51641, 13.04, 3.5, 1.28, 73.05, 0.53, 8.6])

<table style="text-align:left;">
    <tr>
        <th>RI</th>
        <td style="padding:0 1em;">1.51641</td>
        <th>Si</th>
        <td style="padding:0 1em;">73.05</td>
    </tr>
    <tr>
        <th>Na</th>
        <td style="padding:0 1em;">13.04</td>
        <th>K</th>
        <td style="padding:0 1em;">0.53</td>
    </tr>
    <tr>
        <th>Mg</th>
        <td style="padding:0 1em;">3.5</td>
        <th>Ca</th>
        <td style="padding:0 1em;">8.6</td>
    </tr>
    <tr>
        <th>Al</th>
        <td style="padding:0 1em;">1.28</td>
        <th></th>
        <td style="padding:0 1em;"></td>
    </tr>
</table>
<br />
<br />

The glass object likely belongs to the `VehWin` (Vehicle Window) class. The given (above) data was  transformed by PCs 1 and 2, and plotted on a 3D labelled score plot, together with the rest of the data points. As the target datum point (light blue) was situated in the `VehWin` (purple) cluster, it is likely that the glass object belongs to the `VehWin` class.

## Part (d)

Explain how PC3 is advantageous over the first two principal components.

In [None]:
# PC 1 and PC 2 display the absolute loading values for PC1 and PC2 respectively
# PC 3 is unaltered
loading_plots(pd.concat((pca_results_focused.abs().iloc[:2], pca_results_focused.iloc[2:3])))

PC3 captures the contrast between the more important variables and less important variables of both PC1 and PC2. For PC1, `Ca` and `K` have the 2 highest absolute loading values. In other words, `Ca` and `K` are the most important variables for PC 1. For PC2, `K` and `Mg` are the most important variables. These 3 variables (`Mg`, `K` and `Ca`) have negative loading values for PC3. The rest of the variables (`Si`, `Al` and `Na`) have positive loading values for PC3. PC3 tells the difference between the positive and negative values of the individual variables during transformation by PC1 or PC2. For example, if PC1 was close to 0, and PC3 was large, that means that during the transformation of PC1, the individual transformed variables were quite even.