In [None]:
import numpy as np
import sympy as sp
import pandas as pd

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import plotly.express as px

from typing import Union

from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('./data/glass.csv')
df.drop(columns='ID', inplace=True)
df

## Part (a)

PCA should be carried out on the covariance matrix. All the variables (nutrients) are measured in terms of mass. Although they have different units, they can be scaled such that they are all in grams.

## Part (b)

In [None]:
df_numeric = df.iloc[:,:-1]
df_numeric

In [None]:
pca = PCA(n_components=7).fit(df_numeric)

pca.components_.round(5)

In [None]:
def get_pca_results(pca, col_names, precision: int = 5):

    index = [f'PC {i + 1}' for i in range(pca.n_components_)]
    columns = np.concatenate((
        np.array(['Eigenvalue', 'Explained Variance', 'Cumulative Explained Variance']),
        col_names
    ))

    return pd.DataFrame(
        data=np.hstack((
            pca.explained_variance_.reshape(-1, 1),                 # Eigenvalues
            pca.explained_variance_ratio_.reshape(-1, 1),           # Explained Variance
            pca.explained_variance_ratio_.cumsum().reshape(-1, 1),  # Cumulative Explained Variance
            pca.components_                                         # Eigenvectors
        )),
        index=index,
        columns=columns
    ).round(precision)

pca_results = get_pca_results(pca, df_numeric.columns.values)
pca_results

In [None]:
# def pca_results(data, pca):
    
#     # Dimension indexing
#     dimensions = [f'PC {i}' for i in range(1, len(pca.components_) + 1)]
    
#     # PCA components
#     components = pd.DataFrame(np.round(pca.components_, 4), columns = data.keys()) 
#     components.index = dimensions

#     #PCA eigenvalues
#     ev = pca.explained_variance_.reshape(len(pca.components_), 1)
#     eigenvalues = pd.DataFrame(np.round(ev, 4), columns = ['Eigenvalue']) 
#     eigenvalues.index = dimensions
    
#     # PCA explained variance
#     ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1) 
#     variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance']) 
#     variance_ratios.index = dimensions

#     cum_ratios = np.cumsum(ratios)
#     cum_variance_ratios = pd.DataFrame(np.round(cum_ratios, 4), columns = ['Cumulative Explained Variance']) 
#     cum_variance_ratios.index = dimensions

#     # Return a concatenated DataFrame
#     return pd.concat([eigenvalues, variance_ratios, cum_variance_ratios, components], axis = 1)

# pca_res = pca_results(df_numeric, pca)
# pca_res

In [None]:
def eig_expl(pca, proportion=0.8):
    cum_expl_var = pca['Cumulative Explained Variance']
    first_to_cross_threshold = cum_expl_var[cum_expl_var < proportion].shape[0]
    pca_satisified = pca.iloc[:first_to_cross_threshold + 1]
    return pca_satisified

eig_expl(pca_results)

### PCA was not carried out on correlation matrix, so 

In [None]:
# def eig_more_1(pca):
#     return pca[pca['Eigenvalue'] >= 1]
# 
# eig_more_1(pca_results)

In [None]:
def circular_mark(ax, x, y, **kwargs):
    default_options = dict(marker='o', mec='r', mfc='none', markersize=28)
    for opt in default_options:
        if opt not in kwargs.keys():
            kwargs[opt] = default_options[opt]
    ax.plot(x, y, **kwargs)
    return ax

In [None]:
def scree_plot(pca, mark: Union[int, None] = None):
    with sns.axes_style(style='darkgrid'):
        ax = sns.pointplot(x=pca.index, y=pca['Eigenvalue'])
        ax.set(
            title='Scree Plot'
        )
        if mark is not None:
            circular_mark(ax, mark - 1, pca['Eigenvalue'][mark - 1])

    return pca.iloc[:mark-1]

scree_plot(pca_results, mark=4)

In [None]:
def get_focused_pca(pca, num_of_pcs):
    return pca_results.iloc[:num_of_pcs, 3:]

pca_results_focused = get_focused_pca(pca_results, 3)
pca_results_focused

In [None]:
def score_plot(df, pca, labels: Union[pd.Series, None] = None, centre=False, **kwargs):

    PC1 = df @ pca.loc['PC 1'] if not centre else (df - df.mean(axis=0)) @ pca.loc['PC 1']
    PC2 = df @ pca.loc['PC 2'] if not centre else (df - df.mean(axis=0)) @ pca.loc['PC 2']
    PC3 = df @ pca.loc['PC 3'] if not centre else (df - df.mean(axis=0)) @ pca.loc['PC 3']

    ax = px.scatter_3d(x=PC1, y=PC2, z=PC3, color=labels, title='Score Plot', labels=['PC 1', 'PC 2', 'PC 3'])

    # if labels is not None:
    #     ax.scatter(PC1, PC2, PC3, label=labels)
    # else:
    #     sns.scatterplot(x=PC1, y=PC2, hue=PC3, ax=ax, **kwargs)

    # ax.set(
    #     title='Score Plot',
    #     xlabel='PC 1',
    #     ylabel='PC 2'
    # )

    # xlim = ax.set_xlim()
    # ylim = ax.set_ylim()
    # zlim = ax.set_zlim3d()

    # ax.plot([0, 0], [ylim[0], ylim[1]], color='grey', linestyle='--', linewidth=1)
    # ax.plot([xlim[0], xlim[1]], [0, 0], color='grey', linestyle='--', linewidth=1)
    # ax.plot([0, 0], [0, 0], [zlim[0], zlim[1]], color='grey', linestyle='--', linewidth=1)
    return ax

score_plot(df_numeric, pca_results_focused, df['Class'], centre=False, palette='crest')
score_plot(df_numeric, pca_results_focused, df['Class'], centre=True, palette='crest')

In [None]:
# def score_plot(df):
#     pc1 = df.loc['PC 1'].iloc[3:]
#     pc2 = df.loc['PC 2'].iloc[3:]
#     pc3 = df.loc['PC 3'].iloc[3:]
#     ax = sns.scatterplot(x=pc1, y=pc2, size=pc3)
#     ax.set(
#         title='Score Plot',
#         ylim=(-1, 1)
#     )
#     ylim = ax.get_ylim()
#     xlim = ax.get_xlim()
#     ax.plot([0, 0], [ylim[0], ylim[1]], color='grey', linestyle='--', linewidth=1)
#     ax.plot([xlim[0], xlim[1]], [0, 0], color='grey', linestyle='--', linewidth=1)

# score_plot(pca_res)

In [None]:
# This function plots the loading plot.
# Pass original data dataframe and returns of PCA to this function. Optional width, height and margin
# This function returns the axes of the loading plot

def loading_plot_C(data, pca, width=5, height=5, margin=0.5):

    fig, ax = plt.subplots(figsize = (width,height))

    #Set limits for figure
    x_min = min(pca.components_[0,:].min(),0)-margin
    x_max = max(pca.components_[0,:].max(),0)+margin
    y_min = min(pca.components_[1,:].min(),0)-margin
    y_max = max(pca.components_[1,:].max(),0)+margin

    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)

    #Scaling factor for text position
    text_pos = 0.2

    for i, v in enumerate(pca.components_.T):
        ax.arrow(0, 0, v[0], v[1], head_width=0.1, head_length=0.1, linewidth=2, color='red')
        ax.text(v[0], v[1]+text_pos, data.columns[i], color='black', ha='center', va='center', fontsize=12)

    plt.plot([x_min, x_max], [0, 0], color='k', linestyle='--', linewidth=1)
    plt.plot([0, 0], [y_min, y_max], color='k', linestyle='--', linewidth=1)
    ax.set_xlabel("PC1", fontsize=14)
    ax.set_ylabel("PC2", fontsize=14)
    ax.set_title("Loading plot", fontsize = 14)

loading_plot_C(df_numeric, pca)

In [None]:
def loading_plots(pca_res):
    fig, ax = plt.subplots(nrows=pca_res.shape[0], figsize=(8, 8))
    for i in range(pca_res.shape[0]):
        ax[i].set(
            xlim=(pca_res.values.min() - 0.2, pca_res.values.max() + 0.2),
            ylim=(-1, 1),
            yticks=()
        )
        sns.scatterplot(x=pca_res.iloc[i], y=[0] * len(pca_res.iloc[i]), hue=pca_res.columns.values, ax=ax[i], legend=None)
        ax[i].plot([0, 0], [pca_res.values.min() - 0.2, pca_res.values.max() + 0.2], '--', color='grey', linewidth=1)
        for j in range(pca_res.shape[1]):
            ax[i].text(x=pca_res.iloc[i, j] - 0.02, y=0.1 + j / 18, s=pca_res.columns.values[j])
    plt.subplots_adjust(hspace=0.4)

loading_plots(pca_res.iloc[:,3:])

## Part (c)