In [None]:
import numpy as np
import sympy as sp
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('./data/glass.csv')
df

In [None]:
df.drop(columns='ID', inplace=True)
df

## Part (a)

PCA should be carried out on the covariance matrix. All the variables (nutrients) are measured in terms of mass. Although they have different units, they can be scaled such that they are all in grams.

## Part (b)

In [None]:
df_numeric = df.iloc[:,:-1]
df_numeric

In [None]:
pca = PCA(n_components=7).fit(df_numeric)

# pca.transform(df_numeric).round(3)

In [None]:
def pca_results(data, pca):
    
    # Dimension indexing
    dimensions = [f'PC {i}' for i in range(1, len(pca.components_) + 1)]
    
    # PCA components
    components = pd.DataFrame(np.round(pca.components_, 4), columns = data.keys()) 
    components.index = dimensions

    #PCA eigenvalues
    ev = pca.explained_variance_.reshape(len(pca.components_), 1)
    eigenvalues = pd.DataFrame(np.round(ev, 4), columns = ['Eigenvalue']) 
    eigenvalues.index = dimensions
    
    # PCA explained variance
    ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1) 
    variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance']) 
    variance_ratios.index = dimensions

    cum_ratios = np.cumsum(ratios)
    cum_variance_ratios = pd.DataFrame(np.round(cum_ratios, 4), columns = ['Cumulative Explained Variance']) 
    cum_variance_ratios.index = dimensions

    # Return a concatenated DataFrame
    return pd.concat([eigenvalues, variance_ratios, cum_variance_ratios, components], axis = 1)

pca_res = pca_results(df_numeric, pca)
pca_res

In [None]:
def eig_expl(pca, proportion=0.8):
    cum_expl_var = pca['Cumulative Explained Variance']
    first_to_cross_threshold = cum_expl_var[cum_expl_var < proportion].shape[0]
    pca_satisified = pca.iloc[:first_to_cross_threshold + 1]
    return pca_satisified

eig_expl(pca_res)

In [None]:
# def eig_more_1(pca):
#     return pca[pca['Eigenvalue'] >= 1]

# eig_more_1(pca_res)

In [None]:
def scree_plot(pca, highlight=None):
    with sns.axes_style(style='darkgrid'):
        ax = sns.pointplot(x=pca.index, y=pca['Eigenvalue'])
        ax.set(
            title='Scree Plot'
        )
        if highlight is not None:
            ax.plot(highlight, pca['Eigenvalue'][highlight],
                marker='o',
                mec='r',
                mfc='none',
                markersize=28
            )

scree_plot(pca_res, highlight=3)

In [None]:
pca_res = pca_res.iloc[:3]

In [None]:
def score_plot(df):
    pc1 = df.loc['PC 1'].iloc[3:]
    pc2 = df.loc['PC 2'].iloc[3:]
    pc3 = df.loc['PC 3'].iloc[3:]
    ax = sns.scatterplot(x=pc1, y=pc2, size=pc3)
    ax.set(
        title='Score Plot',
        ylim=(-1, 1)
    )
    ylim = ax.get_ylim()
    xlim = ax.get_xlim()
    ax.plot([0, 0], [ylim[0], ylim[1]], color='grey', linestyle='--', linewidth=1)
    ax.plot([xlim[0], xlim[1]], [0, 0], color='grey', linestyle='--', linewidth=1)

score_plot(pca_res)

## Part (c)