# Principal Component Analysis (PCA)
This notebook performs PCA on a selected set of environmental variables and visualizes the results with a scree plot and biplot.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
# Read in the data from the csv file
CO2Data = pd.read_csv("../data/Terminos_lagoon_TA_DIC_2023_RawData.csv")

In [None]:
# Select variables for PCA
selected_vars = ['depth_m', 'do_mg_l', 'sal_psu', 
                 'turbidity_fnu', 'temp_c', 'ta_micromol_kg', 
                 'dic_micromol_kg', 'chlorophy_microg_l']

# Drop missing values and scale the data
X = CO2Data[selected_vars].dropna()
X_scaled = StandardScaler().fit_transform(X)

In [None]:
# Perform PCA
pca = PCA()
principal_components = pca.fit_transform(X_scaled)
explained_variance = pca.explained_variance_ratio_
eigenvalues = pca.explained_variance_

In [None]:
# Scree plot
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o')
plt.axhline(y=1, color='red', linestyle='--', label='Kaiser Criterion')
plt.title('Scree Plot - PCA')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')
plt.legend()
plt.grid(True)
plt.show()

## Simple biplot PCA

In [None]:
# Biplot function
def biplot(scores, coeff, labels=None):
    xs = scores[:, 0]
    ys = scores[:, 1]
    plt.figure(figsize=(8, 6))
    plt.scatter(xs, ys, alpha=0.5)
    for i in range(coeff.shape[0]):
        plt.arrow(0, 0, coeff[i, 0]*2, coeff[i, 1]*2, 
                  color='r', alpha=0.5)
        if labels is None:
            plt.text(coeff[i, 0]*2.2, coeff[i, 1]*2.2, f"Var{i+1}", color='g')
        else:
            plt.text(coeff[i, 0]*2.2, coeff[i, 1]*2.2, labels[i], color='g')
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.title("Biplot")
    plt.grid()
    plt.show()

In [None]:
# Display biplot for first two principal components
biplot(principal_components, pca.components_.T, labels=selected_vars)

In [None]:
def plot_pca_biplot(df, variables, group_col=None, label_col=None,
                                    scale_arrows=2.5, figsize=(10, 10), title="PCA Biplot",
                                    save_path=None):
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler
    import matplotlib.pyplot as plt
    import numpy as np
    from matplotlib import colormaps
    from matplotlib.patheffects import withStroke

    df_filtered = df.dropna(subset=variables + ([group_col] if group_col else []) + ([label_col] if label_col else []))
    X = df_filtered[variables]
    X_scaled = StandardScaler().fit_transform(X)

    pca = PCA(n_components=2)
    scores = pca.fit_transform(X_scaled)
    loadings = pca.components_.T

    groups = df_filtered[group_col].values if group_col else None
    labels = df_filtered[label_col].values if label_col else None
    unique_groups = np.unique(groups) if groups is not None else []
    cmap = colormaps.get_cmap('viridis').resampled(len(unique_groups)) if groups is not None else None
    colors = {group: cmap(i) for i, group in enumerate(unique_groups)} if groups is not None else {}

    fig, ax = plt.subplots(figsize=figsize)
    ax.axhline(0, color='lightgray', lw=1)
    ax.axvline(0, color='lightgray', lw=1)

    for i, var in enumerate(variables):
        ax.arrow(0, 0, loadings[i, 0]*scale_arrows, loadings[i, 1]*scale_arrows,
                 color='black', linewidth=2.4, alpha=0.9,
                 head_width=0.1, head_length=0.12, length_includes_head=True)
        ax.text(loadings[i, 0]*scale_arrows*1.1, loadings[i, 1]*scale_arrows*1.1, var,
                fontsize=13, ha='center', va='center', color='green',
                path_effects=[withStroke(linewidth=3, foreground='white')])

    if groups is not None:
        for group in unique_groups:
            idx = groups == group
            ax.scatter(scores[idx, 0], scores[idx, 1],
                       label=group, s=70, alpha=0.85,
                       edgecolor='white', linewidth=0.6,
                       color=colors[group], zorder=2)
            if labels is not None:
                for j in np.where(idx)[0]:
                    ax.text(scores[j, 0], scores[j, 1], labels[j],
                            fontsize=6.5, alpha=0.5)
    else:
        ax.scatter(scores[:, 0], scores[:, 1], s=50, alpha=0.75)

    ax.set_xlabel('PC1', fontsize=14, weight='bold')
    ax.set_ylabel('PC2', fontsize=14, weight='bold')
    ax.set_title(title, fontsize=16, weight='bold')
    if groups is not None:
        ax.legend(title=group_col, fontsize=10, title_fontsize=11)
    ax.set_aspect('equal')

    margin = 0.5
    ax.set_xlim(scores[:, 0].min() - margin, scores[:, 0].max() + margin)
    ax.set_ylim(scores[:, 1].min() - margin, scores[:, 1].max() + margin)

    plt.tight_layout()

    if save_path:
        dpi = 600 if save_path.endswith(('.png', '.jpg')) else None
        plt.savefig(save_path, dpi=dpi, bbox_inches='tight')

    plt.show()


In [None]:
# Ejemplo de uso con los datos cargados
plot_pca_biplot(
    df=CO2Data,
    variables=selected_vars,
    group_col='season',
    label_col='sample',
    title='PCA Biplot - Terminos Lagoon'
)