# Exploratory Analysis Functions for Environmental Data (PCA, Clustering, PERMANOVA)


In [None]:
# Read in the data from the csv file
import pandas as pd
CO2Data = pd.read_csv("../data/Terminos_lagoon_TA_DIC_2023_RawData.csv")

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.manifold import MDS
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram

# Optional: run PERMANOVA if scikit-bio is available
try:
    from skbio.stats.distance import DistanceMatrix, permanova
    SKBIO_AVAILABLE = True
except ImportError:
    SKBIO_AVAILABLE = False


In [None]:
def preprocess_data(df, variables):
    """
    Standardize selected variables and return clean DataFrame.
    """
    df_clean = df.dropna(subset=variables)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean[variables])
    return df_clean, X_scaled

In [None]:
def perform_hierarchical_clustering(X_scaled, method='ward', n_clusters=3):
    """
    Perform hierarchical clustering and return cluster labels.
    """
    linkage_matrix = linkage(X_scaled, method=method)
    cluster_labels = fcluster(linkage_matrix, t=n_clusters, criterion='maxclust')
    return linkage_matrix, cluster_labels

In [None]:
def plot_dendrogram(linkage_matrix, labels=None):
    """
    Plot a dendrogram from the hierarchical clustering with sample labels.
    """
    plt.figure(figsize=(10, 5))
    dendrogram(linkage_matrix, labels=labels)
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('Sample Labels')
    plt.ylabel('Distance')
    plt.tight_layout()
    plt.show()

In [None]:
def compute_mds(X_scaled, metadata_df, n_components=2):
    """
    Compute MDS coordinates from a distance matrix.
    """
    dist_matrix = pairwise_distances(X_scaled, metric='euclidean')
    mds = MDS(n_components=n_components, dissimilarity='precomputed', random_state=42)
    coords = mds.fit_transform(dist_matrix)
    columns = [f"MDS{i+1}" for i in range(n_components)]
    mds_df = pd.DataFrame(coords, columns=columns)
    mds_df = pd.concat([mds_df, metadata_df.reset_index(drop=True)], axis=1)
    return dist_matrix, mds_df


In [None]:
def plot_mds_with_clusters(mds_df, label_column='sample'):
    """
    Plot MDS colored by clusters and annotated by sample names.
    """
    plt.figure(figsize=(10, 8))
    sns.scatterplot(data=mds_df, x='MDS1', y='MDS2', hue='cluster', style='season', s=100, palette='Set1')
    for i, row in mds_df.iterrows():
        plt.text(row['MDS1'], row['MDS2'], row[label_column], fontsize=7, alpha=0.6)
    plt.title('MDS with Cluster and Sample Labels')
    plt.xlabel('MDS1')
    plt.ylabel('MDS2')
    plt.legend(title='Cluster / Season')
    plt.tight_layout()
    plt.show()


In [None]:
def plot_mds_3d(mds_df, label_column='sample'):
    """
    Plot 3D MDS with sample annotations.
    """
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')
    scatter = ax.scatter(mds_df['MDS1'], mds_df['MDS2'], mds_df['MDS3'],
                         c=mds_df['cluster'].astype(int), cmap='Set1', s=70)
    for i, row in mds_df.iterrows():
        ax.text(row['MDS1'], row['MDS2'], row['MDS3'], row[label_column], size=6)
    ax.set_xlabel('MDS1')
    ax.set_ylabel('MDS2')
    ax.set_zlabel('MDS3')
    plt.title('3D MDS with Clusters and Sample Labels')
    plt.tight_layout()
    plt.show()

In [None]:
def run_permanova(dist_matrix, metadata_df, group_column):
    """
    Run PERMANOVA if scikit-bio is available.
    """
    if not SKBIO_AVAILABLE:
        print("scikit-bio not available. Please install it to run PERMANOVA: conda install anaconda::scikit-bio")
        return None
    dm = DistanceMatrix(dist_matrix)
    result = permanova(dm, metadata_df, column=group_column, permutations=999)
    return result

In [None]:
selected_vars = ['dic_micromol_kg', 'ta_micromol_kg', 'sal_psu', 'temp_c', 'do_mg_l', 'chlorophy_microg_l']
df_clean, X_scaled = preprocess_data(CO2Data, selected_vars)
linkage_matrix, clusters = perform_hierarchical_clustering(X_scaled, n_clusters=3)
df_clean['cluster'] = clusters
plot_dendrogram(linkage_matrix, labels=df_clean['sample'].values)

In [None]:
dist_matrix, mds_df = compute_mds(X_scaled, df_clean[['season', 'estuary', 'sample', 'cluster']])
plot_mds_with_clusters(mds_df)


In [None]:
result = run_permanova(dist_matrix, df_clean[['season']], 'season')
print(result)


## MDS 3D Plot

In [None]:
dist_matrix, mds_df = compute_mds(X_scaled, df_clean[['season', 'estuary', 'sample', 'cluster']], n_components=3)
plot_mds_3d(mds_df)