# Exploratory Analysis Functions for Environmental Data Clustering, MDS and PERMANOVA

In [None]:
# Read in the data from the csv file
import pandas as pd
CO2Data = pd.read_csv("../data/Terminos_lagoon_TA_DIC_2023_RawData.csv")

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.manifold import MDS
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram

# Optional: run PERMANOVA if scikit-bio is available
try:
    from skbio.stats.distance import DistanceMatrix, permanova
    SKBIO_AVAILABLE = True
except ImportError:
    SKBIO_AVAILABLE = False


## Set functions to be used later

In [None]:
def preprocess_data(df, variables):
    """
    Standardize selected variables and return clean DataFrame.
    """
    df_clean = df.dropna(subset=variables)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean[variables])
    return df_clean, X_scaled

In [None]:
def perform_hierarchical_clustering(X_scaled, method='ward', n_clusters=3):
    """
    Perform hierarchical clustering and return cluster labels.
    """
    linkage_matrix = linkage(X_scaled, method=method)
    cluster_labels = fcluster(linkage_matrix, t=n_clusters, criterion='maxclust')
    return linkage_matrix, cluster_labels

In [None]:
def plot_dendrogram(linkage_matrix, labels=None):
    """
    Plot a dendrogram from the hierarchical clustering with sample labels.
    """
    plt.figure(figsize=(10, 5))
    dendrogram(linkage_matrix, labels=labels)
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('Sample Labels')
    plt.ylabel('Distance')
    plt.tight_layout()
    plt.show()

In [None]:
def compute_mds(X_scaled, metadata_df, n_components=2):
    """
    Compute MDS coordinates from a distance matrix.
    """
    dist_matrix = pairwise_distances(X_scaled, metric='euclidean')
    mds = MDS(n_components=n_components, dissimilarity='precomputed', random_state=42)
    coords = mds.fit_transform(dist_matrix)
    columns = [f"MDS{i+1}" for i in range(n_components)]
    mds_df = pd.DataFrame(coords, columns=columns)
    mds_df = pd.concat([mds_df, metadata_df.reset_index(drop=True)], axis=1)
    return dist_matrix, mds_df


In [None]:
def plot_mds_with_clusters(mds_df, label_column='sample'):
    """
    Plot MDS colored by clusters and annotated by sample names.
    """
    plt.figure(figsize=(10, 8))
    sns.scatterplot(data=mds_df, x='MDS1', y='MDS2', hue='cluster', style='season', s=100, palette='Set1')
    for i, row in mds_df.iterrows():
        plt.text(row['MDS1'], row['MDS2'], row[label_column], fontsize=7, alpha=0.6)
    plt.title('MDS with Cluster and Sample Labels')
    plt.xlabel('MDS1')
    plt.ylabel('MDS2')
    plt.legend(title='Cluster / Season')
    plt.tight_layout()
    plt.show()


In [None]:
def plot_mds_3d(mds_df, label_column='sample'):
    """
    Plot 3D MDS with sample annotations.
    """
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')
    scatter = ax.scatter(mds_df['MDS1'], mds_df['MDS2'], mds_df['MDS3'],
                         c=mds_df['cluster'].astype(int), cmap='Set1', s=70)
    for i, row in mds_df.iterrows():
        ax.text(row['MDS1'], row['MDS2'], row['MDS3'], row[label_column], size=6)
    ax.set_xlabel('MDS1')
    ax.set_ylabel('MDS2')
    ax.set_zlabel('MDS3')
    plt.title('3D MDS with Clusters and Sample Labels')
    plt.tight_layout()
    plt.show()

In [None]:
import numpy as np
from skbio.stats.distance import permanova
from skbio import DistanceMatrix

def run_permanova(dist_matrix, metadata_df, group_column, permutations=999):
    """
    Performs PERMANOVA analysis on a distance matrix and a metadata DataFrame.
    
    Parameters:
    - dist_matrix: Distance matrix (must be symmetric and contain no NaNs).
    - metadata_df: DataFrame containing the metadata, which must include a group column.
    - group_column: Name of the column in metadata_df containing the group information.
    - permutations: Number of permutations for the test (default is 999).
    
    Returns:
    - PERMANOVA results.
    """
    # Check if the distance matrix contains NaNs
    if np.any(np.isnan(dist_matrix)):
        print("The distance matrix contains NaN values. Replacing with the mean value.")
        mean_value = np.nanmean(dist_matrix)
        dist_matrix[np.isnan(dist_matrix)] = mean_value

    # Ensure the distance matrix is symmetric
    if not np.allclose(dist_matrix, dist_matrix.T):
        print("The distance matrix is not symmetric. Making it symmetric by averaging with the transpose.")
        dist_matrix = (dist_matrix + dist_matrix.T) / 2
    
    # Create the DistanceMatrix object for skbio
    dist_matrix_obj = DistanceMatrix(dist_matrix)
    
    # Ensure the group column exists in the metadata DataFrame
    if group_column not in metadata_df.columns:
        raise ValueError(f"The column {group_column} does not exist in the metadata DataFrame.")
    
    # Perform the PERMANOVA
    result = permanova(dist_matrix_obj, metadata_df[group_column], permutations=permutations)
    
    return result



In [None]:
# Preprocess the data and perform hierarchical clustering
# Select variables for clustering
selected_vars = ['dic_micromol_kg', 'ta_micromol_kg', 'sal_psu', 'temp_c', 'do_mg_l', 'chlorophy_microg_l','turbidity_fnu']

# Preprocess the data
df_clean, X_scaled = preprocess_data(CO2Data, selected_vars)

# Perform hierarchical clustering
linkage_matrix, clusters = perform_hierarchical_clustering(X_scaled, n_clusters=3)

# Add cluster labels to the DataFrame
df_clean['cluster'] = clusters

# Plot dendrogram
plot_dendrogram(linkage_matrix, labels=df_clean['sample'].values)

## Compute MDS coordinates

In [None]:
# compute MDS of the scaled data and plot it
dist_matrix, mds_df = compute_mds(X_scaled, df_clean[['season', 'estuary', 'sample', 'cluster']])

plot_mds_with_clusters(mds_df)


In [None]:
# Example usage of the function with checks
try:
    # Ensure the distance matrix is symmetric and contains no NaNs
    if np.any(np.isnan(dist_matrix)):
        print("The distance matrix contains NaN values. Replacing with zeros.")
        dist_matrix[np.isnan(dist_matrix)] = 0

    if not np.allclose(dist_matrix, dist_matrix.T):
        print("The distance matrix is not symmetric. Making it symmetric by averaging with the transpose.")
        dist_matrix = (dist_matrix + dist_matrix.T) / 2

    # Call the function passing the distance matrix and metadata DataFrame
    result = run_permanova(dist_matrix, df_clean, 'cluster')
    
    # Print the results if no errors occur
    print(result)
except ValueError as e:
    # Print the error message if an exception is raised
    print(f"Error: {e}")


In [None]:
# Verificar si la matriz es simétrica
if np.allclose(dist_matrix, dist_matrix.T):
    print("La matriz de distancias es simétrica.")
else:
    print("La matriz de distancias no es simétrica.")


## MDS 3D Plot

In [None]:
dist_matrix, mds_df = compute_mds(X_scaled, df_clean[['season', 'estuary', 'sample', 'cluster']], n_components=3)
plot_mds_3d(mds_df)

In [None]:
dist_matrix

In [None]:
df_clean