# Exploratory Analysis Functions for Environmental Data Clustering

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from matplotlib import colormaps

In [None]:
# Read in the data from the csv file
CO2Data = pd.read_csv("../data/Terminos_lagoon_TA_DIC_2023_RawData.csv")

## Set functions to be used later

In [None]:
def preprocess_data(df, variables):
    """
    Standardize selected variables and return clean DataFrame.
    """
    df_clean = df.dropna(subset=variables)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_clean[variables])
    return df_clean, X_scaled

In [None]:
def perform_hierarchical_clustering(X_scaled, method='ward', n_clusters=3):
    """
    Perform hierarchical clustering and return cluster labels.
    """
    linkage_matrix = linkage(X_scaled, method=method)
    cluster_labels = fcluster(linkage_matrix, t=n_clusters, criterion='maxclust')
    return linkage_matrix, cluster_labels

In [None]:
# Plot dendrogram with viridis colormap
def plot_dendrogram(linkage_matrix, labels=None):
    """
    Plot a dendrogram from the hierarchical clustering with sample labels and viridis colormap.
    """
    cmap = colormaps.get_cmap('viridis')
    num_colors = len(linkage_matrix) + 1
    colors = [cmap(i / num_colors) for i in range(num_colors)]

    def link_color_func(k):
        return f"#{int(colors[k % len(colors)][0] * 255):02x}{int(colors[k % len(colors)][1] * 255):02x}{int(colors[k % len(colors)][2] * 255):02x}"

    plt.figure(figsize=(10, 5))
    dendrogram(linkage_matrix, labels=labels, link_color_func=link_color_func)
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('Sample Labels')
    plt.ylabel('Distance')
    plt.tight_layout()
    plt.show()

In [None]:
# Main workflow

# Select variables for clustering
selected_vars = ['dic_micromol_kg', 'ta_micromol_kg', 'sal_psu', 'temp_c', 'do_mg_l', 'chlorophy_microg_l','turbidity_fnu']

# Preprocess the data
df_clean, X_scaled = preprocess_data(CO2Data, selected_vars)

# Perform hierarchical clustering
linkage_matrix, clusters = perform_hierarchical_clustering(X_scaled, n_clusters=3)
 
# Add cluster labels to the DataFrame
df_clean['cluster'] = clusters

# Plot dendrogram using sample labels 
plot_dendrogram(linkage_matrix, labels=df_clean['sample'].values)

_________________________________