# Spearman's Correlation Matrix

In [None]:
# Read in the data from the csv file
import pandas as pd
CO2Data = pd.read_csv("../data/Terminos_lagoon_TA_DIC_2023_RawData.csv")

In [None]:
def perform_adequacy_tests(data, selected_columns):
    """
    Perform Bartlett's test and KMO test for factor analysis.

    Parameters:
        data (pd.DataFrame): The input dataset.
        selected_columns (list): List of column names to include in the analysis.

    Returns:
        dict: Results of Bartlett's test and KMO test.
    """
    from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo

    # Select numeric data and drop missing values
    numeric_values = data[selected_columns].dropna().select_dtypes(include=['float64', 'int64']).values

    # Perform Bartlett's test
    chi_square, p_value = calculate_bartlett_sphericity(numeric_values)
    print(f"Bartlett's Test: p-value = {p_value:.4f}, chi-square = {chi_square:.4f}")

    # Perform KMO test
    _, kmo_model = calculate_kmo(numeric_values)
    print(f"KMO Value: {kmo_model:.1f}")

   # 📝 Print interpretation of KMO value
    if kmo_model > 0.8:
        print("✅ Suitability: Excellent 🌟")
    elif 0.7 <= kmo_model <= 0.79:
        print("✅ Suitability: Acceptable 👍")
    elif 0.6 <= kmo_model <= 0.69:
        print("⚠️ Suitability: Mediocre 🤔")
    else:
        print("❌ Suitability: Poor 🚫")

    print("ℹ️ Interpretation: A KMO value above 0.6 is considered acceptable for factor analysis.")

    # 🔄 Return results
    return {"bartlett": {"chi_square": chi_square, "p_value": p_value}, "kmo": kmo_model}


In [None]:
# Example usage
selected_vars = ['depth_m', 'do_mg_l', 'sal_psu', 
                 'turbidity_fnu', 'temp_c', 'ta_micromol_kg', 'dic_micromol_kg', 'chlorophy_microg_l']
results = perform_adequacy_tests(CO2Data, selected_vars)

## Adequacy Test

In [None]:
def perform_adequacy_tests(data, selected_columns):
    """
    Perform adequacy tests (Bartlett's test and KMO test) for factor analysis.

    Parameters:
        data (pd.DataFrame): The input dataset.
        selected_columns (list): List of column names to include in the analysis.

    Returns:
        dict: Results of Bartlett's test and KMO test.
    """
    from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo

    # Select relevant columns and drop missing values
    selected_data = data[selected_columns].dropna()
    numeric_values = selected_data.select_dtypes(include=['float64', 'int64']).values

    # Perform Bartlett's test of sphericity
    chi_square_value, p_value = calculate_bartlett_sphericity(numeric_values)
    print(f"Bartlett's Test: p-value = {p_value:.4f}, chi-square = {chi_square_value:.4f}")

    # Perform Kaiser-Meyer-Olkin (KMO) test
    _, kmo_model = calculate_kmo(numeric_values)
    print(f"KMO Value: {kmo_model:.2f}")

    # Return results as a dictionary
    return {
        "bartlett": {"chi_square": chi_square_value, "p_value": p_value},
        "kmo": kmo_model
    }

# Example usage
selected_columns = ['depth_m', 'do_mg_l', 'sal_psu', 'turbidity_fnu', 'temp_c', 'ta_micromol_kg', 'dic_micromol_kg', 'chlorophy_microg_l']
results = perform_adequacy_tests(CO2Data, selected_columns)

In [None]:
# Select the columns that we need for the analysis

CO2Data_FA_vars = CO2Data[['depth_m', 'do_mg_l', 'sal_psu', 'turbidity_fnu', 'temp_c', 'ta_micromol_kg', 'dic_micromol_kg', 'chlorophy_microg_l']]




In [None]:
# Select numeric columns for factor analysis
numeric_columns = CO2Data_FA_vars.select_dtypes(include=['float64', 'int64']).dropna(axis=1)
numeric_values = numeric_columns.values

# Perform Bartlett's test of sphericity
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity

# Perform Bartlett's test of sphericity
chi_square_value, p_value = calculate_bartlett_sphericity(numeric_values)

# Print results
print(f"The data is {'not ' if p_value < 0.05 else ''}an identity matrix.")
print(f"p-value: {p_value:.4f}, chi-square value: {chi_square_value:.4f}")

In [None]:
# The KMO test measures the suitability of data for factor analysis. It is a measure of sampling adequacy.
from factor_analyzer.factor_analyzer import calculate_kmo

# Perform the Kaiser-Meyer-Olkin test
_, kmo_model = calculate_kmo(numeric_values)

# Print results with interpretation
print(f"KMO Value: {kmo_model:.2f}")
if kmo_model > 0.8:
    print("Suitability: Excellent")
elif 0.7 <= kmo_model <= 0.79:
    print("Suitability: Acceptable")
elif 0.6 <= kmo_model <= 0.69:
    print("Suitability: Mediocre")
else:
    print("Suitability: Poor")

print("Interpretation: A KMO value above 0.6 is considered acceptable for factor analysis.")


## Choosing the Number of Factors

In [None]:
import numpy as np
from sklearn.decomposition import PCA

# Apply PCA to compute the eigenvalues of the correlation matrix
pca = PCA()
pca.fit(numeric_values)

# Extract eigenvalues representing the variance explained by each component
eigenvalues = pca.explained_variance_
print("Eigenvalues:", eigenvalues)

# Apply the Kaiser criterion: retain components with eigenvalues > 1
n_factors = np.sum(eigenvalues > 1)
print(f"Number of factors to retain (Kaiser criterion): {n_factors}")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='-')
plt.axhline(y=1, color='red', linestyle='--', label='Kaiser Criterion')
plt.title('Scree Plot')
plt.xlabel('Factor Number')
plt.ylabel('Eigenvalue')
plt.legend()
plt.show()

In [None]:
from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate  # For displaying tables

# Select numeric columns for factor analysis
numeric_columns = CO2Data_FA_vars.select_dtypes(include=['float64', 'int64']).dropna(axis=1)
X = numeric_columns.values

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform factor analysis
fa = FactorAnalysis(n_components=2,  rotation='varimax')  # Adjust n_components as needed
X_factors = fa.fit_transform(X_scaled)

# Display the factor loadings in a table (rounded to 2 decimal places)
factor_loadings = pd.DataFrame(
    fa.components_.T, 
    index=numeric_columns.columns, 
    columns=[f'Factor{i+1}' for i in range(fa.n_components)]
).round(2)  # Round to 2 decimal places

print(tabulate(factor_loadings, headers='keys', tablefmt='grid'))

In [None]:
def plot_fa_biplot(df, variables, group_col=None, label_col=None,
                         n_factors=2, title='Factor Analysis Biplot',
                         show_labels=True, save_path=None, figsize=(10, 10)):
    import matplotlib.pyplot as plt
    import numpy as np
    from sklearn.decomposition import FactorAnalysis
    from sklearn.preprocessing import StandardScaler
    import matplotlib
    from matplotlib.patheffects import withStroke

    # Prepare data
    X = df[variables].dropna()
    df_filtered = df.loc[X.index]
    X_scaled = StandardScaler().fit_transform(X)

    # Factor analysis
    fa = FactorAnalysis(n_components=n_factors)
    factor_scores = fa.fit_transform(X_scaled)
    loadings = fa.components_.T * np.sqrt(fa.noise_variance_)[:, np.newaxis]

    # Group and label info
    groups = df_filtered[group_col].values if group_col else None
    labels = df_filtered[label_col].values if label_col else ["" for _ in range(len(factor_scores))]
    unique_groups = np.unique(groups) if groups is not None else None
    cmap = matplotlib.colormaps.get_cmap('viridis').resampled(len(unique_groups)) if groups is not None else None

    # Initialize plot
    fig, ax = plt.subplots(figsize=figsize)
    ax.axhline(0, color='lightgray', lw=1)
    ax.axvline(0, color='lightgray', lw=1)
    ax.add_artist(plt.Circle((0, 0), 1, color='gray', fill=False, linestyle='dashed'))

    # Plot variable loadings
    arrow_scale = 4
    for i, var in enumerate(variables):
        x, y = loadings[i] * arrow_scale
        ax.arrow(0, 0, x, y, color='black', linewidth=2.4,
                 alpha=0.9, head_width=0.12, head_length=0.15, zorder=3, length_includes_head=True)
        ax.text(x * 1.2, y * 1.2, var, fontsize=13, weight='bold',
                ha='center', va='center',
                path_effects=[withStroke(linewidth=3, foreground='white')], zorder=4)

    # Plot scores
    if groups is not None:
        for i, group in enumerate(unique_groups):
            idx = groups == group
            ax.scatter(factor_scores[idx, 0], factor_scores[idx, 1],
                       label=group, s=70, alpha=0.85,
                       edgecolor='white', linewidth=0.6, color=cmap(i), zorder=2)
            if show_labels:
                for j in np.where(idx)[0]:
                    ax.text(factor_scores[j, 0], factor_scores[j, 1], labels[j],
                            fontsize=6.5, alpha=0.5)
    else:
        ax.scatter(factor_scores[:, 0], factor_scores[:, 1], alpha=0.7, s=50, zorder=2)

    # Formatting
    ax.set_xlabel('Factor 1', fontsize=14, weight='bold', family='serif')
    ax.set_ylabel('Factor 2', fontsize=14, weight='bold', family='serif')
    ax.set_title(title, fontsize=16, weight='bold', family='serif')
    if groups is not None:
        ax.legend(title=group_col, fontsize=10, title_fontsize=11)
    ax.tick_params(labelsize=11)
    ax.set_aspect('equal')

    # Auto-limits
    margin = 0.5
    ax.set_xlim(factor_scores[:, 0].min() - margin, factor_scores[:, 0].max() + margin)
    ax.set_ylim(factor_scores[:, 1].min() - margin, factor_scores[:, 1].max() + margin)

    plt.tight_layout()

    # Save figure
    if save_path:
        dpi = 600 if save_path.endswith(('.png', '.jpg')) else None
        plt.savefig(save_path, dpi=dpi, bbox_inches='tight')

    plt.show()



In [None]:
plot_fa_biplot(
    df=CO2Data,
    variables=CO2Data_FA_vars.select_dtypes(include=['float64', 'int64']).columns.tolist(),
    group_col='season',
    label_col='sample',
    n_factors=2,
    title='Factor Analysis - Terminos Lagoon'
)