# Spearman's Correlation Matrix

In [None]:
# Read in the data from the csv file

import pandas as pd

CO2Data = pd.read_csv("../data/Terminos_lagoon_TA_DIC__2023_RawData.csv")

In [None]:
# Printe the column names
print(CO2Data.columns)

## Adequacy Test

In [None]:
# Select the columns that we need for the analysis

CO2Data_FA_vars = CO2Data[['depth_m', 'do_mg_l', 'sal_psu', 'turbidity_fnu', 'temp_c', 'ta_micromol_kg', 'dic_micromol_kg', 'chlorophy_microg_l']]

# Select numeric columns for factor analysis
numeric_columns = CO2Data_FA_vars.select_dtypes(include=['float64', 'int64']).dropna(axis=1)
numeric_values = numeric_columns.values


In [None]:
# Perform Bartlett's test of sphericity
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity

# Perform Bartlett's test of sphericity
chi_square_value, p_value = calculate_bartlett_sphericity(numeric_values)

# Print results
print(f"The data is {'not ' if p_value < 0.05 else ''}an identity matrix.")
print(f"p-value: {p_value:.4f}, chi-square value: {chi_square_value:.4f}")

In [None]:
# The KMO test measures the suitability of data for factor analysis. It is a measure of sampling adequacy.
from factor_analyzer.factor_analyzer import calculate_kmo

# Perform the Kaiser-Meyer-Olkin test
_, kmo_model = calculate_kmo(numeric_values)

# Print results with interpretation
print(f"KMO Value: {kmo_model:.2f}")
if kmo_model > 0.8:
    print("Suitability: Excellent")
elif 0.7 <= kmo_model <= 0.79:
    print("Suitability: Acceptable")
elif 0.6 <= kmo_model <= 0.69:
    print("Suitability: Mediocre")
else:
    print("Suitability: Poor")

print("Interpretation: A KMO value above 0.6 is considered acceptable for factor analysis.")


## Choosing the Number of Factors

In [None]:
import numpy as np
from sklearn.decomposition import PCA

# Apply PCA to compute the eigenvalues of the correlation matrix
pca = PCA()
pca.fit(numeric_values)

# Extract eigenvalues representing the variance explained by each component
eigenvalues = pca.explained_variance_
print("Eigenvalues:", eigenvalues)

# Apply the Kaiser criterion: retain components with eigenvalues > 1
n_factors = np.sum(eigenvalues > 1)
print(f"Number of factors to retain (Kaiser criterion): {n_factors}")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='-')
plt.axhline(y=1, color='red', linestyle='--', label='Kaiser Criterion')
plt.title('Scree Plot')
plt.xlabel('Factor Number')
plt.ylabel('Eigenvalue')
plt.legend()
plt.show()

In [None]:
from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate  # For displaying tables

# Select numeric columns for factor analysis
numeric_columns = CO2Data_FA_vars.select_dtypes(include=['float64', 'int64']).dropna(axis=1)
X = numeric_columns.values

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform factor analysis
fa = FactorAnalysis(n_components=2, random_state=42)  # Adjust n_components as needed
X_factors = fa.fit_transform(X_scaled)

# Display the factor loadings in a table (rounded to 2 decimal places)
factor_loadings = pd.DataFrame(
    fa.components_.T, 
    index=numeric_columns.columns, 
    columns=[f'Factor{i+1}' for i in range(fa.n_components)]
).round(2)  # Round to 2 decimal places

print(tabulate(factor_loadings, headers='keys', tablefmt='grid'))