In [3]:
%reload_ext autoreload
%autoreload 2

from pathlib import Path

import joblib
from matplotlib import pyplot as plt
from sklearn.mixture import BayesianGaussianMixture
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine

from src.clustering import (
    plot_gmm_clusters,
    plot_gmm_log_likelihood_contours,
    preproc_features,
)
from src.config import ConfigManager
from src.database import get_dic_analysis_ids, get_dic_data, get_image
from src.roi import PolygonROISelector, filter_dataframe
from src.visualization import plot_dic_vectors

%matplotlib widget

config = ConfigManager()

# Parameters for DIC data processing
# camera_names = ["PPCX_Tele", "PPCX_Wide"]
# Use only one camera for testing
camera_names = config.get("clustering.variables_names")

min_velocity = config.get("dic.min_velocity")
filter_outliers = config.get("dic.filter_outliers")
tails_percentile = config.get("dic.tails_percentile")

# Parameters for GMM clustering
variables = config.get("clustering.variables_names")
n_components = config.get("clustering.n_components")
weight_concentration_prior = config.get("clustering.weight_concentration_prior")

# Parameters data selection and output
target_date = "2024-09-02"
camera_name = "PPCX_Tele"
base_output_dir = "output"

# Create the connection to the database
db_engine = create_engine(config.db_url)

In [None]:
config.get("dic.min_velocity")

In [None]:
# Build the output folder and base_name
output_dir = Path(base_output_dir) / camera_name
output_dir.mkdir(parents=True, exist_ok=True)
base_name = f"{camera_name}_{target_date}_GMM"

# Get DIC analysis metadata (filtered by date/camera)
dic_analyses = get_dic_analysis_ids(
    db_engine, reference_date=target_date, camera_name=camera_name
)
dic_analyses

In [None]:
master_image_id = dic_analyses["master_image_id"].iloc[0]
master_image_id

In [None]:
# Get the master image for the DIC analysis via the API

img = get_image(master_image_id, camera_name=camera_name)

In [None]:
# Fetch the displacement data for that DIC analysis via the API
dic_id = dic_analyses["dic_id"].iloc[0]
df = get_dic_data(
    dic_id,
    filter_outliers=filter_outliers,
    tails_percentile=tails_percentile,
    min_velocity=min_velocity,
    app_host=config.get("api.host"),
    app_port=config.get("api.port"),
)

In [None]:
# Load the selector from a saved polygon
selector = PolygonROISelector.from_file(
    "data/PPCX_Tele_glacier_ROI.json",
)
df = filter_dataframe(
    df,
    selector.polygon_path,
    x_col="x",
    y_col="y",
)
# visualize_polygon_filter(
#     df,
#     selector2,
#     img=img,
#     figsize=(4, 5),
# )

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
plot_dic_vectors(
    x=df["x"].to_numpy(),
    y=df["y"].to_numpy(),
    u=df["u"].to_numpy(),
    v=df["v"].to_numpy(),
    magnitudes=df["V"].to_numpy(),
    background_image=img,
    cmap_name="batlow",
    # vmin=0.1,
    # vmax=10,
    fig=fig,
    ax=ax,
)
fig.savefig(output_dir / f"{base_name}_dic.png", dpi=300, bbox_inches="tight")

Variational Bayesian Gaussian Mixture


In [None]:
# Extract features and standardize them
df_features = preproc_features(df)
features = df_features[variables_names].values
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Apply weights
for i, var_name in enumerate(variables_names):
    weight = feature_weights.get(var_name, 1.0)
    features_scaled[:, i] *= weight


gmm = BayesianGaussianMixture(
    n_components=n_components,
    weight_concentration_prior=weight_concentration_prior,
    covariance_type=config.get("clustering.covariance_type"),
    max_iter=config.get("clustering.max_iter"),
    random_state=config.get("clustering.random_state",
)
gmm.fit(features_scaled)
labels = gmm.predict(features_scaled)


fig, ax, stats_df = plot_gmm_clusters(
    df_features,
    labels,
    var_names=["V", "angle_rad"],
    img=img,
    figsize=(8, 6),
)
fig.savefig(output_dir / f"{base_name}_clusters.png", dpi=300, bbox_inches="tight")

# Save the GMM model
joblib.dump(scaler, output_dir / f"{base_name}_scaler.joblib")
gmm_run_name = f"{base_name}_GMM_comp{n_components}_cov{covariance_type}_wcp{weight_concentration_prior}"
joblib.dump(gmm, output_dir / f"{gmm_run_name}.joblib")

# Save the features DataFrame with labels
df_features.to_csv(output_dir / f"{base_name}_features_with_labels.csv", index=False)

In [None]:
# Extract features and standardize them
df_features = preproc_features(df)
features = df_features[variables_names].values
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Apply weights
for i, var_name in enumerate(variables_names):
    weight = feature_weights.get(var_name, 1.0)
    features_scaled[:, i] *= weight


gmm = BayesianGaussianMixture(
    n_components=5,
    weight_concentration_prior=0.001,
    covariance_type=covariance_type,
    max_iter=max_iter,
    random_state=random_state,
)
gmm.fit(features_scaled)
labels = gmm.predict(features_scaled)


fig, ax, stats_df = plot_gmm_clusters(
    df_features,
    labels,
    var_names=["V", "angle_rad"],
    img=img,
    figsize=(8, 6),
)
fig.savefig(output_dir / f"{base_name}_clusters.png", dpi=300, bbox_inches="tight")

# Save the GMM model
joblib.dump(scaler, output_dir / f"{base_name}_scaler.joblib")
gmm_run_name = f"{base_name}_GMM_comp{n_components}_cov{covariance_type}_wcp{weight_concentration_prior}"
joblib.dump(gmm, output_dir / f"{gmm_run_name}.joblib")

# Save the features DataFrame with labels
df_features.to_csv(output_dir / f"{base_name}_features_with_labels.csv", index=False)


In [None]:
plot_gmm_log_likelihood_contours(
    df_features, gmm, scaler, variables_names, pair=("V", "angle_rad")
)

In [None]:
# # --- Post-process clustering ---
# min_cluster_size = 30
# smoothing_window_size = 10
# labels_clean = remove_small_clusters(labels, min_size=min_cluster_size)
# labels_smooth = spatial_smooth_labels(
#     df_features, labels_clean, window_size=smoothing_window_size
# )
# labels_smooth = merge_similar_clusters(df_features, labels_smooth, threshold=10)

# # Plot results after cleaning and smoothing
# plot_gmm_clusters(
#     df_features,
#     labels_smooth,
#     var_names=["V", "angle_rad"],
#     img=img,
#     figsize=(8, 6),
# )
