In [None]:
%pip install "pymdma[time_series] @ https://github.com/fraunhoferportugal/pymdma.git" --find-links "https://download.pytorch.org/whl/cpu/torch_stable.html"

# Load Data

Load data that simulates both real and synthetic samples for metric computation.


In [None]:
import os
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np

from pymdma.time_series.input_layer import TimeSeriesInputLayer

In [None]:
parent_dir = os.path.dirname(os.getcwd())

validation_domain = "synthesis_val"
reference_type = "dataset"
target_data_path = Path(parent_dir + "/data/test/time_series/synthesis_val/dataset/")
reference_data_path = Path(parent_dir + "/data/test/time_series/synthesis_val/reference/")
batch_size = 5

ts_input_layer = TimeSeriesInputLayer(
    validation_domain == validation_domain,
    reference_type=reference_type,
    target_data=target_data_path,
    reference_data=reference_data_path,
    batch_size=batch_size,
)


# Get raw data for input validation
ref_data, target_data = ts_input_layer.get_full_samples()

 Explore data shapes and plot Lead I of a real ECG tracing.

In [None]:
# Acess shape
shape_ref = ref_data.shape
shape_target = target_data.shape

print(
    f"Reference/Real data Shape: {shape_ref} | {shape_ref[0]} ECG tracings, each {shape_ref[1]} samples long with {shape_ref[2]} channels"
)
print(
    f"Target/Synthetic data Shape: {shape_target} | {shape_target[0]} tracings, each {shape_target[1]} samples long with {shape_target[2]} channels"
)

# Plot Lead I of a Real ECG Signal
plt.plot(ref_data[0, :, 0])

In [None]:
def plot_instances_score(signals: list[np.ndarray], metric: str, scores: list[float], n_cols: int = 5):
    n_rows = len(signals) // n_cols
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 3))
    for ax, signal, score in zip(axs.flat, signals, scores):
        ax.plot(signal[:, 0])  # ploting only Lead I of the ECG signal
        ax.set_title(f"{metric}: {score:.2f}")
        ax.axis("off")
        ax.set_aspect("auto")
    # Add a title to the entire figure
    fig.suptitle("ECG Signals with SNR Annotation (All Leads Considered, Lead I Shown)", fontsize=16)
    plt.show()

# Input Validation


In the time series modality, the `pymdma` package offers one type of input validation, **no-reference**, where the signal is validated independently, without requiring a reference signal.

This section demonstrates how to use the input validation functions with the signal-to-noise ratio (`SNR`) as an example.

In [None]:
from pymdma.time_series.measures.input_val.data.quality import SNR

snr = SNR()
snr_result = snr.compute(ref_data)  # compute the metric
_dataset_level, instance_level = snr_result.value  # fetch the instance level results


plot_instances_score(ref_data, "SNR", instance_level, n_cols=5)

### Ploting the metric results

We provide a simple method in the `MetricResult` class to easily plot the results of the metrics. The method `plot()` will plot the results of the metrics in the format specified by the `plot_params` attribute in the `MetricResult` class. The `plot_params` attribute is a dictionary that contains the parameters to be used in the plot. If this attribute is not set, the method will default to a bar plot.

You can provide a title for the plot when calling this method, as well as an axis is which you wish to plot the results (helpfull when plotting multiple metrics in the same plot). In addition, you can provide a set of `plot_params` to be used directly by matplotlib's plotting functions.

> **Note**: You also have access to the values of the metrics via the `values` attribute in the `MetricResult` class. You can use these values to plot the results using your own plotting functions.


In [None]:
snr_result.plot("Signal to Noise Ratio")  # plot the results from the result object
plt.show()

# Synthetic Validation

The automatic evaluation of synthetically generated signals is a common practice in the field of generative AI, and is crucial for the assessment of the quality of large synthetic datasets. This is usually done by comparing the synthetic signals to a set of reference signals by considering the similarity between the distributions of the two sets. In this section, we will demonstrate how to use the `pymdma` package to evaluate the quality of synthetic signals.

In [None]:
# Get features for synthetic data quality metrics computation
ref_features, target_features = ts_input_layer.get_embeddings("tsfel")

print("Reference features shape:", ref_features.shape)
print("Synthetic features shape:", target_features.shape)

#### Feature Space Visualization: UMAP Analysis of Real vs Synthetic Data

In [None]:
from umap import UMAP

umap = UMAP(n_neighbors=3, n_components=2, random_state=10, n_jobs=1)
real_feats_2d = umap.fit_transform(ref_features)
fake_feats_2d = umap.transform(target_features)

plt.figure(figsize=(5, 5))
plt.scatter(real_feats_2d[:, 0], real_feats_2d[:, 1], s=20, label="Real Samples")
plt.scatter(fake_feats_2d[:, 0], fake_feats_2d[:, 1], s=20, label="Fake Samples")
plt.title("UMAP Features Visualization | Real vs Synthetic")
plt.legend()
plt.show()

#### Compute Improved Precision and Improved Recall (Dataset-level and Instance-level)

In [None]:
from pymdma.time_series.measures.synthesis_val import ImprovedPrecision, ImprovedRecall

ip = ImprovedPrecision(k=2)
ir = ImprovedRecall(k=2)

ip_result = ip.compute(ref_features, target_features)
ir_result = ir.compute(ref_features, target_features)

precision_dataset, precision_instance = ip_result.value
recall_dataset, recall_instance = ir_result.value

print(f"Dataset-level Precision: {precision_dataset:.2f} | Dataset-level Recall: {recall_dataset:.2f}")
print(f"Instance-level Precision: {precision_instance[:20]} | Instance-level Recall: {recall_instance[:20]}")

#### Plot Precise and Imprecise samples according to Improved Precision

In [None]:
def plot_instances_grid(signals: list[np.ndarray], n_cols: int = 25):
    n_rows = len(signals) // n_cols
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols * 3, n_rows * 3))
    fig.subplots_adjust(hspace=0, wspace=0)
    for ax, signal in zip(axs.flat, signals):
        ax.plot(signal[:, 0])  # ploting only Lead I
        ax.get_xaxis().set_ticks([])
        ax.get_yaxis().set_ticks([])
        ax.axis("off")
        ax.set_aspect("auto")
    return fig

In [None]:
precision_instance = np.array(precision_instance)
imprecise_idx = np.argwhere(precision_instance < 1).flatten()
precise_idx = np.argwhere(precision_instance >= 1).flatten()

precise_samples = [target_data[i] for i in precise_idx]
imprecise_samples = [target_data[i] for i in imprecise_idx]

precise_fig = plot_instances_grid(precise_samples, n_cols=5)
precise_fig.suptitle("Lead I of Precise Signals (All Leads Considered)", fontsize=15)
plt.show()

imprecise_fig = plot_instances_grid(imprecise_samples, n_cols=5)
imprecise_fig.suptitle("Lead I of Imprecise Signals (All Leads Considered)", fontsize=15)
plt.show()

### Synthetic Valitation using Distance Metrics

In distance metrics such as Frechet Distance, Wasserstein Distance, and Maximum Mean Discrepancy (MMD), besides the metric value alone the `pymdma` package also computes two additional statistics: the dispersion ratio and the distance ratio.

- **dispersion ratio**: computes the ratio of the distance between fake samples and the distance between real samples, providing insight into the variability of the generated data compared to the original data.
- **distance ratio**: computes the ratio of the distance between real and fake samples and the distance of between real samples, indicating the dissimilarity between the two datasets in comparison to the internal variation within the real samples.

An example of the Wasserstein distance value, along with the corresponding ratios, is provided above.


In [None]:
from pymdma.time_series.measures.synthesis_val import WassersteinDistance

WD = WassersteinDistance()
wd_result = WD.compute(ref_features, target_features)

wd_dataset, _ = wd_result.value
stats_dataset, _ = wd_result.stats


dispersion_ratio = stats_dataset["dispersion_ratio"]
distance_ratio = stats_dataset["distance_ratio"]
print("Dataset-level information:")
print(f"\t{'Wasserstein Distance':<25}{wd_dataset:.2f}")
print(f"\t{'Distance Ratio':<25}{distance_ratio:.2f}")
print(f"\t{'Dispersion Ratio':<25}{dispersion_ratio:.2f}")

These values indicate that the distance between real and fake samples was 2.54 times greater than the distance between real samples, and that the variability among fake samples was 3.28 times higher than the variability between real samples. These ratios provide a more intuitive interpretation than the distance metric value alone, offering a clearer comparison of the variation between real and synthetic data.