In [None]:
import numpy as np
from scipy.stats import wasserstein_distance

In [None]:
# Set random seed for reproducibility
np.random.seed(123)

# Generate example data.
reference_data = np.random.normal(loc=0, scale=1, size=1000)
new_data = np.random.normal(loc=1.5, scale=1, size=1000)

In [None]:
# Code to plot the distributions.

import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

# Estimate PDF.
ref_kde = gaussian_kde(reference_data)
new_kde = gaussian_kde(new_data)

x_vals = np.linspace(min(reference_data.min(), new_data.min()) - 1,
                     max(reference_data.max(), new_data.max()) + 1, 500)

fig, axes = plt.subplots(2, 1, figsize=(8, 8), sharex=True)

# Reference data
axes[0].plot(x_vals, ref_kde(x_vals), color="steelblue", linewidth=2)
axes[0].set_title("Reference Data")
axes[0].set_ylabel("Density")
axes[0].grid(True, linestyle="--", alpha=0.6)

# New data
axes[1].plot(x_vals, new_kde(x_vals), color="darkorange", linewidth=2)
axes[1].set_title("New Data")
axes[1].set_xlabel("Value")
axes[1].set_ylabel("Density")
axes[1].grid(True, linestyle="--", alpha=0.6)

plt.suptitle("Comparison of Reference vs New Data (distribution)", fontsize=14)

plt.savefig("data-drift-comparison.png", dpi=300, bbox_inches="tight")

plt.show()


In [None]:
# Calculate Wasserstein Distance.
distance = wasserstein_distance(reference_data, new_data)
print(f"Wasserstein Distance: {distance:.4f}")