In [1]:
import sys
import os

# Get the absolute path of the project root
project_root = os.path.abspath("..")  # Moves up one level

# Add the project root to sys.path
sys.path.append(project_root)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import h5py

In [3]:
id2label = ["Technology", "Sale-Ads", "Politics", "Religion", "Science", "Recreation (Drift)"]

In [4]:
from driftlens.driftlens import DriftLens

# Load Embedding Vectors and Texts

In [5]:
def load_embedding(filepath, E_name=None, Y_original_name=None, Y_predicted_name=None):
    if filepath is not None:
        with h5py.File(filepath, "r") as hf:
            if E_name is None:
                E = hf["E"][()]
            else:
                E = hf[E_name][()]
            if Y_original_name is None:
                Y_original = hf["Y_original"][()]
            else:
                Y_original = hf[Y_original_name][()]
            if Y_predicted_name is None:
                Y_predicted = hf["Y_predicted"][()]
            else:
                Y_predicted = hf[Y_predicted_name][()]
    else:
        raise Exception("Error in loading the embedding file. Please set the embedding paths in the configuration file.")
    return E, Y_original, Y_predicted

In [6]:
base_path = "../experiments/use_case_2_20_news_recreation_drift/static/saved_embeddings/bert/"

E_train, Y_original_train, Y_predicted_train = load_embedding(os.path.join(base_path, "train_embedding_0-4.hdf5"))
E_test, Y_original_test, Y_predicted_test = load_embedding(os.path.join(base_path, "test_embedding_0-4.hdf5"))
E_new_unseen, Y_original_new_unseen, Y_predicted_new_unseen = load_embedding(os.path.join(base_path, "new_unseen_embedding_0-4.hdf5"))
E_drift, Y_original_drift, Y_predicted_drift = load_embedding(os.path.join(base_path, "drift_embedding_5.hdf5"))

In [7]:
df_train = pd.read_csv("../experiments/use_case_2_20_news_recreation_drift/static/data/bert/df_train.csv")
df_test = pd.read_csv("../experiments/use_case_2_20_news_recreation_drift/static/data/bert/df_test.csv")
df_new_unseen = pd.read_csv("../experiments/use_case_2_20_news_recreation_drift/static/data/bert/df_new_unseen.csv")
df_drift = pd.read_csv("../experiments/use_case_2_20_news_recreation_drift/static/data/bert/df_drifted.csv")

In [8]:
df_train["original_label"] = Y_original_train
df_train["predicted_label"] = Y_predicted_train
df_train["drifted_label"] = [0]*len(df_train)

df_test["original_label"] = Y_original_test
df_test["predicted_label"] = Y_original_test
df_test["drifted_label"] = [0]*len(df_test)

df_new_unseen["original_label"] = Y_original_new_unseen
df_new_unseen["predicted_label"] = Y_original_new_unseen
df_new_unseen["drifted_label"] = [0]*len(df_new_unseen)

df_drift["original_label"] = Y_original_drift
df_drift["predicted_label"] = Y_predicted_drift
df_drift["drifted_label"] = [1]*len(df_drift)

# Reduce Embedding Dimensionality

In [9]:
flag_reduce_with_baseline = False

if flag_reduce_with_baseline:
    print("Embdding dimensionality reduction with baseline PCA ...")
    dl = DriftLens()
    
    baseline = dl.estimate_baseline(E=E_train,
                                Y=Y_predicted_train,
                                label_list=range(len(id2label)-1),
                                batch_n_pc=150,
                                per_label_n_pc=50)

    E_train_per_label_reduced = []

    for E, Y_predicted in zip(E_train, Y_predicted_train):
        pca_model = baseline.get_PCA_model_by_label(Y_predicted)

        # Ensure E is a 2D array before PCA transformation
        if E.ndim == 1:  # If E is a 1D vector
            E = E.reshape(1, -1)  # Reshape to (1, num_features)

        # Apply PCA transformation
        E_pca = pca_model.transform(E)  # This should be (1, reduced_dim)

        # Reshape E_pca to ensure final shape is (1, reduced_dim) without unnecessary dimensions
        E_train_per_label_reduced.append(E_pca.squeeze())  

    # Convert list to numpy array
    E_train_per_label_reduced = np.array(E_train_per_label_reduced)

    # Check the final shape of E_drift_reduced
    print(E_train_per_label_reduced.shape)
    
    E_test_per_label_reduced = []

    for E, Y_predicted in zip(E_test, Y_predicted_test):
        pca_model = baseline.get_PCA_model_by_label(Y_predicted)

        # Ensure E is a 2D array before PCA transformation
        if E.ndim == 1:  # If E is a 1D vector
            E = E.reshape(1, -1)  # Reshape to (1, num_features)

        # Apply PCA transformation
        E_pca = pca_model.transform(E)  # This should be (1, reduced_dim)

        # Reshape E_pca to ensure final shape is (1, reduced_dim) without unnecessary dimensions
        E_test_per_label_reduced.append(E_pca.squeeze())  

    # Convert list to numpy array
    E_test_per_label_reduced = np.array(E_test_per_label_reduced)

    # Check the final shape of E_drift_reduced
    print(E_test_per_label_reduced.shape)
    
    E_new_unseen_per_label_reduced = []

    for E, Y_predicted in zip(E_new_unseen, Y_predicted_new_unseen):
        pca_model = baseline.get_PCA_model_by_label(Y_predicted)

        # Ensure E is a 2D array before PCA transformation
        if E.ndim == 1:  # If E is a 1D vector
            E = E.reshape(1, -1)  # Reshape to (1, num_features)

        # Apply PCA transformation
        E_pca = pca_model.transform(E)  # This should be (1, reduced_dim)

        # Reshape E_pca to ensure final shape is (1, reduced_dim) without unnecessary dimensions
        E_new_unseen_per_label_reduced.append(E_pca.squeeze())  

    # Convert list to numpy array
    E_new_unseen_per_label_reduced = np.array(E_new_unseen_per_label_reduced)

    # Check the final shape of E_drift_reduced
    print(E_new_unseen_per_label_reduced.shape)
    
    E_drift_per_label_reduced = []

    for E, Y_predicted in zip(E_drift, Y_predicted_drift):
        pca_model = baseline.get_PCA_model_by_label(Y_predicted)

        # Ensure E is a 2D array before PCA transformation
        if E.ndim == 1:  # If E is a 1D vector
            E = E.reshape(1, -1)  # Reshape to (1, num_features)

        # Apply PCA transformation
        E_pca = pca_model.transform(E)  # This should be (1, reduced_dim)

        # Reshape E_pca to ensure final shape is (1, reduced_dim) without unnecessary dimensions
        E_drift_per_label_reduced.append(E_pca.squeeze())  

    # Convert list to numpy array
    E_drift_per_label_reduced = np.array(E_drift_per_label_reduced)

    # Check the final shape of E_drift_reduced
    print(E_drift_per_label_reduced.shape)

    E_train_per_batch_reduced = baseline.get_batch_PCA_model().transform(E_train)
    E_test_per_batch_reduced = baseline.get_batch_PCA_model().transform(E_test)
    E_new_unseen_per_batch_reduced = baseline.get_batch_PCA_model().transform(E_new_unseen)
    E_drift_per_batch_reduced = baseline.get_batch_PCA_model().transform(E_drift)

else:
    print("Embdding dimensionality reduction with baseline PCA skipped")
    E_train_per_label_reduced = E_train
    E_train_per_batch_reduced = E_train

    E_test_per_label_reduced = E_test
    E_test_per_batch_reduced = E_test

    E_new_unseen_per_label_reduced = E_new_unseen
    E_new_unseen_per_batch_reduced = E_new_unseen

    E_drift_per_label_reduced = E_drift
    E_drift_per_batch_reduced = E_drift



Embdding dimensionality reduction with baseline PCA skipped


In [10]:
print(E_train_per_label_reduced.shape)
print(E_test_per_label_reduced.shape)
print(E_new_unseen_per_label_reduced.shape)
print(E_drift_per_label_reduced.shape)

(5080, 768)
(3387, 768)
(5560, 768)
(3655, 768)


In [11]:
print(E_train_per_batch_reduced.shape)
print(E_test_per_batch_reduced.shape)
print(E_new_unseen_per_batch_reduced.shape)
print(E_drift_per_batch_reduced.shape)

(5080, 768)
(3387, 768)
(5560, 768)
(3655, 768)


# Clustering Algorithms

In [12]:
from sklearn.mixture import GaussianMixture
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
import numpy as np
from sklearn.preprocessing import normalize

def find_optimal_clusters_with_spectral(embeddings, max_clusters=10, reduce_dim=False, n_dim=2, affinity='nearest_neighbors', plot=False):
    if reduce_dim:
        pca = PCA(n_components=n_dim)
        embeddings = pca.fit_transform(embeddings)
    
    silhouette_scores = []
    
    for k in range(2, max_clusters + 1):
        clustering = SpectralClustering(n_clusters=k, affinity=affinity, random_state=42, assign_labels='kmeans')
        labels = clustering.fit_predict(embeddings)
        
        if len(set(labels)) > 1:
            silhouette_scores.append(silhouette_score(embeddings, labels))
        else:
            silhouette_scores.append(None)
    
    best_k = np.argmax([s for s in silhouette_scores if s is not None]) + 2
    spectral_best = SpectralClustering(n_clusters=best_k, affinity=affinity, random_state=42, assign_labels='kmeans')
    labels = spectral_best.fit_predict(embeddings)
    
    if plot:
        plt.figure(figsize=(8, 6))
        plt.plot(range(2, max_clusters + 1), silhouette_scores, marker='o', label='Silhouette Score')
        plt.xlabel('Number of Clusters (k)')
        plt.ylabel('Silhouette Score')
        plt.title('Silhouette Score vs. Clusters (Spectral)')
        plt.grid(True)
        plt.show()
    
    return best_k, spectral_best, labels

def find_optimal_clusters_with_kmeans(embeddings, max_clusters=10, reduce_dim=False, n_dim=2, max_iter=1000, plot=False):
    if reduce_dim:
        pca = PCA(n_components=n_dim)
        embeddings = pca.fit_transform(embeddings)
    
    silhouette_scores = []
    inertias = []
    for k in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, max_iter=max_iter, n_init=10)
        labels = kmeans.fit_predict(embeddings)
        inertias.append(kmeans.inertia_)
        
        if k > 1:
            silhouette_scores.append(silhouette_score(embeddings, labels))
        else:
            silhouette_scores.append(None)
    
    best_k = np.argmax([s for s in silhouette_scores if s is not None]) + 2
    kmeans_best = KMeans(n_clusters=best_k, max_iter=max_iter, n_init=10)
    labels = kmeans_best.fit_predict(embeddings)
    
    if plot:
        plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        plt.plot(range(2, max_clusters + 1), silhouette_scores[1:], marker='o', label='Silhouette Score')
        plt.xlabel('Number of Clusters (k)')
        plt.ylabel('Silhouette Score')
        plt.title('Silhouette Score vs. Clusters')
        plt.grid(True)

        plt.subplot(1, 2, 2)
        plt.plot(range(1, max_clusters + 1), inertias, marker='o', label='Inertia', color='orange')
        plt.xlabel('Number of Clusters (k)')
        plt.ylabel('Inertia')
        plt.title('Inertia vs. Clusters')
        plt.grid(True)
        plt.show()
    
    return best_k, kmeans_best, labels

def find_optimal_clusters_with_gmm(embeddings, max_clusters=10, reduce_dim=False, n_dim=2, plot=False):
    if reduce_dim:
        pca = PCA(n_components=n_dim)
        embeddings = pca.fit_transform(embeddings)
    
    silhouette_scores = []
    bic_scores = []
    
    for k in range(2, max_clusters + 1):
        gmm = GaussianMixture(n_components=k, covariance_type='full', random_state=42)
        labels = gmm.fit_predict(embeddings)
        bic_scores.append(gmm.bic(embeddings))
        
        if len(set(labels)) > 1:
            silhouette_scores.append(silhouette_score(embeddings, labels))
        else:
            silhouette_scores.append(None)
    
    best_k = np.argmax([s for s in silhouette_scores if s is not None]) + 2
    gmm_best = GaussianMixture(n_components=best_k, covariance_type='full', random_state=42)
    labels = gmm_best.fit_predict(embeddings)
    
    if plot:
        plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        plt.plot(range(2, max_clusters + 1), silhouette_scores, marker='o', label='Silhouette Score')
        plt.xlabel('Number of Clusters (k)')
        plt.ylabel('Silhouette Score')
        plt.title('Silhouette Score vs. Clusters')
        plt.grid(True)
    
        plt.subplot(1, 2, 2)
        plt.plot(range(2, max_clusters + 1), bic_scores, marker='o', label='BIC', color='orange')
        plt.xlabel('Number of Clusters (k)')
        plt.ylabel('BIC Score')
        plt.title('BIC Score vs. Clusters')
        plt.grid(True)
        plt.show()
    
    return best_k, gmm_best, labels



def get_centroids_and_closest_samples_spectral(embeddings, labels, k=5, distance_metric="euclidean"):
    """
    Compute pseudo-centroids (mean points) and find k closest samples to each centroid.
    """
    unique_labels = np.unique(labels)
    centroids = np.array([embeddings[labels == cluster_id].mean(axis=0) for cluster_id in unique_labels])
    
    distances = cdist(embeddings, centroids, metric=distance_metric)
    
    closest_samples = {}
    for i, cluster_id in enumerate(unique_labels):
        cluster_samples = np.where(labels == cluster_id)[0]
        cluster_distances = distances[cluster_samples, i]
        closest_k_indices = cluster_samples[np.argsort(cluster_distances)[:k]]
        closest_samples[cluster_id] = closest_k_indices.tolist()
    
    return centroids, closest_samples

def get_centroids_and_closest_samples_kmeans(embeddings, kmeans, labels, k=5, distance_metric="euclidean"):
    """
    Find the centroids and the k closest samples to each centroid.
    
    Parameters:
    - embeddings (numpy array): High-dimensional input embeddings (n_samples, n_features).
    - kmeans (KMeans): Trained KMeans model.
    - labels (numpy array): Cluster labels for the embeddings.
    - k (int): Number of closest samples to return for each centroid.
    
    Returns:
    - centroids (numpy array): Coordinates of the centroids (n_clusters, n_features).
    - closest_samples (dict): Dictionary where keys are cluster IDs and values are lists of sample indices.
    """
    # Extract centroids from the KMeans
    centroids = kmeans.cluster_centers_  # Shape: (n_clusters, n_features)
    
    if distance_metric == "euclidean":
        # Calculate distances between each embedding and each centroid
        distances = cdist(embeddings, centroids, metric='euclidean')  # Shape: (n_samples, n_clusters)
    else:
        # Compute cosine distance
        distances = cdist(embeddings, centroids, metric='cosine')

    
    # Find the k closest samples for each centroid
    closest_samples = {}
    for cluster_id in range(len(centroids)):
        # Get indices of samples belonging to the current cluster
        cluster_samples = np.where(labels == cluster_id)[0]
        # Filter distances to only include samples in the current cluster
        cluster_distances = distances[cluster_samples, cluster_id]
        # Sort by distance and get the indices of the k smallest distances
        closest_k_indices = cluster_samples[np.argsort(cluster_distances)[:k]]
        closest_samples[cluster_id] = closest_k_indices.tolist()
    
    return centroids, closest_samples


def get_centroids_and_closest_samples_gmm(embeddings, gmm, labels, k=5, distance_metric="cosine"):
    """
    Find the centroids and the k closest samples to each centroid for GMM clusters.
    """
    centroids = gmm.means_
    
    distances = cdist(embeddings, centroids, metric=distance_metric)
    
    closest_samples = {}
    for cluster_id in range(len(centroids)):
        cluster_samples = np.where(labels == cluster_id)[0]
        cluster_distances = distances[cluster_samples, cluster_id]
        closest_k_indices = cluster_samples[np.argsort(cluster_distances)[:k]]
        closest_samples[cluster_id] = closest_k_indices.tolist()
    
    return centroids, closest_samples

In [13]:
def get_text_and_labels_from_ids(df, text_column, label_column, sample_ids):
    """
    Retrieve the text and labels for the given sample indices, using .iloc for integer-based indexing.
    
    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the text data.
    - text_column (str): Name of the column containing text.
    - label_column (str): Name of the column containing labels.
    - sample_ids (list): List of sample indices (integer positions).
    
    Returns:
    - List of tuples: Each tuple contains (text, label) corresponding to the indices.
    """
    return df.iloc[sample_ids][[text_column, label_column]].to_records(index=False).tolist()


In [14]:
from scipy.special import comb
from collections import Counter
from math import log2

def compute_purity(cluster_labels, true_labels):
    clusters = np.unique(cluster_labels)
    total_correct = 0
    
    for cluster in clusters:
        # Indices of samples in this cluster
        indices = np.where(cluster_labels == cluster)[0]
        # True labels for these samples
        true_labels_in_cluster = true_labels[indices]
        
        # Count occurrences of each true label and select the most common
        most_common_count = Counter(true_labels_in_cluster).most_common(1)[0][1]
        total_correct += most_common_count
    
    purity = total_correct / len(true_labels)
    return purity



# Drifted Window Prototypes

In [28]:
window_size = 2000
drift_percetage = 20

n_samples_drift = int((window_size*drift_percetage)/100)
samples_per_class = int((window_size - n_samples_drift) // (len(id2label)-1))

# Ensure there are enough samples in each group
if df_new_unseen["original_label"].value_counts().min() < samples_per_class:
    raise ValueError("Not enough samples in one or more groups to split equally.")

# Initialize an empty list to store indices
indices_new_unseen = []

# Iterate through each unique value in the "original_label" column
for label in df_new_unseen["original_label"].unique():
    # Get all indices for the current label
    label_indices = df_new_unseen[df_new_unseen["original_label"] == label].index
    # Randomly sample the specified number of indices
    sampled_indices = np.random.choice(label_indices, size=samples_per_class, replace=False)
    # Add the sampled indices to the list
    indices_new_unseen.extend(sampled_indices)

# Convert to a numpy array if needed
indices_new_unseen = np.array(indices_new_unseen)

# Randomly sample indices for df_drift and E_drift
indices_drift = np.random.choice(df_drift.index, size=n_samples_drift, replace=False)

# Sample both the DataFrame and the corresponding NumPy array
df_drifted_window_per_label = pd.concat([
    df_new_unseen.loc[indices_new_unseen].copy(),
    df_drift.loc[indices_drift].copy()
], axis=0).copy()

E_drifted_window_per_label = np.concatenate([
    E_new_unseen_per_label_reduced[indices_new_unseen],
    E_drift_per_label_reduced[indices_drift]
]).copy()


df_drifted_window_per_batch = pd.concat([
    df_new_unseen.loc[indices_new_unseen].copy(),
    df_drift.loc[indices_drift].copy()
], axis=0).copy()

E_drifted_window_per_batch = np.concatenate([
    E_new_unseen_per_batch_reduced[indices_new_unseen],
    E_drift_per_batch_reduced[indices_drift]
]).copy()


## Per-label Explanations

In [33]:
clustering_algorithms = ["spectral", "kmeans", "gmm"]
clustering_algorithm_id = 1

closest_samples_distance_metrics = ["euclidean", "cosine"]
closest_samples_distance_metric_id = 0

label_ids_to_explain = [4]

max_clusters = 10

k = 5  # Number of closest samples to find as prototypes

flag_pca = False
pca_dims = 75

flag_normalize = False

print(f"Using {clustering_algorithms[clustering_algorithm_id]} with {closest_samples_distance_metrics[closest_samples_distance_metric_id]} distance to identify prototypes")


Using kmeans with euclidean distance to identify prototypes


In [34]:
for label_id_to_explain in label_ids_to_explain:
    print(f"Explainig Label: {id2label[label_id_to_explain]}")
    
    indices_current_label = np.where(df_drifted_window_per_label.predicted_label.values == label_id_to_explain)[0]

    # Filter the DataFrame using the positional indices
    df_current_label = df_drifted_window_per_label.iloc[indices_current_label].copy()

    # Filter the NumPy array using the same positional indices
    E_current_label = E_drifted_window_per_label[indices_current_label]
    
    if flag_pca:
        pca = PCA(n_components=pca_dims)
        E_current_label = pca.fit_transform(E_current_label)
        
    if flag_normalize:
        E_current_label = normalize(E_current_label, norm='l2')
    
    print(df_current_label.original_label.value_counts())

    
    if clustering_algorithms[clustering_algorithm_id] == "spectral":
        print("\nUsing spectral algorithm")
        best_k, gmm_best, labels = find_optimal_clusters_with_spectral(E_current_label, max_clusters=max_clusters, reduce_dim=False)

        centroids, closest_samples = get_centroids_and_closest_samples_spectral(E_current_label, labels, k=k, distance_metric=closest_samples_distance_metrics[closest_samples_distance_metric_id])
            
        
        
    elif clustering_algorithms[clustering_algorithm_id] == "kmeans":
        print("\nUsing kmeans algorithm")
        best_k, gmm_best, labels = find_optimal_clusters_with_kmeans(E_current_label, max_clusters=max_clusters, reduce_dim=False)

        centroids, closest_samples = get_centroids_and_closest_samples_kmeans(E_current_label, gmm_best, labels, k=k, distance_metric=closest_samples_distance_metrics[closest_samples_distance_metric_id])
        
    elif clustering_algorithms[clustering_algorithm_id] == "gmm":
        print("\nUsing gmm algorithm")
        best_k, gmm_best, labels = find_optimal_clusters_with_gmm(E_current_label, max_clusters=max_clusters, reduce_dim=False)

        centroids, closest_samples = get_centroids_and_closest_samples_gmm(E_current_label, gmm_best, labels, k=k, distance_metric=closest_samples_distance_metrics[closest_samples_distance_metric_id])
    else:
        print("\nunknown clustering algorithm")
        break
        
        
    # Extract text and labels for each cluster
    cluster_texts_and_labels = {}

    for cluster_id, sample_ids in closest_samples.items():
        # Get text and labels for the current cluster
        text_and_labels = get_text_and_labels_from_ids(df_current_label, 'text', 'original_label', sample_ids)
        cluster_texts_and_labels[cluster_id] = text_and_labels

    # Output results for each cluster
    for cluster_id, texts_and_labels in cluster_texts_and_labels.items():
        print(f"\nCluster {cluster_id}:\n")
        for text, label in texts_and_labels:
            print(f"  Text: {text}")
            print(f"  Label: {id2label[label]}")
            print("------------------------------------------------")
        print("\n")
        print("\n")
        
        

Explainig Label: Science
4    320
5    175
Name: original_label, dtype: int64

Using kmeans algorithm

Cluster 0:

  Text: Hello,

Can somebody tell me what kind of MOV's are most useful for a computer
surge protector?  Radio Shack have 2 types, both rated 130V, one for
a 10A current and the other for 20A.  On the other hand, commercial surge
protectors I have seen claim a clamping voltage of 330V-400V, which SEEMS to
me in imply a rated voltage of about 250V (250V * SQRT(2.)=353).

What is the right kind?

While at it, ow important is really the EMI/RFI module?  Is there really any
noticable level of such noise in the power line?

Please-  E-MAIL to me, as this group has too much volume for me to be able
to follow.
  Label: Science
------------------------------------------------
  Text: 

Of course,

	How many government projects after Using PERT, GANT, C.P.M.s
Process flow diagrams,  Level 5 software projects....  actually
come in on schedule and under Cost.  I know the GAO determin

## Per-label Purity 

In [18]:
from tqdm import tqdm
import numpy as np
import pandas as pd

def run_purity_experiment(window_sizes, drift_percentages, labels_to_explain, E_new_unseen, E_drift, df_new_unseen, df_drift,
                          n_iterations=100, max_clusters=10, flag_pca=False, pca_dims=150, k=4, flag_normalize=True, clustering_algorithm="kmeans"):
    results = []
    
    for window_size in window_sizes:
        for drift_percentage in drift_percentages:
            n_samples_drift = int((window_size * drift_percentage) / 100)
            purity_scores = []
            
            for iteration in tqdm(range(n_iterations), desc=f"WinSize {window_size}, Drift {drift_percentage}%"):
                samples_per_class = int((window_size - n_samples_drift) // df_new_unseen["original_label"].nunique())
                
                if df_new_unseen["original_label"].value_counts().min() < samples_per_class:
                    raise ValueError("Not enough samples in one or more groups to split equally.")
                
                indices_new_unseen = []
                for label in df_new_unseen["original_label"].unique():
                    label_indices = df_new_unseen[df_new_unseen["original_label"] == label].index
                    sampled_indices = np.random.choice(label_indices, size=samples_per_class, replace=False)
                    indices_new_unseen.extend(sampled_indices)
                
                indices_new_unseen = np.array(indices_new_unseen)
                indices_drift = np.random.choice(df_drift.index, size=n_samples_drift, replace=False)
                
                df_new_unseen_drift = pd.concat([
                    df_new_unseen.loc[indices_new_unseen].copy(),
                    df_drift.loc[indices_drift].copy()
                ], axis=0)
                
                E_new_unseen_drift = np.concatenate([
                    E_new_unseen[indices_new_unseen],
                    E_drift[indices_drift]
                ])
                
                for label_id in labels_to_explain:
                    indices_current_label = np.where(df_new_unseen_drift.predicted_label.values == label_id)[0]
                    df_current_label = df_new_unseen_drift.iloc[indices_current_label].copy()
                    E_current_label = E_new_unseen_drift[indices_current_label]
                    
                            
                    if flag_normalize:
                        E_current_label = normalize(E_current_label, norm='l2')
                    
                    if flag_pca:
                        pca = PCA(n_components=pca_dims)
                        E_current_label = pca.fit_transform(E_current_label)
                    
                    
                    
                    if clustering_algorithm == "spectral":
                        best_k, gmm_best, labels = find_optimal_clusters_with_spectral(
                            E_current_label, max_clusters=max_clusters, reduce_dim=False, n_dim=None
                        )

                    elif clustering_algorithm == "kmeans":
                        best_k, gmm_best, labels = find_optimal_clusters_with_kmeans(
                            E_current_label, max_clusters=max_clusters, reduce_dim=False, n_dim=None
                        )

                    elif clustering_algorithm == "gmm":
                        best_k, gmm_best, labels = find_optimal_clusters_with_gmm(
                            E_current_label, max_clusters=max_clusters, reduce_dim=False, n_dim=None
                        )
                    else:
                        print("\nunknown clustering algorithm")
                        break
                    
                    
                    #print(labels)
                    #print()
                    #print(df_current_label.drifted_label.values)
                    
                    purity_score = compute_purity(labels, df_current_label.drifted_label.values)
                    purity_scores.append(purity_score)
            
            mean_purity = np.mean(purity_scores)
            std_purity = np.std(purity_scores)
            results.append((window_size, drift_percentage, mean_purity, std_purity))
            
            print(f"Purity for Window Size {window_size}, Drift {drift_percentage}%: {mean_purity:.3f} ± {std_purity:.3f}")
    
    return results


In [19]:
# Example usage:
window_sizes = [1000, 2000]
drift_percentages = [10, 15, 20]
labels_to_explain = [4] # Purity for label Science

flag_pca = False
pca_dims = 150

flag_normalize = False

clustering_algorithm = "kmeans"

purity_results = run_purity_experiment(window_sizes, drift_percentages, labels_to_explain, 
                                       E_new_unseen_per_label_reduced, E_drift_per_label_reduced, df_new_unseen, df_drift,
                                       clustering_algorithm=clustering_algorithm, flag_pca=flag_pca, pca_dims=pca_dims, flag_normalize=flag_normalize)

WinSize 1000, Drift 10%: 100%|██████████| 100/100 [00:37<00:00,  2.66it/s]


Purity for Window Size 1000, Drift 10%: 0.8357 ± 0.0302


WinSize 1000, Drift 15%: 100%|██████████| 100/100 [00:41<00:00,  2.42it/s]


Purity for Window Size 1000, Drift 15%: 0.8187 ± 0.0357


WinSize 1000, Drift 20%: 100%|██████████| 100/100 [00:42<00:00,  2.36it/s]


Purity for Window Size 1000, Drift 20%: 0.8025 ± 0.0406


WinSize 2000, Drift 10%: 100%|██████████| 100/100 [01:03<00:00,  1.56it/s]


Purity for Window Size 2000, Drift 10%: 0.8310 ± 0.0202


WinSize 2000, Drift 15%: 100%|██████████| 100/100 [01:04<00:00,  1.55it/s]


Purity for Window Size 2000, Drift 15%: 0.8149 ± 0.0244


WinSize 2000, Drift 20%: 100%|██████████| 100/100 [01:08<00:00,  1.47it/s]

Purity for Window Size 2000, Drift 20%: 0.8045 ± 0.0276





## Per-batch Explanations

In [20]:
clustering_algorithms = ["spectral", "kmeans", "gmm"]
clustering_algorithm_id = 1

closest_samples_distance_metrics = ["euclidean", "cosine"]
closest_samples_distance_metric_id = 0

max_clusters = 10

k = 4  # Number of closest samples to find as prototypes

flag_pca = False
pca_dims = 75

flag_normalize = False

print(f"Using {clustering_algorithms[clustering_algorithm_id]} with {closest_samples_distance_metrics[closest_samples_distance_metric_id]} distance to identify prototypes")


Using kmeans with euclidean distance to identify prototypes


In [35]:
if flag_normalize:
    E_drifted_window_per_batch = normalize(E_drifted_window_per_batch, norm='l2')


if flag_pca:
    pca = PCA(n_components=pca_dims)
    E_drifted_window_per_batch = pca.fit_transform(E_drifted_window_per_batch)
    

print(df_drifted_window_per_batch.original_label.value_counts())


if clustering_algorithms[clustering_algorithm_id] == "spectral":
    print("\nUsing spectral algorithm")
    best_k, gmm_best, labels = find_optimal_clusters_with_spectral(E_drifted_window_per_batch, max_clusters=max_clusters, reduce_dim=False)

    centroids, closest_samples = get_centroids_and_closest_samples_spectral(E_drifted_window_per_batch, labels, k=k, distance_metric=closest_samples_distance_metrics[closest_samples_distance_metric_id])

elif clustering_algorithms[clustering_algorithm_id] == "kmeans":
    print("\nUsing kmeans algorithm")
    best_k, gmm_best, labels = find_optimal_clusters_with_kmeans(E_drifted_window_per_batch, max_clusters=max_clusters, reduce_dim=False)

    centroids, closest_samples = get_centroids_and_closest_samples_kmeans(E_drifted_window_per_batch, gmm_best, labels, k=k, distance_metric=closest_samples_distance_metrics[closest_samples_distance_metric_id])

elif clustering_algorithms[clustering_algorithm_id] == "gmm":
    print("\nUsing gmm algorithm")
    best_k, gmm_best, labels = find_optimal_clusters_with_gmm(E_drifted_window_per_batch, max_clusters=max_clusters, reduce_dim=False)

    centroids, closest_samples = get_centroids_and_closest_samples_gmm(E_drifted_window_per_batch, gmm_best, labels, k=k, distance_metric=closest_samples_distance_metrics[closest_samples_distance_metric_id])
else:
    print("\nunknown clustering algorithm")



# Extract text and labels for each cluster
cluster_texts_and_labels = {}

for cluster_id, sample_ids in closest_samples.items():
    # Get text and labels for the current cluster
    text_and_labels = get_text_and_labels_from_ids(df_drifted_window_per_batch, 'text', 'original_label', sample_ids)
    cluster_texts_and_labels[cluster_id] = text_and_labels

    # Output results for each cluster
    for cluster_id, texts_and_labels in cluster_texts_and_labels.items():
        print(f"\nCluster {cluster_id}:\n")
        for text, label in texts_and_labels:
            print(f"  Text: {text}")
            print(f"  Label: {id2label[label]}")
            print("------------------------------------------------")
        print("\n")
        print("\n")



5    400
4    320
3    320
2    320
1    320
0    320
Name: original_label, dtype: int64

Using kmeans algorithm

Cluster 0:

  Text: Hello,

Can somebody tell me what kind of MOV's are most useful for a computer
surge protector?  Radio Shack have 2 types, both rated 130V, one for
a 10A current and the other for 20A.  On the other hand, commercial surge
protectors I have seen claim a clamping voltage of 330V-400V, which SEEMS to
me in imply a rated voltage of about 250V (250V * SQRT(2.)=353).

What is the right kind?

While at it, ow important is really the EMI/RFI module?  Is there really any
noticable level of such noise in the power line?

Please-  E-MAIL to me, as this group has too much volume for me to be able
to follow.
  Label: Science
------------------------------------------------
  Text: .........
I, some years ago, almost became a victim of this.  Squirted a fair amount in
an old model 15 Teletype which was acting up, then turned it on.  The eruption
when the motor startin

# Historical Prototypes

## Per-label Historical Explanations

In [22]:
clustering_algorithms = ["spectral", "kmeans", "gmm"]
clustering_algorithm_id = 1

closest_samples_distance_metrics = ["euclidean", "cosine"]
closest_samples_distance_metric_id = 0

label_ids_to_explain = [4]

max_clusters = 10

k = 4  # Number of closest samples to find as prototypes

flag_pca = False
pca_dims = 75

flag_normalize = False

print(f"Using {clustering_algorithms[clustering_algorithm_id]} with {closest_samples_distance_metrics[closest_samples_distance_metric_id]} distance to identify prototypes")

df_historical = df_test
E_historical = E_test_per_label_reduced

Using kmeans with euclidean distance to identify prototypes


In [36]:
for label_id_to_explain in label_ids_to_explain:
    print(f"Explainig Label: {id2label[label_id_to_explain]}")
    
    indices_current_label = np.where(df_historical.predicted_label.values == label_id_to_explain)[0]

    # Filter the DataFrame using the positional indices
    df_current_label = df_historical.iloc[indices_current_label].copy()

    # Filter the NumPy array using the same positional indices
    E_current_label = E_historical[indices_current_label]
    
    if flag_normalize:
        E_current_label = normalize(E_current_label, norm='l2')
    
    
    if flag_pca:
        pca = PCA(n_components=pca_dims)
        E_current_label = pca.fit_transform(E_current_label)
    
    print(df_current_label.original_label.value_counts())

    
    if clustering_algorithms[clustering_algorithm_id] == "spectral":
        print("\nUsing spectral algorithm")
        best_k, gmm_best, labels = find_optimal_clusters_with_spectral(E_current_label, max_clusters=max_clusters, reduce_dim=False)

        centroids, closest_samples = get_centroids_and_closest_samples_spectral(E_current_label, labels, k=k, distance_metric=closest_samples_distance_metrics[closest_samples_distance_metric_id])
            
        
        
    elif clustering_algorithms[clustering_algorithm_id] == "kmeans":
        print("\nUsing kmeans algorithm")
        best_k, gmm_best, labels = find_optimal_clusters_with_kmeans(E_current_label, max_clusters=max_clusters, reduce_dim=False)

        centroids, closest_samples = get_centroids_and_closest_samples_kmeans(E_current_label, gmm_best, labels, k=k, distance_metric=closest_samples_distance_metrics[closest_samples_distance_metric_id])
        
    elif clustering_algorithms[clustering_algorithm_id] == "gmm":
        print("\nUsing gmm algorithm")
        best_k, gmm_best, labels = find_optimal_clusters_with_gmm(E_current_label, max_clusters=max_clusters, reduce_dim=False)

        centroids, closest_samples = get_centroids_and_closest_samples_gmm(E_current_label, gmm_best, labels, k=k, distance_metric=closest_samples_distance_metrics[closest_samples_distance_metric_id])
    else:
        print("\nunknown clustering algorithm")
        break
        
        
    # Extract text and labels for each cluster
    cluster_texts_and_labels = {}

    for cluster_id, sample_ids in closest_samples.items():
        # Get text and labels for the current cluster
        text_and_labels = get_text_and_labels_from_ids(df_current_label, 'text', 'original_label', sample_ids)
        cluster_texts_and_labels[cluster_id] = text_and_labels

    # Output results for each cluster
    for cluster_id, texts_and_labels in cluster_texts_and_labels.items():
        print(f"\nCluster {cluster_id}:\n")
        for text, label in texts_and_labels:
            print(f"  Text: {text}")
            print(f"  Label: {id2label[label]}")
            print("------------------------------------------------")
        print("\n")
        print("\n")
        
        

Explainig Label: Science
4    907
Name: original_label, dtype: int64

Using kmeans algorithm

Cluster 0:

  Text: How hard would it be to somehow interface them to some of the popular  Motorola microcontrollers. I am a novice at microcontrollers but I am starting to get into them for some of my projects. I have several k SIMMs laying around from upgraded Macs and if I could use them as free memory in one or two of my projects that would be great. One project that comes to mind is a Caller ID device that would require quite a bit of RAM to store several hundered CID records etc...    Dan    Daniel Joseph Rubin rubin
  Label: Science
------------------------------------------------
  Text: Greetings. Ive been seeing the word storage mentioned around oscilliscopes but Im curious what does it mean  If my life depended on it Id say that its a scope that uses longpersistance phosphor to keep the successive taces on the screen for some unit of time  store them. Do I get to live  Also Ive been

## Per-batch historical explanations

In [24]:
clustering_algorithms = ["spectral", "kmeans", "gmm"]
clustering_algorithm_id = 1

closest_samples_distance_metrics = ["euclidean", "cosine"]
closest_samples_distance_metric_id = 0

max_clusters = 10

k = 4  # Number of closest samples to find as prototypes

flag_pca = False
pca_dims = 75

flag_normalize = False

print(f"Using {clustering_algorithms[clustering_algorithm_id]} with {closest_samples_distance_metrics[closest_samples_distance_metric_id]} distance to identify prototypes")


df_historical_per_batch = df_test
E_historical_per_batch = E_test_per_batch_reduced

Using kmeans with euclidean distance to identify prototypes


In [38]:
if flag_normalize:
    E_drifted_window_per_batch = normalize(E_drifted_window_per_batch, norm='l2')

if flag_pca:
    pca = PCA(n_components=pca_dims)
    E_historical_per_batch = pca.fit_transform(E_historical_per_batch)

print(df_historical_per_batch.original_label.value_counts())


if clustering_algorithms[clustering_algorithm_id] == "spectral":
    print("\nUsing spectral algorithm")
    best_k, gmm_best, labels = find_optimal_clusters_with_spectral(E_historical_per_batch, max_clusters=max_clusters, reduce_dim=False)

    centroids, closest_samples = get_centroids_and_closest_samples_spectral(E_historical_per_batch, labels, k=k, distance_metric=closest_samples_distance_metrics[closest_samples_distance_metric_id])

elif clustering_algorithms[clustering_algorithm_id] == "kmeans":
    print("\nUsing kmeans algorithm")
    best_k, gmm_best, labels = find_optimal_clusters_with_kmeans(E_historical_per_batch, max_clusters=max_clusters, reduce_dim=False)

    centroids, closest_samples = get_centroids_and_closest_samples_kmeans(E_historical_per_batch, gmm_best, labels, k=k, distance_metric=closest_samples_distance_metrics[closest_samples_distance_metric_id])

elif clustering_algorithms[clustering_algorithm_id] == "gmm":
    print("\nUsing gmm algorithm")
    best_k, gmm_best, labels = find_optimal_clusters_with_gmm(E_historical_per_batch, max_clusters=max_clusters, reduce_dim=False)

    centroids, closest_samples = get_centroids_and_closest_samples_gmm(E_historical_per_batch, gmm_best, labels, k=k, distance_metric=closest_samples_distance_metrics[closest_samples_distance_metric_id])
else:
    print("\nunknown clustering algorithm")



# Extract text and labels for each cluster
cluster_texts_and_labels = {}

for cluster_id, sample_ids in closest_samples.items():
    # Get text and labels for the current cluster
    text_and_labels = get_text_and_labels_from_ids(df_historical_per_batch, 'text', 'original_label', sample_ids)
    cluster_texts_and_labels[cluster_id] = text_and_labels

# Output results for each cluster
for cluster_id, texts_and_labels in cluster_texts_and_labels.items():
    print(f"\nCluster {cluster_id}:\n")
    for text, label in texts_and_labels:
        print(f"  Text: {text}")
        print(f"  Label: {id2label[label]}")
        print("------------------------------------------------")
    print("\n")
    print("\n")



0    1112
4     907
2     594
3     548
1     226
Name: original_label, dtype: int64

Using kmeans algorithm

Cluster 0:

  Text: I would like more info on this if anybody has it. Our Exabyte   tapedrive has never been working from the Quadra .  We have been trying it since September  replaced cabling  inits I dont know what all. All the industry experts we  phoned the tapedrive dealer our Apple dealer the software  dealer all say its our fault or they dont know. The last  thing they said was that we needed a special Quadra SCSI terminator  . Anybody know more Thanks   I have been using the PLI SONY . MO drive and now a Sharp color scanner using standard SCSI cables and STANDARD  terminator on my Q.  No problems. If you were using a IIfx that might be another story.  Make sure there is only one terminator in the cabling and it must be at the end. Some boxes have internal terminators some can be switched out and others are socketted. These count. If the box with internal terminations ca