In [None]:
%load_ext autoreload
%autoreload 2

# Datalab

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict
from cleanlab.benchmarking.noise_generation import generate_noise_matrix_from_trace, generate_noisy_labels
from datasets import Dataset
from cleanlab.experimental.datalab.datalab import Datalab
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt


SEED = 123
np.random.seed(SEED)

BINS = {
    "low": [0, 3.3],
    "mid": [3.3, 6.6],
    "high": [6.6, 10],
}

BINS_MAP = {
    "low": 0,
    "mid": 1,
    "high": 2,
}

In [None]:
X = np.random.rand(1000, 2) * 5
y = np.sum(X, axis=1)
# Map y to bins based on the BINS dict
y_bin = np.array([k for y_i in y for k, v in BINS.items()  if v[0] <= y_i < v[1]])
y_bin_idx = np.array([BINS_MAP[k] for k in y_bin])

# Split into train and test
X_train, X_test, y_train, y_test, y_train_idx, y_test_idx = train_test_split(X, y_bin, y_bin_idx, test_size=0.5, random_state=SEED)

py = np.bincount(y_train_idx) / float(len(y_train_idx))
m = len(BINS)

noise_matrix = generate_noise_matrix_from_trace(
    m,
    trace=0.9 * m,
    py=py,
    valid_noise_matrix=True,
    seed=SEED,
)

noisy_labels_idx = generate_noisy_labels(y_train_idx, noise_matrix)

# TODO: Add noise to test set when we support extra splits in DataLab

In [None]:
# Plot data with clean labels and noisy labels, use BINS_MAP for the legend
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
for i, (X, y) in enumerate(zip([X_train, X_train], [y_train_idx, noisy_labels_idx])):
    for k, v in BINS_MAP.items():
        ax[i].scatter(X[y == v, 0], X[y == v, 1], label=k)
    ax[i].legend()
    ax[i].set_title(["Clean labels", "Noisy labels"][i])
    ax[i].set_xlabel(r"$x_1$")
    ax[i].set_ylabel(r"$x_2$")
    ax[i].set_xlim(0, 5)
    ax[i].set_ylim(0, 5)

# Plot true boundaries (x+y=3.3, x+y=6.6)
for i in range(2):
    ax[i].plot([0, 3.3], [3.3, 0], color="k", linestyle="--", alpha=0.5)
    ax[i].plot([0, 6.6], [6.6, 0], color="k", linestyle="--", alpha=0.5)

# Draw red circles around the points that are misclassified (i.e. the points that are in the wrong bin)
for i, (X, y) in enumerate(zip([X_train, X_train], [y_train_idx, noisy_labels_idx])):
    for k, v in BINS_MAP.items():
        ax[i].plot(
            X[(y == v) & (y != y_train_idx), 0],
            X[(y == v) & (y != y_train_idx), 1],
            "o",
            markerfacecolor="none",
            markeredgecolor="red",
            markersize=14,
            markeredgewidth=2.5,
            alpha=0.5,
        )


plt.tight_layout()

In [None]:
print(np.mean(noisy_labels_idx == y_train_idx))  # accuracy
noisy_labels = np.array([list(BINS_MAP.keys())[i] for i in noisy_labels_idx])

# Load data into DataLab
data = Dataset.from_dict({"X": X_train, "y": noisy_labels})

# Instantiate a Datalab object

Provide the data object and a name of the label column in the data object.

Most issue types currently rely on getting (out-of-sample) predictions from a trained model.
We'll use a simple logistic regression model for this example.

In [None]:
lab = Datalab(data, label_name="y")

model = LogisticRegression()
pred_probs = cross_val_predict(
    estimator=model, X=X_train, y=noisy_labels, cv=5, method="predict_proba"
)
lab.find_issues(pred_probs=pred_probs)


We can review some of the results:

In [None]:
# Dataset health summary
print("Health summary: ", lab.results)


In [None]:
# Per-example issues
print("Issues: ", lab.issues)


# Save the results

In [None]:
path = "saved_datalab.pkl"

lab.save(path)