## Navigating Data Issues
This notebook is part of an [article at medium.](https://medium.com/@markus.stoll/navigate-data-issues-d05ae8c45841)


In [None]:
# install spotlight
!pip install renumics-spotlight

In [None]:
# load the dataset from the huggingface hub
import datasets

ds = datasets.load_dataset("renumics/cifar100-enriched", split="test")
df = ds.to_pandas()

In [None]:
# run cleablab to find label errors, outliers and duplicates
import pandas as pd
import numpy as np
from cleanlab import Datalab

lab = Datalab(data=df, label_name="fine_label")
features = np.array([x.tolist() for x in df["embedding"]])
pred_probs = np.array([x.tolist() for x in df["probabilities"]])
lab.find_issues(features=features, pred_probs=pred_probs)

In [None]:
# concat issues to df
df = pd.concat([df, lab.get_issues()], axis=1)

In [None]:
# create data issues representations for the data issues found by cleanlab
from renumics.spotlight.analysis import DataIssue

label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist()
label_issue = DataIssue(
    severity="medium",
    title="label-issue",
    rows=label_issue_rows,
    description="Label issue found by cleanlab - Review and correct if necessary",
)

outlier_issue_row = (
    df[df["outlier_score"] < 0.6].sort_values("outlier_score").index.tolist()
)
outlier_issue = DataIssue(
    severity="medium",
    title="outlier-issue",
    rows=outlier_issue_row,
    description="Outlier score < 0.6 - Review and remove or collect more data",
)

near_duplicate_issue_row = (
    df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist()
)
near_duplicate_issue = DataIssue(
    severity="medium",
    title="near-duplicate-issue",
    rows=near_duplicate_issue_row,
    description="Near duplicate issue found by cleanlab - Review and remove if necessary",
)

In [None]:
# explorer in Spotlight
from renumics import spotlight

dtypes = {
    "image": spotlight.Image,
    "full_image": spotlight.Image,
    "embedding": spotlight.Embedding,
    "embedding_reduced": spotlight.Embedding,
    "probabilities": spotlight.Embedding,
}


spotlight.show(
    df.rename(columns={"fine_label_str": "label", "fine_label_prediction_str": "pred"}),
    dtype=dtypes,
    layout="https://spotlight.renumics.com/resources/layout_data_issues.json",
    issues=[label_issue, outlier_issue, near_duplicate_issue],
)

Checkout the [article at TBD](https://medium.com/@markus.stoll) for more details.