# KAIROS valuation of iNaturalist subset using clean insect data

https://github.com/lodino/kairos/blob/main/examples/image-data.ipynb 

#### TODO
- Currently just transferred code from Q1 experiments on adult income dataset. Will adapt for the clean insect data (validation) and iNaturalist dataset (training/messy).
- Make output the top X highest valued images from the inaturalist dataset (only the insects as theoretically labeled by humans) to use for finetuning ResNet-50.

Collecting "noisy" indexes: if label is not one of the species in our main insect categories, it is noisy. We might create a semi-noisy label for images that are insects but are just not ones we are looking for. Since the method could value them as "clean" and it wouldn't be totally wrong, we just want to know how accurate Kairos is being.

In [None]:
import sys
sys.path.append('../')
from custom_valuations import *
from utils import *

import opendataval
from opendataval.experiment import ExperimentMediator
from opendataval.dataval.api import DataEvaluator, ModelLessMixin
from opendataval.dataval import DataOob, LavaEvaluator, DVRL
from opendataval.experiment import discover_corrupted_sample, noisy_detection
from opendataval.dataloader import Register, DataFetcher, mix_labels, add_gauss_noise
from opendataval.model import ClassifierMLP, LogisticRegression

import matplotlib.pyplot as plt
import numpy as np
import torch
import json
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

from custom_valuations import *
from utils import *

import custom_valuations
import utils
import fixed_valuations
import importlib
importlib.reload(custom_valuations)
importlib.reload(utils)
importlib.reload(fixed_valuations)
from custom_valuations import *
from utils import *
from fixed_valuations import *

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.mixture import GaussianMixture

PATH_TO_DATA="data_files" #change based on working directory

In [None]:
markers = {
    'DataOob': 'o',
    'KNNShapley': 's',
    'FixedKNNShapley': 's',
    'FixedLavaEvaluator': 'x',
    #'LavaEvaluator': 'x',
    'DVRL': 'x',
    'Kairos': '^',
}

def write_dict(d, fname):
    txt = json.dumps(d)
    with open(f'logs/{fname}.json', 'w+') as f:
        f.write(txt)
        
train_count, valid_count, test_count = ...,...,...
train_kwargs = {"epochs": 3, "batch_size": 100, "lr": 0.01}
metric_name = ... #'accuracy'

fetcher = (
    DataFetcher('inat-embeddings', '../data_files/', 
                False, random_state=42)
    .split_dataset_by_count(train_count,
                            valid_count,
                            test_count)  
)

### 1/22/26 check why labels aren't separated by clean vs noisy. How are they aligned?
clean_embeddings = np.load("../data_files/cifar10-embeddings/clean_embeddings.npy")
labels = np.load("../data_files/cifar10-embeddings/labels.npy")
noisy_embeddings = np.load("../data_files/cifar10-embeddings/noisy_embeddings.npy")

In [None]:
def load_presplit_dataset(dataset_name, exp_num, data_frac, path_to_data, cache_dir=None, force_download=False):
        # Load CSVs into numpy arrays
        x_train = pd.read_csv(f"{path_to_data}/{dataset_name}/experiment{exp_num}/X_train_dirty.csv").values
        y_train = pd.read_csv(f"{path_to_data}/{dataset_name}/experiment{exp_num}/y_train_dirty.csv").values

        x_valid = pd.read_csv(f"{path_to_data}/{dataset_name}/experiment{exp_num}/X_val.csv").values
        y_valid = pd.read_csv(f"{path_to_data}/{dataset_name}/experiment{exp_num}/y_val.csv").values

        ### not sure if test is needed. either valid or test.
    
        # x_test = pd.read_csv(f"{path_to_data}/{dataset_name}/experiment{exp_num}/X_test.csv").values
        # y_test = pd.read_csv(f"{path_to_data}/{dataset_name}/experiment{exp_num}/y_test.csv").values

        # Flatten labels to shape (N,)
        y_train = y_train.ravel().astype(int)
        y_valid = y_valid.ravel().astype(int)
        #y_test  = y_test.ravel().astype(int)

        # Take subsets
        n_train = int(len(x_train) * data_frac)
        n_valid = int(len(x_valid) * data_frac)
        #n_test  = int(len(x_test) * data_frac)

        x_train = x_train[:n_train]
        y_train = y_train[:n_train]

        x_valid = x_valid[:n_valid]
        y_valid = y_valid[:n_valid]

        # x_test = x_test[:n_test]
        # y_test = y_test[:n_test]

        num_classes = int(max(y_train.max(), y_valid.max())) + 1 #, y_test.max()

        # One-hot encode
        y_train = np.eye(num_classes)[y_train]
        y_valid = np.eye(num_classes)[y_valid]
        #y_test  = np.eye(num_classes)[y_test]

        covariates = (x_train, x_valid) #, x_test
        labels = (y_train, y_valid) #, y_test

        # print(f"num_classes: {num_classes}")
        # print(f"y_train min/max: {y_train.min()}/{y_train.max()}")
        # print(f"y_train shape: {y_train.shape}")
        # print(f"Any labels >= num_classes? {(y_train >= num_classes).any()}")
        # print(f"Any NaN in y_train? {np.isnan(y_train).any()}")
        # print(f"Unique labels: {np.unique(y_train)}")

        return covariates, labels

### plot accuracy
def plot_valuations(eval_name, eval_med, fetcher, dataset_name, exp_num):

        evalu = None
        for evaluator in eval_med.data_evaluators:
                if evaluator.__class__.__name__ == eval_name:
                        evalu = evaluator
                        break

        if evalu is None:
                raise RuntimeError(f"{eval_name} evaluator not found in eval_med")

        valuations = evalu.data_values  # length = n_train

        # Identify noisy vs clean indices
        noisy = np.array(fetcher.noisy_train_indices)
        clean = np.setdiff1d(np.arange(len(valuations)), noisy)

        vals_noisy = valuations[noisy]
        vals_clean = valuations[clean]
        print(len(vals_clean), len(vals_noisy))

        plt.figure(figsize=(6,5))

        # Option A: Histogram curves (what your example image uses)
        plt.hist(vals_clean, bins=50, density=True, histtype='step', linewidth=2, label="Clean", color='blue')
        plt.hist(vals_noisy, bins=50, density=True, histtype='step', linewidth=2, label="Noisy", color='red')

        # Option B: KDE smooth curves (optional)
        # from scipy.stats import gaussian_kde
        # xs = np.linspace(min(valuations), max(valuations), 500)
        # plt.plot(xs, gaussian_kde(vals_clean)(xs), label="Clean KDE", color='blue')
        # plt.plot(xs, gaussian_kde(vals_noisy)(xs), label="Noisy KDE", color='red')

        plt.xlabel(f"{eval_name} Valuation Score")
        plt.ylabel("Frequency (Density)")
        plt.title(f"{eval_name} {dataset_name} Valuations: Clean vs Noisy (Exp {exp_num})")
        plt.legend()
        plt.grid(alpha=0.3)
        plt.tight_layout()
        plt.show()


def thresh_acc(eval_name, eval_med, dataset_name, exp_num, fetcher):
        evalu = None
        for evaluator in eval_med.data_evaluators:
            if evaluator.__class__.__name__ == eval_name:
                evalu = evaluator
                break

        if evalu is None:
            raise RuntimeError(f"{eval_name} evaluator not found in eval_med")

        valuations = evalu.data_values
        
        vals = valuations.reshape(-1,1)
        gmm = GaussianMixture(n_components=2, random_state=42).fit(vals)

        # responsibilities: P(component k | v_i)
        probs = gmm.predict_proba(vals)

        # Determine which component = noisy (lower mean)
        means = gmm.means_.reshape(-1)
        noisy_component = np.argmin(means)  

        # threshold = decision boundary between gaussians
        threshold = np.mean([
            means[noisy_component],
            means[1-noisy_component]
        ])
        
        print(f"{eval_name} {dataset_name} experiment {exp_num}")
        print("GMM threshold:", round(threshold, 4))

        pred = (valuations < threshold).astype(int)  

        y_true = np.zeros(len(valuations), dtype=int)
        y_true[fetcher.noisy_train_indices] = 1  # 1 = real noisy

        accuracy = round((pred == y_true).mean(), 4)

        from sklearn.metrics import precision_score, recall_score, f1_score

        precision = round(precision_score(y_true, pred), 4)
        recall = round(recall_score(y_true, pred), 4)
        f1 = round(f1_score(y_true, pred), 4)

        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1)
        print('----------------------------------')

In [None]:
def run_experiment(dataset_name, exp_num, data_frac, path_to_data):

    Register(
        dataset_name=f"{dataset_name}_experiment{exp_num}",
        one_hot=False,
        cacheable=False,
        presplit=True
    )(lambda: load_presplit_dataset(dataset_name, exp_num, data_frac, path_to_data))#(load_presplit_dataset(dataset_name, exp_num, data_frac, path_to_data))

    fetcher = DataFetcher(f"{dataset_name}_experiment{exp_num}")
    fetcher.noisy_train_indices = [i for i in noisy_indexes[f"exp{exp_num}"] if i < len(fetcher.x_train)]
    curr_noisy_idxs = sum(np.array(fetcher.noisy_train_indices) < len(fetcher.x_train))

    # Estimate kernel bandwidth w/ median sample pairwise distance
    kairos = Kairos()
    kairos.input_data(fetcher.x_train, fetcher.y_train, fetcher.x_valid, fetcher.y_valid)
    sigma_feature = max(est_median_dist(kairos.X_valid.numpy()), est_median_dist(kairos.X_train.numpy()))

    ### plot Covered vs Inspected data
    model_name = LogisticRegression(input_dim=len(fetcher.x_train[0]), num_classes=(int(np.max(fetcher.y_train)) + 1)) #Used to be fetcher.y_train[0].size
    exper_med = ExperimentMediator(fetcher=fetcher, pred_model=model_name, train_kwargs=train_kwargs,
                                metric_name=metric_name, raises_error=True)

    fig = plt.figure(figsize=(4, 4))
    list_of_data_evaluators = [
        FixedKNNShapley(),
        DataOob(num_models=10),
        LavaEvaluator(random_state=42), #breaks if dataset too big
        Kairos(sigma_feature=sigma_feature, lambda_weight=.97),
    ]
    eval_med = exper_med.compute_data_values(list_of_data_evaluators)

    for evaluator in eval_med.data_evaluators:
        d = get_discover_corrupted_sample_results(evaluator, fetcher)
        eval_name = evaluator.__class__.__name__
        plt.plot(d['axis'], d['corrupt_found'], marker=markers[eval_name], label=eval_name)
    plt.grid()
    plt.legend()
    for ax in fig.axes:
        ax.set_ylabel('')
        ax.set_xlabel('')
    fig.supylabel('% covered corrupted data')
    fig.supxlabel('% inspected data')
    fig.suptitle(f'Kairos: {dataset_name} ({len(fetcher.x_train)} train, {len(fetcher.x_valid)} valid, {curr_noisy_idxs} label noise) Exp 2')
    plt.tight_layout()


    plot_valuations("Kairos", eval_med, fetcher, dataset_name, exp_num)
    plot_valuations("LavaEvaluator", eval_med, fetcher, dataset_name, exp_num)

    thresh_acc("Kairos", eval_med, dataset_name, exp_num, fetcher)
    thresh_acc("LavaEvaluator", eval_med, dataset_name, exp_num, fetcher)


In [None]:
run_experiment(dataset_name="adult", exp_num=2, data_frac=0.35, path_to_data=PATH_TO_DATA)