In [1]:
from minio_obj_storage import get_numpy_from_cloud
import pandas as pd
import logging
import json
from sklearn.metrics import roc_auc_score
import numpy as np

logger = logging.getLogger('Default')
with open("./config.json", 'r') as f:
    config = json.load(f)

In [3]:
results_seeds = []
results_dataset = []
results_metric = []
results_auroc = []

for seed in [1,2,3]:
    for dataset_name in ['cifar10_duplicate', 'cifar100_duplicate']:
        indices_map = get_numpy_from_cloud('learning-dynamics-models', dataset_name, 'duplicate_index_map.npy').item()
        losses = []
        loss_curvature = []
        lt = []
        for arrays, metric, suffix in [
            (losses, 'loss', ""), 
            (loss_curvature, 'loss_curvature', "_h_0.001_n_10"), 
            (lt, 'correct', "")]:
            for epoch in range(200):
                score = get_numpy_from_cloud(
                    'learning-dynamics-scores', 
                    dataset_name,
                    f"{metric}_{dataset_name}_resnet18_seed_{seed}_epoch_{epoch}{suffix}.npy")
                arrays.append(score)

        losses = np.array(losses)
        loss_curvature = np.array(loss_curvature)
        lt = np.array(lt)

        duplicate_idxs = []
        for idx1, (idx2, _) in indices_map.items():
            duplicate_idxs.append(idx1)
            duplicate_idxs.append(idx2)

        is_duplicate = np.zeros((losses.shape[1]))
        is_duplicate[duplicate_idxs] = 1
        results_df = pd.DataFrame(
            data={
                'loss': losses.mean(0),
                'loss_curvature': loss_curvature.mean(0),
                'clt': 1 - lt.mean(0),
                'lt': np.argmax(lt, 0),
                'is_duplicate': is_duplicate
            }
        )

        for metric in ['loss', 'loss_curvature', 'clt', 'lt']:
            y_true = results_df['is_duplicate']
            y_scores = results_df[metric]
            auroc = roc_auc_score(y_true, y_scores)
            results_seeds.append(seed)
            results_dataset.append(dataset_name)
            results_metric.append(metric)
            results_auroc.append(auroc)


In [None]:
from cleanlab.filter import find_label_issues
for seed in [1,2,3]:
    for dataset_name in ['cifar10_duplicate', 'cifar100_duplicate']:
        container_name = 'learning-dynamics-scores'
        indices_map = get_numpy_from_cloud('learning-dynamics-models', dataset_name, 'duplicate_index_map.npy').item()
        duplicate_idxs = []
        for idx1, (idx2, _) in indices_map.items():
            duplicate_idxs.append(idx1)
            duplicate_idxs.append(idx2)

        conf_learning_labels = get_numpy_from_cloud(
            container_name, 
            f"{dataset_name}_noisy",
            f"duplicate_conf_learning_labels_noise_idx_{seed}_noise_0.0.pt"
        ).astype(np.int32)

        is_duplicate = np.zeros((len(conf_learning_labels)))
        is_duplicate[duplicate_idxs] = 1

        prob_file_name = f"duplicate_conf_learning_prob_noise_idx_{seed}_noise_0.0.pt"
        prob_4_eph = get_numpy_from_cloud(container_name, f"{dataset_name}_noisy", prob_file_name)

        conf_learning = find_label_issues(
            conf_learning_labels,
            prob_4_eph,
            return_indices_ranked_by="self_confidence",
            filter_by="confident_learning",
        )

        conf = prob_4_eph.max(axis=1)
        conf_learning_soft = np.zeros(len(conf_learning_labels))
        for idx, i in enumerate(conf_learning[::-1]):
            conf_learning_soft[i] = (idx  + 1) / len(conf_learning)

        y_true = is_duplicate
        y_scores = conf_learning_soft.astype(np.float32)
        auroc = roc_auc_score(y_true, y_scores)
        results_seeds.append(seed)
        results_dataset.append(dataset_name)
        results_metric.append('cl')
        results_auroc.append(auroc)

        y_true = is_duplicate
        y_scores = 1 - prob_4_eph.max(1)
        auroc = roc_auc_score(y_true, y_scores)
        results_seeds.append(seed)
        results_dataset.append(dataset_name)
        results_metric.append('in conf.')
        results_auroc.append(auroc)

In [5]:
from minio_obj_storage import get_numpy_from_cloud
import torch

bucket_name = 'learning-dynamics-scores'
index = torch.load("./index/data_index_cifar100.pt")
index = np.array(list(range(len(index))))
label_noise = 0.0
for dataset in ['cifar100', 'cifar10']:
    container_dir = dataset + '_duplicate'

    indices_map = get_numpy_from_cloud('learning-dynamics-models', container_dir, 'duplicate_index_map.npy').item()
    duplicate_idxs = []
    for idx1, (idx2, _) in indices_map.items():
        duplicate_idxs.append(idx1)
        duplicate_idxs.append(idx2)

    is_duplicate = np.zeros((len(index)))
    is_duplicate[duplicate_idxs] = 1
    epochs_total = 200
    for seed in [1,2,3]:
        preds = np.zeros((epochs_total, len(index)))
        for part in [0, 1]:
            if part == 0:
                index1 = index[:len(index) // 2]
                index2 = index[len(index) // 2:]
            else:
                index2 = index[:len(index) // 2]
                index1 = index[len(index) // 2:]    
            pred = get_numpy_from_cloud(bucket_name, container_dir, f"ssft_pred_resnet18_part_{part}_noisy_idx_{seed}_noise_{label_noise}.npy")
            preds[:, index1] = pred[epochs_total:, index1]

        preds = np.array(preds).T
        ft = np.mean(preds, axis=1)

        for metric in ['ssft']:
            auroc = roc_auc_score(is_duplicate, 1-ft)
            results_seeds.append(seed)
            results_auroc.append(auroc)
            results_metric.append(metric)
            results_dataset.append(container_dir)

In [6]:
all_results = pd.DataFrame(data={
    'seed': results_seeds,
    'metric': results_metric,
    'dataset': results_dataset,
    'auroc': results_auroc
})

In [None]:
all_results.groupby(['dataset', 'metric']).mean()

In [None]:
all_results.groupby(['dataset', 'metric']).std()