In [None]:
import numpy as np 
from minio_obj_storage import get_numpy_from_cloud
import pandas as pd
import logging
import json
from utils.load_dataset import load_dataset
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, roc_curve
from cleanlab.filter import find_label_issues

with open("./config.json", 'r') as f:
    config = json.load(f)

def calculate_metrics(y_true, y_scores):
    # AUROC
    auroc = roc_auc_score(y_true, y_scores)
    
    # AUPR
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    aupr = auc(recall, precision)
    
    # FPR at 95% TPR
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    fpr95 = fpr[(tpr >= 0.95)][0] if any(tpr >= 0.95) else None
    
    return auroc, aupr, fpr95

logger = logging.getLogger(f'Analyze the')
container_name = 'learning-dynamics-scores'
max_epoch = 200
result_seeds = []
result_noise = []
result_auroc = []
result_aupr = []
result_fpr95 = []
result_method = []
result_dataset = []

for dataset in ['cifar100', 'cifar10']:
    container_dir = dataset
    for label_noise in [0.01, 0.02, 0.05, 0.1]:
        for seed in range(2, 4):
            print(f'----- Seed {seed} -- Noise {label_noise} ---')
            loss = []
            loss_curvature = []
            for epoch in range(0, max_epoch):
                try:
                    loss_4_eph = get_numpy_from_cloud(container_name, container_dir, f"losses_{dataset}_resnet18_noisy_idx_{seed}_epoch_{epoch}_noise_{label_noise}_tid0.pt")
                    loss_curvature_4_eph = get_numpy_from_cloud(container_name, container_dir, f"loss_curvature_{dataset}_resnet18_noisy_idx_{seed}_epoch_{epoch}_noise_{label_noise}_h0.001_tid0.pt")
                    loss.append(loss_4_eph)
                    loss_curvature.append(loss_curvature_4_eph)
                except:
                    print(f"Not found {seed} {epoch}")

            loss_curvature = np.array(loss_curvature)
            loss = np.array(loss)

            dataset_obj = load_dataset(
                dataset=dataset,
                val_split=0,
                root_path=config['data_dir'],
                random_seed=seed,
                logger=logger,
                label_noise=label_noise)

            is_miss = np.zeros((loss.shape[1]))
            is_miss[dataset_obj.noisy_idxs] = 1

            gt_labels = np.array(dataset_obj.train_loader.dataset.dataset.targets)

            prob_file_name = f"conf_learning_prob_noise_idx_{seed}_noise_{label_noise}.pt"
            prob_4_eph = get_numpy_from_cloud(container_name, container_dir, prob_file_name)

            conf_learning_labels = get_numpy_from_cloud(
                container_name, 
                container_dir,
                f"conf_learning_labels_noise_idx_{1}_noise_{label_noise}.pt"
            ).astype(np.int32)

            conf_learning = find_label_issues(
                conf_learning_labels,
                prob_4_eph,
                return_indices_ranked_by="self_confidence",
                filter_by="confident_learning",
            )
            
            prob_4_eph = get_numpy_from_cloud(
                container_name, 
                container_dir, 
                f"prob_{dataset}_resnet18_noisy_idx_{seed}_epoch_{199}_noise_{label_noise}_tid0.pt")

            conf = prob_4_eph.max(axis=1)
            conf_learning_soft = np.zeros(loss.shape[1])
            for idx, i in enumerate(conf_learning[::-1]):
                conf_learning_soft[i] = (idx  + 1) / len(conf_learning)

            df = pd.DataFrame(data={
                'loss': loss.mean(0), 
                'loss_curvature': loss_curvature.mean(0), 
                'noisy_labels': gt_labels,
                'conf_learning': conf_learning_soft.astype(np.float32),
                'in_conf': 1 - conf,
                'is_mis': is_miss})

            methods = ['loss', 'loss_curvature', 'conf_learning', 'in_conf']
            for metric in methods:
                auroc, aupr, fpr95 = calculate_metrics(df['is_mis'], df[metric])
                result_seeds.append(seed)
                result_noise.append(label_noise)
                result_auroc.append(auroc)
                result_aupr.append(aupr)
                result_fpr95.append(fpr95)
                result_method.append(metric)
                result_dataset.append(dataset)


In [None]:
results_df = pd.DataFrame({
    'seed': result_seeds,
    'label_noise': result_noise,
    'method': result_method,
    'auroc': result_auroc,
    'aupr': result_aupr,
    'fpr95': result_fpr95,
    'dataset': result_dataset
})

results_df

In [46]:
avg_res = results_df.groupby(['dataset', 'label_noise', 'method']).mean()
std_res = results_df.groupby(['dataset', 'label_noise', 'method']).std()

avg_res.to_csv('./avg_res2.csv')
std_res.to_csv("./std_res2.csv")

In [43]:
results_df.to_csv("./res_mis2.csv")

In [None]:

for label_noise in [0.01, 0.02, 0.05, 0.1]:
    for dataset in ['cifar100', 'cifar10']:
        container_dir = dataset
        for seed in [1,2,3]:
            preds = []
            for epoch in range(200):
                pred = get_numpy_from_azure(container_name, container_dir, f"pred_{dataset}_resnet18_noisy_idx_{seed}_epoch_{epoch}_noise_{label_noise}_tid0.pt")
                preds.append(pred)

            preds = np.array(preds).T
            lt = preds.mean(1)

            dataset_obj = load_dataset(
                    dataset=dataset,
                    val_split=0,
                    root_path=config['data_dir'],
                    random_seed=seed,
                    logger=logger,
                    label_noise=label_noise)

            is_miss = np.zeros((dataset_obj.train_length))
            is_miss[dataset_obj.noisy_idxs] = 1

            for metric in ['lt']:
                auroc, aupr, fpr95 = calculate_metrics(is_miss, 1-lt)
                result_seeds.append(seed)
                result_noise.append(label_noise)
                result_auroc.append(auroc)
                result_aupr.append(aupr)
                result_fpr95.append(fpr95)
                result_method.append(metric)
                result_dataset.append(dataset)

In [None]:
results_df = pd.DataFrame({
    'seed': result_seeds,
    'label_noise': result_noise,
    'method': result_method,
    'auroc': result_auroc,
    'aupr': result_aupr,
    'fpr95': result_fpr95,
    'dataset': result_dataset
})

results_df.groupby(['dataset', 'method', 'label_noise']).mean()

In [None]:
result_seeds = []
result_noise = []
result_auroc = []
result_aupr = []
result_fpr95 = []
result_method = []
result_dataset = []

for label_noise in [0.01, 0.02, 0.05, 0.1]:
    for dataset in ['cifar100', 'cifar10']:
        container_dir = dataset
        for seed in [1,2,3]:
            print(f"{seed} {dataset} {label_noise}")
            preds = []
            for epoch in range(200):
                pred = get_numpy_from_azure(container_name, container_dir, f"pred_{dataset}_resnet18_noisy_idx_{seed}_epoch_{epoch}_noise_{label_noise}_tid0.pt")
                preds.append(pred)

            preds = np.array(preds).T
            lt = np.argmax(preds, axis=1)

            dataset_obj = load_dataset(
                    dataset=dataset,
                    val_split=0,
                    root_path=config['data_dir'],
                    random_seed=seed,
                    logger=logger,
                    label_noise=label_noise)

            is_miss = np.zeros((dataset_obj.train_length))
            is_miss[dataset_obj.noisy_idxs] = 1

            for metric in ['lt']:
                auroc, aupr, fpr95 = calculate_metrics(is_miss, 1-lt)
                result_seeds.append(seed)
                result_noise.append(label_noise)
                result_auroc.append(auroc)
                result_aupr.append(aupr)
                result_fpr95.append(fpr95)
                result_method.append(metric)
                result_dataset.append(dataset)

In [91]:
results_df = pd.DataFrame({
    'seed': result_seeds,
    'label_noise': result_noise,
    'method': result_method,
    'auroc': result_auroc,
    'aupr': result_aupr,
    'fpr95': result_fpr95,
    'dataset': result_dataset
})

results_df.groupby(['dataset', 'method', 'label_noise']).mean().to_csv("./mean_lt.csv")

In [92]:
results_df.groupby(['dataset', 'method', 'label_noise']).std().to_csv("./std_lt.csv")

In [None]:
for label_noise in [0.01, 0.02, 0.05, 0.1]:
    for dataset in ['cifar100', 'cifar10']:
        container_dir = dataset
        for seed in [1,2,3]:
            print(f"{seed} {dataset} {label_noise}")
            preds = []
            for epoch in range(200):
                pred = get_numpy_from_azure(container_name, container_dir, f"pred_{dataset}_resnet18_noisy_idx_{seed}_epoch_{epoch}_noise_{label_noise}_tid0.pt")
                preds.append(pred)

            preds = np.array(preds).T
            lt = preds.mean(1)

            dataset_obj = load_dataset(
                    dataset=dataset,
                    val_split=0,
                    root_path=config['data_dir'],
                    random_seed=seed,
                    logger=logger,
                    label_noise=label_noise)

            is_miss = np.zeros((dataset_obj.train_length))
            is_miss[dataset_obj.noisy_idxs] = 1

            for metric in ['01_loss']:
                auroc, aupr, fpr95 = calculate_metrics(is_miss, 1-lt)
                result_seeds.append(seed)
                result_noise.append(label_noise)
                result_auroc.append(auroc)
                result_aupr.append(aupr)
                result_fpr95.append(fpr95)
                result_method.append(metric)
                result_dataset.append(dataset)

In [97]:
results_df = pd.DataFrame({
    'seed': result_seeds,
    'label_noise': result_noise,
    'method': result_method,
    'auroc': result_auroc,
    'aupr': result_aupr,
    'fpr95': result_fpr95,
    'dataset': result_dataset
})

results_df.groupby(['dataset', 'method', 'label_noise']).mean().to_csv("./mean_zo_loss.csv")
results_df.groupby(['dataset', 'method', 'label_noise']).std().to_csv("./std_zo_loss.csv")

In [None]:
results_df.groupby(['dataset', 'method', 'label_noise']).mean()

In [None]:
from minio_obj_storage import get_numpy_from_cloud
import torch

result_seeds = []
result_noise = []
result_auroc = []
result_aupr = []
result_fpr95 = []
result_method = []
result_dataset = []

bucket_name = 'learning-dynamics-scores'
index = torch.load("./index/data_index_cifar100.pt")
index = np.array(list(range(len(index))))
for label_noise in [0.01, 0.02, 0.05, 0.1]:
    for dataset in ['cifar100', 'cifar10']:
        container_dir = dataset
        for seed in [1,2,3]:
            dataset_obj = load_dataset(
                    dataset=dataset,
                    val_split=0,
                    root_path=config['data_dir'],
                    random_seed=seed,
                    logger=logger,
                    label_noise=label_noise)

            print(f"{seed} {dataset} {label_noise}")
            epochs_total = 200
            preds = np.zeros((epochs_total, dataset_obj.train_length))

            for part in [0, 1]:
                if part == 0:
                    index1 = index[:len(index) // 2]
                    index2 = index[len(index) // 2:]
                else:
                    index2 = index[:len(index) // 2]
                    index1 = index[len(index) // 2:]    
                pred = get_numpy_from_cloud(bucket_name, container_dir, f"ssft_pred_resnet18_part_{part}_noisy_idx_{seed}_noise_{label_noise}.npy")
                preds[:, index1] = pred[epochs_total:, index1]

            preds = np.array(preds).T
            ft = np.mean(preds, axis=1)
            dataset_obj = load_dataset(
                    dataset=dataset,
                    val_split=0,
                    root_path=config['data_dir'],
                    random_seed=seed,
                    logger=logger,
                    label_noise=label_noise)

            is_miss = np.zeros((dataset_obj.train_length))
            is_miss[dataset_obj.noisy_idxs] = 1

            for metric in ['ssft']:
                auroc, aupr, fpr95 = calculate_metrics(is_miss, 1-ft)
                print(auroc, seed, label_noise, dataset)
                result_seeds.append(seed)
                result_noise.append(label_noise)
                result_auroc.append(auroc)
                result_aupr.append(aupr)
                result_fpr95.append(fpr95)
                result_method.append(metric)
                result_dataset.append(dataset)

In [None]:
results_df = pd.DataFrame({
    'seed': result_seeds,
    'label_noise': result_noise,
    'method': result_method,
    'auroc': result_auroc,
    'aupr': result_aupr,
    'fpr95': result_fpr95,
    'dataset': result_dataset
})

results_df.groupby(['dataset', 'method', 'label_noise']).mean()


In [None]:
results_df.groupby(['dataset', 'method', 'label_noise']).std()