# Create data for next steps

This notebook will create a .pkl file with the model's preditions and other information.

In [1]:
import os
import gc
import pickle
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

from utils.utils import load_config
from dataloaders import *  
from models import *       
from uncertainty.MCDropout import MCDropout

## Constants

In [2]:
DEVICE = 'cuda'
BATCH_SIZE = 64
POSITIVE_THRESHOLD = 0.5

## Experiment Settings

In [3]:
EXPERIMENTS_FOLDER = 'UNIFESP+iCOPE'
MODEL_NAME = 'NCNN_PRETRAINED'  # change model here
BASE_EXPERIMENT_PATH = os.path.join('experiments', EXPERIMENTS_FOLDER, MODEL_NAME)

### MCDP Settings

In [4]:
# MCDP SETTINGS
MCDP = False
MCDP_FOWARD_PASSES = 30
MCDP_DROPOUT = 0.1 if "NCNN" in MODEL_NAME else 0.5

## Functions

In [5]:
def create_hook(embeddings_list):
    """Return a hook function that appends flattened outputs to embeddings_list."""
    def hook(module, input, output):
        output_np = output.detach().cpu().numpy()
        for x in output_np:
            embeddings_list.append(x.flatten())
    return hook

In [6]:
def process_experiment(exp, mode, device, positive_threshold, batch_size):
    """
    Process a single experiment directory.
    
    Parameters:
        exp (str): Name of the experiment folder.
        mode (str): Either 'train' or 'test'.
        device (str): Device to run inference on.
        positive_threshold (float): Threshold for positive predictions.
        batch_size (int): Batch size for DataLoader.
        
    Returns:
        fold (str): Extracted fold name.
        result (dict): Dictionary containing outputs and optionally embeddings.
    """
    exp_path = os.path.join(BASE_EXPERIMENT_PATH, exp)
    model_path = os.path.join(exp_path, 'Model', 'best_model.pt')
    config_path = os.path.join(exp_path, 'Model', 'config.yaml')
    
    # Load configuration
    config = load_config(config_path)
    data_path = config['path_train'] if mode == 'train' else config['path_test']
    
    # Extract the fold from the data path (platform independent)
    fold = os.path.normpath(data_path).split(os.sep)[-2]
    print(f"Processing {mode} data from: {data_path}")
    
    # Set up embeddings collection and hook handle (if needed)
    embeddings = []
    hook_handle = None

    # Choose model architecture and dataset based on experiment name
    if "NCNN" in exp:
        model_instance = NCNN()
        dataset = NCNNDataset(data_path)
        hook_handle = model_instance.fc_4.register_forward_hook(create_hook(embeddings))
    elif "VGGNB" in exp:
        model_instance = VGGNB()
        dataset = VGGNBDataset(data_path)
        # You may choose which layer to hook:
        # hook_handle = model_instance.VGGFace.features.conv5_3.register_forward_hook(create_hook(embeddings))
        hook_handle = model_instance.VGGFace.classifier[3].register_forward_hook(create_hook(embeddings))
    elif "ViTNB" in exp:
        model_instance = ViTNB()
        dataset = ViTNBDataset(data_path)
    else:
        raise ValueError(f"Unknown experiment type in {exp}")

    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=False,
        pin_memory=True,
        num_workers=4
    )

    # Load model weights and prepare model for inference
    model_instance.load_state_dict(torch.load(model_path))
    model_instance = model_instance.to(device)
    model_instance.eval()

    # Accumulate outputs using lists (more efficient than repeated concatenation)
    probs_list, preds_list, logits_list, labels_list = [], [], [], []

    # If MCDP is activated, accumulate probabilities using a list
    if MCDP:
        probs_uq_list = []
        model_instance = MCDropout(model_instance, p=MCDP_DROPOUT)

    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"Processing {exp}"):
            inputs = batch['image'].to(device)
            labels = batch['label'].to(device)

            # If MCDP calculate probabilities for each forward pass
            if MCDP:
                probs = model_instance.predict(inputs, reps=MCDP_FOWARD_PASSES)
                preds = torch.ge(torch.mean(probs, dim=1), positive_threshold).type(torch.int)
                probs_uq_list.append(probs)

            else:
                logits = model_instance(inputs)
                probs = torch.sigmoid(logits)
                preds = (probs >= positive_threshold).int()
                logits_list.append(logits)

            probs_list.append(probs)
            preds_list.append(preds)
            labels_list.append(labels)

    # Concatenate tensors and convert to numpy arrays
    probs_all = torch.cat(probs_list).cpu().numpy()
    preds_all = torch.cat(preds_list).cpu().numpy()
    labels_all = torch.cat(labels_list).cpu().numpy()

    if MCDP:
        probs_uq_all = torch.cat(probs_uq_list).cpu().numpy()

        result = {
            'img_names': np.array(dataset.img_names),
            'probs': probs_all,
            'preds': preds_all,
            'labels': labels_all,
            'probs_uq': probs_uq_all
        }

    else:
        logits_all = torch.cat(logits_list).cpu().numpy()
        
        result = {
            'img_names': np.array(dataset.img_names),
            'probs': probs_all,
            'preds': preds_all,
            'logits': logits_all,
            'labels': labels_all,
            'embeddings': np.array(embeddings)
        }

    # Remove hook if it was set
    if hook_handle is not None:
        hook_handle.remove()

    # Cleanup GPU memory
    gc.collect()
    torch.cuda.empty_cache()

    return fold, result

In [7]:
def main():
    """Main loop to process all experiments for each mode and save results."""
    modes = ['train', 'test']

    if MCDP:
        save_filenames = [f'train_results_MCDP_{MCDP_DROPOUT}_{MCDP_FOWARD_PASSES}.pkl', 
                          f'results_MCDP_{MCDP_DROPOUT}_{MCDP_FOWARD_PASSES}.pkl']
    else:
        save_filenames = ['train_results.pkl', 'results.pkl']

    for mode, save_filename in zip(modes, save_filenames):
        results = {}
        # List all experiment directories in the base experiments path
        for exp in os.listdir(BASE_EXPERIMENT_PATH):
            # Filter out non-experiment files
            if any(sub in exp for sub in ['.pkl', 'masks', '.png']):
                continue
            try:
                fold, res = process_experiment(exp, mode, DEVICE, POSITIVE_THRESHOLD, BATCH_SIZE)
                results[fold] = res
            except Exception as e:
                print(f"Error processing {exp}: {e}")

        output_path = os.path.join(BASE_EXPERIMENT_PATH, save_filename)
        with open(output_path, 'wb') as f:
            pickle.dump(results, f)
        print(f"Saved {mode} results to {output_path}")

In [8]:
main()

Processing train data from: Datasets\Folds\0\Train


Processing 20250222_0902_NCNN: 100%|██████████| 151/151 [00:14<00:00, 10.35it/s]


Processing train data from: Datasets\Folds\1\Train


Processing 20250222_0905_NCNN: 100%|██████████| 155/155 [00:14<00:00, 10.52it/s]


Processing train data from: Datasets\Folds\2\Train


Processing 20250222_0908_NCNN: 100%|██████████| 152/152 [00:14<00:00, 10.35it/s]


Processing train data from: Datasets\Folds\3\Train


Processing 20250222_0912_NCNN: 100%|██████████| 154/154 [00:14<00:00, 10.45it/s]


Processing train data from: Datasets\Folds\4\Train


Processing 20250222_0914_NCNN: 100%|██████████| 155/155 [00:14<00:00, 10.71it/s]


Processing train data from: Datasets\Folds\5\Train


Processing 20250222_0919_NCNN: 100%|██████████| 152/152 [00:13<00:00, 10.96it/s]


Processing train data from: Datasets\Folds\6\Train


Processing 20250222_0922_NCNN: 100%|██████████| 151/151 [00:13<00:00, 11.51it/s]


Processing train data from: Datasets\Folds\7\Train


Processing 20250222_0928_NCNN: 100%|██████████| 154/154 [00:13<00:00, 11.60it/s]


Processing train data from: Datasets\Folds\8\Train


Processing 20250222_0931_NCNN: 100%|██████████| 160/160 [00:13<00:00, 11.93it/s]


Processing train data from: Datasets\Folds\9\Train


Processing 20250222_0935_NCNN: 100%|██████████| 159/159 [00:13<00:00, 11.91it/s]


Saved train results to experiments\UNIFESP+iCOPE\NCNN_PRETRAINED\train_results.pkl
Processing test data from: Datasets\Folds\0\Test


Processing 20250222_0902_NCNN: 100%|██████████| 1/1 [00:02<00:00,  2.23s/it]


Processing test data from: Datasets\Folds\1\Test


Processing 20250222_0905_NCNN: 100%|██████████| 1/1 [00:02<00:00,  2.23s/it]


Processing test data from: Datasets\Folds\2\Test


Processing 20250222_0908_NCNN: 100%|██████████| 1/1 [00:02<00:00,  2.25s/it]


Processing test data from: Datasets\Folds\3\Test


Processing 20250222_0912_NCNN: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it]


Processing test data from: Datasets\Folds\4\Test


Processing 20250222_0914_NCNN: 100%|██████████| 1/1 [00:02<00:00,  2.23s/it]


Processing test data from: Datasets\Folds\5\Test


Processing 20250222_0919_NCNN: 100%|██████████| 1/1 [00:02<00:00,  2.25s/it]


Processing test data from: Datasets\Folds\6\Test


Processing 20250222_0922_NCNN: 100%|██████████| 1/1 [00:02<00:00,  2.28s/it]


Processing test data from: Datasets\Folds\7\Test


Processing 20250222_0928_NCNN: 100%|██████████| 1/1 [00:02<00:00,  2.21s/it]


Processing test data from: Datasets\Folds\8\Test


Processing 20250222_0931_NCNN: 100%|██████████| 1/1 [00:02<00:00,  2.20s/it]


Processing test data from: Datasets\Folds\9\Test


Processing 20250222_0935_NCNN: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]

Saved test results to experiments\UNIFESP+iCOPE\NCNN_PRETRAINED\results.pkl





# Post-hoc Calibration

## Experiment Settings

In [None]:
EXPERIMENTS_FOLDER = 'UNIFESP+iCOPE'
MODEL_NAME = 'NCNN_PRETRAINED'  # change model here
BASE_EXPERIMENT_PATH = os.path.join('experiments', EXPERIMENTS_FOLDER, MODEL_NAME)

filename = 'results_MCDP_0.5_30.pkl'  # change here
filename_calib = 'train_results_MCDP_0.5_30.pkl'  # change here

positive_threshold = 0.5

## Functions

In [None]:
def calibrate_post_hoc(results, calib_results, calibrator):

    for fold in results.keys():
        calibrator.fit(calib_results[fold]['probs'], calib_results[fold]['labels'])
        calibrated_probs = calibrator.predict(results[fold]['probs'])

        results[fold]['probs'] = calibrated_probs
        results[fold]['preds'] = (calibrated_probs >= positive_threshold).astype('float32')

    return results

In [None]:
def calibrate_mcdp(results, calib_results, calibrator):

    for fold in results.keys():

        calibrator.fit(calib_results[fold]['probs'], calib_results[fold]['labels'])
        
        if calibrator.__class__.__name__ == "IsotonicRegressor":
            calibrated_probs = np.empty_like(results[fold]['probs_uq'])
            for i, x in enumerate(results[fold]['probs_uq']):
                calibrated_probs[i] = calibrator.predict(x)
        else:
            calibrated_probs = calibrator.predict(results[fold]['probs_uq'])

        results[fold]['probs_uq'] = calibrated_probs
        results[fold]['probs'] = calibrated_probs.mean(axis=1)
        results[fold]['preds'] = (calibrated_probs.mean(axis=1) >= positive_threshold).astype('float32')

    return results

## Platt Scaling

In [None]:
with open(os.path.join(BASE_EXPERIMENT_PATH, filename), 'rb') as f:
    results = pickle.load(f)

with open(os.path.join(BASE_EXPERIMENT_PATH, filename_calib), 'rb') as f:
    calib_results = pickle.load(f)

In [None]:
from calibration.calibrators import PlattScaling

In [None]:
platt = PlattScaling()

In [None]:
mcdp = "MCDP_" if "MCDP" in filename else ""
save_filename = f'results_{mcdp}PLATT.pkl'

In [None]:
new_results = calibrate_post_hoc(results=results, calib_results=calib_results, calibrator=platt)

In [None]:
with open(os.path.join(BASE_EXPERIMENT_PATH, save_filename), 'wb') as f:
    pickle.dump(new_results, f)

## Temperature Scaling

In [None]:
with open(os.path.join(BASE_EXPERIMENT_PATH, filename), 'rb') as f:
    results = pickle.load(f)

with open(os.path.join(BASE_EXPERIMENT_PATH, filename_calib), 'rb') as f:
    calib_results = pickle.load(f)

In [None]:
from calibration.calibrators import TemperatureScaling

In [None]:
temp = TemperatureScaling()

In [None]:
mcdp = "MCDP_" if "MCDP" in filename else ""
save_filename = f'results_{mcdp}TEMPERATURE.pkl'

In [None]:
new_results = calibrate_post_hoc(results=results, calib_results=calib_results, calibrator=temp)

In [None]:
with open(os.path.join(BASE_EXPERIMENT_PATH, save_filename), 'wb') as f:
    pickle.dump(new_results, f)

## Isotonic Regressor

In [None]:
with open(os.path.join(BASE_EXPERIMENT_PATH, filename), 'rb') as f:
    results = pickle.load(f)

with open(os.path.join(BASE_EXPERIMENT_PATH, filename_calib), 'rb') as f:
    calib_results = pickle.load(f)

In [None]:
from calibration.calibrators import IsotonicRegressor

In [None]:
iso = IsotonicRegressor()

In [None]:
mcdp = "MCDP_" if "MCDP" in filename else ""
save_filename = f'results_{mcdp}ISOTONIC.pkl'

In [None]:
new_results = calibrate_post_hoc(results=results, calib_results=calib_results, calibrator=iso)

In [None]:
with open(os.path.join(BASE_EXPERIMENT_PATH, save_filename), 'wb') as f:
    pickle.dump(new_results, f)

## Histogram Binning

In [None]:
with open(os.path.join(BASE_EXPERIMENT_PATH, filename), 'rb') as f:
    results = pickle.load(f)

with open(os.path.join(BASE_EXPERIMENT_PATH, filename_calib), 'rb') as f:
    calib_results = pickle.load(f)

In [None]:
from calibration.calibrators import HistogramBinning

In [None]:
hist_bin = HistogramBinning()

In [None]:
mcdp = "MCDP_" if "MCDP" in filename else ""
save_filename = f'results_{mcdp}HIST.pkl'

In [None]:
new_results = calibrate_post_hoc(results=results, calib_results=calib_results, calibrator=hist_bin)

In [None]:
with open(os.path.join(BASE_EXPERIMENT_PATH, save_filename), 'wb') as f:
    pickle.dump(new_results, f)