# Aggregation of Pre-trained Models

## Steps

1. Complete model training in `training_demo.ipynb`
2. Load pre-trained models from experiments
3. Configure aggregation methods and weights
4. Run aggregation experiments with different sample sizes
5. Evaluate aggregated predictions using survival metrics

In [7]:
import sys
import copy
from pathlib import Path

import numpy as np
import pandas as pd

sys.path.append('../src')
from aggregation.PredictionsAggregator import PredictionsAggregator

# Set random seed for reproducibility
np.random.seed(42)

## Configuration

Setup aggregation experiment parameters.

In [8]:
# Experiment configuration
DATASET_TRAIN_SAMPLES = 20 # Number of samples in dataset which will be used as a train dataset
MODEL_TRAIN_SAMPLES = 20 # Number of samples the model was trained on (for model selection)
SAMPLE_GRID = [1, 5, 10]  # Number of samples for aggregation
TEST_SAMPLES = [25]
MODELS_SIZE = [2048] # Hidden size of the model (for model selection)
MODEL_NAME = 'CoxTV'
METRICS_LIST = {'ci', 'ibs'}
DATA_EXT = '.csv'

DATA_FOLDER = Path("Data/Preprocessed") # Path to preprocessed data
RES_FOLDER = Path(f"Data/Agg")
MODELS_FOLDER = Path(f"demo_models") # Path to trained models for aggregation demo

TIMES = np.arange(0, 730)
TRAIN_BATCHSIZE = 32

## Aggregation Methods

Define different prediction aggregation strategies with various weight distributions.

In [9]:
# Define aggregation methods with different weights using real PredictionsAggregator
AGGREGATORS_DICT = {
    "n_dist": {
        "0.01": PredictionsAggregator(mode='n_dist', weight=0.01),
        "0.1": PredictionsAggregator(mode='n_dist', weight=0.1),
        "0.3": PredictionsAggregator(mode='n_dist', weight=0.3),
        "0.5": PredictionsAggregator(mode='n_dist', weight=0.5),
        "0.7": PredictionsAggregator(mode='n_dist', weight=0.7),
        "0.9": PredictionsAggregator(mode='n_dist', weight=0.9),
        "0.99": PredictionsAggregator(mode='n_dist', weight=0.99)
    },
    "t_dist": {
        "0.1": PredictionsAggregator(mode='t_dist', weight=0.1),
        "1": PredictionsAggregator(mode='t_dist', weight=1),
        "10": PredictionsAggregator(mode='t_dist', weight=10),
        "25": PredictionsAggregator(mode='t_dist', weight=25),
        "50": PredictionsAggregator(mode='t_dist', weight=50),
        "100": PredictionsAggregator(mode='t_dist', weight=100),
        "1000": PredictionsAggregator(mode='t_dist', weight=1000)
    },
    "geom": {
        "0.01": PredictionsAggregator(mode='geom', weight=0.01),
        "0.1": PredictionsAggregator(mode='geom', weight=0.1),
        "0.3": PredictionsAggregator(mode='geom', weight=0.3),
        "0.5": PredictionsAggregator(mode='geom', weight=0.5),
        "0.7": PredictionsAggregator(mode='geom', weight=0.7),
        "0.9": PredictionsAggregator(mode='geom', weight=0.9),
        "0.99": PredictionsAggregator(mode='geom', weight=0.99)
    }
}

print(f"Aggregation methods configured:")
for method, variants in AGGREGATORS_DICT.items():
    print(f"- {method}: {list(variants.keys())}")

Aggregation methods configured:
- n_dist: ['0.01', '0.1', '0.3', '0.5', '0.7', '0.9', '0.99']
- t_dist: ['0.1', '1', '10', '25', '50', '100', '1000']
- geom: ['0.01', '0.1', '0.3', '0.5', '0.7', '0.9', '0.99']


## Functions from Agg.py

Import and setup key functions for aggregation experiments.

In [10]:
# Import functions for aggregation experiments  
from train_scripts.Agg import eval_model

def create_demo_schema(metrics_list):
    """Create schema for demo aggregation results"""
    base_schema = {
        'train_samples': [],
        'agg_samples': [],
        'method': [],
        'agg_method': [],
        'agg_weight': [],
        'model_id': []
    }
    
    schema = copy.deepcopy(base_schema)
    for metric in metrics_list:
        schema[f'{metric}_train'] = []
        schema[f'{metric}_test'] = []
    
    return schema

def save_results_to_csv(results_df, filename):
    """Save aggregation results to CSV file"""
    results_df.to_csv(filename, index=False)
    print(f"Results saved to {filename}")
    

## Model Loading and Data Preparation

Load pre-trained models and prepare test data for aggregation experiments.

In [11]:
# Create directories for demo results
RES_FOLDER.mkdir(parents=True, exist_ok=True)
MODELS_FOLDER.mkdir(parents=True, exist_ok=True)

# Create demo results schema
SCHEMA = create_demo_schema(METRICS_LIST)
results_filename = RES_FOLDER / f"aggregation_results_{DATASET_TRAIN_SAMPLES}_{max(SAMPLE_GRID)}.csv"

# Look for trained models from training demo
model_files = list(MODELS_FOLDER.glob("*.pkl"))
if not model_files:
    raise FileNotFoundError(f"No trained models found in {MODELS_FOLDER}. Please run training_demo.ipynb first.")

print(f"Found {len(model_files)} trained models:")
for model_file in model_files:
    print(f"  - {model_file.name}")

# Create model_name_grid from all available models
model_name_grid = [model_file.stem for model_file in model_files]

Found 1 trained models:
  - 0_CoxTV.pkl


## Run Aggregation Experiments

Execute aggregation experiments with different methods and sample sizes.

In [12]:
# Prepare test data path
test_data_path = DATA_FOLDER / f"{DATASET_TRAIN_SAMPLES}_{TEST_SAMPLES[0]}_test_preprocessed{DATA_EXT}"

# Run evaluation for each model and collect results
all_results = []

for i, model_name in enumerate(model_name_grid, 1):
    print(f"Processing model {i}/{len(model_name_grid)}: {model_name}")
    try:
        model_results = eval_model(
            data_path=str(test_data_path),
            data_folder = DATA_FOLDER,
            models_folder=MODELS_FOLDER,
            model_name=model_name,
            agg_dict=AGGREGATORS_DICT,
            metrics_list=METRICS_LIST,
            sample_grid=SAMPLE_GRID,
            times = TIMES,
            model_train_samples= MODEL_TRAIN_SAMPLES,
            train_batchsize=TRAIN_BATCHSIZE,
            metric_postfix='test',
            data_ext=DATA_EXT
        )
        
        print(f"Model {model_name} completed: {len(model_results)} results")
        all_results.append(model_results)
        
    except Exception as e:
        print(f"Error processing model {model_name}: {e}")
        continue

# Combine all results into one dataframe
if all_results:
    results_df = pd.concat(all_results, ignore_index=True)

Processing model 1/1: 0_CoxTV


Collecting data for Cox prediction: 100%|██████████| 3336/3336 [00:07<00:00, 420.13it/s]


Начало обработки 1 n_samples
Обработка 1 завершена за 20.681692838668823 секунд
Начало обработки 5 n_samples
Обработка 5 завершена за 152.42873120307922 секунд
Начало обработки 10 n_samples


KeyboardInterrupt: 

In [None]:
all_results

[]