# Model Training

## Steps

1. Complete data preprocessing in `preprocessing_demo.ipynb`
2. Configure training parameters 
3. Run experiments

In [None]:
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import ParameterGrid

sys.path.append('../src')

from train_scripts.Experiments import (
    TrainingConfig, DataConfig, ExperimentConfig,
    prepare_dataloader, run_single_experiment, 
    make_schema, collect_all_hparams, ModelScorer
)

## Configuration

Setup parameters for experiments.

In [None]:
# Sample sizes
TRAIN_GRID = [20]  
TEST_GRID = [25]   
VALIDATION_SIZE = 10  

# Model parameters
PARAM_GRIDS = {
    "SP": ParameterGrid({
        "hidden_dim": [2048], 
    }),
    
    "CoxTV": ParameterGrid({
        "penalizer": [np.logspace(-2, 2, 5)[1]], 
        "l1_ratio": [np.logspace(-2, 2, 5)[1]]   
    }),
}

# Data
DATA_FOLDER = "../Data"  
TRAIN_BATCH_SIZE = 64   
SCORE_BATCH_SIZE = 128  

# Time
TIMES = np.arange(0, 730)   
TRAIN_TIMES = TIMES[::10]   

# Training
EPOCHS = 20             
LEARNING_RATE = 1e-3    
EARLY_STOPPING = True   
PATIENCE = 5            
MIN_DELTA = 0.001       
SCORE_METRIC = "ibs"    

# Metrics
METRICS = ["ci", "ibs"]  

METHODS = list(PARAM_GRIDS.keys())

print("Ready:")
print(f"  Methods: {METHODS}")
print(f"  Data: {TRAIN_GRID} train, {TEST_GRID} test")
print(f"  Metrics: {METRICS}")

In [None]:
# Create configurations
data_cfg = DataConfig(
    data_folder=DATA_FOLDER,
    train_batchsize=TRAIN_BATCH_SIZE,
    score_batchsize=SCORE_BATCH_SIZE,
    times=TIMES,
    train_times=TRAIN_TIMES,
    to_cens_shift=[], 
    to_term_shift=[], 
    cens_prob=-1,      
)

train_cfg = TrainingConfig(
    epochs=EPOCHS,
    lr=LEARNING_RATE,
    early_stopping=EARLY_STOPPING,
    patience=PATIENCE,
    min_delta=MIN_DELTA,
    score_metric=SCORE_METRIC,
)

all_hparams = collect_all_hparams(PARAM_GRIDS)
schema = make_schema(METRICS, all_hparams)

exp_cfg = ExperimentConfig(
    metrics=METRICS,
    schema=schema,
    res_filename="demo_results.csv", 
    models_folder="demo_models",     
    log_dir="demo_logs",            
)

In [None]:
# Prepare data
scorer = ModelScorer()
max_train_samples = max(TRAIN_GRID)

dl_score_max = prepare_dataloader(
    train_samples=max_train_samples,
    data_cfg=data_cfg,
    data_type="train",
    dataset_type="score"
)

df_train_max = pd.read_csv(f"{DATA_FOLDER}/{max_train_samples}_train_preprocessed.csv")
df_train_max["duration"] = df_train_max["max_lifetime"] - df_train_max["time"]
df_train_max = df_train_max[["duration", "failure", "time"]]

In [None]:
# Run experiments
all_results = []
run_id = 0

for train_samples in TRAIN_GRID:
    for method in METHODS:
        for hparams in PARAM_GRIDS[method]:
            print(f"Running {method} with {hparams}")
            
            try:
                result = run_single_experiment(
                    train_samples=train_samples,
                    test_grid=TEST_GRID,
                    method=method,
                    hparams=hparams,
                    run_id=run_id,
                    scorer=scorer,
                    dl_score_max=dl_score_max,
                    df_train_max=df_train_max,
                    exp_cfg=exp_cfg,
                    data_cfg=data_cfg,
                    train_cfg=train_cfg,
                    val_data_size=VALIDATION_SIZE,
                )
                
                all_results.append(result)
                
                if result["error"][0] == 0:
                    print(f"Success! Training time: {result['train_time'][0]:.2f}s")
                    for metric in METRICS:
                        train_val = result[f'{metric}_train_same_size'][0]
                        test_val = result[f'{metric}_test'][0]
                        print(f"  {metric}: train={train_val:.4f}, test={test_val:.4f}")
                else:
                    print(f"Failed: {result['error_text'][0]}")
                    
            except Exception as e:
                print(f"Failed: {str(e)}")
                
            run_id += 1

In [None]:
# Results
if all_results:
    results_df = pd.DataFrame({
        k: [r[k][0] for r in all_results] 
        for k in all_results[0].keys()
    })
    
    print("Results:")
    print(results_df.to_string(index=False, float_format='%.4f'))
    
else:
    print("No experiments were run.")
    print("Check that preprocessed data files exist in DATA_FOLDER.")

## Done

Training completed.
To modify experiments, change `PARAM_GRIDS`, `TRAIN_GRID`, `EPOCHS`, etc.