# Model Training

## Steps

1. Complete data preprocessing in `preprocessing_demo.ipynb`
2. Configure training parameters 
3. Run experiments

In [12]:
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import ParameterGrid

np.random.seed(42)
sys.path.append('../src')

from train_scripts.Experiments import (
    TrainingConfig, DataConfig, ExperimentConfig,
    prepare_dataloader, run_single_experiment, 
    make_schema, collect_all_hparams, create_res_file
)

## Configuration

Setup parameters for experiments.

In [13]:
import os

RES_FILE = 'demo_results.csv'
# Sample sizes
TRAIN_GRID = [20]  
TEST_GRID = [25]   
VALIDATION_SIZE = 10  

# Model parameters
PARAM_GRIDS = {
    # "SP": ParameterGrid({
    #     "hidden_dim": [2048],
    # }),
    
    "CoxTV": ParameterGrid({
        "penalizer": [np.logspace(-2, 2, 5)[1]], 
        "l1_ratio": [np.logspace(-2, 2, 5)[1]]   
    }),
}

# Data
DATA_FOLDER = os.path.join("Data", "Preprocessed")
DATA_EXT = ".csv" # ".parquet" or ".csv" 
TRAIN_BATCH_SIZE = 512
SCORE_BATCH_SIZE = 512  

# Time
TIMES = np.arange(0, 730)   
TRAIN_TIMES = TIMES[::10]   

# Training
EPOCHS = 20
LEARNING_RATE = 1e-3    
EARLY_STOPPING = True   
PATIENCE = 5            
MIN_DELTA = 0.001       
SCORE_METRIC = "ibs"    

# Metrics
METRICS = ["ci", "ibs"]  

METHODS = list(PARAM_GRIDS.keys())

print("Ready:")
print(f"  Methods: {METHODS}")
print(f"  Data: {TRAIN_GRID} train, {TEST_GRID} test")
print(f"  Metrics: {METRICS}")

Ready:
  Methods: ['CoxTV']
  Data: [20] train, [25] test
  Metrics: ['ci', 'ibs']


In [14]:
# Create configurations
data_cfg = DataConfig(
    data_folder=DATA_FOLDER,
    train_batchsize=TRAIN_BATCH_SIZE,
    score_batchsize=SCORE_BATCH_SIZE,
    times=TIMES,
    train_times=TRAIN_TIMES,
    to_cens_shift=[], 
    to_term_shift=[], 
    cens_prob=-1,      
)

train_cfg = TrainingConfig(
    epochs=EPOCHS,
    lr=LEARNING_RATE,
    early_stopping=EARLY_STOPPING,
    patience=PATIENCE,
    min_delta=MIN_DELTA,
    score_metric=SCORE_METRIC,
)

all_hparams = collect_all_hparams(PARAM_GRIDS)
schema = make_schema(METRICS, all_hparams)

exp_cfg = ExperimentConfig(
    metrics=METRICS,
    schema=schema,
    res_filename=RES_FILE, 
    models_folder="demo_models",     
    log_dir="demo_logs",            
)

pd.DataFrame(exp_cfg.schema).to_csv(exp_cfg.res_filename, index=False)


In [15]:
from scoring import ModelScorer
# Prepare data
scorer = ModelScorer()
max_train_samples = 50 # or max(TRAIN_GRID)

dl_score_max = prepare_dataloader(
    train_samples=max_train_samples,
    data_cfg=data_cfg,
    data_type="train",
    dataset_type="score",
    data_ext = DATA_EXT
)

df_train_max_path = f"{DATA_FOLDER}/{max_train_samples}_train_preprocessed{DATA_EXT}"

if DATA_EXT == ".csv":
    df_train_max = pd.read_csv(df_train_max_path)
else:
    df_train_max = pd.read_parquet(df_train_max_path)
df_train_max["duration"] = df_train_max["max_lifetime"] - df_train_max["time"]
df_train_max = df_train_max[["duration", "failure", "time"]]

In [16]:
# Run experiments
all_results = []
run_id = 0

for train_samples in TRAIN_GRID:
    for method in METHODS:
        for hparams in PARAM_GRIDS[method]:
            print(f"Running {method} with {hparams}")
            try:
                result = run_single_experiment(
                    train_samples=train_samples,
                    test_grid=TEST_GRID,
                    method=method,
                    hparams=hparams,
                    run_id=run_id,
                    scorer=scorer,
                    dl_score_max=dl_score_max,
                    df_train_max=df_train_max,
                    exp_cfg=exp_cfg,
                    data_cfg=data_cfg,
                    train_cfg=train_cfg,
                    val_data_size=VALIDATION_SIZE,
                )
                
                all_results.append(result)
                
                if result["error"][0] == 0:
                    print(f"Success! Training time: {result['train_time'][0]:.2f}s")
                    for metric in METRICS:
                        train_val = result[f'{metric}_train_same_size'][0]
                        test_val = result[f'{metric}_test'][0]
                        print(f"  {metric}: train={train_val:.4f}, test={test_val:.4f}")
                else:
                    print(f"Failed: {result['error_text'][0]}")
                        
            except Exception as e:
                    print(f"Failed: {str(e)}")
                
            run_id += 1

Running CoxTV with {'l1_ratio': 0.1, 'penalizer': 0.1}


Collecting data for Cox fit: 100%|██████████| 670/670 [00:17<00:00, 38.35it/s]
  nonnumeric_cols = [col for (col, dtype) in df.dtypes.iteritems() if dtype.name == "category" or dtype.kind not in "biuf"]
  self.params_ = pd.Series(params_, index=pd.Index(X.columns, name="covariate"), name="coef") / self._norm_std
Collecting data for Cox prediction: 100%|██████████| 670/670 [00:16<00:00, 40.30it/s]
Collecting data for Cox prediction: 100%|██████████| 1651/1651 [00:40<00:00, 40.82it/s]
Collecting data for Cox prediction: 100%|██████████| 209/209 [00:05<00:00, 40.88it/s]


Success! Training time: 259.87s
  ci: train=0.8208, test=0.8239
  ibs: train=0.2951, test=0.2969


In [17]:
# Results
if all_results:
    results_df = pd.DataFrame({
        k: [r[k][0] for r in all_results] 
        for k in all_results[0].keys()
    })

In [18]:
results_df

Unnamed: 0,train_samples,method,model_id,train_time,test_time,error,error_text,ci_train_same_size,ci_train_max_size,ci_test,ibs_train_same_size,ibs_train_max_size,ibs_test,l1_ratio,penalizer
0,20,CoxTV,0_CoxTV,259.866722,9.768716,0,,0.820833,0.824249,0.823926,0.295132,0.301512,0.296925,0.1,0.1


## Done

Training completed.
To modify experiments, change `PARAM_GRIDS`, `TRAIN_GRID`, `EPOCHS`, etc.