In [3]:
# Standard imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from optformer.decoding_regression import models
from optformer.decoding_regression import vocabs
import matplotlib.pyplot as plt

# For reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Define a constant for negative infinity
NEG_INF = float('-inf')


ModuleNotFoundError: No module named 'torch'

In [None]:
# Create an instance of the vocabulary.
vocab = vocabs(size=7, token_length=7)

# Convert each target to a token sequence.
Y_token_ids = np.array([vocab.to_int(y) for y in Y])
print("Example target:", Y[0])
print("Example token IDs:", Y_token_ids[0])

In [None]:
class ParticipantVisibleError(Exception):
    pass

# Score function needed for evaluation of the model in the competition
# Define the concordance index function
def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_pred = {'prediction': {0: 1.0, 1: 0.0, 2: 1.0}}
    >>> y_pred = pd.DataFrame(y_pred)
    >>> y_pred.insert(0, row_id_column_name, range(len(y_pred)))
    >>> y_true = { 'efs': {0: 1.0, 1: 0.0, 2: 0.0}, 'efs_time': {0: 25.1234,1: 250.1234,2: 2500.1234}, 'race_group': {0: 'race_group_1', 1: 'race_group_1', 2: 'race_group_1'}}
    >>> y_true = pd.DataFrame(y_true)
    >>> y_true.insert(0, row_id_column_name, range(len(y_true)))
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name)
    0.75
    """

    del solution[row_id_column_name]
    del submission[row_id_column_name]

    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    for col in submission.columns:
        if not pd.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_list = []
    for race in merged_df_race_dict.keys():
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))

In [None]:
# Function to encode categorical features
def encode_categorical_features(df, cats, encoding_maps=None):
    """
    Encode categorical features in a df using label encoding.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing categorical features.
    cats (list): List of column names in the DataFrame to be encoded.
    encoding_maps (dict, optional): Dictionary to store encoding mappings for test dataframe. 
                                    If None, a new dictionary will be created.

    Returns:
    pd.DataFrame: DataFrame with encoded categorical features.
    dict: Dictionary containing encoding mappings for each categorical feature.
    """
    df = df.copy()
    
    # Store encoding mappings for consistency in test data
    if encoding_maps is None:
        encoding_maps = {}  
    
    print("Label Encoding:")
    for c in cats:
        if c not in encoding_maps:
            # Sort ensures stable encoding
            df[c], mapping = df[c].factorize(sort=True) 
            encoding_maps[c] = {val: idx for idx, val in enumerate(mapping)}
        else:
            df[c] = df[c].map(encoding_maps[c])
        
        # Ensure all mapped values are valid integers
        # Assign -1 for unseen categories in test data
        df[c] = df[c].fillna(-1).astype(int)  
        
        print(f'{c}: n_categories={len(encoding_maps[c].values())}, mapped values={list(encoding_maps[c].values())}')
  
    return df, encoding_maps

In [None]:
# from this notebook https://www.kaggle.com/code/ambrosm/esp-eda-which-makes-sense/notebook
def transform_quantile(time, event):
    """Transform the target by stretching the range of eventful efs_times and compressing the range of event_free efs_times

    From https://www.kaggle.com/code/ambrosm/esp-eda-which-makes-sense"""
    transformed = np.full(len(time), np.nan)
    transformed_dead = quantile_transform(- time[event == 1].values.reshape(-1, 1)).ravel()
    transformed[event == 1] = transformed_dead
    transformed[event == 0] = transformed_dead.min() - 0.3
    return transformed
    
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y
    

def transform_survival_target(df, time_col='efs_time', event_col='efs'):
    df["y3"] = df.efs_time.values
    mx = df.loc[df.efs==1,"efs_time"].max()
    mn = df.loc[df.efs==0,"efs_time"].min()
    df.loc[df.efs==0,"y3"] = df.loc[df.efs==0,"y3"] + mx - mn
    df.y3 = df.y3.rank()
    df.loc[df.efs==0,"y3"] += 2*len(df)
    df.y3 = df.y3 / df.y3.max()
    df.y3 = np.log( df.y3 )
    df.y3 -= df.y3.mean()
    df.y3 *= -1.0
    return df.y3

df_train["y1"] = transform_survival_probability(df_train, time_col='efs_time', event_col='efs')
df_train["y2"] = transform_quantile(time=df_train['efs_time'], event=df_train['efs'])
df_train["y3"] = transform_survival_target(df_train, time_col='efs_time', event_col='efs')

# Plot for y1
plt.figure(figsize=(12, 6))
plt.hist(df_train.loc[df_train.efs==1, "y1"], bins=100, alpha=0.5, label="efs=1, Yes Event")
plt.hist(df_train.loc[df_train.efs==0, "y1"], bins=100, alpha=0.5, label="efs=0, Maybe Event")
plt.xlabel("Transformed Target y1")
plt.ylabel("Density")
plt.title("Transformed Target y1 using survival probability.")
plt.legend()
plt.show()

# Plot for y2
plt.figure(figsize=(12, 6))
plt.hist(df_train.loc[df_train.efs==1, "y2"], bins=100, alpha=0.5, label="efs=1, Yes Event")
plt.hist(df_train.loc[df_train.efs==0, "y2"], bins=100, alpha=0.5, label="efs=0, Maybe Event")
plt.xlabel("Transformed Target y2")
plt.ylabel("Density")
plt.title("Transformed Target y2 using quantile transformation.")
plt.legend()
plt.show()

# Plot for y3
plt.figure(figsize=(12, 6))
plt.hist(df_train.loc[df_train.efs==1, "y3"], bins=100, alpha=0.5, label="efs=1, Yes Event")
plt.hist(df_train.loc[df_train.efs==0, "y3"], bins=100, alpha=0.5, label="efs=0, Maybe Event")
plt.xlim((-5, 5))
plt.xlabel("Transformed Target y3")
plt.ylabel("Density")
plt.title("Transformed Target y3 using both efs and efs_time.")
plt.legend()
plt.show()

In [None]:

categorical_features = df_train.select_dtypes(include="object").columns.to_list()
numerical_features = [col for col in df_train.select_dtypes(exclude="object").columns if col not in ["ID","efs","efs_time","y1","y2","y3"]]

# Handle missing values
for col in categorical_features:
    df_train[col] = df_train[col].fillna("NAN")
    if col in df_test.columns:
        df_test[col] = df_test[col].fillna("NAN")


# Encode training data and save mapping
train, encoding_maps = encode_categorical_features(df_train, cats=categorical_features)

# Encode test data using saved mapping
test, _ = encode_categorical_features(df_test, cats=categorical_features, encoding_maps=encoding_maps)


# Use training median to impute nan values
train = train.fillna(train[numerical_features].median())
test = test.fillna(train[numerical_features].median())


# Normalize numerical features
for col in numerical_features:
    m = train[col].mean()
    s = train[col].std()
    train[col] = (train[col] - m) / (s + 1e-8)  
    
    m = test[col].mean()
    s = test[col].std()
    test[col] = (test[col] - m) / (s + 1e-8)  


# Define embedding sizes
emb_c = {n: len(col.unique()) for n, col in train[categorical_features].items()}
embedding_sizes = {col: (len(train[col].unique()), np.int64(np.sqrt(len(train[col].unique()) + 1))) for col in categorical_features}

train_features = [c for c in train.columns if not c in ["ID","efs","efs_time","y1","y2","y3"]]
test_features = [c for c in test.columns if not c in ["ID"]]

In [None]:
# K-Fold Cross Validation
kf = KFold(n_splits=NUM_FOLDS, shuffle=True)

# Convert categorical & numerical data to tensors
X_cat = train[categorical_features].to_numpy()
X_cont = train[numerical_features].to_numpy()
y = df_train["y3"].to_numpy()

# Convert to tensors
X_cat = torch.LongTensor(X_cat)
X_cont = torch.tensor(X_cont, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

# Convert target values into token sequences
Y_token_ids = np.array([vocab.to_int(y_val) for y_val in y])
Y_token_ids = torch.tensor(Y_token_ids, dtype=torch.long)

# Initialize arrays for OOF and test predictions
oof_predictions = np.zeros(len(X_cat))
predictions = []

# Debugging: Print shapes
print(f"X_cat shape: {X_cat.shape}, X_cont shape: {X_cont.shape}, Y shape: {Y_token_ids.shape}")

In [None]:
# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"

# Data & Feature Info
n_cont = len(numerical_features)
n_cat = len(categorical_features)


batch_size = 256
NUM_EPOCHS = 5
NUM_FOLDS = 5
NUM_REPEATS = 1

# Loss function (weighted sparse categorical cross entropy)
loss_weights = torch.tensor([0.3, 0.3, 0.09, 0.01, 0.01, 0.3, 0.5], device=device).unsqueeze(0)

In [None]:
# Lists for metrics
train_losses, val_losses = [], []

for repeat in range(NUM_REPEATS):
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_cat)):
        print(f"\nFold {fold+1}/{NUM_FOLDS}")

        # Train-validation split for current fold
        X_cat_train, X_cat_val = X_cat[train_idx], X_cat[val_idx]
        X_cont_train, X_cont_val = X_cont[train_idx], X_cont[val_idx]
        Y_train, Y_val = Y_token_ids[train_idx], Y_token_ids[val_idx]

        # Create DataLoaders
        train_ds = TensorDataset(X_cat_train, X_cont_train, Y_train[:, :-1], Y_train)
        val_ds = TensorDataset(X_cat_val, X_cont_val, Y_val[:, :-1], Y_val)
        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_ds, batch_size=len(val_ds))

        # Model initialization
        model = AttentionDecoder(
            encoder=DummyEncoder(),
            vocab=vocab,
            embedding_sizes=embedding_sizes,
            units=128,
            num_layers=1,
            num_heads=1,
            dropout=0.1,
            encoder_dim=n_cont
        ).to(device)

        optimizer = optim.Adam(model.parameters(), lr=0.001)
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer, max_lr=0.01, steps_per_epoch=len(train_loader), epochs=NUM_EPOCHS
        )

        # Training loop
        for epoch in range(NUM_EPOCHS):
            model.train()
            total_loss = 0

            for x_cat_batch, x_cont_batch, dec_input, y_batch in train_loader:
                x_cat_batch, x_cont_batch, dec_input, y_batch = (
                    x_cat_batch.to(device),
                    x_cont_batch.to(device),
                    dec_input.to(device),
                    y_batch.to(device),
                )

                optimizer.zero_grad()
                outputs = model((x_cont_batch, x_cat_batch, dec_input))
                loss_tensor = weighted_sparse_categorical_crossentropy(y_batch, outputs, weights=loss_weights)
                loss = loss_tensor.mean()
                
                loss.backward()
                optimizer.step()
                scheduler.step()
                total_loss += loss.item()

            train_loss = total_loss / len(train_loader)

            # Validation loop
            model.eval()
            val_loss = 0

            with torch.no_grad():
                for x_cat_batch, x_cont_batch, dec_input, y_batch in val_loader:
                    x_cat_batch, x_cont_batch, dec_input, y_batch = (
                        x_cat_batch.to(device),
                        x_cont_batch.to(device),
                        dec_input.to(device),
                        y_batch.to(device),
                    )
                    outputs = model((x_cont_batch, x_cat_batch, dec_input))
                    val_loss += weighted_sparse_categorical_crossentropy(y_batch, outputs, weights=loss_weights).mean().item()

            val_loss = val_loss / len(val_loader)
            print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

            # Store metrics
            train_losses.append(train_loss)
            val_losses.append(val_loss)

        # Store OOF predictions
        oof_predictions[val_idx] += model.decode(X_cont_val.to(device)).cpu().numpy().squeeze()

        # Test predictions
        test_cat = torch.LongTensor(test[categorical_features].to_numpy()).to(device)
        test_cont = torch.tensor(test[numerical_features].to_numpy(), dtype=torch.float32).to(device)
        with torch.no_grad():
            test_preds = model.decode(test_cont).cpu().numpy()
        
        if len(predictions) == 0:
            predictions.append(test_preds)
        else:
            predictions += test_preds

# Final aggregation
oof_predictions[val_idx] = oof_predictions[val_idx] / (NUM_REPEATS * NUM_EPOCHS)
predictions = np.array(predictions) / NUM_REPEATS

# Final cross-validation results
print("\nCross-Validation Results:")
print(f"Train Loss: {np.mean(train_losses):.4f} ± {np.std(train_losses):.4f}")
print(f"Val Loss: {np.mean(val_losses):.4f} ± {np.std(val_losses):.4f}")