In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization, Activation, LeakyReLU
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam, Nadam
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.model_selection import train_test_split
import optuna
from optuna import Trial
import os
import matplotlib.pyplot as plt

# Load the combined dataset
df = pd.read_csv('dataset.tsv', sep='\t')
feature_cols = ['mH2', 'mHD', 'mAD', 'mHDp', 'alpha', 'L2', 'L8', 'vs', 'm22sq']
label_cols = ['valid_BFB', 'valid_Uni', 'valid_STU', 'valid_Higgs']

X_selected = df[feature_cols].copy()
y = df[label_cols]

# Preprocessing function
def preprocess_data(X, apply_yj=False, apply_scaler=False):
    if apply_yj:
        pt = PowerTransformer(method='yeo-johnson')
        X = pt.fit_transform(X)
    if apply_scaler:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    return X

# Custom metrics with correct casting to avoid float/int issues
def subset_accuracy(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    return tf.reduce_mean(tf.cast(tf.reduce_all(tf.equal(tf.round(y_true), tf.round(y_pred)), axis=1), tf.float32))

def hamming_loss(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    return tf.reduce_mean(tf.cast(tf.not_equal(y_true, tf.round(y_pred)), tf.float32))

def matthews_correlation(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.round(y_pred), tf.float32)

    tp = tf.reduce_sum(tf.cast(y_true * y_pred, tf.float32))
    tn = tf.reduce_sum(tf.cast((1 - y_true) * (1 - y_pred), tf.float32))
    fp = tf.reduce_sum(tf.cast((1 - y_true) * y_pred, tf.float32))
    fn = tf.reduce_sum(tf.cast(y_true * (1 - y_pred), tf.float32))

    numerator = tp * tn - fp * fn
    denominator = tf.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + tf.keras.backend.epsilon())

def macro_f1_score(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.round(y_pred), tf.float32)

    tp = tf.reduce_sum(tf.cast(y_true * y_pred, tf.float32), axis=0)
    fp = tf.reduce_sum(tf.cast((1 - y_true) * y_pred, tf.float32), axis=0)
    fn = tf.reduce_sum(tf.cast(y_true * (1 - y_pred), tf.float32), axis=0)

    precision = tp / (tp + fp + tf.keras.backend.epsilon())
    recall = tp / (tp + fn + tf.keras.backend.epsilon())
    f1 = 2 * precision * recall / (precision + recall + tf.keras.backend.epsilon())

    return tf.reduce_mean(f1)

# Custom objective function for Optuna
def custom_objective(val_subset_accuracy, val_loss, val_hamming_loss, val_mcc, val_f1_score):
    inv_val_loss = 1 / (val_loss + 1e-8)
    normalized_score = (
        (0.4 * val_subset_accuracy) +
        (0.4 * (inv_val_loss / (1 + inv_val_loss))) +  # Normalize to [0, 1]
        (0.0 * (1 - val_hamming_loss)) +  # Invert so higher is better
        (0.2 * ((val_mcc + 1) / 2)) +  # Normalize from [-1, 1] to [0, 1]
        (0.0 * val_f1_score)
    )
    return normalized_score

# Define the model creation function
def create_model(trial_or_params, input_shape, num_labels):
    if isinstance(trial_or_params, Trial):
        trial = trial_or_params
        n_layers = trial.suggest_int('n_layers', 2, 4)
        units = [trial.suggest_int(f'n_units_{i}', 128, 1024) for i in range(n_layers)]
        activation = trial.suggest_categorical('activation', ['relu', 'leaky_relu'])
        dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
        apply_batch_norm = trial.suggest_categorical('apply_batch_norm', [True, False])
        optimizer_name = trial.suggest_categorical('optimizer', ['adam', 'nadam'])
        regularization = trial.suggest_categorical('regularization', [None, 'l2'])
        reg_lambda = trial.suggest_float('reg_lambda', 1e-5, 1e-2) if regularization == 'l2' else 0.0
        learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)

        print(f"Trial {trial.number}: n_layers={n_layers}, units={units}, activation={activation}, "
              f"dropout_rate={dropout_rate}, apply_batch_norm={apply_batch_norm}, optimizer={optimizer_name}, "
              f"regularization={regularization}, reg_lambda={reg_lambda}, learning_rate={learning_rate}")
    else:
        params = trial_or_params
        n_layers = params['n_layers']
        units = [params[f'n_units_{i}'] for i in range(n_layers)]
        activation = params['activation']
        dropout_rate = params['dropout_rate']
        apply_batch_norm = params['apply_batch_norm']
        optimizer_name = params['optimizer']
        regularization = params['regularization']
        reg_lambda = params['reg_lambda'] if regularization == 'l2' else 0.0
        learning_rate = params['learning_rate']

    print(f"Input shape: {input_shape}")
    print(f"Number of labels: {num_labels}")

    model = Sequential()
    model.add(Input(shape=input_shape))

    for unit in units:
        model.add(Dense(unit, kernel_regularizer=l2(reg_lambda) if regularization == 'l2' else None))
        if activation == 'leaky_relu':
            model.add(LeakyReLU())
        else:
            model.add(Activation('relu'))
        if apply_batch_norm:
            model.add(BatchNormalization())
        model.add(Dropout(dropout_rate))

    model.add(Dense(num_labels, activation='sigmoid'))

    if optimizer_name == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_name == 'nadam':
        optimizer = Nadam(learning_rate=learning_rate)

    print("Model summary:")
    model.summary()

    return model, optimizer


class OptunaPruningCallback(tf.keras.callbacks.Callback):
    def __init__(self, trial: Trial, monitor_metrics: dict):
        super().__init__()
        self.trial = trial
        self.monitor_metrics = monitor_metrics
        self.custom_scores = []  # List to store custom scores

    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            return

        val_subset_accuracy = logs.get(self.monitor_metrics['subset_accuracy'])
        val_loss = logs.get(self.monitor_metrics['val_loss'])
        val_hamming_loss = logs.get(self.monitor_metrics['hamming_loss'])
        val_mcc = logs.get(self.monitor_metrics['mcc'])
        val_f1_score = logs.get(self.monitor_metrics['f1_score'])

        if val_subset_accuracy is None or val_loss is None or val_hamming_loss is None or val_mcc is None or val_f1_score is None:
            return

        # Calculate the custom score
        custom_score = custom_objective(val_subset_accuracy, val_loss, val_hamming_loss, val_mcc, val_f1_score)
        self.custom_scores.append(custom_score)  # Append custom score to the list
        print(f' Epoch {epoch + 1}: Custom Score = {custom_score}')

        self.trial.report(custom_score, step=epoch)

        if self.trial.should_prune():
            self.model.stop_training = True
            raise optuna.exceptions.TrialPruned(f"Trial was pruned at epoch {epoch} due to suboptimal performance.")


# Create an Optuna study with HyperbandPruner
pruner = optuna.pruners.HyperbandPruner(min_resource=1, max_resource=50, reduction_factor=3)
study = optuna.create_study(direction='maximize', pruner=pruner)

from sklearn.model_selection import train_test_split

# # Preprocess the data
# apply_yj = False  # You can make this a hyperparameter if you want
# apply_scaler = True  # You can make this a hyperparameter if you want
# X_processed = preprocess_data(X_selected, apply_yj, apply_scaler)

# # First, split the data into 85% training/validation and 15% test
# X_temp, X_test, y_temp, y_test = train_test_split(X_processed, y, test_size=0.15, random_state=42)

# # Now, split the 85% into 70% training and 15% validation (which is 85% * 0.176 = 15%)
# X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.176, random_state=42)
# First, split the data into 85% training/validation and 15% test
X_temp, X_test, y_temp, y_test = train_test_split(X_selected, y, test_size=0.15, random_state=42)

# Now, split the 85% into 70% training and 15% validation
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.176, random_state=42)


# Sanity check
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Validation set size: {X_val.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# Proceed with Optuna for hyperparameter tuning using X_train and X_val

# Use the Ask-and-Tell interface for better control
for trial_number in range(10):  # Set the number of trials you want to run
    trial = study.ask()

    batch_size = trial.suggest_int('batch_size', 512, 1024)
    #batch_size = trial.suggest_int('batch_size', 512, 1024)
    
    # Suggest whether to apply Yeo-Johnson transformation and/or scaling
    apply_yj = trial.suggest_categorical('apply_yj', [True, False])
    apply_scaler = trial.suggest_categorical('apply_scaler', [True, False])

    
    # Apply preprocessing based on the Optuna suggestions
    X_train_processed = preprocess_data(X_train, apply_yj, apply_scaler)
    X_val_processed = preprocess_data(X_val, apply_yj, apply_scaler)
    X_test_processed = preprocess_data(X_test, apply_yj, apply_scaler)  # Preprocess test data similarly



    print(f"\nTrial {trial_number}:")
    print(f"Batch size: {batch_size}")
    print(f" - Apply Yeo-Johnson: {apply_yj}")
    print(f" - Apply Scaler: {apply_scaler}")
    print(f"X_train_processed shape: {X_train_processed.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"X_val_processed shape: {X_val_processed.shape}")
    print(f"y_val shape: {y_val.shape}")

    model, optimizer = create_model(trial, input_shape=(X_train_processed.shape[1],), num_labels=y_train.shape[1])

    # # Compile the model with standard binary cross-entropy loss
    # model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[subset_accuracy, hamming_loss, matthews_correlation, macro_f1_score])
    from tensorflow.keras.losses import BinaryFocalCrossentropy

    # Compile the model with BinaryFocalCrossentropy
    model.compile(
        optimizer=optimizer, 
        loss=BinaryFocalCrossentropy(gamma=2.0, from_logits=False),
        metrics=[subset_accuracy, hamming_loss, matthews_correlation, macro_f1_score]
    )


    monitor_metrics = {
        'subset_accuracy': 'val_subset_accuracy',
        'val_loss': 'val_loss',
        'hamming_loss': 'val_hamming_loss',
        'mcc': 'val_matthews_correlation',
        'f1_score': 'val_macro_f1_score'
    }

    try:
        optuna_callback = OptunaPruningCallback(trial, monitor_metrics=monitor_metrics)
        history = model.fit(X_train_processed, y_train,
                            validation_data=(X_val_processed, y_val),
                            batch_size=batch_size,
                            epochs=50,
                            verbose=1,
                            callbacks=[optuna_callback])

        val_subset_accuracy = np.max(history.history['val_subset_accuracy'])
        val_loss = np.min(history.history['val_loss'])
        val_hamming_loss = np.min(history.history['val_hamming_loss'])
        val_mcc = np.max(history.history['val_matthews_correlation'])
        val_f1_score = np.max(history.history['val_macro_f1_score'])

        custom_score = custom_objective(val_subset_accuracy, val_loss, val_hamming_loss, val_mcc, val_f1_score)
        study.tell(trial, custom_score)

    except optuna.exceptions.TrialPruned as e:
        study.tell(trial, state=optuna.trial.TrialState.PRUNED)
        print(str(e))

# Get the best hyperparameters
best_params = study.best_params
print("Best hyperparameters: ", best_params)

# Save the study to a file
os.makedirs('optuna_trials_file', exist_ok=True)
study.trials_dataframe().to_csv(os.path.join('optuna_trials_file', 'optuna_study_results.csv'))

# Combined plotting for training and validation metrics
metrics_pairs = [
    ('subset_accuracy', 'val_subset_accuracy'),
    ('hamming_loss', 'val_hamming_loss'),
    ('matthews_correlation', 'val_matthews_correlation'),
    ('macro_f1_score', 'val_macro_f1_score'),
    ('loss', 'val_loss')
]

for train_metric, val_metric in metrics_pairs:
    plt.figure()
    plt.plot(history.history[train_metric], label=f'Training {train_metric}')
    plt.plot(history.history[val_metric], label=f'Validation {val_metric}')
    plt.title(f'{train_metric} vs. {val_metric}')
    plt.xlabel('Epochs')
    plt.ylabel(train_metric)
    plt.legend(loc='best')
    plt.grid(True)
    plt.savefig(os.path.join('optuna_trials_file', f'{train_metric}_vs_{val_metric}_history.png'), dpi=300, bbox_inches='tight')
    plt.show()

# Plotting the custom score over epochs
plt.figure()
plt.plot(optuna_callback.custom_scores, label='Custom Score')
plt.title('Custom Score over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Custom Score')
plt.legend(loc='best')
plt.grid(True)
plt.savefig(os.path.join('optuna_trials_file', 'custom_score_history.png'), dpi=300, bbox_inches='tight')
plt.show()