In [6]:
import sys
from PyCROSL.CRO_SL import *
from PyCROSL.AbsObjectiveFunc import *
from PyCROSL.SubstrateReal import *
from PyCROSL.SubstrateInt import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
import warnings
import sys
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
# Import the ml_prediction class from the separate file
from ml_prediction import ml_prediction


In [7]:
# 1 - FEATURE SELECTION SET-UP #
# Aim: Identify optimal combinations of potential predictors 
# Method: An optimization algorithm is used to test various combinations of predictors used to recreate the HW record.
# The algorithm optimizes the F1-score of validation period.
num_eval = 100
max_lag = 30
max_duration = 5
first_train = 1981
last_train = 2010
first_test = last_train
last_test = 2022
sol_data = pd.DataFrame(columns=['CV', 'Test', 'Sol'])
indiv_file = 'optimisation_output.csv'
solution_file = 'optimisation_output_sol.csv'
sol_data.to_csv(indiv_file, sep=' ', header=sol_data.columns, index=None)


In [8]:

# 3 - LOAD DATA #
# Potential Predictor dataset
pred_dataframe = pd.read_csv('/work/bk1318/b382634/ERA5/Predictors/Archive/Predictors_dataset_May24.csv', index_col=0)
pred_dataframe.index = pd.to_datetime(pred_dataframe.index)

# Target HW occurrence data
target_dataset = pd.read_csv('/work/bk1318/b382634/ERA5/HW_Indicators/tmax_NDQ90_LakeComo_Valle_MJJA_period19402022_clim19812010.csv', index_col=0)
target_dataset.index = pd.to_datetime(target_dataset.index)

# Define train and test indices
train_indices = (target_dataset.index.year >= first_train) & (target_dataset.index.year <= last_train)
test_indices = (target_dataset.index.year > first_train) & (target_dataset.index.year <= last_test)

# Set target column name
target_dataset = target_dataset.rename(columns={target_dataset.columns[0]: 'Target'})

In [9]:
# Initialize the objective function with size = 3 * number of predictors
objfunc = ml_prediction(
    size=3*pred_dataframe.shape[1],
    pred_dataframe=pred_dataframe,
    target_dataset=target_dataset,
    train_indices=train_indices,
    test_indices=test_indices,
    indiv_file=indiv_file
)
# Define CRO-SL parameters
params = {
    "popSize": 100,
    "rho": 0.6,
    "Fb": 0.98,
    "Fd": 0.2,
    "Pd": 0.8,
    "k": 3,
    "K": 20,
    "group_subs": True,

    "stop_cond": "Neval",
    "time_limit": 4000.0,
    "Ngen": 10000,
    "Neval": num_eval,
    "fit_target": 1000, 

    "verbose": True,
    "v_timer": 1,
    "Njobs": 1,

    "dynamic": True,
    "dyn_method": "success",
    "dyn_metric": "avg",
    "dyn_steps": 10,
    "prob_amp": 0.01
}

# Define operators for the algorithm
operators = [
    SubstrateInt("BLXalpha", {"F": 0.8}),
    SubstrateInt("Multipoint"),
    SubstrateInt("HS", {"F": 0.7, "Cr": 0.8, "Par": 0.2}),
    SubstrateInt("Xor"),
]

In [10]:
# Initialize and run the optimization algorithm
cro_alg = CRO_SL(objfunc, operators, params)
solution, obj_value = cro_alg.optimize()

# Save final solution
solution.tofile(solution_file, sep=',')
print(f"Optimization completed. Best objective value: {obj_value}")
print(f"Solution saved to {solution_file}")

CV F1: 0.08055310981171557, Test F1: 0.6607508532423209
CV F1: 0.09908425872183493, Test F1: 0.6899930020993702
CV F1: 0.05037005303083623, Test F1: 0.6520924422236103
CV F1: 0.08299018503196451, Test F1: 0.6541353383458647
CV F1: 0.07680049554764133, Test F1: 0.6340533672172808
CV F1: 0.08425694920948656, Test F1: 0.6861924686192469
CV F1: 0.07188353216854759, Test F1: 0.6535764375876578
CV F1: 0.06429271824228575, Test F1: 0.6731571627260083
CV F1: 0.07990412831787616, Test F1: 0.6250764525993884
CV F1: 0.08467446531962661, Test F1: 0.6645885286783042
CV F1: 0.0649089004909161, Test F1: 0.6428571428571429
CV F1: 0.07765983313169339, Test F1: 0.6381371932032724
CV F1: 0.08717632613306012, Test F1: 0.6606099935107074
CV F1: 0.06709461652657815, Test F1: 0.6724832214765101
CV F1: 0.06513124768408682, Test F1: 0.654320987654321
CV F1: 0.07659003392623695, Test F1: 0.6552856204858831
CV F1: 0.09756328253844027, Test F1: 0.7028795811518325
CV F1: 0.05319950278603256, Test F1: 0.64790494058