# Training nested MLR models with Sequential Forward Feature Selection
We plan to find the best set of variables that explain the most of the data without any overfitting. This is important for our research problem as we are in a particularly low sample regime, which necessitates simplifying the models. The most straight-forward approach model simplification can take is through removing variables that have redundant information, as in the inclusion of such variables would hurt the generalizability of the machine learning models. Including such variables would quickly overparameterize the model and lead to an increased risk of model overfitting.

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import pickle
import pandas as pd
import glob, os, gc
from sklearn.decomposition import PCA
from tqdm import tqdm
from sklearn.model_selection import KFold

# Add the path to the directory containing the module
import sys
sys.path.append('../../')
from util.ml import baseline, metrics, nestedMLR

from properscoring import crps_ensemble  # For CRPS calculation
from sklearn.utils import resample  # For bootstrapping

## Read in files and settings

In [3]:
# Find the folder name organized by seed number
seed_doc = sorted(glob.glob('../../datas/seed_revised_*/'))[0]

# Load the data
# Load the time series data
df = pd.read_csv(seed_doc +'X_train_ts_all.csv')
df_valid = pd.read_csv(seed_doc +'X_validation_ts_all.csv')
df_test = pd.read_csv(seed_doc +'X_test_ts_all.csv')

# Find the name for each column
column_names = ([obj.split('_step_')[0] for obj in df.columns])
# Unique names in the column name list
unique_names = list(set(column_names))
# Remove strings with large_scale
unique_names_filt = [var for var in unique_names if "large_scale" not in var]

In [4]:
# Now we read in the y data for every fold
y_train = []
y_val = []
for i in range(7):
    y_train.append(baseline.load_pickle(f'../../datas/proc/sfs/y/ytrain_split_{i}.pkl'))
    y_val.append(baseline.load_pickle(f'../../datas/proc/sfs/y/yval_split_{i}.pkl'))

pcs_train = baseline.load_pickle(f'../../datas/proc/sfs/pcsall_train.pkl')
pcs_val = baseline.load_pickle(f'../../datas/proc/sfs/pcsall_valid.pkl')

## Sequential Forward Feature Selection

In [5]:
from scipy.optimize import minimize
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm import tqdm
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [6]:
# List of variable names
var_names = list(pcs_train[0].keys())

# Empty list to store the chosen variables
selected_vars = []
# List to store all the variable choices
remaining_vars = [var for var in var_names if "large_scale" not in var]
# Initial RMSE to beat is infinity
best_val_rmse = float('inf')
# Target transformation category
target_cat = 'cdf'
# Seed
seed = 42
# Hyperparameter space for the random forest
param_grid = {
    "n_estimators": [100],
    "max_depth": [None],
    "min_samples_split": [2],
}

#### NestedMLR

In [20]:
while remaining_vars:
    best_var = None
    best_mean_val_rmse = best_val_rmse

    # Test each variable
    for var in tqdm(remaining_vars):
        nPC = (pcs_train[0][var].shape[1]) # Number of PCs for this variable
        # Create a linear regression model
        for pc_index in range(nPC):
            mean_score = []
            for iseed in range(7):
                candidate_features = []
                for sel_var in selected_vars:
                    candidate_features.append(pcs_train[iseed][sel_var])
                candidate_features.append(pcs_train[iseed][var][:,[pc_index]])
                X_train_subset = np.hstack(candidate_features)
                X_val_subset = np.hstack([pcs_val[iseed][sel_var] for sel_var in selected_vars] + 
                                         [pcs_val[iseed][var][:,[pc_index]]])
                
                # Train a model and evaluate
                model = linear_model.LinearRegression()
                model.fit(X_train_subset, y_train[iseed][target_cat])
                y_pred = model.predict(X_val_subset)
                # Calculate the validation RMSE
                val_rmse = mean_squared_error(y_val[iseed][target_cat], y_pred, squared=False)
                # Store the validation RMSE for this fold
                mean_score.append(val_rmse)
            
            # Get the mean validation RMSE across folds
            mean_val_rmse = np.mean(np.asarray(mean_score))
            print(mean_val_rmse, best_mean_val_rmse, best_val_rmse)
            # Track and update the best RMSE score
            if mean_val_rmse < best_mean_val_rmse:
                best_mean_val_rmse = mean_val_rmse
                best_var = var
                
    print('_______')
    print((best_mean_val_rmse, best_val_rmse))
    # Automatically select all PCs of the best variable
    if best_var and best_mean_val_rmse <= best_val_rmse:
        selected_vars.append(best_var)
        remaining_vars.remove(best_var)
        best_val_rmse = best_mean_val_rmse
        print('_______')
        print(selected_vars)
        #sys.stdout.flush()
    else:
        break




 10%|█         | 2/20 [00:00<00:01, 14.69it/s]

1.3699707467120632 inf inf
1.3440133613245453 1.3699707467120632 inf
1.347035061466956 1.3440133613245453 inf
1.4163983021275197 1.3440133613245453 inf
1.3819491231693808 1.3440133613245453 inf
1.3875197870849443 1.3440133613245453 inf
1.374648307775648 1.3440133613245453 inf
1.3498558253494406 1.3440133613245453 inf
1.393789981084739 1.3440133613245453 inf
1.3679230192745546 1.3440133613245453 inf
1.3563802380297199 1.3440133613245453 inf
1.368095252209754 1.3440133613245453 inf
1.3373129467827416 1.3440133613245453 inf
1.345022172116129 1.3373129467827416 inf
1.3617760401051098 1.3373129467827416 inf
1.3424636673853099 1.3373129467827416 inf
1.3431818859521816 1.3373129467827416 inf
1.3651930725849035 1.3373129467827416 inf
1.3713592542877353 1.3373129467827416 inf
1.3368287102423315 1.3373129467827416 inf
1.3428526696716996 1.3368287102423315 inf
1.3350572278627306 1.3368287102423315 inf
1.3579844496439606 1.3350572278627306 inf
1.3450691312188874 1.3350572278627306 inf
1.3718239603

 30%|███       | 6/20 [00:00<00:00, 15.64it/s]

1.3531532975023743 1.3321468108100498 inf
1.3499927351282952 1.3321468108100498 inf
1.352070734423822 1.3321468108100498 inf
1.3708185312625185 1.3321468108100498 inf
1.357943387966221 1.3321468108100498 inf
1.3559409389548596 1.3321468108100498 inf
1.354885824091479 1.3321468108100498 inf
1.343020474264694 1.3321468108100498 inf
1.3875550371649474 1.3321468108100498 inf
1.3534917174456988 1.3321468108100498 inf
1.3899540456964805 1.3321468108100498 inf
1.3578784518083016 1.3321468108100498 inf
1.3542731821944696 1.3321468108100498 inf
1.3581804477037491 1.3321468108100498 inf
1.3750767633082768 1.3321468108100498 inf
1.3791972693155454 1.3321468108100498 inf
1.3863882526101994 1.3321468108100498 inf
1.391277694620854 1.3321468108100498 inf
1.348220996062643 1.3321468108100498 inf
1.3441252975535576 1.3321468108100498 inf
1.3378475574608946 1.3321468108100498 inf
1.359441213866933 1.3321468108100498 inf
1.3646382122717549 1.3321468108100498 inf
1.3606850909945472 1.3321468108100498 inf

 40%|████      | 8/20 [00:00<00:00, 15.76it/s]

1.3370062898734798 1.3321468108100498 inf
1.3598820776894343 1.3321468108100498 inf
1.4145593117472965 1.3321468108100498 inf
1.345293280012841 1.3321468108100498 inf
1.4101025152018927 1.3321468108100498 inf
1.347970120310031 1.3321468108100498 inf
1.3593298699876926 1.3321468108100498 inf
1.3548492750419927 1.3321468108100498 inf
1.3474434336435708 1.3321468108100498 inf
1.3446151424914372 1.3321468108100498 inf
1.366613834600889 1.3321468108100498 inf
1.3388710666202839 1.3321468108100498 inf
1.3465513680949341 1.3321468108100498 inf
1.3703178194196035 1.3321468108100498 inf
1.3358464706106985 1.3321468108100498 inf
1.3740509308230269 1.3321468108100498 inf
1.3410104964188327 1.3321468108100498 inf
1.353055673012955 1.3321468108100498 inf
1.3741986085913303 1.3321468108100498 inf
1.3747623544269085 1.3321468108100498 inf
1.3474934114573531 1.3321468108100498 inf
1.361942809123367 1.3321468108100498 inf
1.3586837185605678 1.3321468108100498 inf
1.3333347310426793 1.3321468108100498 i

 60%|██████    | 12/20 [00:00<00:00, 15.41it/s]

1.3401994691642092 1.3321468108100498 inf
1.3699708095171101 1.3321468108100498 inf
1.344013393751073 1.3321468108100498 inf
1.347035146759347 1.3321468108100498 inf
1.4163984314834208 1.3321468108100498 inf
1.381949453502136 1.3321468108100498 inf
1.387520072977124 1.3321468108100498 inf
1.3746486761688135 1.3321468108100498 inf
1.349855587484288 1.3321468108100498 inf
1.3937903821851445 1.3321468108100498 inf
1.3679225846582341 1.3321468108100498 inf
1.3563809464394083 1.3321468108100498 inf
1.3875560488420198 1.3321468108100498 inf
1.3534915071604368 1.3321468108100498 inf
1.389954224431705 1.3321468108100498 inf
1.3578796753006606 1.3321468108100498 inf
1.3542743542560334 1.3321468108100498 inf
1.358184533636702 1.3321468108100498 inf
1.3750757938577622 1.3321468108100498 inf
1.3792071232703085 1.3321468108100498 inf
1.3863789867903509 1.3321468108100498 inf
1.3912843907297048 1.3321468108100498 inf
1.3482185356629082 1.3321468108100498 inf
1.331151924456236 1.3321468108100498 inf


 80%|████████  | 16/20 [00:01<00:00, 15.42it/s]

1.3483360399214916 1.3297259944826452 inf
1.3547630897156249 1.3297259944826452 inf
1.3480073859624653 1.3297259944826452 inf
1.377212059538005 1.3297259944826452 inf
1.3422919480345914 1.3297259944826452 inf
1.3694979428609584 1.3297259944826452 inf
1.3487569580644045 1.3297259944826452 inf
1.3360772231126123 1.3297259944826452 inf
1.3630271693488942 1.3297259944826452 inf
1.352050720270604 1.3297259944826452 inf
1.3518550078054834 1.3297259944826452 inf
1.3427913017190816 1.3297259944826452 inf
1.3611150835032304 1.3297259944826452 inf
1.343889933121263 1.3297259944826452 inf
1.3681835316902884 1.3297259944826452 inf
1.356007757702584 1.3297259944826452 inf
1.3565282697789425 1.3297259944826452 inf
1.3502530488971887 1.3297259944826452 inf
1.342723334095114 1.3297259944826452 inf
1.3435121364150926 1.3297259944826452 inf
1.3711043175663673 1.3297259944826452 inf
1.3727875265593084 1.3297259944826452 inf
1.3498024397339916 1.3297259944826452 inf
1.3699196159283835 1.3297259944826452 i

 85%|████████▌ | 17/20 [00:01<00:00, 15.38it/s]

1.345022172116129 1.3297259944826452 inf
1.3617760401051098 1.3297259944826452 inf
1.3424636673853099 1.3297259944826452 inf
1.3431818859521816 1.3297259944826452 inf
1.3651930725849035 1.3297259944826452 inf
1.3713592542877353 1.3297259944826452 inf
1.3368287102423315 1.3297259944826452 inf
1.3428526696716996 1.3297259944826452 inf
1.3350572278627306 1.3297259944826452 inf





KeyboardInterrupt: 

### RF

#### all

In [35]:
# Initialize variables
selected_pcs = []  # List to store selected PCs
remaining_vars = [var for var in var_names if "large_scale" not in var]  # All variables initially available for selection
best_val_rmse = float('inf')  # Start with a very high RMSE value

while remaining_vars:
    best_var = None
    best_pc_index = None
    best_iseed = None
    best_mean_val_rmse = best_val_rmse

    # Test each variable
    for var in tqdm(remaining_vars):
        nPC = pcs_train[0][var].shape[1]  # Number of PCs for this variable

        # Test each PC of the variable
        for pc_index in range(nPC):
            mean_score = []

            # Evaluate using all seeds
            for iseed in range(7):
                # Prepare data for the current candidate PC
                candidate_features = [pcs_train[iseed][sel_var][:, [pc_idx]] 
                                      for sel_var, pc_idx in selected_pcs]
                candidate_features.append(pcs_train[iseed][var][:, [pc_index]])
                X_train_subset = np.hstack(candidate_features)
                X_val_subset = np.hstack(
                    [pcs_val[iseed][sel_var][:, [pc_idx]] for sel_var, pc_idx in selected_pcs] +
                    [pcs_val[iseed][var][:, [pc_index]]]
                )

                # Train a model and evaluate
                model = RandomForestRegressor(random_state=seed)
                #grid_search = GridSearchCV(
                #    estimator=model,
                #    param_grid=param_grid,
                #    scoring="neg_root_mean_squared_error",  # Use RMSE as scoring metric
                #    cv=3,  # Inner cross-validation
                #)
                #model.fit(X_train_subset, y_train[iseed][target_cat])
                
                # Use the best model to predict
                #model = grid_search.best_estimator_
                model.fit(X_train_subset, y_train[iseed][target_cat])
                y_pred = model.predict(X_val_subset)

                # Calculate the validation RMSE
                val_rmse = mean_squared_error(y_val[iseed][target_cat], y_pred, squared=False)
                mean_score.append(val_rmse)

                # Track the best seed for this PC
                if val_rmse == min(mean_score):  # Check if this seed gives the best RMSE
                    current_best_iseed = iseed

            # Compute the mean validation RMSE across seeds
            mean_val_rmse = np.mean(mean_score)

            # Update the best PC if this one performs better
            if mean_val_rmse < best_mean_val_rmse:
                best_mean_val_rmse = mean_val_rmse
                best_var = var
                best_pc_index = pc_index
                best_iseed = current_best_iseed

    # Check if we found a PC that improves validation RMSE
    if best_var and best_mean_val_rmse < best_val_rmse:
        # Add the best-performing PC to the selected set
        selected_pcs.append((best_var, best_pc_index))
        remaining_vars.remove(best_var)
        best_val_rmse = best_mean_val_rmse
        print(f"Selected PC: {best_var}_PC{best_pc_index + 1}, Mean Val RMSE: {best_mean_val_rmse}")
    else:
        print("No improvement. Stopping feature selection.")
        break


100%|██████████| 184/184 [18:46<00:00,  6.12s/it]


Selected PC: convective_available_potential_energy_min_PC3, Mean Val RMSE: 1.2882816457831918


100%|██████████| 183/183 [18:53<00:00,  6.19s/it]

No improvement. Stopping feature selection.





#### 30

In [10]:
# Initialize variables
selected_pcs = []  # List to store selected PCs
remaining_vars = list(pcs_train[0].keys())  # All variables initially available for selection
best_val_rmse = float('inf')  # Start with a very high RMSE value

while remaining_vars:
    best_var = None
    best_pc_index = None
    best_mean_val_rmse = best_val_rmse

    # Test each variable
    for var in tqdm(remaining_vars):
        nPC = pcs_train[0][var].shape[1]  # Number of PCs for this variable

        # Test each PC of the variable
        for pc_index in range(nPC):
            mean_score = []

            # Evaluate using all seeds
            for iseed in range(7):
                # Prepare data for the current candidate PC
                candidate_features = [pcs_train[iseed][sel_var][:, [pc_idx]] 
                                      for sel_var, pc_idx in selected_pcs]
                candidate_features.append(pcs_train[iseed][var][:, [pc_index]])
                X_train_subset = np.hstack(candidate_features)
                X_val_subset = np.hstack(
                    [pcs_val[iseed][sel_var][:, [pc_idx]] for sel_var, pc_idx in selected_pcs] +
                    [pcs_val[iseed][var][:, [pc_index]]]
                )

                # Train a model and evaluate
                model = RandomForestRegressor(random_state=seed)
                grid_search = GridSearchCV(
                    estimator=model,
                    param_grid=param_grid,
                    scoring="neg_root_mean_squared_error",  # Use RMSE as scoring metric
                    cv=3,  # Inner cross-validation
                )
                grid_search.fit(X_train_subset, y_train[iseed][target_cat])
                
                # Use the best model to predict
                model = grid_search.best_estimator_
                #model.fit(X_train_subset, y_train[iseed][target_cat])
                y_pred = model.predict(X_val_subset)

                # Calculate the validation RMSE
                val_rmse = mean_squared_error(y_val[iseed][target_cat], y_pred, squared=False)
                mean_score.append(val_rmse)

            # Compute the mean validation RMSE across seeds
            mean_val_rmse = np.mean(mean_score)

            # Update the best PC if this one performs better
            if mean_val_rmse < best_mean_val_rmse:
                best_mean_val_rmse = mean_val_rmse
                best_var = var
                best_pc_index = pc_index

    # Check if we found a PC that improves validation RMSE
    if best_var and best_mean_val_rmse < best_val_rmse:
        # Add the best-performing PC to the selected set
        selected_pcs.append((best_var, best_pc_index))
        remaining_vars.remove(best_var)
        best_val_rmse = best_mean_val_rmse
        print(f"Selected PC: {best_var}_PC{best_pc_index + 1}, Mean Val RMSE: {best_mean_val_rmse}")
    else:
        print("No improvement. Stopping feature selection.")
        break


100%|██████████| 30/30 [12:06<00:00, 24.21s/it]


Selected PC: surface_latent_heat_flux_min_PC2, Mean Val RMSE: 1.4446984006301278


100%|██████████| 29/29 [11:57<00:00, 24.73s/it]


Selected PC: geopotential_500_min_PC1, Mean Val RMSE: 1.334079775199497


100%|██████████| 28/28 [11:51<00:00, 25.40s/it]


Selected PC: 10m_magnitude_of_wind_mean_PC2, Mean Val RMSE: 1.3099424057631899


100%|██████████| 27/27 [11:36<00:00, 25.80s/it]


Selected PC: geopotential_1000_max_PC8, Mean Val RMSE: 1.2878110422551654


100%|██████████| 26/26 [11:15<00:00, 26.00s/it]


Selected PC: 100m_magnitude_of_wind_mean_PC10, Mean Val RMSE: 1.2816968406378675


100%|██████████| 25/25 [10:59<00:00, 26.40s/it]

No improvement. Stopping feature selection.





### Linear

#### 20

#### all

In [7]:
# Initialize variables
selected_pcs = []  # List to store selected PCs
remaining_vars = [var for var in var_names if "large_scale" not in var] # All variables initially available for selection
best_val_rmse = float('inf')  # Start with a very high RMSE value

while remaining_vars:
    best_var = None
    best_pc_index = None
    #best_iseed = None
    best_mean_val_rmse = best_val_rmse

    # Test each variable
    for var in remaining_vars:
        nPC = pcs_train[0][var].shape[1]  # Number of PCs for this variable

        # Test each PC of the variable
        for pc_index in range(nPC):
            mean_score = []

            # Evaluate using all seeds
            for iseed in range(7):
                # Prepare data for the current candidate PC
                candidate_features = [pcs_train[iseed][sel_var][:, [pc_idx]] 
                                      for sel_var, pc_idx in selected_pcs]
                candidate_features.append(pcs_train[iseed][var][:, [pc_index]])
                X_train_subset = np.hstack(candidate_features)
                X_val_subset = np.hstack(
                    [pcs_val[iseed][sel_var][:, [pc_idx]] for sel_var, pc_idx in selected_pcs] +
                    [pcs_val[iseed][var][:, [pc_index]]]
                )

                # Train a model and evaluate
                model = linear_model.LinearRegression()
                model.fit(X_train_subset, y_train[iseed][target_cat])
                y_pred = model.predict(X_val_subset)

                # Calculate the validation RMSE
                val_rmse = mean_squared_error(y_val[iseed][target_cat], y_pred, squared=False)
                mean_score.append(val_rmse)

                # Track the best seed for this PC
                #if val_rmse == min(mean_score):  # Check if this seed gives the best RMSE
                #    current_best_iseed = iseed

            # Compute the mean validation RMSE across seeds
            mean_val_rmse = np.mean(mean_score)

            # Update the best PC if this one performs better
            if mean_val_rmse < best_mean_val_rmse:
                best_mean_val_rmse = mean_val_rmse
                best_var = var
                best_pc_index = pc_index
                #best_iseed = current_best_iseed

    # Check if we found a PC that improves validation RMSE
    if best_var and best_mean_val_rmse < best_val_rmse:
        # Add the best-performing PC to the selected set
        selected_pcs.append((best_var, best_pc_index))
        remaining_vars.remove(best_var)
        best_val_rmse = best_mean_val_rmse
        print(f"Selected PC: {best_var}_PC{best_pc_index + 1}, Mean Val RMSE: {best_mean_val_rmse}")
    else:
        print("No improvement. Stopping feature selection.")
        break


Selected PC: relative_humidity_850_mean_PC11, Mean Val RMSE: 1.2550392845893388
Selected PC: relative_humidity_900_max_PC1, Mean Val RMSE: 1.241884272191761
Selected PC: relative_humidity_30_min_PC8, Mean Val RMSE: 1.2282240397342457
Selected PC: relative_humidity_850_std_PC8, Mean Val RMSE: 1.2182808263700438
Selected PC: relative_humidity_950_min_PC3, Mean Val RMSE: 1.2075029518397966
Selected PC: k_index_mean_PC8, Mean Val RMSE: 1.1994924244587732
Selected PC: relative_humidity_925_mean_PC8, Mean Val RMSE: 1.189351413217784
Selected PC: relative_humidity_250_max_PC5, Mean Val RMSE: 1.18257838346893
Selected PC: relative_humidity_100_mean_PC8, Mean Val RMSE: 1.1782313109031135
Selected PC: geopotential_1000_mean_PC6, Mean Val RMSE: 1.1719048086484978
Selected PC: relative_humidity_1000_mean_PC11, Mean Val RMSE: 1.1658931282293612
Selected PC: surface_sensible_heat_flux_max_PC4, Mean Val RMSE: 1.1617247904733603
Selected PC: relative_humidity_200_max_PC10, Mean Val RMSE: 1.15584627297

In [19]:
def train_based_on_selected(pcs_train,pcs_val,y_train,iseed,target_cat,selected_pcs):
    X_train_subset = np.hstack(
        [pcs_train[iseed][sel_var][:, [pc_idx]] for sel_var, pc_idx in selected_pcs]
        )
    
    X_val_subset = np.hstack(
        [pcs_val[iseed][sel_var][:, [pc_idx]] for sel_var, pc_idx in selected_pcs]
        )
    
    model = linear_model.LinearRegression()
    model.fit(X_train_subset, y_train[iseed][target_cat])
    return model, X_train_subset, X_val_subset

In [22]:
models = []
valsrmse = []
for i in range(7):
    model, X_train_subset, X_val_subset = train_based_on_selected(pcs_train,pcs_val,y_train,i,target_cat,selected_pcs)
    y_pred = model.predict(X_val_subset)
    models.append(model)
    valsrmse.append(mean_squared_error(y_val[i][target_cat], y_pred, squared=False))

In [24]:
baseline.save_models({'models':models,'val_rmse':valsrmse},f'../../datas/proc/sfs/results/best_linear_afterselect.pkl')

In [9]:
baseline.save_models(selected_pcs,f'../../datas/proc/sfs/results/best_linear_cdf_feature.pkl')

#### Probabilistic

In [6]:
# Initialize variables
selected_pcs = []  # List to store selected PCs
remaining_vars = list(pcs_train[0].keys())  # All variables initially available for selection
best_val_crps = float('inf')  # Start with a very high RMSE value

while remaining_vars:
    best_var = None
    best_pc_index = None
    best_mean_val_crps = best_val_crps

    # Test each variable
    for var in tqdm(remaining_vars):
        nPC = pcs_train[0][var].shape[1]  # Number of PCs for this variable

        # Test each PC of the variable
        for pc_index in range(nPC):
            mean_score = []

            # Evaluate using all seeds
            for iseed in range(7):
                # Prepare data for the current candidate PC
                candidate_features = [pcs_train[iseed][sel_var][:, [pc_idx]] 
                                      for sel_var, pc_idx in selected_pcs]
                candidate_features.append(pcs_train[iseed][var][:, [pc_index]])
                X_train_subset = np.hstack(candidate_features)
                X_val_subset = np.hstack(
                    [pcs_val[iseed][sel_var][:, [pc_idx]] for sel_var, pc_idx in selected_pcs] +
                    [pcs_val[iseed][var][:, [pc_index]]]
                )

                # Train a model and evaluate
                model = linear_model.LinearRegression()
                model.fit(X_train_subset, y_train[iseed][target_cat])
                y_pred = model.predict(X_val_subset)

                # Generate ensemble predictions using bootstrapping
                y_pred_ensemble = []
                for _ in range(50):  # Number of ensemble members
                    bootstrap_X, bootstrap_y = resample(X_train_subset, y_train[iseed][target_cat])
                    bootstrap_model = linear_model.LinearRegression()
                    bootstrap_model.fit(bootstrap_X, bootstrap_y)
                    y_pred_ensemble.append(bootstrap_model.predict(X_val_subset))
                y_pred_ensemble = np.array(y_pred_ensemble)  # Shape: (n_samples, n_ensemble)

                # Calculate the CRPS score
                val_crps = np.mean([crps_ensemble(y_val[iseed][target_cat][i], y_pred_ensemble[:, i, :].T) 
                                for i in range(len(y_val[iseed][target_cat]))])
                mean_score.append(val_crps)

            # Compute the mean CRPS across seeds
            mean_val_crps = np.mean(mean_score)

            # Update the best PC if this one performs better
            if mean_val_crps < best_mean_val_crps:
                best_mean_val_crps = mean_val_crps
                best_var = var
                best_pc_index = pc_index

    # Check if we found a PC that improves validation RMSE
    if best_var and best_mean_val_crps < best_val_crps:
        # Add the best-performing PC to the selected set
        selected_pcs.append((best_var, best_pc_index))
        remaining_vars.remove(best_var)
        best_val_crps = best_mean_val_crps
        print(f"Selected PC: {best_var}_PC{best_pc_index + 1}, Mean Val RMSE: {best_mean_val_crps}")
    else:
        print("No improvement. Stopping feature selection.")
        break


  0%|          | 0/196 [00:00<?, ?it/s]

100%|██████████| 196/196 [11:42<00:00,  3.58s/it]


Selected PC: relative_humidity_150_std_PC10, Mean Val RMSE: 0.852078878503774


100%|██████████| 195/195 [11:14<00:00,  3.46s/it]


Selected PC: 10m_magnitude_of_wind_mean_PC10, Mean Val RMSE: 0.8099722754769674


100%|██████████| 194/194 [10:24<00:00,  3.22s/it]


Selected PC: convective_available_potential_energy_min_PC11, Mean Val RMSE: 0.7744771745181797


100%|██████████| 193/193 [09:50<00:00,  3.06s/it]


Selected PC: relative_humidity_70_mean_PC8, Mean Val RMSE: 0.7489202930143003


100%|██████████| 192/192 [09:53<00:00,  3.09s/it]


Selected PC: relative_humidity_1000_mean_PC11, Mean Val RMSE: 0.7385795948228253


100%|██████████| 191/191 [09:24<00:00,  2.96s/it]


Selected PC: 10m_orientation_of_wind_mean_PC11, Mean Val RMSE: 0.7378090072285783


100%|██████████| 190/190 [09:04<00:00,  2.87s/it]

No improvement. Stopping feature selection.





#### 30

In [7]:
# Initialize variables
selected_pcs = []  # List to store selected PCs
remaining_vars = list(pcs_train[0].keys())  # All variables initially available for selection
best_val_rmse = float('inf')  # Start with a very high RMSE value

while remaining_vars:
    best_var = None
    best_pc_index = None
    best_iseed = None
    best_mean_val_rmse = best_val_rmse

    # Test each variable
    for var in remaining_vars:
        nPC = pcs_train[0][var].shape[1]  # Number of PCs for this variable

        # Test each PC of the variable
        for pc_index in range(nPC):
            mean_score = []

            # Evaluate using all seeds
            for iseed in range(7):
                # Prepare data for the current candidate PC
                candidate_features = [pcs_train[iseed][sel_var][:, [pc_idx]] 
                                      for sel_var, pc_idx in selected_pcs]
                candidate_features.append(pcs_train[iseed][var][:, [pc_index]])
                X_train_subset = np.hstack(candidate_features)
                X_val_subset = np.hstack(
                    [pcs_val[iseed][sel_var][:, [pc_idx]] for sel_var, pc_idx in selected_pcs] +
                    [pcs_val[iseed][var][:, [pc_index]]]
                )

                # Train a model and evaluate
                model = linear_model.LinearRegression()
                model.fit(X_train_subset, y_train[iseed][target_cat])
                y_pred = model.predict(X_val_subset)

                # Calculate the validation RMSE
                val_rmse = mean_squared_error(y_val[iseed][target_cat], y_pred, squared=False)
                mean_score.append(val_rmse)

                # Track the best seed for this PC
                if val_rmse == min(mean_score):  # Check if this seed gives the best RMSE
                    current_best_iseed = iseed

            # Compute the mean validation RMSE across seeds
            mean_val_rmse = np.mean(mean_score)

            # Update the best PC if this one performs better
            if mean_val_rmse < best_mean_val_rmse:
                best_mean_val_rmse = mean_val_rmse
                best_var = var
                best_pc_index = pc_index
                best_iseed = current_best_iseed

    # Check if we found a PC that improves validation RMSE
    if best_var and best_mean_val_rmse < best_val_rmse:
        # Add the best-performing PC to the selected set
        selected_pcs.append((best_var, best_pc_index))
        remaining_vars.remove(best_var)
        best_val_rmse = best_mean_val_rmse
        print(f"Selected PC: {best_var}_PC{best_pc_index + 1}, Mean Val RMSE: {best_mean_val_rmse}, Seed: {best_iseed}")
    else:
        print("No improvement. Stopping feature selection.")
        break


Selected PC: 100m_magnitude_of_wind_mean_PC11, Mean Val RMSE: 1.295685566862838, Seed: 6
Selected PC: mean_surface_latent_heat_flux_mean_PC3, Mean Val RMSE: 1.2918166506338797, Seed: 6
Selected PC: mean_sea_level_pressure_min_PC11, Mean Val RMSE: 1.285289225325478, Seed: 6
Selected PC: surface_latent_heat_flux_std_PC2, Mean Val RMSE: 1.2827884884276681, Seed: 6
Selected PC: 10m_magnitude_of_wind_std_PC6, Mean Val RMSE: 1.2789278097959607, Seed: 6
No improvement. Stopping feature selection.


#### 40

In [29]:
# Initialize variables
selected_pcs = []  # List to store selected PCs
remaining_vars = list(pcs_train[0].keys())  # All variables initially available for selection
best_val_rmse = float('inf')  # Start with a very high RMSE value

while remaining_vars:
    best_var = None
    best_pc_index = None
    best_iseed = None
    best_mean_val_rmse = best_val_rmse

    # Test each variable
    for var in remaining_vars:
        nPC = pcs_train[0][var].shape[1]  # Number of PCs for this variable

        # Test each PC of the variable
        for pc_index in range(nPC):
            mean_score = []

            # Evaluate using all seeds
            for iseed in range(7):
                # Prepare data for the current candidate PC
                candidate_features = [pcs_train[iseed][sel_var][:, [pc_idx]] 
                                      for sel_var, pc_idx in selected_pcs]
                candidate_features.append(pcs_train[iseed][var][:, [pc_index]])
                X_train_subset = np.hstack(candidate_features)
                X_val_subset = np.hstack(
                    [pcs_val[iseed][sel_var][:, [pc_idx]] for sel_var, pc_idx in selected_pcs] +
                    [pcs_val[iseed][var][:, [pc_index]]]
                )

                # Train a model and evaluate
                model = linear_model.LinearRegression()
                model.fit(X_train_subset, y_train[iseed][target_cat])
                y_pred = model.predict(X_val_subset)

                # Calculate the validation RMSE
                val_rmse = mean_squared_error(y_val[iseed][target_cat], y_pred, squared=False)
                mean_score.append(val_rmse)

                # Track the best seed for this PC
                if val_rmse == min(mean_score):  # Check if this seed gives the best RMSE
                    current_best_iseed = iseed

            # Compute the mean validation RMSE across seeds
            mean_val_rmse = np.mean(mean_score)

            # Update the best PC if this one performs better
            if mean_val_rmse < best_mean_val_rmse:
                best_mean_val_rmse = mean_val_rmse
                best_var = var
                best_pc_index = pc_index
                best_iseed = current_best_iseed

    # Check if we found a PC that improves validation RMSE
    if best_var and best_mean_val_rmse < best_val_rmse:
        # Add the best-performing PC to the selected set
        selected_pcs.append((best_var, best_pc_index))
        remaining_vars.remove(best_var)
        best_val_rmse = best_mean_val_rmse
        print(f"Selected PC: {best_var}_PC{best_pc_index + 1}, Mean Val RMSE: {best_mean_val_rmse}, Seed: {best_iseed}")
    else:
        print("No improvement. Stopping feature selection.")
        break


Selected PC: mean_top_net_long_wave_radiation_flux_std_PC9, Mean Val RMSE: 1.326135643881517, Seed: 0
Selected PC: 2m_dewpoint_temperature_max_PC11, Mean Val RMSE: 1.316756247171401, Seed: 0
Selected PC: 2m_temperature_std_PC6, Mean Val RMSE: 1.3102959084722106, Seed: 0
Selected PC: k_index_max_PC3, Mean Val RMSE: 1.307242133702916, Seed: 0
Selected PC: 10m_v_component_of_wind_max_PC9, Mean Val RMSE: 1.3067771641821166, Seed: 0
Selected PC: mean_sea_level_pressure_mean_PC4, Mean Val RMSE: 1.3063823692533671, Seed: 0
Selected PC: mean_sea_level_pressure_min_PC10, Mean Val RMSE: 1.3031657793849616, Seed: 0
No improvement. Stopping feature selection.


### Climatology

In [54]:
meantrain = []
for i in range(7):
    meantrain.append([y_train[i]['cdf'].mean(axis=0) for _ in range(y_val[i]['cdf'].shape[0])])

In [55]:
np.mean(np.asarray([mean_squared_error(y_val[i]['cdf'], meantrain[i], squared=False) for i in range(7)]))

1.335407045719028