In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import pickle
import pandas as pd
import glob, os, gc
from sklearn.decomposition import PCA
from tqdm import tqdm
from sklearn.model_selection import KFold

# Add the path to the directory containing the module
import sys
sys.path.append('../../')
from util.ml import baseline, metrics, nestedMLR, fno

from properscoring import crps_ensemble  # For CRPS calculation
from sklearn.utils import resample  # For bootstrapping

In [17]:
# Find the folder name organized by seed number
seed_doc = sorted(glob.glob('../../datas/seed_revised_*/'))[0]
# Variances explained
varsexp_filename = str(90)
# Load the data
# Load the time series data
df = pd.read_csv(seed_doc +'X_train_ts_all.csv')
df_valid = pd.read_csv(seed_doc +'X_validation_ts_all.csv')
df_test = pd.read_csv(seed_doc +'X_test_ts_all.csv')

# Find the name for each column
column_names = ([obj.split('_step_')[0] for obj in df.columns])
# Unique names in the column name list
unique_names = list(set(column_names))
# Remove strings with large_scale
unique_names_filt = [var for var in unique_names if "large_scale" not in var]

# Now we read in the y data for every fold
y_train = []
y_val = []
for i in range(7):
    y_train.append(baseline.load_pickle(f'../../datas/proc/sfs/y/ytrain_kmeans_split_{i}.pkl'))
    y_val.append(baseline.load_pickle(f'../../datas/proc/sfs/y/yval_kmeans_split_{i}.pkl'))

# Load the test data
y_test = baseline.load_pickle('../../datas/proc/sfs/y/ytest.pkl')

In [18]:
from scipy.optimize import minimize
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm import tqdm
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [19]:
def r2_score_f(y_true,y_pred):
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()

    r2 = 1-np.sum((y_true-y_pred)**2)/np.sum((y_true-np.mean(y_true))**2)
    return r2

In [20]:
def train_SFFS(remaining_vars, pcs_train, pcs_val, pcs_test, y_train, y_val, y_test, target_cat, best_val_rmse, varsexp_filename, F_cutoff, strategy):
    
    # Track RMSEs
    rmse_log = []  # List to log RMSE values for each iteration
    selected_pcs = []  # List to store selected PCs
    
    while remaining_vars:  
        best_var = None
        best_pc_index = None
        best_mean_val_rmse = best_val_rmse
        best_mean_train_rmse = None
        best_mean_test_rmse = None
        best_mean_train_r2 = None
        best_mean_val_r2 = None
        best_mean_test_r2 = None
        
        # Test each variable
        for var in remaining_vars:
            nPC = pcs_train[0][var].shape[1]  # Number of PCs for this variable
            
            # Test each PC of the variable
            for pc_index in range(nPC):
                train_scores = []
                test_scores = []
                mean_score = []
                train_r2 = []
                test_r2 = []
                valid_r2 = []
                
                # Evaluate using all seeds
                for iseed in range(7):
                    # Prepare data for the current candidate PC
                    candidate_features = [pcs_train[iseed][sel_var][:, [pc_idx]] for sel_var, pc_idx in selected_pcs]
                    candidate_features.append(pcs_train[iseed][var][:, [pc_index]])
                    X_train_subset = np.hstack(candidate_features)
                    X_val_subset = np.hstack([pcs_val[iseed][sel_var][:, [pc_idx]] for sel_var, pc_idx in selected_pcs] +[pcs_val[iseed][var][:, [pc_index]]])
                    X_test_subset = np.hstack([pcs_test[iseed][sel_var][:, [pc_idx]] for sel_var, pc_idx in selected_pcs] +[pcs_test[iseed][var][:, [pc_index]]])
                
                    # Train a model and evaluate
                    model = linear_model.LinearRegression()
                    model.fit(X_train_subset, y_train[iseed][target_cat])
                    y_pred = model.predict(X_val_subset)
                    
                    # Training RMSE
                    y_train_pred = model.predict(X_train_subset)
                    train_rmse = mean_squared_error(y_train[iseed][target_cat], y_train_pred, squared=False)
                    train_r2z = -r2_score_f(y_train[iseed][target_cat], y_train_pred)
                    train_scores.append(train_rmse)
                    train_r2.append(train_r2z)
                    
                    # Calculate the validation RMSE
                    val_rmse = mean_squared_error(y_val[iseed][target_cat], y_pred, squared=False)
                    val_r2z = -r2_score_f(y_val[iseed][target_cat], y_pred)
                    mean_score.append(val_rmse)
                    valid_r2.append(val_r2z)

                    # Calculate the test RMSE
                    test_rmse = mean_squared_error(y_test[target_cat], model.predict(X_test_subset), squared=False)
                    test_r2z = -r2_score_f(y_test[target_cat], model.predict(X_test_subset))
                    test_scores.append(test_rmse)
                    test_r2.append(test_r2z)

                if strategy == "mean":
                    # Compute the mean training RMSE across seeds
                    mean_train_rmse = np.mean(train_scores)
                    # Compute the mean validation RMSE across seeds
                    mean_val_rmse = np.mean(mean_score)
                    # Compute the mean test RMSE across seeds
                    mean_test_rmse = np.mean(test_scores)
                    # Compute the mean training R2 across seeds
                    mean_train_r2 = np.mean(train_r2)
                    # Compute the mean validation R2 across seeds
                    mean_val_r2 = np.mean(valid_r2)
                    # Compute the mean test R2 across seeds
                    mean_test_r2 = np.mean(test_r2)
                elif strategy == "max":
                    mean_train_rmse = np.max(train_scores)
                    mean_val_rmse = np.max(mean_score)
                    mean_test_rmse = np.max(test_scores)
                    mean_train_r2 = np.max(train_r2)
                    mean_val_r2 = np.max(valid_r2)
                    mean_test_r2 = np.max(test_r2)

                # Update the best PC if this one performs better
                if mean_val_rmse < best_mean_val_rmse:
                    best_mean_val_rmse = mean_val_rmse
                    best_mean_train_rmse = mean_train_rmse
                    best_mean_test_rmse = mean_test_rmse
                    best_mean_train_r2 = mean_train_r2
                    best_mean_val_r2 = mean_val_r2
                    best_mean_test_r2 = mean_test_r2
                    best_var = var
                    best_pc_index = pc_index

        # Check if we found a PC that improves validation RMSE
        if best_var and best_mean_val_rmse < best_val_rmse:
            # Add the best-performing PC to the selected set
            selected_pcs.append((best_var, best_pc_index))
            remaining_vars.remove(best_var)
            best_val_rmse = best_mean_val_rmse
            
            # Log RMSEs for this iteration
            rmse_log.append({
                "selected_pc": f"{best_var}_PC{best_pc_index + 1}",
                "train_rmse": best_mean_train_rmse,
                "val_rmse": best_mean_val_rmse,
                "test_rmse": best_mean_test_rmse,
                "train_r2": best_mean_train_r2,
                "val_r2": best_mean_val_r2,
                "test_r2": best_mean_test_r2
                })
            print(f"Selected PC: {best_var}_PC{best_pc_index + 1}, Train RMSE: {best_mean_train_rmse}, Val RMSE: {best_mean_val_rmse}")
        else:
            print("No improvement. Stopping feature selection.")
            break
    
    # Train the final model using all selected PCs and all training data
    final_X_train = np.hstack(
        [pcs_train[0][sel_var][:, [pc_idx]] for sel_var, pc_idx in selected_pcs]
        )
    final_model = linear_model.LinearRegression()
    final_model.fit(final_X_train, y_train[0][target_cat])

    # Create subfolder
    #os.makedirs(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/results/smooth{F_cutoff}/', exist_ok=True)
    # Save the final model
    baseline.save_models(final_model,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/results/smooth{F_cutoff}/best_linear_{target_cat}_model_kmeans_{strategy}.pkl')
    baseline.save_models(selected_pcs,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/results/smooth{F_cutoff}/best_linear_{target_cat}_feature_kmeans_{strategy}.pkl')
    baseline.save_models(rmse_log,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/results/smooth{F_cutoff}/best_linear_{target_cat}_RMSElog_kmeans_{strategy}.pkl')
    return final_model, selected_pcs, rmse_log

In [21]:
#os.makedirs(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/results/', exist_ok=True)
strategy = 'max'
for F_cutoff in [3,5,7,9,11,13]:
    pcs_train = baseline.load_pickle(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pcs/pcsall_smooth{F_cutoff}_kmeans_train.pkl')
    pcs_val = baseline.load_pickle(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pcs/pcsall_smooth{F_cutoff}_kmeans_valid.pkl')
    pcs_test = baseline.load_pickle(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pcs/pcsall_smooth{F_cutoff}_kmeans_test.pkl')  
    # List of variable names
    var_names = list(pcs_train[0].keys())
    
    # Empty list to store the chosen variables
    selected_vars = []
    # List to store all the variable choices
    remaining_vars = [var for var in var_names if "large_scale" not in var].copy()
    # Initial RMSE to beat is infinity
    best_val_rmse = float('inf')
    # Seed
    seed = 42
    
    # Initialize variables
    remaining_vars = [var for var in var_names if "large_scale" not in var].copy()  # All variables initially available for selection
    tofilt = ['_10_', '_20_', '_30_', '_50_', '_70_']
    remaining_vars = [var for var in remaining_vars if not any(substring in var for substring in tofilt)] # Filter out specific pressure levels
    best_val_rmse = float('inf')  # Start with a very high RMSE value
    target_cat = 'cdf'
    
    final_model, selected_pcs, rmse_log = train_SFFS(remaining_vars, pcs_train, pcs_val, pcs_test, y_train, y_val, y_test, target_cat, best_val_rmse, varsexp_filename, F_cutoff, strategy)

Selected PC: relative_humidity_100_std_PC1, Train RMSE: 1.387645717721664, Val RMSE: 2.0925795696352028
Selected PC: mean_surface_net_long_wave_radiation_flux_max_PC2, Train RMSE: 1.338737161996124, Val RMSE: 1.9369368805035403
Selected PC: 100m_orientation_of_wind_std_PC1, Train RMSE: 1.3175939837132753, Val RMSE: 1.8728026239584097
Selected PC: 10m_magnitude_of_wind_std_PC2, Train RMSE: 1.3052308841237668, Val RMSE: 1.836901198383454
Selected PC: relative_humidity_600_max_PC3, Train RMSE: 1.2864257029091004, Val RMSE: 1.8232862656650446
Selected PC: mean_top_net_long_wave_radiation_flux_max_PC3, Train RMSE: 1.2607785736010286, Val RMSE: 1.8028402574273927
Selected PC: mean_vertically_integrated_moisture_divergence_mean_PC2, Train RMSE: 1.2533682069246994, Val RMSE: 1.7859936648327366
Selected PC: relative_humidity_300_mean_PC1, Train RMSE: 1.2434209460697148, Val RMSE: 1.772319959516849
Selected PC: relative_humidity_300_max_PC1, Train RMSE: 1.2270287063417966, Val RMSE: 1.7675096468

In [22]:
strategy = 'max'
for F_cutoff in [3,5,7,9,11,13]:
    pcs_train = baseline.load_pickle(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pcs/pcsall_smooth{F_cutoff}_kmeans_train.pkl')
    pcs_val = baseline.load_pickle(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pcs/pcsall_smooth{F_cutoff}_kmeans_valid.pkl')
    pcs_test = baseline.load_pickle(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pcs/pcsall_smooth{F_cutoff}_kmeans_test.pkl')  
    # List of variable names
    var_names = list(pcs_train[0].keys())
    
    # Empty list to store the chosen variables
    selected_vars = []
    # List to store all the variable choices
    remaining_vars = [var for var in var_names if "large_scale" not in var].copy()
    # Initial RMSE to beat is infinity
    best_val_rmse = float('inf')
    # Seed
    seed = 42
    
    # Initialize variables
    remaining_vars = [var for var in var_names if "large_scale" not in var].copy()  # All variables initially available for selection
    tofilt = ['_10_', '_20_', '_30_', '_50_', '_70_']
    remaining_vars = [var for var in remaining_vars if not any(substring in var for substring in tofilt)] # Filter out specific pressure levels
    best_val_rmse = float('inf')  # Start with a very high RMSE value
    target_cat = 'max'
    
    final_model, selected_pcs, rmse_log = train_SFFS(remaining_vars, pcs_train, pcs_val, pcs_test, y_train, y_val, y_test, target_cat, best_val_rmse, varsexp_filename, F_cutoff, strategy)

Selected PC: relative_humidity_975_max_PC1, Train RMSE: 5.77659400703905, Val RMSE: 7.894336560945991
Selected PC: relative_humidity_900_mean_PC1, Train RMSE: 5.497703025781009, Val RMSE: 7.243807517977461
Selected PC: 10m_magnitude_of_wind_std_PC1, Train RMSE: 5.373552275707778, Val RMSE: 6.913839108394469
Selected PC: relative_humidity_950_mean_PC2, Train RMSE: 5.274231178222915, Val RMSE: 6.682504569236732
Selected PC: mean_surface_net_short_wave_radiation_flux_min_PC3, Train RMSE: 5.179230395423718, Val RMSE: 6.5401682902001115
Selected PC: surface_latent_heat_flux_max_PC2, Train RMSE: 5.11197544748255, Val RMSE: 6.471618293446962
Selected PC: relative_humidity_850_std_PC2, Train RMSE: 5.06703567399468, Val RMSE: 6.431772356637579
Selected PC: 100m_magnitude_of_wind_min_PC2, Train RMSE: 4.981632270754973, Val RMSE: 6.382454307755898
Selected PC: relative_humidity_500_mean_PC3, Train RMSE: 4.897579703986427, Val RMSE: 6.344189676344723
Selected PC: relative_humidity_100_min_PC1, Tra

## Climatology

In [24]:
TYPEEXP = 'max'
meantrain_val = []
meantrain_test = []
meantrain_train = []
for i in range(7):
    # Combine training and validation data
    y_trainval = np.concatenate((y_train[i][TYPEEXP], y_val[i][TYPEEXP]), axis=0)
    meantrain_val.append([y_trainval.mean(axis=0) for _ in range(y_val[i][TYPEEXP].shape[0])])
    meantrain_train.append([y_trainval.mean(axis=0) for _ in range(y_train[i][TYPEEXP].shape[0])])
    meantrain_test.append([y_trainval.mean(axis=0) for _ in range(y_test[TYPEEXP].shape[0])])

print(f"Validation:{np.mean(np.asarray([mean_squared_error(y_val[i][TYPEEXP], meantrain_val[i], squared=False) for i in range(7)]))}")
print(f"Training:{np.mean(np.asarray([mean_squared_error(y_train[i][TYPEEXP], meantrain_train[i], squared=False) for i in range(7)]))}")
print(f"Test:{np.mean(np.asarray([mean_squared_error(y_test[TYPEEXP], meantrain_test[i], squared=False) for i in range(7)]))}")

Validation:5.711170097123249
Training:5.7585842877975395
Test:6.230451098692526


In [26]:
from natsort import natsorted
from scipy.stats import genextreme
# shape, loc, scale
clusters = [pd.read_csv(natsorted(glob.glob('../../datas/GEV_parameters/*'))[i]).iloc[:,1].values for i in range(15)]
# Revert transformation
def revert_cdf(cluster, transformedWINDS):
    outWINDS = np.zeros(transformedWINDS.shape)
    for i in range(15):
        clusterz = cluster[i]
        cdf_values = (1 - np.exp(-transformedWINDS[:,i]))
        outWINDS[:,i] = genextreme.ppf(cdf_values, clusterz[0], loc=clusterz[1], scale=clusterz[2])
    return outWINDS

In [30]:
TYPEEXP = 'cdf'
meantrain_val = []
meantrain_test = []
meantrain_train = []
for i in range(7):
    # Combine training and validation data
    y_trainval = np.concatenate((y_train[i][TYPEEXP], y_val[i][TYPEEXP]), axis=0)
    meantrain_val.append([y_trainval.mean(axis=0) for _ in range(y_val[i][TYPEEXP].shape[0])])
    meantrain_train.append([y_trainval.mean(axis=0) for _ in range(y_train[i][TYPEEXP].shape[0])])
    meantrain_test.append([y_trainval.mean(axis=0) for _ in range(y_test[TYPEEXP].shape[0])])

In [35]:
print(f"Validation:{np.mean(np.asarray([mean_squared_error(y_val[i]['max'], revert_cdf(clusters,np.asarray(meantrain_val[i])), squared=False) for i in range(7)]))}")
print(f"Training:{np.mean(np.asarray([mean_squared_error(y_train[i]['max'], revert_cdf(clusters,np.asarray(meantrain_train[i])), squared=False) for i in range(7)]))}")
print(f"Test:{np.mean(np.asarray([mean_squared_error(y_test['max'], revert_cdf(clusters,np.asarray(meantrain_test[i])), squared=False) for i in range(7)]))}")

Validation:5.85241777540701
Training:5.89431057505778
Test:6.149047844396475
