# Windstorm project
This is a second out of several notebooks on Machine Learning-based prediction of severe surface winds associated with Extratropical Windstorms over different European geographical regions. Specifically, this study emphasizes how the temporal evolution characteristics of different storm internal and environmental predictors ("history") may contain useful information for quick evolution of severe wind potential overland.  This notebook preprocesses pre-landfall storm internal and environmental predictors data for predictive pattern discovery with nestedMLR.

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import pandas as pd
import glob, os, gc
from sklearn.decomposition import PCA
from tqdm import tqdm
from sklearn.model_selection import KFold

# Add the path to the directory containing the module
import sys
sys.path.append('../../')
from util.ml import baseline, metrics, preproc

## Read and analyze time series data
We start with 40 variables of storm internal and environmental (external) characteristics. We combine the predefined train-valid-test split and redo the split so that the validation data would sample all extreme storm cases at least once. The data consists with time series of storm internal and external characterstics taken from a moving grid box centered around the center of the windstorm. To reduce the dimensionality of the data, spatial moments (e.g., mean/max/std) of these fields are calculated and treated as separate variables.

In [5]:
# Find the folder name organized by seed number
seed_doc = sorted(glob.glob('../../datas/seed_revised_*/'))[0]

# Load the data
# Load the time series data
df = pd.read_csv(seed_doc +'X_train_ts_all.csv')
df_valid = pd.read_csv(seed_doc +'X_validation_ts_all.csv')
df_test = pd.read_csv(seed_doc +'X_test_ts_all.csv')
# Find the name for each column
column_names = ([obj.split('_step_')[0] for obj in df.columns])
# Unique names in the column name list
unique_names = list(set(column_names))
unique_names.remove('storm_index')
# Create a dictionary for each unique name
vardict = {"name": unique_names}
vardict_valid = {"name": unique_names}
vardict_test = {"name": unique_names}
for name in unique_names:
    vardict[name] = df[df.filter(regex=f"^({name}_)").columns].values
    vardict_valid[name] = df_valid[df_valid.filter(regex=f"^({name}_)").columns].values
    vardict_test[name] = df_test[df_test.filter(regex=f"^({name}_)").columns].values

Now we combine the data and split the data without delay. Since we have 63 storms, we will do 7 splits with 9 storms in the validation set.

In [6]:
trainvalidexp_dict = {}
totalexp_dict = {}
for name in unique_names:
    # Concatenate the data
    trainvalidexp_dict[name] = np.concatenate([vardict[name], vardict_valid[name]],axis=0)
    totalexp_dict[name] = np.concatenate([trainvalidexp_dict[name], vardict_test[name]],axis=0)

# Read and concatenate y data
ytrain_cdf = pd.read_csv(seed_doc +'y_train_cdf.csv').values
ytrain_max = pd.read_csv(seed_doc +'y_train_max.csv').values
yvalid_cdf = pd.read_csv(seed_doc +'y_validation_cdf.csv').values
yvalid_max = pd.read_csv(seed_doc +'y_validation_max.csv').values
ytest_cdf = pd.read_csv(seed_doc +'y_test_cdf.csv').values
ytest_max = pd.read_csv(seed_doc +'y_test_max.csv').values
# Concatenate the y data
y_cdf = np.concatenate([ytrain_cdf, yvalid_cdf],axis=0)
y_max = np.concatenate([ytrain_max, yvalid_max],axis=0)

In [7]:
def do_smoothing_and_pca_and_make_PCs_nosmooth(vardict, vardict_valid, vardict_test, unique_names, varsexp, train_sets, val_sets):
    varsexp_filename = int(str(varsexp).split('.')[1])
    if varsexp_filename < 10:
        varsexp_filename = f'{varsexp_filename}0'
    vardict_smooth = {"name": unique_names}
    vardict_smooth_valid = {"name": unique_names}
    vardict_smooth_test = {"name": unique_names}
    for name in unique_names:
        vardict_smooth[name] = np.asarray([obj for obj in vardict[name]])
        vardict_smooth_valid[name] = np.asarray([obj for obj in vardict_valid[name]])
        vardict_smooth_test[name] = np.asarray([obj for obj in vardict_test[name]])

    trainvalidexp_dict = {}
    for name in unique_names:
        # Concatenate the data
        trainvalidexp_dict[name] = np.concatenate([vardict_smooth[name], vardict_smooth_valid[name]],axis=0)

    pca_dict = {}
    mean_dict = {}
    std_dict = {}
    for iname in unique_names:
        pca, mean, std = preproc.train_PCA(trainvalidexp_dict[iname])
        pca_dict[iname] = pca
        mean_dict[iname] = mean
        std_dict[iname] = std
    baseline.save_models(pca_dict,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pca/pcaall.pkl')
    baseline.save_models(mean_dict,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/scaler/meanall.pkl')
    baseline.save_models(std_dict,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/scaler/stdall.pkl')

    # Create a dictionary for each unique name
    vardict_train = {"name": unique_names}
    vardict_valid = {"name": unique_names}
    
    for i in range(7):
        for name in unique_names:
            vardict_train[name] = trainvalidexp_dict[name][train_sets[i]]
            vardict_valid[name] = trainvalidexp_dict[name][val_sets[i]]
        # Store the data
        baseline.save_models(vardict_train,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/ts/tsall_train_split_{i}.pkl')
        baseline.save_models(vardict_valid,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/ts/tsall_val_split_{i}.pkl')
        
    vardict_trains = []
    vardict_valids = []    
    for i in range(7):
        vardict_trains.append(baseline.load_pickle(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/ts/tsall_train_split_{i}.pkl'))
        vardict_valids.append(baseline.load_pickle(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/ts/tsall_val_split_{i}.pkl'))

    PCloadings_train = []
    PCloadings_valid = []
    PCloadings_test = []

    # Produce the PCA transformed data
    for iseed in tqdm(range(7)):
        PCdicts_train = {}
        PCdicts_valid = {}
        PCdicts_test = {}
        for iname in unique_names:
            #--------- FIHT: Remove this methods because it overcomplicates FFS ----------------
            # # Find the number of components that explain 99% of the variance
            atg = np.abs(pca_dict[iname].explained_variance_ratio_.cumsum()-varsexp).argmin()
            #atg = 10
            # Load the trained mean and standard deviation
            trainmean = mean_dict[iname]
            # Read the train/valid/test data
            traindata = vardict_trains[iseed][iname]
            validdata = vardict_valids[iseed][iname]
            testdata = vardict_smooth_test[iname]
            # PCA transform
            temptrain = pca_dict[iname].transform(traindata)[:,:atg+1]
            tempvalid = preproc.myPCA_projection_sen(pca_dict,iname,validdata,trainmean)[:,:atg+1]
            temptest =  preproc.myPCA_projection_sen(pca_dict,iname,testdata,trainmean)[:,:atg+1]
            # Standardize the data
            PCdicts_train[iname] = (temptrain-np.mean(temptrain))/np.std(temptrain)
            PCdicts_valid[iname] = (tempvalid-np.mean(temptrain))/np.std(temptrain)
            PCdicts_test[iname] = (temptest-np.mean(temptrain))/np.std(temptrain)
        PCloadings_train.append(PCdicts_train)
        PCloadings_valid.append(PCdicts_valid)
        PCloadings_test.append(PCdicts_test)

    baseline.save_models(PCloadings_train,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pcs/pcsall_train.pkl')
    baseline.save_models(PCloadings_valid,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pcs/pcsall_valid.pkl')
    baseline.save_models(PCloadings_test,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pcs/pcsall_test.pkl')

In [8]:
# Variance explained
varsexp = 0.75
# Store the indices for different storms
storm_indices = np.arange(0,56)
# Set the random seed
np.random.seed(42)
np.random.shuffle(storm_indices)
# Create 7 folds
kf = KFold(n_splits=7, shuffle=False, random_state=None)  # Shuffle was already done
# Store the indices for each fold
train_sets, val_sets = [], []
for i, (train_idx, val_idx) in enumerate(kf.split(storm_indices)):
    train_set = storm_indices[train_idx]
    val_set = storm_indices[val_idx]
    train_sets.append(train_set)
    val_sets.append(val_set)

In [9]:
for varsexp in [0.75,0.80,0.85,0.90,0.95]:
    do_smoothing_and_pca_and_make_PCs_nosmooth(vardict, vardict_valid, vardict_test, unique_names, varsexp, train_sets, val_sets)
    gc.collect()

100%|██████████| 7/7 [00:00<00:00, 15.81it/s]
100%|██████████| 7/7 [00:00<00:00, 14.81it/s]
100%|██████████| 7/7 [00:00<00:00, 14.41it/s]
100%|██████████| 7/7 [00:00<00:00, 16.30it/s]
100%|██████████| 7/7 [00:00<00:00, 15.85it/s]
