In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import pandas as pd
import glob, os, gc
from sklearn.decomposition import PCA
from tqdm import tqdm
from sklearn.model_selection import KFold

# Add the path to the directory containing the module
import sys
sys.path.append('../../')
from util.ml import baseline, metrics, preproc

from scipy.fftpack import fft, ifft, fftshift, ifftshift

In [2]:
# Find the folder name organized by seed number
seed_doc = sorted(glob.glob('../../datas/seed_revised_*/'))[0]

# Load the data
# Load the time series data
df = pd.read_csv(seed_doc +'X_train_ts_all.csv')
df_valid = pd.read_csv(seed_doc +'X_validation_ts_all.csv')
df_test = pd.read_csv(seed_doc +'X_test_ts_all.csv')
# Find the name for each column
column_names = ([obj.split('_step_')[0] for obj in df.columns])
# Unique names in the column name list
unique_names = list(set(column_names))
unique_names.remove('storm_index')
# Create a dictionary for each unique name
vardict = {"name": unique_names}
vardict_valid = {"name": unique_names}
vardict_test = {"name": unique_names}
for name in unique_names:
    vardict[name] = df[df.filter(regex=f"^({name}_)").columns].values
    vardict_valid[name] = df_valid[df_valid.filter(regex=f"^({name}_)").columns].values
    vardict_test[name] = df_test[df_test.filter(regex=f"^({name}_)").columns].values

## Fourier Smoothing

In [3]:
def fourier_smoothing(timeseries, cutoff):
    rft = np.fft.rfft(timeseries)
    rft[cutoff:] = 0   # Note, rft.shape = 19
    y_smooth = np.fft.irfft(rft)
    return y_smooth

In [17]:
def do_smoothing_and_pca_and_make_PCs(vardict, vardict_valid, vardict_test, unique_names, F_cutoff, varsexp, train_sets, val_sets):
    varsexp_filename = int(str(varsexp).split('.')[1])
    vardict_smooth = {"name": unique_names}
    vardict_smooth_valid = {"name": unique_names}
    vardict_smooth_test = {"name": unique_names}
    for name in unique_names:
        vardict_smooth[name] = np.asarray([fourier_smoothing(obj,F_cutoff) for obj in vardict[name]])
        vardict_smooth_valid[name] = np.asarray([fourier_smoothing(obj,F_cutoff) for obj in vardict_valid[name]])
        vardict_smooth_test[name] = np.asarray([fourier_smoothing(obj,F_cutoff) for obj in vardict_test[name]])

    trainvalidexp_dict = {}
    for name in unique_names:
        # Concatenate the data
        trainvalidexp_dict[name] = np.concatenate([vardict_smooth[name], vardict_smooth_valid[name]],axis=0)

    pca_dict = {}
    mean_dict = {}
    std_dict = {}
    for iname in unique_names:
        pca, mean, std = preproc.train_PCA(trainvalidexp_dict[iname])
        pca_dict[iname] = pca
        mean_dict[iname] = mean
        std_dict[iname] = std
    os.makedirs(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/ts/', exist_ok=True)
    os.makedirs(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pca/', exist_ok=True)
    os.makedirs(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pcs/', exist_ok=True)
    os.makedirs(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/scaler/', exist_ok=True)

    os.makedirs(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/ts/smooth{F_cutoff}', exist_ok=True)
    baseline.save_models(pca_dict,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pca/pcaall_smooth_F{F_cutoff}.pkl')
    baseline.save_models(mean_dict,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/scaler/meanall_smooth_F{F_cutoff}.pkl')
    baseline.save_models(std_dict,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/scaler/stdall_smooth_F{F_cutoff}.pkl')

    # Create a dictionary for each unique name
    vardict_train = {"name": unique_names}
    vardict_valid = {"name": unique_names}
    
    for i in range(7):
        for name in unique_names:
            vardict_train[name] = trainvalidexp_dict[name][train_sets[i]]
            vardict_valid[name] = trainvalidexp_dict[name][val_sets[i]]
        # Store the data
        baseline.save_models(vardict_train,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/ts/smooth{F_cutoff}/tsall_train_split_{i}.pkl')
        baseline.save_models(vardict_valid,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/ts/smooth{F_cutoff}/tsall_val_split_{i}.pkl')
        
    vardict_trains = []
    vardict_valids = []    
    for i in range(7):
        vardict_trains.append(baseline.load_pickle(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/ts/smooth{F_cutoff}/tsall_train_split_{i}.pkl'))
        vardict_valids.append(baseline.load_pickle(f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/ts/smooth{F_cutoff}/tsall_val_split_{i}.pkl'))

    PCloadings_train = []
    PCloadings_valid = []
    PCloadings_test = []

    # Produce the PCA transformed data
    for iseed in tqdm(range(7)):
        PCdicts_train = {}
        PCdicts_valid = {}
        PCdicts_test = {}
        for iname in unique_names:
            #--------- FIHT: Remove this methods because it overcomplicates FFS ----------------
            # # Find the number of components that explain 99% of the variance
            atg = np.abs(pca_dict[iname].explained_variance_ratio_.cumsum()-varsexp).argmin()
            #atg = 10
            # Load the trained mean and standard deviation
            trainmean = mean_dict[iname]
            # Read the train/valid/test data
            traindata = vardict_trains[iseed][iname]
            validdata = vardict_valids[iseed][iname]
            testdata = vardict_smooth_test[iname]
            # PCA transform
            temptrain = pca_dict[iname].transform(traindata)[:,:atg+1]
            tempvalid = preproc.myPCA_projection_sen(pca_dict,iname,validdata,trainmean)[:,:atg+1]
            temptest =  preproc.myPCA_projection_sen(pca_dict,iname,testdata,trainmean)[:,:atg+1]
            # Standardize the data
            PCdicts_train[iname] = (temptrain-np.mean(temptrain))/np.std(temptrain)
            PCdicts_valid[iname] = (tempvalid-np.mean(temptrain))/np.std(temptrain)
            PCdicts_test[iname] = (temptest-np.mean(temptrain))/np.std(temptrain)
        PCloadings_train.append(PCdicts_train)
        PCloadings_valid.append(PCdicts_valid)
        PCloadings_test.append(PCdicts_test)

    baseline.save_models(PCloadings_train,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pcs/pcsall_smooth{F_cutoff}_train.pkl')
    baseline.save_models(PCloadings_valid,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pcs/pcsall_smooth{F_cutoff}_valid.pkl')
    baseline.save_models(PCloadings_test,f'../../datas/proc/sfs/PCcomp_var{varsexp_filename}/pcs/pcsall_smooth{F_cutoff}_test.pkl')

In [13]:
# Read and concatenate y data
ytrain_cdf = pd.read_csv(seed_doc +'y_train_cdf.csv').values
ytrain_max = pd.read_csv(seed_doc +'y_train_max.csv').values
yvalid_cdf = pd.read_csv(seed_doc +'y_validation_cdf.csv').values
yvalid_max = pd.read_csv(seed_doc +'y_validation_max.csv').values
ytest_cdf = pd.read_csv(seed_doc +'y_test_cdf.csv').values
ytest_max = pd.read_csv(seed_doc +'y_test_max.csv').values
# Concatenate the y data
y_cdf = np.concatenate([ytrain_cdf, yvalid_cdf],axis=0)
y_max = np.concatenate([ytrain_max, yvalid_max],axis=0)

In [25]:
# Variance explained
varsexp = 0.75
# Store the indices for different storms
storm_indices = np.arange(0,56)
# Set the random seed
np.random.seed(42)
np.random.shuffle(storm_indices)
# Create 7 folds
kf = KFold(n_splits=7, shuffle=False, random_state=None)  # Shuffle was already done
# Store the indices for each fold
train_sets, val_sets = [], []
for i, (train_idx, val_idx) in enumerate(kf.split(storm_indices)):
    train_set = storm_indices[train_idx]
    val_set = storm_indices[val_idx]
    train_sets.append(train_set)
    val_sets.append(val_set)

In [26]:
for F_cutoff in [1,3,5,7,9,11,13]:
    do_smoothing_and_pca_and_make_PCs(vardict, vardict_valid, vardict_test, unique_names, F_cutoff, varsexp, train_sets, val_sets)
    gc.collect()

100%|██████████| 7/7 [00:00<00:00, 17.13it/s]
100%|██████████| 7/7 [00:00<00:00, 19.07it/s]
100%|██████████| 7/7 [00:00<00:00, 14.63it/s]
100%|██████████| 7/7 [00:00<00:00, 16.13it/s]
100%|██████████| 7/7 [00:00<00:00, 16.04it/s]
100%|██████████| 7/7 [00:00<00:00, 13.39it/s]
100%|██████████| 7/7 [00:00<00:00, 16.44it/s]
