In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, auc, roc_curve

from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.initializers import Constant
from keras.layers import Dense, LSTM, Bidirectional
from keras.callbacks import ModelCheckpoint, EarlyStopping

from Libs.config import inter_extra_data_folder
from Libs.load_data import DataLoader, get_dataset_split
from Libs.keras_f1score import f1_m

2023-03-25 11:52:39.346747: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-25 11:52:39.516855: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-25 11:52:39.516882: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-25 11:52:40.456728: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [2]:
F_std = False

# initialize data loader
data_loader = DataLoader(run=30, N=1000, s=0.5, t=[0.01, 0.1, 0.5, 3], d=0.2, m=1, 
                         override=False, folder=inter_extra_data_folder)
# get the grid
grid_X, grid_y = data_loader.get_grid()
# get params dictionary
params = data_loader.get_params()

grid_X.shape, grid_y.shape

100%|██████████| 30/30 [00:00<00:00, 59.51it/s]


((30, 1, 4, 1, 1, 1000), (30, 1, 4, 1, 1, 1000))

A biggest assumption when training ANNs is the following: 

"We assume that training sets and test sets contains independent and identically distributed samples from the same unknown distribution $p_{data}(x,y)$"

This is a very important assumption that in general affect the performance ANNs, in particular classifier ones. We could, indeed, explore what can happen if we violete the following assumption. This a relevant application case, for exaple in cases when the generation parameters are not known.

In [3]:
def get_data_set(data1, mode, data2=None, F_std=False, overlap_size=15):
    assert mode in ['all', 'interpolation', 'extrapolation']    

    # params commons
    dataset_split_params = {
        'window_size': 20, # how large is the window
        'overlap_size': overlap_size, # how many time interval of overlap there is between the windows
        'label_treshold': 1, # how many labels have to be at 1 in the window_size to consider the current window as a flare
        'split_on_run': True, # if True the windows of a run cannot be on different sets
        'shuffle_run': False, # if True shuffles the order of the runs before computing the windows
        'shuffle_window': False, # if True shuffles the order of the windows in the resulting dataframes
        'test_size': 0.3, # size of the test set expressed in percentage
        'val_size': 0.2, # size of the validation set expressed in percentage, considered only if get_validation is True
        'get_validation': True, # if True the output would be train,val,test set, otherwise it would be train,test
        'random_state': 42 # sets the seed for reproducibility
    }

    if mode in ['interpolation', 'extrapolation']:
        assert not data2 is None
        grid_X_train, grid_y_train = data1
        grid_X_test, grid_y_test   = data2
        # get the train and validation set, selecting the index for grid given the interpolation assuption
        # notice that theta is the third parameter
        df_train, df_val, _ = get_dataset_split(grid_X_train, grid_y_train, **dataset_split_params)
        # get the test set, selecting the index for grid given the interpolation assuption
        # notice that theta is the third parameter
        _, _, df_test = get_dataset_split(grid_X_test, grid_y_test, **dataset_split_params)
    elif mode in ['all']:
        grid_X, grid_y = data1
        # get all the dataset from a single list
        df_train, df_val, df_test = get_dataset_split(grid_X, grid_y, **dataset_split_params)
    
    # number of classes
    print('Training set:')
    train_counts = df_train['future_flare'].value_counts()
    print(train_counts, '\n')
    print('validation set:')
    val_counts = df_val['future_flare'].value_counts()
    print(val_counts, '\n')
    print('Test set:')
    test_counts = df_test['future_flare'].value_counts()
    print(test_counts, '\n')
    print('Total:')
    total_counts = train_counts.add(val_counts).add(test_counts)
    print(total_counts, '\n')
    print()
    
    # compute the initial bias to pass then to the model
    initial_bias = Constant([np.log(train_counts[0]/train_counts[1])])

    # check the shape
    X_train, y_train = df_train.iloc[:,:-1].to_numpy(), df_train.future_flare.to_numpy()
    X_val, y_val = df_val.iloc[:,:-1].to_numpy(), df_val.future_flare.to_numpy()
    X_test, y_test = df_test.iloc[:,:-1].to_numpy(), df_test.future_flare.to_numpy()
    X = np.vstack((X_train, X_val, X_test))
    y = np.hstack((y_train, y_val, y_test))
    print('X ## Train:', X_train.shape, 'Val:', X_val.shape, 'Test:', X_test.shape)
    print('y ## Train:', y_train.shape, 'Val:', y_val.shape, 'Test:', y_test.shape)

    # finally, if requested, standardize the dataset
    if F_std:
        # Standardize Data
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train_std = scaler.transform(X_train)
        X_val_std = scaler.transform(X_val)
        X_test_std = scaler.transform(X_test)
        # get automatically the number of classes
        num_classes = len(np.unique(y))
    else:
        X_train_std = X_train
        X_val_std = X_val
        X_test_std = X_test


    # finally return the dataset
    return X_train_std, y_train, X_val_std, y_val, X_test_std, y_test, initial_bias

In [4]:
def make_model(X_train, initial_bias):
    # define model
    model = Sequential()
    model.add(Bidirectional(LSTM(20, activation='relu'), input_shape=(X_train.shape[1], 1)))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid',bias_initializer=initial_bias))
    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=[f1_m, 'accuracy'])
    # print the summury model
    print(model.summary())
    return model

In [5]:
def eval(model, X_val, y_val, X_test, y_test):
    # Validation set
    y_pred = np.round(model.predict(X_val), 0)
    print("### Evaluation on validation set ###")
    print("Accuracy: %.2f" % (accuracy_score(y_pred, y_val)))
    print("F1 score: %.2f" % (f1_score(y_pred, y_val, average='macro')))
    fpr, tpr, _ = roc_curve(y_val, y_pred, pos_label=1)
    print('AUC:', auc(fpr, tpr))
    #Create confusion matrix and normalizes it over predicted (columns)
    cm = confusion_matrix(y_val, y_pred)
    print(cm)

    print()
    
    # Test set
    y_pred = np.round(model.predict(X_test), 0)
    print("### Evaluation on test set ###")
    print("Accuracy: %.2f" % (accuracy_score(y_pred, y_test)))
    print("F1 score: %.2f" % (f1_score(y_pred, y_test, average='macro')))
    fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=1)
    print('AUC:', auc(fpr, tpr))
    #Create confusion matrix and normalizes it over predicted (columns)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

# LSTM model with multiple all theta parameters

Let's construct now the model

In [6]:
X_train, y_train, X_val, y_val, X_test, y_test, initial_bias = get_data_set((grid_X, grid_y), 'all')

Training set:
0    7153
1    3655
Name: future_flare, dtype: int64 

validation set:
0    3447
1    1957
Name: future_flare, dtype: int64 

Test set:
0    4587
1    2361
Name: future_flare, dtype: int64 

Total:
0    15187
1     7973
Name: future_flare, dtype: int64 


X ## Train: (10808, 20) Val: (5404, 20) Test: (6948, 20)
y ## Train: (10808,) Val: (5404,) Test: (6948,)


In [7]:
model = make_model(X_train, initial_bias)

2023-03-25 11:53:05.596201: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-25 11:53:05.596350: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-25 11:53:05.596419: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (5166cf34c918): /proc/driver/nvidia/version does not exist
2023-03-25 11:53:05.596883: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 40)               3520      
 l)                                                              
                                                                 
 dense (Dense)               (None, 30)                1230      
                                                                 
 dense_1 (Dense)             (None, 10)                310       
                                                                 
 dense_2 (Dense)             (None, 1)                 11        
                                                                 
Total params: 5,071
Trainable params: 5,071
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
epochs = 20
batch_size = 32
# define callbacks
callbacks = [
    ModelCheckpoint(
        os.path.join("models", "LSTM_allTheta_checkpoint.h5"), save_weights_only=True, monitor="val_loss"
    ),
    EarlyStopping(monitor="val_loss", patience=3, verbose=1),
]
# fit model
model.fit(
    X_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=callbacks,
    validation_data=(X_val, y_val),
    verbose=1,
)

Epoch 1/20
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f3f3c708eb0>

Validation set results

In [8]:
eval(model, X_val, y_val, X_test, y_test)

### Evaluation on validation set ###
Accuracy: 0.36
F1 score: 0.27
AUC: 0.5
[[   0 3447]
 [   0 1957]]

### Evaluation on test set ###
Accuracy: 0.34
F1 score: 0.25
AUC: 0.5
[[   0 4587]
 [   0 2361]]


# Interpolation

Same model as before, but now we are fitting only in the dataset coming from the generation with using just the extreme parameters: 

$\theta=0.01$ and $\theta=3$

and a fraction of the other dataset, coming from $\theta=0.1$ and $\theta=0.5$ as test set

In [9]:
p = 'theta'
# train configurations
theta_train_list     = [0.01, 3]
theta_train_list_idx = [params[p].index(t) for t in theta_train_list]
data_train = (grid_X[:,:,theta_train_list_idx,:,:,:], grid_y[:,:,theta_train_list_idx,:,:,:])
# test configuration
theta_test_list      = [0.1, 0.5]
theta_test_list_idx  = [params[p].index(t) for t in theta_test_list]
data_test = (grid_X[:,:,theta_test_list_idx,:,:,:], grid_y[:,:,theta_test_list_idx,:,:,:])

X_train, y_train, X_val, y_val, X_test, y_test, initial_bias = get_data_set(data_train, 'interpolation', data2=data_test)

Training set:
0    3758
1    1646
Name: future_flare, dtype: int64 

validation set:
0    1910
1     792
Name: future_flare, dtype: int64 

Test set:
0    2050
1    1424
Name: future_flare, dtype: int64 

Total:
0    7718
1    3862
Name: future_flare, dtype: int64 


X ## Train: (5404, 20) Val: (2702, 20) Test: (3474, 20)
y ## Train: (5404,) Val: (2702,) Test: (3474,)


In [10]:
model = make_model(X_train, initial_bias)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_1 (Bidirectio  (None, 40)               3520      
 nal)                                                            
                                                                 
 dense_3 (Dense)             (None, 30)                1230      
                                                                 
 dense_4 (Dense)             (None, 10)                310       
                                                                 
 dense_5 (Dense)             (None, 1)                 11        
                                                                 
Total params: 5,071
Trainable params: 5,071
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
epochs = 20
batch_size = 32
# define callbacks
callbacks = [
    ModelCheckpoint(
        os.path.join("models", "LSTM_intrpTheta_checkpoint.h5"), save_weights_only=True, monitor="val_loss"
    ),
    EarlyStopping(monitor="val_loss", patience=3, verbose=1),
]
# fit model
model.fit(
    X_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=callbacks,
    validation_data=(X_val, y_val),
    verbose=1,
)

Epoch 1/20
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 19: early stopping


<keras.callbacks.History at 0x7f261c34c340>

In [12]:
eval(model, X_val, y_val, X_test, y_test)

### Evaluation on validation set ###
Accuracy: 0.75
F1 score: 0.71
AUC: 0.7218460785869163
[[1501  409]
 [ 271  521]]

### Evaluation on test set ###
Accuracy: 0.75
F1 score: 0.73
AUC: 0.7287637023842148
[[1711  339]
 [ 537  887]]


# Extrapolation

Same model as before, but now we are fitting only in the dataset coming from the generation without using the extreme parameters: 

$\theta=0.1$ and $\theta=0.5$

In [13]:
p = 'theta'
# train configurations
theta_train_list     = [0.1, 0.5]
theta_train_list_idx = [params[p].index(t) for t in theta_train_list]
data_train = (grid_X[:,:,theta_train_list_idx,:,:,:], grid_y[:,:,theta_train_list_idx,:,:,:])
# test configuration
theta_test_list      = [0.01, 3]
theta_test_list_idx  = [params[p].index(t) for t in theta_test_list]
data_test = (grid_X[:,:,theta_test_list_idx,:,:,:], grid_y[:,:,theta_test_list_idx,:,:,:])

X_train, y_train, X_val, y_val, X_test, y_test, initial_bias = get_data_set(data_train, 'extrapolation', data2=data_test)

Training set:
0    3395
1    2009
Name: future_flare, dtype: int64 

validation set:
0    1537
1    1165
Name: future_flare, dtype: int64 

Test set:
0    2537
1     937
Name: future_flare, dtype: int64 

Total:
0    7469
1    4111
Name: future_flare, dtype: int64 


X ## Train: (5404, 20) Val: (2702, 20) Test: (3474, 20)
y ## Train: (5404,) Val: (2702,) Test: (3474,)


In [14]:
model = make_model(X_train, initial_bias)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_2 (Bidirectio  (None, 40)               3520      
 nal)                                                            
                                                                 
 dense_6 (Dense)             (None, 30)                1230      
                                                                 
 dense_7 (Dense)             (None, 10)                310       
                                                                 
 dense_8 (Dense)             (None, 1)                 11        
                                                                 
Total params: 5,071
Trainable params: 5,071
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
epochs = 20
batch_size = 32
# define callbacks
callbacks = [
    ModelCheckpoint(
        os.path.join("models", "LSTM_extrpTheta_checkpoint.h5"), save_weights_only=True, monitor="val_loss"
    ),
    EarlyStopping(monitor="val_loss", patience=3, verbose=1),
]
# fit model
model.fit(
    X_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=callbacks,
    validation_data=(X_val, y_val),
    verbose=1,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 17: early stopping


<keras.callbacks.History at 0x7f26243e6fd0>

In [16]:
eval(model, X_val, y_val, X_test, y_test)

### Evaluation on validation set ###
Accuracy: 0.73
F1 score: 0.71
AUC: 0.7091516554460642
[[1358  179]
 [ 542  623]]

### Evaluation on test set ###
Accuracy: 0.75
F1 score: 0.67
AUC: 0.6630870585978532
[[2184  353]
 [ 501  436]]


## Further investigation in extrapolation with greater overlap

In [17]:
X_train, y_train, X_val, y_val, X_test, y_test, initial_bias = get_data_set(data_train, 'extrapolation', data2=data_test, overlap_size=19)

Training set:
0    16928
1     9980
Name: future_flare, dtype: int64 

validation set:
0    7656
1    5798
Name: future_flare, dtype: int64 

Test set:
0    12614
1     4684
Name: future_flare, dtype: int64 

Total:
0    37198
1    20462
Name: future_flare, dtype: int64 


X ## Train: (26908, 20) Val: (13454, 20) Test: (17298, 20)
y ## Train: (26908,) Val: (13454,) Test: (17298,)


In [18]:
model = make_model(X_train, initial_bias)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_3 (Bidirectio  (None, 40)               3520      
 nal)                                                            
                                                                 
 dense_9 (Dense)             (None, 30)                1230      
                                                                 
 dense_10 (Dense)            (None, 10)                310       
                                                                 
 dense_11 (Dense)            (None, 1)                 11        
                                                                 
Total params: 5,071
Trainable params: 5,071
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
epochs = 20
batch_size = 32
# define callbacks
callbacks = [
    ModelCheckpoint(
        os.path.join("models", "LSTM_extrpTheta19_checkpoint.h5"), save_weights_only=True, monitor="val_loss"
    ),
    EarlyStopping(monitor="val_loss", patience=3, verbose=1),
]
# fit model
model.fit(
    X_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=callbacks,
    validation_data=(X_val, y_val),
    verbose=1,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 11: early stopping


<keras.callbacks.History at 0x7f25fa9e9940>

In [20]:
eval(model, X_val, y_val, X_test, y_test)

### Evaluation on validation set ###
Accuracy: 0.74
F1 score: 0.71
AUC: 0.710636198372011
[[6875  781]
 [2764 3034]]

### Evaluation on test set ###
Accuracy: 0.76
F1 score: 0.66
AUC: 0.6478221438584296
[[11135  1479]
 [ 2750  1934]]


# Bibliography
\[1\] _On the distribution of fluxes of gamma-ray blazars: hints for a stochastic process?_, Tavecchio et al., [https://arxiv.org/pdf/2004.09149.pdf](https://arxiv.org/pdf/2004.09149.pdf)
<!-- cite with: [\[1\]](https://arxiv.org/pdf/2004.09149.pdf)  -->
\[2\] _Time Series Classification from Scratch with Deep Neural Networks: A Strong Baseline_, Wang et al., [https://arxiv.org/abs/1611.06455](https://arxiv.org/abs/1611.06455)
<!-- cite with: [\[2\]](https://arxiv.org/abs/1611.06455)  -->
\[3\] _Solar Flare Prediction Based on the Fusion of Multiple Deep-learning Models_, Tang et al., [https://iopscience.iop.org/article/10.3847/1538-4365/ac249e/meta](https://iopscience.iop.org/article/10.3847/1538-4365/ac249e/meta)
<!-- cite with: [\[3\]](https://iopscience.iop.org/article/10.3847/1538-4365/ac249e/meta)  -->
\[4\] _Predicting Solar Energetic Particles Using SDO/HMI Vector Magnetic Data Products and a Bidirectional LSTM Network_, Abduallah et al., [https://iopscience.iop.org/article/10.3847/1538-4365/ac5f56/meta](https://iopscience.iop.org/article/10.3847/1538-4365/ac5f56/meta)
<!-- cite with: [\[4\]](https://iopscience.iop.org/article/10.3847/1538-4365/ac5f56/meta) -->