In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional
from keras.callbacks import ModelCheckpoint, EarlyStopping

from Libs.config import inter_extra_data_folder
from Libs.load_data import DataLoader, get_dataset_split
from Libs.keras_f1score import f1_m

2023-03-21 20:44:23.775242: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-21 20:44:24.058435: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-21 20:44:24.058496: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-21 20:44:25.284612: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [2]:
F_std = False

# initialize data loader
data_loader = DataLoader(run=30, N=1000, s=0.5, t=[0.01, 0.1, 0.5, 3], d=0.2, m=1, 
                         override=False, folder=inter_extra_data_folder)
# get the grid
grid_X, grid_y = data_loader.get_grid()
# get params dictionary
params = data_loader.get_params()

grid_X.shape, grid_y.shape

((30, 1, 4, 1, 1, 1000), (30, 1, 4, 1, 1, 1000))

A biggest assumption when training ANNs is the following: 

"We assume that training sets and test sets contains independent and identically distributed samples from the same unknown distribution $p_{data}(x,y)$"

This is a very important assumption that in general affect the performance ANNs, in particular classifier ones. We could, indeed, explore what can happen if we violete the following assumption. This a relevant application case, for exaple in cases when the generation parameters are not known.

# LSTM model with multiple all theta parameters

Let's start seeing what is going to happen with training and testing the NN with all the configurations of theta parameters

In [3]:
dataset_split_params = {
    'window_size': 20, # how large is the window
    'overlap_size': 15, # how many time interval of overlap there is between the windows
    'label_treshold': 1, # how many labels have to be at 1 in the window_size to consider the current window as a flare
    'split_on_run': True, # if True the windows of a run cannot be on different sets
    'shuffle_run': False, # if True shuffles the order of the runs before computing the windows
    'shuffle_window': False, # if True shuffles the order of the windows in the resulting dataframes
    'test_size': 0.3, # size of the test set expressed in percentage
    'val_size': 0.2, # size of the validation set expressed in percentage, considered only if get_validation is True
    'get_validation': True, # if True the output would be train,val,test set, otherwise it would be train,test
    'random_state': 42 # sets the seed for reproducibility
}
df_train,df_val,df_test = get_dataset_split(grid_X, grid_y, **dataset_split_params)
df_train.shape, df_val.shape, df_test.shape

((10976, 21), (5488, 21), (7056, 21))

This cell shows the amounts of class 0 and 1 for each data set

In [4]:
# number of classes
print('Training set:')
train_counts = df_train['future_flare'].value_counts()
print(train_counts, '\n')
print('validation set:')
val_counts = df_val['future_flare'].value_counts()
print(val_counts, '\n')
print('Test set:')
test_counts = df_test['future_flare'].value_counts()
print(test_counts, '\n')
print('Total:')
total_counts = train_counts.add(val_counts).add(test_counts)
print(total_counts, '\n')

## Training set:
0    7197
1    3779
Name: future_flare, dtype: int64 

## Validation set:
0    3564
1    1924
Name: future_flare, dtype: int64 

## Test set:
0    4547
1    2509
Name: future_flare, dtype: int64 



In [5]:
X_train, y_train = df_train.iloc[:,:-1].to_numpy(), df_train.future_flare.to_numpy()
X_val, y_val = df_val.iloc[:,:-1].to_numpy(), df_val.future_flare.to_numpy()
X_test, y_test = df_test.iloc[:,:-1].to_numpy(), df_test.future_flare.to_numpy()
X = np.vstack((X_train, X_val, X_test))
y = np.hstack((y_train, y_val, y_test))
print('X ## Train:', X_train.shape, 'Val:', X_val.shape, 'Test:', X_test.shape)
print('y ## Train:', y_train.shape, 'Val:', y_val.shape, 'Test:', y_test.shape)

X ## Train: (10976, 20) Val: (5488, 20) Test: (7056, 20)
y ## Train: (10976,) Val: (5488,) Test: (7056,)


In [6]:
if F_std:
    # Standardize Data
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_std = scaler.transform(X_train)
    X_val_std = scaler.transform(X_val)
    X_test_std = scaler.transform(X_test)
    # get automatically the number of classes
    num_classes = len(np.unique(y))
else:
    X_train_std = X_train
    X_val_std = X_val
    X_test_std = X_test

Let's construct now the model

In [7]:
initial_bias = np.log([train_counts[1]/train_counts[0]])
# define model
model = Sequential()
model.add(Bidirectional(LSTM(20, activation='relu'), input_shape=(X_train_std.shape[1], 1)))
model.add(Dense(30, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid',bias_initializer=initial_bias))
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=[f1_m, 'accuracy'])

print(model.summary())

2023-03-21 20:44:27.409507: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-21 20:44:27.409613: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-21 20:44:27.409672: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (44910f15382a): /proc/driver/nvidia/version does not exist
2023-03-21 20:44:27.410626: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 40)               3520      
 l)                                                              
                                                                 
 dense (Dense)               (None, 30)                1230      
                                                                 
 dense_1 (Dense)             (None, 10)                310       
                                                                 
 dense_2 (Dense)             (None, 1)                 11        
                                                                 
Total params: 5,071
Trainable params: 5,071
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
epochs = 20
batch_size = 32
# define callbacks
callbacks = [
    ModelCheckpoint(
        os.path.join("models", "LSTM_allTheta_checkpoint.h5"), save_weights_only=True, monitor="val_loss"
    ),
    EarlyStopping(monitor="val_loss", patience=3, verbose=1),
]
# fit model
model.fit(
    X_train_std,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=callbacks,
    validation_data=(X_val_std, y_val),
    verbose=1,
)

Epoch 1/20
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f792c7e94f0>

Validation set results

In [9]:
y_pred = np.round(model.predict(X_val_std), 0)

print("### Evaluation on validation set ###")
print("Accuracy: %.2f" % (accuracy_score(y_pred, y_val)))
print("F1 score: %.2f" % (f1_score(y_pred, y_val, average='macro')))
#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(y_val, y_pred)
print(result)

### Evaluation on validation set ###
Accuracy: 0.93
F1 score: 0.92
[[3480   84]
 [ 311 1613]]


Test results

In [10]:
y_pred = np.round(model.predict(X_test_std), 0)

print("### Evaluation on test set ###")
print("Accuracy: %.2f" % (accuracy_score(y_pred, y_test)))
print("F1 score: %.2f" % (f1_score(y_pred, y_test, average='macro')))
#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(y_test, y_pred)
print(result)

### Evaluation on test set ###
Accuracy: 0.92
F1 score: 0.92
[[4432  115]
 [ 418 2091]]


# Interpolation

Same model as before, but now we are fitting only in the dataset coming from the generation with using just the extreme parameters: 

$\theta=0.01$ and $\theta=3$

and a fraction of the other dataset, coming from $\theta=0.1$ and $\theta=0.5$ as test set

In [11]:
p = 'theta'
theta_train_list     = [0.01, 3]
theta_train_list_idx = [params[p].index(t) for t in theta_train_list]
theta_test_list      = [0.1, 0.5]
theta_test_list_idx  = [params[p].index(t) for t in theta_test_list]
# params commons
dataset_split_params = {
    'window_size': 20, # how large is the window
    'overlap_size': 15, # how many time interval of overlap there is between the windows
    'label_treshold': 1, # how many labels have to be at 1 in the window_size to consider the current window as a flare
    'split_on_run': True, # if True the windows of a run cannot be on different sets
    'shuffle_run': False, # if True shuffles the order of the runs before computing the windows
    'shuffle_window': False, # if True shuffles the order of the windows in the resulting dataframes
    'get_validation': True, # if True the output would be train,val,test set, otherwise it would be train,test
    'random_state': 42 # sets the seed for reproducibility
}
# params for training and validation set
train_split = {
    'test_size': 0, # size of the test set expressed in percentage
    'val_size': 0.2 # size of the validation set expressed in percentage, considered only if get_validation is True
    }
dataset_split_params_train = {**dataset_split_params, **train_split}
# params for test set
test_split =  {
    'test_size': 0.3, # size of the test set expressed in percentage
    'val_size': 0 # size of the validation set expressed in percentage, considered only if get_validation is True
}                            
dataset_split_params_test  = {**dataset_split_params, **test_split}

In [12]:
# get the train and validation set, selecting the index for grid given the interpolation assuption
# notice that theta is the third parameter
df_train, df_val, _ = get_dataset_split(grid_X[:,:,theta_train_list_idx,:,:,:], 
                                        grid_y[:,:,theta_train_list_idx,:,:,:], 
                                        **dataset_split_params_train)
# get the test set, selecting the index for grid given the interpolation assuption
# notice that theta is the third parameter
_, _, df_test = get_dataset_split(grid_X[:,:,theta_test_list_idx,:,:,:], 
                                  grid_y[:,:,theta_test_list_idx,:,:,:], 
                                  **dataset_split_params_test)
df_train.shape, df_val.shape, df_test.shape

((9408, 21), (2352, 21), (3528, 21))

In [13]:
# number of classes
print('Training set:')
train_counts = df_train['future_flare'].value_counts()
print(train_counts, '\n')
print('validation set:')
val_counts = df_val['future_flare'].value_counts()
print(val_counts, '\n')
print('Test set:')
test_counts = df_test['future_flare'].value_counts()
print(test_counts, '\n')
print('Total:')
total_counts = train_counts.add(val_counts).add(test_counts)
print(total_counts, '\n')

## Training set:
0    6703
1    2705
Name: future_flare, dtype: int64 

## Validation set:
0    1680
1     672
Name: future_flare, dtype: int64 

## Test set:
0    2042
1    1486
Name: future_flare, dtype: int64 



In [14]:
X_train, y_train = df_train.iloc[:,:-1].to_numpy(), df_train.future_flare.to_numpy()
X_val, y_val = df_val.iloc[:,:-1].to_numpy(), df_val.future_flare.to_numpy()
X_test, y_test = df_test.iloc[:,:-1].to_numpy(), df_test.future_flare.to_numpy()
X = np.vstack((X_train, X_val, X_test))
y = np.hstack((y_train, y_val, y_test))
print('X ## Train:', X_train.shape, '\n     Val:', X_val.shape, '\n     Test:', X_test.shape)
print('y ## Train:', y_train.shape, '\n     Val:', y_val.shape, '\n     Test:', y_test.shape)

X ## Train: (9408, 20) 
     Val: (2352, 20) 
     Test: (3528, 20)
y ## Train: (9408,) 
     Val: (2352,) 
     Test: (3528,)


In [15]:
if F_std:
    # Standardize Data
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_std = scaler.transform(X_train)
    X_val_std = scaler.transform(X_val)
    X_test_std = scaler.transform(X_test)
    # get automatically the number of classes
    num_classes = len(np.unique(y))
else:
    X_train_std = X_train
    X_val_std = X_val
    X_test_std = X_test

In [16]:
initial_bias = np.log([train_counts[1]/train_counts[0]])
# define model
model = Sequential()
model.add(Bidirectional(LSTM(20, activation='relu'), input_shape=(X_train_std.shape[1], 1)))
model.add(Dense(30, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid',bias_initializer=initial_bias))
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=[f1_m, 'accuracy'])

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_1 (Bidirectio  (None, 40)               3520      
 nal)                                                            
                                                                 
 dense_3 (Dense)             (None, 30)                1230      
                                                                 
 dense_4 (Dense)             (None, 10)                310       
                                                                 
 dense_5 (Dense)             (None, 1)                 11        
                                                                 
Total params: 5,071
Trainable params: 5,071
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
epochs = 20
batch_size = 32
# define callbacks
callbacks = [
    ModelCheckpoint(
        os.path.join("models", "LSTM_intrpTheta_checkpoint.h5"), save_weights_only=True, monitor="val_loss"
    ),
    EarlyStopping(monitor="val_loss", patience=3, verbose=1),
]
# fit model
model.fit(
    X_train_std,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=callbacks,
    validation_data=(X_val_std, y_val),
    verbose=1,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f79354ff550>

Validation results

In [18]:
y_pred = np.round(model.predict(X_val_std), 0)

print("### Evaluation on validation set ###")
print("Accuracy: %.2f" % (accuracy_score(y_pred, y_val)))
print("F1 score: %.2f" % (f1_score(y_pred, y_val, average='macro')))
#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(y_val, y_pred)
print(result)

### Evaluation on validation set ###
Accuracy: 0.91
F1 score: 0.89
[[1601   79]
 [ 131  541]]


Test results

In [19]:
y_pred = np.round(model.predict(X_test_std), 0)

print("### Evaluation on test set ###")
print("Accuracy: %.2f" % (accuracy_score(y_pred, y_test)))
print("F1 score: %.2f" % (f1_score(y_pred, y_test, average='macro')))
#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(y_test, y_pred)
print(result)

### Evaluation on test set ###
Accuracy: 0.92
F1 score: 0.92
[[1995   47]
 [ 220 1266]]


Results are still similar to the standard case

# Extrapolation

Same model as before, but now we are fitting only in the dataset coming from the generation without using the extreme parameters: 

$\theta=0.1$ and $\theta=0.5$

In [20]:
p = 'theta'
theta_train_list     = [0.1, 0.5]
theta_train_list_idx = [params[p].index(t) for t in theta_train_list]
theta_test_list      = [0.01, 3]
theta_test_list_idx  = [params[p].index(t) for t in theta_test_list]

# params commons
dataset_split_params = {
    'window_size': 20, # how large is the window
    'overlap_size': 15, # how many time interval of overlap there is between the windows
    'label_treshold': 1, # how many labels have to be at 1 in the window_size to consider the current window as a flare
    'split_on_run': True, # if True the windows of a run cannot be on different sets
    'shuffle_run': False, # if True shuffles the order of the runs before computing the windows
    'shuffle_window': False, # if True shuffles the order of the windows in the resulting dataframes
    'get_validation': True, # if True the output would be train,val,test set, otherwise it would be train,test
    'random_state': 42 # sets the seed for reproducibility
}
# params for training and validation set
train_split = {
    'test_size': 0, # size of the test set expressed in percentage
    'val_size': 0.2 # size of the validation set expressed in percentage, considered only if get_validation is True
    }
dataset_split_params_train = {**dataset_split_params, **train_split}
# params for test set
test_split =  {
    'test_size': 0.3, # size of the test set expressed in percentage
    'val_size': 0 # size of the validation set expressed in percentage, considered only if get_validation is True
}                            
dataset_split_params_test  = {**dataset_split_params, **test_split}

In [21]:
# get the train and validation set, selecting the index for grid given the interpolation assuption
# notice that theta is the third parameter
df_train, df_val, _ = get_dataset_split(grid_X[:,:,theta_train_list_idx,:,:,:], 
                                        grid_y[:,:,theta_train_list_idx,:,:,:], 
                                        **dataset_split_params_train)
# get the test set, selecting the index for grid given the interpolation assuption
# notice that theta is the third parameter
_, _, df_test = get_dataset_split(grid_X[:,:,theta_test_list_idx,:,:,:], 
                                  grid_y[:,:,theta_test_list_idx,:,:,:], 
                                  **dataset_split_params_test)
df_train.shape, df_val.shape, df_test.shape

((9408, 21), (2352, 21), (3528, 21))

In [22]:
# number of classes
print('Training set:')
train_counts = df_train['future_flare'].value_counts()
print(train_counts, '\n')
print('validation set:')
val_counts = df_val['future_flare'].value_counts()
print(val_counts, '\n')
print('Test set:')
test_counts = df_test['future_flare'].value_counts()
print(test_counts, '\n')
print('Total:')
total_counts = train_counts.add(val_counts).add(test_counts)
print(total_counts, '\n')

## Training set:
0    5557
1    3851
Name: future_flare, dtype: int64 

## Validation set:
0    1368
1     984
Name: future_flare, dtype: int64 

## Test set:
0    2505
1    1023
Name: future_flare, dtype: int64 



In [23]:
X_train, y_train = df_train.iloc[:,:-1].to_numpy(), df_train.future_flare.to_numpy()
X_val, y_val = df_val.iloc[:,:-1].to_numpy(), df_val.future_flare.to_numpy()
X_test, y_test = df_test.iloc[:,:-1].to_numpy(), df_test.future_flare.to_numpy()
X = np.vstack((X_train, X_val, X_test))
y = np.hstack((y_train, y_val, y_test))
print('X ## Train:', X_train.shape, 'Val:', X_val.shape, 'Test:', X_test.shape)
print('y ## Train:', y_train.shape, 'Val:', y_val.shape, 'Test:', y_test.shape)

X ## Train: (9408, 20) Val: (2352, 20) Test: (3528, 20)
y ## Train: (9408,) Val: (2352,) Test: (3528,)


In [24]:
if F_std:
    # Standardize Data
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_std = scaler.transform(X_train)
    X_val_std = scaler.transform(X_val)
    X_test_std = scaler.transform(X_test)
    # get automatically the number of classes
    num_classes = len(np.unique(y))
else:
    X_train_std = X_train
    X_val_std = X_val
    X_test_std = X_test

Same model as before

In [25]:
initial_bias = np.log([train_counts[1]/train_counts[0]])
# define model
model = Sequential()
model.add(Bidirectional(LSTM(20, activation='relu'), input_shape=(X_train_std.shape[1], 1)))
model.add(Dense(30, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid',bias_initializer=initial_bias))
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=[f1_m, 'accuracy'])

print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_2 (Bidirectio  (None, 40)               3520      
 nal)                                                            
                                                                 
 dense_6 (Dense)             (None, 30)                1230      
                                                                 
 dense_7 (Dense)             (None, 10)                310       
                                                                 
 dense_8 (Dense)             (None, 1)                 11        
                                                                 
Total params: 5,071
Trainable params: 5,071
Non-trainable params: 0
_________________________________________________________________
None


In [26]:
epochs = 20
batch_size = 32
# define callbacks
callbacks = [
    ModelCheckpoint(
        os.path.join("models", "LSTM_extrpTheta_checkpoint.h5"), save_weights_only=True, monitor="val_loss"
    ),
    EarlyStopping(monitor="val_loss", patience=3, verbose=1),
]
# fit model
model.fit(
    X_train_std,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=callbacks,
    validation_data=(X_val_std, y_val),
    verbose=1,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 16: early stopping


<keras.callbacks.History at 0x7f78f3778850>

Validation results

In [27]:
y_pred = np.round(model.predict(X_val_std), 0)

print("### Evaluation on validation set ###")
print("F1 score: %.2f" % (f1_score(y_val, y_pred, average='macro')))
#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(y_val, y_pred)
print(result)

### Evaluation on validation set ###
F1 score: 0.89
[[1283   85]
 [ 165  819]]


Test results

In [28]:
y_pred = np.round(model.predict(X_test_std), 0)

print("### Evaluation on test set ###")
print("F1 score: %.2f" % (f1_score(y_test, y_pred, average='macro')))
#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(y_test, y_pred)
print(result)

### Evaluation on test set ###
F1 score: 0.82
[[2156  349]
 [ 187  836]]
