In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Bidirectional
from itertools import product
from tqdm import tqdm


from Libs.load_data import DataLoader
from Libs.threshold import get_labels_physic

2023-03-04 18:23:57.680476: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-04 18:23:58.053698: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-04 18:23:58.053778: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-04 18:24:00.155305: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [2]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [3]:
# initialize data loader
data_loader = DataLoader()
# get the grid
grid_X = data_loader.get_grid()
# get params dictionary
params = data_loader.get_params()
# get physic labels
grid_y = get_labels_physic(grid_X, params, alpha=2)

params

{'run': 30,
 'sigma': [0.3, 0.4, 0.5, 0.6],
 'theta': [0.01, 0.1, 0.5, 3],
 'mu': [0.8, 0.9, 1, 1.1],
 'delta': [0.01, 0.05, 0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7],
 'N': 1000}

In [4]:
run = 0
test_size = 0.33
random_state = 42

# select best params
idx = data_loader.get_standard_indexes()

In [5]:
# TODO: parametrizzare split on run e shuffle
def get_dataset_split(grid_x, grid_y, idx, test_size = 0.33, window_size = 10, overlap_size = 0):
    def build_df(X_configuration, y_configuration, window_size=window_size, overlap_size=overlap_size, label_treshold = 1):
        stride = window_size - overlap_size
        num_windows = (X_configuration.shape[-1]-window_size)//stride + 1

        windows = np.zeros((X_configuration.shape[0]*(num_windows-1),window_size))
        windows_label = np.zeros((y_configuration.shape[0]*(num_windows-1),window_size), dtype='bool')


        for i in range(X_configuration.shape[0]):
            tmp_windows = np.array([X_configuration[i,j:j+window_size] for j in range(0,stride*num_windows,stride)])
            tmp_windows_labels = np.array([y_configuration[i,j:j+window_size] for j in range(0,stride*num_windows,stride)])
            windows[i*(num_windows-1):(i+1)*(num_windows-1)] = tmp_windows[:-1,:]
            windows_label[i*(num_windows-1):(i+1)*(num_windows-1)] = tmp_windows_labels[1:,:]

        windows_label = np.sum(windows_label, axis=-1)
        windows_label[windows_label<label_treshold] = 0
        windows_label[windows_label>=label_treshold] = 1

        df = pd.DataFrame(windows, columns=[f't_{i}' for i in range(windows.shape[-1])]).sample(frac=1)
        y_df = pd.DataFrame({'future_flare':windows_label})
        df = pd.concat([df, y_df], axis=1)

        return df

    run_test_index = int((1-test_size) * params['run'])

    # build the dataframe
    X_configuration = []
    y_configuration = []
    # for s, t, d, m in tqdm(product(params['sigma'], params['theta'], params['delta'], params['mu'])):
    for s, t, d, m in tqdm(product(params['sigma'], params['theta'], [0.2], [1])):
        ti = params['theta'].index(t)
        mi = params['mu'].index(m)
        si = params['sigma'].index(s)
        di = params['delta'].index(d)
        X_configuration.append(grid_X[:run_test_index, ti, mi, si, di, :])
        y_configuration.append(grid_y[:run_test_index, ti, mi, si, di, :])

    X_configuration = np.hstack(X_configuration)
    y_configuration = np.hstack(y_configuration)
    # df training
    df_train = build_df(X_configuration, y_configuration)

    # build the dataframe
    X_configuration = []
    y_configuration = []
    # for s, t, d, m in tqdm(product(params['sigma'], params['theta'], params['delta'], params['mu'])):
    for s, t, d, m in tqdm(product(params['sigma'], params['theta'], [0.2], [1])):
        ti = params['theta'].index(t)
        mi = params['mu'].index(m)
        si = params['sigma'].index(s)
        di = params['delta'].index(d)
        X_configuration.append(grid_X[run_test_index:, ti, mi, si, di, :])
        y_configuration.append(grid_y[run_test_index:, ti, mi, si, di, :])
    X_configuration = np.hstack(X_configuration)
    y_configuration = np.hstack(y_configuration)
    # df test
    df_test = build_df(X_configuration, y_configuration)

    return df_train, df_test

df_train, df_test = get_dataset_split(grid_X, grid_y, idx, window_size=20, overlap_size=18)

16it [00:00, 18020.64it/s]
16it [00:00, 29799.67it/s]


In [6]:
# number of classes
print('Training set:')
print(df_train['future_flare'].value_counts(), '\n')
print('Test set:')
print(df_test['future_flare'].value_counts(), '\n')

Training set:
0    119858
1     39942
Name: future_flare, dtype: int64 

Test set:
0    60062
1    19838
Name: future_flare, dtype: int64 



In [7]:
# extract X and y from training dataframe
X_train = df_train.drop(['future_flare'], axis=1).to_numpy()
y_train = df_train['future_flare'].to_numpy()

# extract X and y from test dataframe
X_test = df_test.drop(['future_flare'], axis=1).to_numpy()
y_test = df_test['future_flare'].to_numpy()

print('Train shape:',len(X_train), len(y_train))
print('Test shape:  ',len(X_test), '', len(y_test))

Train shape: 159800 159800
Test shape:   79900  79900


In [8]:
print('Train:', X_train.shape, y_train.shape)
print('Test: ', X_test.shape, y_test.shape)

Train: (159800, 20) (159800,)
Test:  (79900, 20) (79900,)


In [9]:
# define model
model = Sequential()
model.add(Bidirectional(LSTM(20, activation='relu'), input_shape=(X_train.shape[1], 1)))
model.add(Dense(30, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_m, 'accuracy'])

# Calculate the weights for each class so that we can balance the data
print(model.summary())

2023-03-04 18:24:04.972308: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-04 18:24:04.972616: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-04 18:24:04.972693: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (83dc2cdd3c94): /proc/driver/nvidia/version does not exist
2023-03-04 18:24:04.973521: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 40)               3520      
 l)                                                              
                                                                 
 dense (Dense)               (None, 30)                1230      
                                                                 
 dense_1 (Dense)             (None, 10)                310       
                                                                 
 dense_2 (Dense)             (None, 1)                 11        
                                                                 
Total params: 5,071
Trainable params: 5,071
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
# fit model
model.fit(X_train, y_train, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb6585d37c0>

In [11]:
scores = model.evaluate(X_test, y_test, verbose=0)
y_pred = np.round(model.predict(X_test), 0)

print("### Evaluation on test set ###")
print("Accuracy: %.2f" % (accuracy_score(y_pred, y_test)))
print("F1 score: %.2f" % (f1_score(y_pred, y_test)))
#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(y_test, y_pred)
print(result)


### Evaluation on test set ###
Accuracy: 0.91
F1 score: 0.80
[[57191  2871]
 [ 4683 15155]]


```
154/154 [==============================] - 2s 9ms/step
### Evaluation on test set ###
Accuracy: 1.00
F1 score: 0.94
[[4728    5]
 [  13  154]]
```