In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import tensorflow as tf

from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Bidirectional


from Libs.load_data import DataLoader
from Libs import flares_plot as fplt
from Libs.threshold import get_labels_physic

In [28]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [29]:
# initialize data loader
data_loader = DataLoader()
# get the grid
grid_X = data_loader.get_grid()
# get params dictionary
params = data_loader.get_params()
# get physic labels
grid_y = get_labels_physic(grid_X, params, alpha=2)

params

{'run': 30,
 'sigma': [0.3, 0.4, 0.5, 0.6],
 'theta': [0.01, 0.1, 0.5, 3],
 'mu': [0.8, 0.9, 1, 1.1],
 'delta': [0.01, 0.05, 0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7],
 'N': 1000}

In [30]:
run = 0
test_size = 0.33
random_state = 42

# select best params
idx = data_loader.get_standard_indexes()

In [31]:
# TODO: parametrizzare split on run e shuffle
def get_dataset_split(grid_x, grid_y, idx, test_size = 0.33, window_size = 10, overlap_size = 0):
    def build_df(X_configuration, y_configuration, window_size=window_size, overlap_size=overlap_size, label_treshold = 1):
        # f1 ~ 87% su test set
        # window_size = 100
        # overlap_size = 90
        #------
        # f1 ~ 80% su test set

        stride = window_size - overlap_size
        num_windows = (X_configuration.shape[-1]-window_size)//stride + 1

        windows = np.zeros((X_configuration.shape[0]*(num_windows-1),window_size))
        windows_label = np.zeros((y_configuration.shape[0]*(num_windows-1),window_size), dtype='bool')


        for i in range(X_configuration.shape[0]):
            tmp_windows = np.array([X_configuration[i,j:j+window_size] for j in range(0,stride*num_windows,stride)])
            tmp_windows_labels = np.array([y_configuration[i,j:j+window_size] for j in range(0,stride*num_windows,stride)])
            windows[i*(num_windows-1):(i+1)*(num_windows-1)] = tmp_windows[:-1,:]
            windows_label[i*(num_windows-1):(i+1)*(num_windows-1)] = tmp_windows_labels[1:,:]

        windows_label = np.sum(windows_label, axis=-1)
        windows_label[windows_label<label_treshold] = 0
        windows_label[windows_label>=label_treshold] = 1

        df = pd.DataFrame(windows, columns=[f't_{i}' for i in range(windows.shape[-1])]).sample(frac=1)
        y_df = pd.DataFrame({'future_flare':windows_label})
        df = pd.concat([df, y_df], axis=1)

        return df

    run_test_index = int((1-test_size) * params['run'])

    # build the dataframe
    X_configuration = grid_X[:run_test_index, idx[0], idx[1], idx[2], idx[3], :]
    y_configuration = grid_y[:run_test_index, idx[0], idx[1], idx[2], idx[3], :]
    # df training
    df_train = build_df(X_configuration, y_configuration)

    # build the dataframe
    X_configuration = grid_X[run_test_index:, idx[0], idx[1], idx[2], idx[3], :]
    y_configuration = grid_y[run_test_index:, idx[0], idx[1], idx[2], idx[3], :]
    # df test
    df_test = build_df(X_configuration, y_configuration)

    return df_train, df_test

df_train, df_test = get_dataset_split(grid_X, grid_y, idx, window_size=20, overlap_size=18)

In [32]:
# number of classes
print('Training set:')
print(df_train['future_flare'].value_counts(), '\n')
print('Test set:')
print(df_test['future_flare'].value_counts(), '\n')

Training set:
0    8873
1     927
Name: future_flare, dtype: int64 

Test set:
0    4733
1     167
Name: future_flare, dtype: int64 



In [33]:
# extract X and y from training dataframe
X_train = df_train.drop(['future_flare'], axis=1).to_numpy()
y_train = df_train['future_flare'].to_numpy()

# extract X and y from test dataframe
X_test = df_test.drop(['future_flare'], axis=1).to_numpy()
y_test = df_test['future_flare'].to_numpy()

print('Train shape:',len(X_train), len(y_train))
print('Test shape:  ',len(X_test), '', len(y_test))

Train shape: 9800 9800
Test shape:   4900  4900


In [34]:
# model params
n_steps = 3
n_features = 1

In [35]:
# split train and test set
# X_reshaped = X.reshape((X.shape[0], X.shape[1], n_features))
# X_reshaped = X
# X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=test_size, random_state=random_state)
print('Train:', X_train.shape, y_train.shape)
print('Test: ', X_test.shape, y_test.shape)

Train: (9800, 20) (9800,)
Test:  (4900, 20) (4900,)


In [36]:
# define model
model = Sequential()
# model.add(Bidirectional(LSTM(30, return_sequences=True), input_shape=(X_train.shape[1], 1)))
model.add(Bidirectional(LSTM(20, activation='relu'), input_shape=(X_train.shape[1], 1)))
# model.add(LSTM(100))
model.add(Dense(30, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_m, 'accuracy'])
# model.compile(loss='binary_crossentropy', optimizer='RMSProp', metrics=[f1_m, 'accuracy'])

# Calculate the weights for each class so that we can balance the data
# weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)

print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_1 (Bidirectio  (None, 40)               3520      
 nal)                                                            
                                                                 
 dense_6 (Dense)             (None, 30)                1230      
                                                                 
 dense_7 (Dense)             (None, 10)                310       
                                                                 
 dense_8 (Dense)             (None, 1)                 11        
                                                                 
Total params: 5,071
Trainable params: 5,071
Non-trainable params: 0
_________________________________________________________________
None


In [37]:
# fit model
model.fit(X_train, y_train, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa34f3d0940>

In [38]:
scores = model.evaluate(X_test, y_test, verbose=0)
y_pred = np.round(model.predict(X_test), 0)

print("### Evaluation on test set ###")
print("Accuracy: %.2f" % (accuracy_score(y_pred, y_test)))
print("F1 score: %.2f" % (f1_score(y_pred, y_test)))
#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(y_test, y_pred)
print(result)


### Evaluation on test set ###
Accuracy: 1.00
F1 score: 0.95
[[4729    4]
 [  12  155]]


```
154/154 [==============================] - 2s 9ms/step
### Evaluation on test set ###
Accuracy: 1.00
F1 score: 0.94
[[4728    5]
 [  13  154]]
```

In [26]:
model.predict(X_test)



array([1.], dtype=float32)

### CNN-LSTM

In [13]:
from keras.layers import TimeDistributed, Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [18]:
# extract X and y from training dataframe
X_train = df_train.drop(['future_flare'], axis=1).to_numpy()
y_train = df_train['future_flare'].to_numpy()

# extract X and y from test dataframe
X_test = df_test.drop(['future_flare'], axis=1).to_numpy()
y_test = df_test['future_flare'].to_numpy()

print('Train shape:',X_train.shape, y_train.shape)
print('Test shape:  ',X_test.shape, '', y_test.shape)

Train shape: (9800, 20) (9800,)
Test shape:   (4900, 20)  (4900,)


In [19]:
n_features = 1
n_seq = 1
n_steps = 4
timesteps = X_train.shape[1]
X_train = X_train.reshape((X_train.shape[0], n_seq, timesteps, n_features))
X_test = X_test.reshape((X_test.shape[0], n_seq, timesteps, n_features))

print('Train:', X_train.shape, y_train.shape)
print('Test: ', X_test.shape, y_test.shape)

Train: (9800, 1, 20, 1) (9800,)
Test:  (4900, 1, 20, 1) (4900,)


In [20]:
model2 = Sequential()
model2.add(TimeDistributed(Conv1D(filters=16, kernel_size=1, activation='relu'), input_shape=(None, X_train.shape[2], n_features)))
model2.add(TimeDistributed(MaxPooling1D(pool_size=4)))
model2.add(TimeDistributed(Flatten()))
model2.add(LSTM(15, activation='relu'))
model2.add(Dense(30, activation='relu'))
model2.add(Dense(10, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_m, 'accuracy'])

In [21]:
# fit model
model2.fit(X_train, y_train, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa318a427f0>

In [22]:
scores = model2.evaluate(X_test, y_test, verbose=0)
y_pred = np.round(model2.predict(X_test), 0)

print("### Evaluation on test set ###")
print("Accuracy: %.2f" % (accuracy_score(y_pred, y_test)))
print("F1 score: %.2f" % (f1_score(y_pred, y_test)))
#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(y_test, y_pred)
print(result)


### Evaluation on test set ###
Accuracy: 1.00
F1 score: 0.93
[[4730    3]
 [  20  147]]
