****Import required packages****

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

****Import Tensorflow****

In [None]:
import tensorflow as tf
# required for solving issue with cuDNN
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices: tf.config.experimental.set_memory_growth(device, True)

In [None]:
tf.random.set_seed(42)

****Import Keras****

In [None]:
from tensorflow import keras

In [None]:
data_filename = '../data/ozone.csv'

In [None]:
data = pd.read_csv(data_filename)

In [None]:
data.shape

In [None]:
data_columns = list(data.columns)

In [None]:
print(data_columns)

In [None]:
data.head()

In [None]:
data.tail()

****Find number of records in each class****

In [None]:
target_percentages = data['target'].value_counts()

In [None]:
print(target_percentages)

In [None]:
target_percentages = target_percentages.values

In [None]:
target_0_percent = round(target_percentages[0] / np.sum(target_percentages) * 100, 3)

In [None]:
print(target_0_percent)

In [None]:
target_1_percent = round(target_percentages[1] / np.sum(target_percentages) * 100, 3)

In [None]:
print(target_1_percent)

In [None]:
data.describe()

In [None]:
data.dtypes

In [None]:
data.isnull().any()

In [None]:
for missing_flag in data.isnull().any():
    print(missing_flag)

In [None]:
data == '?'

****Identify missing values with -1****

In [None]:
data = data.replace('?', -1)

In [None]:
data = data.iloc[:,1:].astype('float64')

In [None]:
data.dtypes

In [None]:
data.describe()

****Extract time series and targets from original data****

In [None]:
timeseries = data.iloc[:,1:(data.shape[1] - 1)].values

In [None]:
print(type(timeseries))

In [None]:
timeseries.shape

In [None]:
print(timeseries[0:5,-3:-1])

In [None]:
targets_init = data['target'].values

In [None]:
print(type(targets_init))

In [None]:
targets_init.shape

In [None]:
print(targets_init[0:5])

****Mean value imputation****

In [None]:
def impute_data(timeseries):
    (n, d) = timeseries.shape
    timeseries_mean = np.mean(timeseries, axis=0)
    for i in range(n):
        for j in range(d):
            if timeseries[i, j] == -1:
                timeseries[i, j] = timeseries_mean[j]
    return timeseries

In [None]:
timeseries = impute_data(timeseries)

In [None]:
print(timeseries[0:5,-3:-1])

****Normalization****

In [None]:
timeseries_mean = timeseries.mean(axis=0)

In [None]:
print(timeseries_mean)

In [None]:
timeseries_std = timeseries.std(axis=0)

In [None]:
print(timeseries_std)

In [None]:
timeseries -= timeseries_mean

In [None]:
timeseries /= timeseries_std

In [None]:
print(timeseries.mean(axis=0))

In [None]:
print(timeseries.std(axis=0))

****Transform initial data into batch of sequences****

In [None]:
def create_sequences(timeseries, targets_init, p):
    (n, d) = timeseries.shape
    sequences = np.zeros((n - p, p, d)).astype('float32')
    targets = np.zeros((n - p)).astype('float32')
    for i in range(p, n):
        sequence = timeseries[(i - p) : i, :]
        target = targets_init[i - 1]
        sequences[i - p, :, :] = sequence
        targets[i - p] = target
    return sequences, targets

In [None]:
p = 5

In [None]:
sequences, targets = create_sequences(timeseries, targets_init, p)

In [None]:
sequences.shape

In [None]:
targets.shape

****Split data****

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sequences, targets, test_size=0.1, shuffle=False)

In [None]:
print(X_train.shape)

In [None]:
print(y_train.shape)

In [None]:
print(X_test.shape)

In [None]:
print(y_test.shape)

****Target classes percentages in train data****

In [None]:
unique_train, counts_train = np.unique(y_train, return_counts=True)

In [None]:
print(dict(zip(unique_train, counts_train)))

In [None]:
unique_train

In [None]:
counts_train

In [None]:
target_0_percent_train = round(counts_train[0] / np.sum(counts_train) * 100, 3)

In [None]:
print(target_0_percent_train)

In [None]:
target_1_percent_train = round(counts_train[1] / np.sum(counts_train) * 100, 3)

In [None]:
print(target_1_percent_train)

****Target classes percentages in test data****

In [None]:
unique_test, counts_test = np.unique(y_test, return_counts=True)

In [None]:
print(dict(zip(unique_test, counts_test)))

In [None]:
unique_test

In [None]:
counts_test

In [None]:
target_0_percent_test = round(counts_test[0] / np.sum(counts_test) * 100, 3)

In [None]:
print(target_0_percent_test)

In [None]:
target_1_percent_test = round(counts_test[1] / np.sum(counts_test) * 100, 3)

In [None]:
print(target_1_percent_test)

****Function for plotting training history****

In [None]:
def plot_training_history(history):
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(loss) + 1)
    plt.figure()
    plt.plot(epochs, loss, label='Training loss')
    plt.plot(epochs, val_loss, label='Validation loss')
    plt.legend()
    plt.show()

****Baseline models****

****Persistence model****

In [None]:
def baseline_predict(X):
    n = X.shape[0]
    preds = []
    for i in range(n):
        pred = 0 # always predict 0 class
        preds.append(pred)
    return np.array(preds)

In [None]:
baseline_preds = baseline_predict(X_test)

In [None]:
acc = keras.metrics.Accuracy()

In [None]:
acc.update_state(y_true = y_test, y_pred = baseline_preds)

In [None]:
round(acc.result().numpy() * 100, 3)

****Feedforward DNN****

In [None]:
dnn = keras.Sequential()
dnn.add(keras.layers.Flatten(input_shape=(p, sequences.shape[2])))
dnn.add(keras.layers.Dense(16, activation='relu'))
dnn.add(keras.layers.Dense(1, activation='sigmoid'))

In [None]:
dnn.compile(optimizer='rmsprop', loss=keras.losses.BinaryCrossentropy(), metrics=['acc'])

In [None]:
history_dnn = dnn.fit(X_train, y_train, epochs=10, batch_size=10, validation_split=0.1)

In [None]:
dnn.summary()

In [None]:
plot_training_history(history_dnn)

In [None]:
dnn_results = dnn.evaluate(X_test, y_test)

In [None]:
print('Feedforward DNN test accuracy (%): ', round(dnn_results[1] * 100, 3))

****Classic RNN****

In [None]:
rnn = keras.Sequential()
rnn.add(keras.layers.SimpleRNN(16))
rnn.add(keras.layers.Dense(1, activation='sigmoid'))

In [None]:
rnn.compile(optimizer='rmsprop', loss=keras.losses.BinaryCrossentropy(), metrics=['acc'])

In [None]:
history_rnn = rnn.fit(X_train, y_train, epochs=10, batch_size=10, validation_split=0.1)

In [None]:
rnn.summary()

In [None]:
plot_training_history(history_rnn)

In [None]:
rnn_results = rnn.evaluate(X_test, y_test)

In [None]:
print('Simple RNN test accuracy (%): ', round(rnn_results[1] * 100, 3))

****LSTM****

In [None]:
lstm = keras.Sequential()
lstm.add(keras.layers.LSTM(16))
lstm.add(keras.layers.Dense(1, activation='sigmoid'))

In [None]:
lstm.compile(optimizer='rmsprop', loss=keras.losses.BinaryCrossentropy(), metrics=['acc'])

In [None]:
history_lstm = lstm.fit(X_train, y_train, epochs=10, batch_size=10, validation_split=0.1)

In [None]:
lstm.summary()

In [None]:
plot_training_history(history_lstm)

In [None]:
lstm_results = lstm.evaluate(X_test, y_test)

In [None]:
print('LSTM test accuracy (%): ', round(lstm_results[1] * 100, 3))

****GRU****

In [None]:
gru = keras.Sequential()
gru.add(keras.layers.GRU(16))
gru.add(keras.layers.Dense(1, activation='sigmoid'))

In [None]:
gru.compile(optimizer='rmsprop', loss=keras.losses.BinaryCrossentropy(), metrics=['acc'])

In [None]:
history_gru = gru.fit(X_train, y_train, epochs=10, batch_size=10, validation_split=0.1)

In [None]:
gru.summary()

In [None]:
plot_training_history(history_gru)

In [None]:
gru_results = gru.evaluate(X_test, y_test)

In [None]:
print('GRU test accuracy (%): ', round(gru_results[1] * 100, 3))