In [1]:
import numpy as np
import pandas as pd
import io
import re
import tensorflow as tf
import keras
from keras import layers
from keras.preprocessing.sequence import TimeseriesGenerator
import multiprocessing

# Define Parameters

In [36]:
# data
train_local = False
local_data_path = "C:\\Users\\wolff\\OneDrive\\Uni\\Fünftes Semester\\Hackathon\\"

# label
extreme_event_threshold = 60
days = 3

# time series generator
train_test_split = 0.8
prediction_horizon = 10
sampling_rate = 1
stride = 1
batch_size = 15000

# model related
units = 128
input_shape = (prediction_horizon, 5)

# training
epochs = 100
class_weights = {0: 1.0, 1: 6}

# Loading and preprocessing

In [23]:
if train_local:
  precipitation_NL = pd.read_csv(local_data_path + "PrecipitationNL.csv")
  daily_mean_sea_level_pressure_NL = pd.read_csv(local_data_path + "DailyMeanSeaLevelPressureNL.csv")
  mean_temperature_NL = pd.read_csv(local_data_path + "MeanTemperatureNL.csv")
  wind_directoin_NL = pd.read_csv(local_data_path + "WindDirectionNL.csv")
  wind_speed_NL =  pd.read_csv(local_data_path + "WindSpeedNL.csv")
else: 
  precipitation_NL = pd.read_csv('PrecipitationNL.csv')
  daily_mean_sea_level_pressure_NL = pd.read_csv('DailyMeanSeaLevelPressureNL.csv')
  mean_temperature_NL = pd.read_csv('MeanTemperatureNL.csv')
  wind_directoin_NL = pd.read_csv('WindDirectionNL.csv')
  wind_speed_NL = pd.read_csv('WindSpeedNL.csv')

In [25]:
def prepare_CSV(data_frame, start_date, description):
  # changes first column to date column
  data_frame.iloc[:,0] = pd.date_range(start=start_date, periods=len(data_frame), freq='D')
  # names all columns in the style 'Prec_MeasureID'
  data_frame = data_frame.rename(columns= lambda col: description + '_' + str(re.findall(r"\d+",col)[0]))
  # names first columns to 'Date'
  data_frame = data_frame.rename(columns={description + '_0':'Date'})
  # cast all -9999 values to NaN
  data_frame = data_frame.replace(-9999, np.nan)

  return data_frame

precipitation_NL = prepare_CSV(precipitation_NL, '1/1/1980', 'Prec')
daily_mean_sea_level_pressure_NL = prepare_CSV(daily_mean_sea_level_pressure_NL, '1/1/1980', 'DMSLP')
mean_temperature_NL = prepare_CSV(mean_temperature_NL, '1/1/1980', 'MT')
wind_directoin_NL = prepare_CSV(wind_directoin_NL, '1/1/1980', 'WD')
wind_speed_NL = prepare_CSV(wind_speed_NL, '1/1/1980', 'WS')

In [27]:
# get data from a single station
precipitation_NL_10961 = precipitation_NL['Prec_010961']
daily_mean_sea_level_pressure_NL_10961 = daily_mean_sea_level_pressure_NL['DMSLP_010961']
mean_temperature_NL_10961 = mean_temperature_NL['MT_010961']
wind_directoin_NL_10961 = wind_directoin_NL['WD_010961']
wind_speed_NL_10961 = wind_speed_NL['WS_010961']
single_station_data = [precipitation_NL_10961, daily_mean_sea_level_pressure_NL_10961, mean_temperature_NL_10961, wind_directoin_NL_10961, wind_speed_NL_10961]

# replace nan's with medium value
single_station_data = [tmp_df.fillna(tmp_df.mean(skipna=True)) for tmp_df in single_station_data]

# normalize
single_station_data = [(tmp_df - tmp_df.min()) / (tmp_df.max() - tmp_df.min()) for tmp_df in single_station_data]

In [29]:
# create the labels 
label = precipitation_NL_10961 >= extreme_event_threshold
label[:-days] = label[days:]
label[-days:] = days * [False]
label = label.astype(np.int32)
label.shape

# one-hot encoding
one_hot = np.full((len(label),2), np.nan, dtype=np.int64)
one_hot[label == 1] = [0, 1]
one_hot[label == 0] = [1, 0]

In [30]:
one_hot.shape

(14944, 2)

# Create data generators

In [31]:
# join data to single frame
dataset = pd.DataFrame(single_station_data).transpose().values

# calculate train-test-split
index = int(len(label) * train_test_split)

# create generators
train_generator = TimeseriesGenerator(dataset, one_hot, length=prediction_horizon, batch_size=batch_size, stride=stride, sampling_rate=sampling_rate, end_index=index)
test_generator = TimeseriesGenerator(dataset, one_hot, length=prediction_horizon, batch_size=batch_size, stride=stride, sampling_rate=sampling_rate, start_index=index)

# Model

In [32]:
model = keras.Sequential()
model.add(layers.LSTM(units, return_sequences=True, input_shape=input_shape))
model.add(layers.Flatten())
model.add(layers.Dense(2, activation='softmax'))

In [37]:
model.compile(
    loss='binary_crossentropy',
    optimizer="Adam",
    metrics=["accuracy", 'binary_crossentropy'],
)

model.fit(train_generator, 
          epochs=epochs, 
          verbose=1, 
          class_weight=class_weights)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f48e448c048>

In [34]:
def evaluate_test_predictions(targets, predictions):
    """
    Evaluates the test predictions. Prints a confusion matrix and the weighted accuracy.
    Args:
        targets: The target labels from the generator.
        predictions: The predicted output.
    Returns:
        A tupel with the TP, FP, TN, FN and the weighted accuracy.
    """
    # transform to labels
    target_labels = np.argmax(targets, axis=1)
    prediction_labels = np.argmax(predictions, axis=1)
    # calculate confusion matrix
    tp, fp, tn, fn = _calc_confusion_matrix(target_labels, prediction_labels)
    print("\nBankrupt: {}, Non-Bankrupt: {}".format((fp+tn), (tp+fn)))
   
    print("""Confusion matrix of test results:
                              Actual class
                       extreme | non-extreme
Predicted | extreme |    {}    |    {}
class     | non-extreme |    {}    |    {}""".format(tp, fp, fn, tn))
    # calculate weighted accuracy
    weighted_acc = tp / (2 * (tp + fn)) + tn / (2 * (tn + fp))
    print(f"Weighted accuracy: {weighted_acc}")

    return tp, fp, tn, fn, weighted_acc


def _calc_confusion_matrix(target_labels, prediction_labels):
    """
    Calculates the confusion matrix.
    Args:
        target_lables: The target labels.
        prediction_labels: The predicted labels.
    Returns:
        The number of TP, FP, TN, FN.
    """
    true_predictions = prediction_labels[target_labels == prediction_labels]
    tp = len(true_predictions[true_predictions == 1])
    tn = len(true_predictions[true_predictions == 0])
    false_predictions = prediction_labels[target_labels != prediction_labels]
    fn = len(false_predictions[false_predictions == 0])
    fp = len(false_predictions[false_predictions == 1])
    return tp, fp, tn, fn

In [38]:
predictions = model.predict(test_generator)
targets = test_generator.targets[index+10:]
evaluate_test_predictions(targets, predictions)


Bankrupt: 2548, Non-Bankrupt: 431
Confusion matrix of test results:
                              Actual class
                       extreme | non-extreme
Predicted | extreme |    276    |    1572
class     | non-extreme |    155    |    976
Weighted accuracy: 0.5117083778005224


(276, 1572, 976, 155, 0.5117083778005224)