In [None]:
# Create nested cv datasets with stratification
# Creates online alarm flood datasets from test data

In [None]:
import pandas as pd
import numpy as np

In [None]:
alarm_floods = pd.read_csv('../../data/preprocessed/Crane_alarm_floods.csv')

In [None]:
labels = pd.read_csv('../../data/classification/final_classes_v2.csv')

In [None]:
labels = labels["class"]

In [None]:
outer_folds = 5
inner_folds = 2

In [None]:
# Set labels 18 and 19 to -1
labels[labels == 18] = -1
labels[labels == 19] = -1

In [None]:
X_train = []
X_validation= []
X_test = []
Y_train = []
Y_validation= []
Y_test = []


In [None]:
for i in range(outer_folds):
    X_test.append(pd.DataFrame(columns=alarm_floods.columns))
    Y_test.append(pd.Series(dtype='int64'))
    for j in range(inner_folds):
        X_train.append(pd.DataFrame(columns=alarm_floods.columns))
        Y_train.append(pd.Series(dtype='int64'))
        X_validation.append(pd.DataFrame(columns=alarm_floods.columns))
        
        Y_validation.append(pd.Series(dtype='int64'))
        

In [None]:
labels.unique()

In [None]:
# Create folds
remaining_sample_indices = []
for label in labels.unique():
    label_indices = labels[labels == label].index.values
    np.random.shuffle(label_indices)
    # Take only first 35 samples
    label_indices = label_indices[:35]
    outer_fold_size = int(len(label_indices) / outer_folds)
    for i in range(outer_folds):
        outer_fold_indices = label_indices[i * outer_fold_size: (i + 1) * outer_fold_size]
        X_test[i] = X_test[i].append(alarm_floods[alarm_floods["flood_id"].isin(outer_fold_indices)])
        Y_test[i] = Y_test[i].append(labels.iloc[outer_fold_indices])
    remaining_indices = label_indices[(i+1)* outer_fold_size:]
    remaining_sample_indices.append(remaining_indices)
        

In [None]:
# Add remaining samples to outer folds evenly
remaining_sample_indices = np.concatenate(remaining_sample_indices)

In [None]:
fold_idx = 0
for i in range(len(remaining_sample_indices)):
    X_test[fold_idx] = X_test[fold_idx].append(alarm_floods[alarm_floods["flood_id"] == remaining_sample_indices[i]])
    Y_test[fold_idx] = Y_test[fold_idx].append(labels.iloc[[remaining_sample_indices[i]]])
    fold_idx += 1
    if fold_idx >= outer_folds:
        fold_idx = 0


In [None]:
# Create inner folds
folds = list(range(outer_folds))


set_idx = 0
for i in folds:
    inner_indices = [x for x in folds if x != i]
    inner_folds_indices = []
    inner_folds_size = len(inner_indices)
    for j in range(inner_folds):
        train_indices = inner_indices[j*(inner_folds_size//inner_folds):(j+1)*(inner_folds_size//inner_folds)]
        valid_indices = [x for x in inner_indices if x not in train_indices]
        for t in train_indices:
            X_train[set_idx] = X_train[set_idx].append(X_test[t])
            Y_train[set_idx] = Y_train[set_idx].append(Y_test[t])
        for v in valid_indices:
            X_validation[set_idx] = X_validation[set_idx].append(X_test[v])
            Y_validation[set_idx] = Y_validation[set_idx].append(Y_test[v])
        set_idx += 1



In [None]:
Y_test[0].sort_index()

In [None]:
Y_train[0].sort_index()


In [None]:
# Save the folds
set_idx = 0
for i in range(outer_folds):
    X_test[i].sort_index(inplace=True)
    Y_test[i].sort_index(inplace=True)
    X_test[i].to_csv('../../data/classification/folds/outer_fold_' + str(i) + '_test.csv')
    Y_test[i].to_csv('../../data/classification/folds/outer_fold_' + str(i) + '_test_labels.csv')
    for j in range(inner_folds):
            X_train[set_idx].sort_index(inplace=True)
            Y_train[set_idx].sort_index(inplace=True)
            X_validation[set_idx].sort_index(inplace=True)
            Y_validation[set_idx].sort_index(inplace=True)
            X_train[set_idx].to_csv('../../data/classification/folds/outer_fold_' + str(i) + '_inner_fold_' + str(j) + '_train.csv')
            Y_train[set_idx].to_csv('../../data/classification/folds/outer_fold_' + str(i) + '_inner_fold_' + str(j) + '_train_labels.csv')
            X_validation[set_idx].to_csv('../../data/classification/folds/outer_fold_' + str(i) + '_inner_fold_' + str(j) + '_validation.csv')
            Y_validation[set_idx].to_csv('../../data/classification/folds/outer_fold_' + str(i) + '_inner_fold_' + str(j) + '_validation_labels.csv')

In [None]:
# Create online alarm flood datasets
# Get alarms which occurred before each minute between start which is before minute 11 and until minute 20

for f in range(outer_folds):

    for i in range(11, 21):
        data = X_test[f].copy()
        online_flood = data[data["startTimestamp"] < i * 1000*60].copy()
        online_flood["endTimestamp"] = online_flood["endTimestamp"].apply(lambda x: min(x, i * 1000*60))
        online_flood.to_csv('../../data/classification/online_floods/online_flood_inner_fold_' + str(f) + 'min_' + str(i) +'.csv')