In [None]:
# all the mutation and plotting related functions are in the mutation.py file
from mutation import * 
# from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.models import Model, Sequential
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense, Dropout

In [None]:
folder_path = '/home/fazle/notebook-ws/UCR_TimeSeriesAnomalyDatasets2021/AnomalyDatasets_2021/UCR_TimeSeriesAnomalyDatasets2021/FilesAreInHere/UCR_Anomaly_FullData/'

files = os.listdir(folder_path)

# Filter only .txt files
txt_files = [file for file in files if file.endswith('.txt')]

In [None]:
dataset_records = load_data(txt_files, folder_path, is_record = True, fraction_of_anomaly = 0.02)

In [None]:
# #dataset for sequence
# dataset_seq = load_data(txt_files, folder_path, is_record = False, fraction_of_anomaly = 0.02)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Define the custom F1 score metric
def f1_score_metric(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred_binary = tf.round(y_pred)  # Convert predictions to binary values
    y_pred_binary = tf.cast(y_pred_binary, tf.float32)

    tp = K.sum(K.cast(y_true * y_pred_binary, 'float'), axis=0)
    fp = K.sum(K.cast((1 - y_true) * y_pred_binary, 'float'), axis=0)
    fn = K.sum(K.cast(y_true * (1 - y_pred_binary), 'float'), axis=0)

    precision = tp / (tp + fp + K.epsilon())
    recall = tp / (tp + fn + K.epsilon())

    f1 = 2 * precision * recall / (precision + recall + K.epsilon())
    return f1

# Define the function to train and evaluate the MLP model
def train_and_evaluate_mlp(df_tuple):
    # Extract features and labels
    X = df_tuple[0]['feature'].values.reshape(-1, 1)
    y = df_tuple[0]['is_anomaly'].values
    last_training_data = df_tuple[1]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = X[:last_training_data], X[last_training_data:], y[:last_training_data], y[last_training_data:]

    # Define the more complex MLP model
    model = Sequential()
    model.add(Input(shape=(1,)))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[f1_score_metric])

    # Train the model
    model.fit(X_train, y_train, epochs=50, verbose=0, batch_size=32, validation_split=0.2)

    y_pred = model.predict(X_test).flatten()
    y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions

    precision = precision_score(y_test, y_pred, zero_division=0.0)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    return precision, recall, f1, accuracy

# List to store the F1 score results
precision_scores = []
recall_scores = []
f1_scores = []
accuracy_scores = []

for i, df in enumerate(dataset_records):
    print(f'Training and evaluating on dataset {i+1}')
    precision, recall, f1, accuracy = train_and_evaluate_mlp(df)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    accuracy_scores.append(accuracy)
    print(f'Dataset {i+1} Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, Accuracy: {accuracy:.4f}')

# Calculate the average precision, recall, and F1 score across all datasets
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1_score = np.mean(f1_scores)
average_accuracy = np.mean(accuracy_scores)
print(f'Average Precision: {average_precision:.4f}')
print(f'Average Recall: {average_recall:.4f}')
print(f'Average F1 Score: {average_f1_score:.4f}')
print(f'Average Accuracy: {average_accuracy:.4f}')


# Anomalous sequence detection - semi supervised.

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

def create_lstm_autoencoder(input_shape):
    model = Sequential([
        LSTM(64, activation='relu', return_sequences=True),
        Dropout(0.2),
        LSTM(32, activation='relu', return_sequences=False),
        RepeatVector(input_shape[0]),
        LSTM(32, activation='relu', return_sequences=True),
        Dropout(0.2),
        LSTM(64, activation='relu', return_sequences=True),
        Dropout(0.2),
        TimeDistributed(Dense(input_shape[1]))
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

def fit_lstm_autoencoder(model, train_data, epochs=50, batch_size=32):
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(train_data, train_data, epochs=epochs, verbose=0, batch_size=batch_size, 
              validation_split=0.1, callbacks=[early_stopping], shuffle=False)
    return model

def create_sequences(data, timesteps):
    sequences = []
    for i in range(len(data) - timesteps + 1):
        sequence = data[i:i + timesteps]
        sequences.append(sequence)
    return np.array(sequences)

def is_prediction_correct(prediction, begin, end):
    print(f'prediction: {prediction}, begin: {begin}, end: {end}')
    L = end - begin + 1
    return min(begin - L, begin - 100) < prediction < max(end + L, end + 100)

def detect_anomaly(model, test_data, segment_length):
    predictions = model.predict(test_data)
    reconstruction_errors = np.mean(np.abs(predictions - test_data), axis=1)
    segment_errors = []

    # Calculate average error for each segment
    for i in range(len(reconstruction_errors) - segment_length + 1):
        segment_error = np.mean(reconstruction_errors[i:i + segment_length])
        segment_errors.append(segment_error)

    # Find the segment with the highest average error
    most_confident_anomalous_segment = np.argmax(segment_errors)
    center_of_anomalous_segment = most_confident_anomalous_segment + segment_length // 2

    return center_of_anomalous_segment

def train_and_evaluate_lstm_autoencoder(df_tuple, timesteps, segment_length):
    X = df_tuple[0]['feature'].values
    last_training_data = df_tuple[1]
    begin_anomaly = df_tuple[2]
    end_anomaly = df_tuple[3]
    
    train_data, test_data = X[:last_training_data], X[last_training_data:]

    # Create sequences
    train_data = create_sequences(train_data, timesteps)
    test_data = create_sequences(test_data, timesteps)
    
    # Reshape data for LSTM
    train_data = train_data.reshape((train_data.shape[0], timesteps, 1))
    test_data = test_data.reshape((test_data.shape[0], timesteps, 1))
    
    # Create and train the LSTM-Autoencoder
    model = create_lstm_autoencoder((timesteps, 1))
    model = fit_lstm_autoencoder(model, train_data)
    
    # Detect anomalies in the test set
    center_of_anomalous_segment = detect_anomaly(model, test_data, segment_length) + last_training_data
    
    # Evaluate predictions
    correct = is_prediction_correct(center_of_anomalous_segment, begin_anomaly, end_anomaly)
    print(f'last training point: {last_training_data}')
    return correct

# Example number of timesteps and segment length
timesteps = 30
segment_length = 30

dataset_seq_wo_mutation = load_data(txt_files, folder_path, is_record=False, fraction_of_anomaly=0.02, mutation=False)
results = []

for i, df in enumerate(dataset_seq_wo_mutation):
    print(f'Training and evaluating on dataset {i + 1}')
    correct_or_not = train_and_evaluate_lstm_autoencoder(df, timesteps, segment_length)
    results.append(correct_or_not)
    print(f'Dataset {i + 1} correctly identified: {correct_or_not}')

accuracy = sum(1 for item in results if item) / len(results)
print(f'Final Accuracy: {accuracy}')
