In [None]:
# all the mutation and plotting related functions are in the mutation.py file
from mutation import * 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.models import Model, Sequential
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras import backend as K

In [None]:
folder_path = '/home/fazle/notebook-ws/UCR_TimeSeriesAnomalyDatasets2021/AnomalyDatasets_2021/UCR_TimeSeriesAnomalyDatasets2021/FilesAreInHere/UCR_Anomaly_FullData/'

files = os.listdir(folder_path)

# Filter only .txt files
txt_files = [file for file in files if file.endswith('.txt')]

In [None]:
# Custom loss and metrics for my models

# def custom_loss(y_true, y_pred):
#     anomaly_weight = tf.constant(100.0, dtype=tf.float32)  # Set a higher weight for anomalies
#     normal_weight = tf.constant(1.0, dtype=tf.float32)    # Weight for normal points
#     base_loss = tf.keras.losses.BinaryCrossentropy()(y_true, y_pred)

#     weight_vector = tf.where(
#         tf.math.logical_and(y_true == 1, y_pred == 0),
#         anomaly_weight,  # Apply anomaly weight if the true label is 1 and the prediction is 0
#         normal_weight  # Apply normal weight otherwise
#     )

#     # Weighted loss calculation
#     weighted_loss = base_loss * weight_vector

#     # Return the mean of the weighted loss
#     return tf.reduce_mean(weighted_loss)


def custom_specificity(y_true, y_pred, tolerance=100):
    tp = 0
    fp = 0
    tn = 0
    fn = 0

    total_predictions = len(y_pred)
    
    tolerance = max(tolerance, 100)

    for i in range(total_predictions):
        if y_pred[i] == 1:
            # Check if there is any anomaly in the ground truth within the range p-100 to p+100
            if np.any(y_true[max(0, i - tolerance):min(len(y_true), i + tolerance + 1)] == 1):
                tp += 1
            else:
                fp += 1
        elif y_pred[i] == 0 and y_true[i] == 0:
            tn += 1
        elif y_pred[i] == 0 and y_true[i] == 1:
            fn += 1

    specificity = tn / (tn + fp) if (tn+fp) > 0 else 0.0
    return specificity
    
# Define the custom accuracy function
def is_prediction_correct(prediction, begin, end):
    print(f'prediction: {prediction}, begin: {begin}, end: {end}')
    L = end - begin + 1
    return min(begin - L, begin - 100) < prediction < max(end + L, end + 100)
    

**Multi Layer Perceptron-1**

In [None]:
dataset_records = load_data(txt_files, folder_path, is_record = True, fraction_of_anomaly = 0.05)

In [None]:
def train_and_evaluate_mlp1(df_tuple, threshold = 0.10):
    # Extract features and labels
    X = df_tuple[0]['feature'].values.reshape(-1, 1)
    y = df_tuple[0]['is_anomaly'].values
    last_training_data = df_tuple[1]
    begin_anomaly = df_tuple[2]
    end_anomaly = df_tuple[3]

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = X[:last_training_data], X[last_training_data:], y[:last_training_data], y[last_training_data:]

    model = Sequential()
    model.add(Input(shape=(1,)))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    class_weight = {0: 1.0, 1: 20.0}
    model.compile(optimizer='adam', loss='binary_crossentropy')
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train the model with early stopping
    model.fit(X_train, y_train, epochs=70, verbose=0, batch_size=32, validation_split=0.2, callbacks=[early_stopping], class_weight=class_weight)
    # model.fit(X_train, y_train, epochs=70, verbose=0, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

    y_pred = model.predict(X_test).flatten()
    correct_or_not = is_prediction_correct( (np.argmax(y_pred) + last_training_data), begin_anomaly, end_anomaly)
    y_pred = (y_pred > threshold).astype(int)
    specificity = custom_specificity(y_test, y_pred, tolerance = end_anomaly-begin_anomaly)
    return correct_or_not, specificity

# List to store the results
accuracy_scores = []
specificity_scores = []

for i, df in enumerate(dataset_records):
    print(f'Training and evaluating on dataset {i+1}')
    accuracy, specificity = train_and_evaluate_mlp1(df, threshold = 0.15)
    specificity_scores.append(specificity)
    accuracy_scores.append(accuracy)
    print(f'Dataset {i+1} specificity: {specificity:.4f} Correctly_predicted: {accuracy}')

# Calculate the average precision, recall, and F1 score across all datasets
average_specificity = np.mean(specificity_scores)
average_accuracy =  sum(1 for item in accuracy_scores if item == True) / len(accuracy_scores)
print(f'Average specificity: {average_specificity:.4f}')
print(f'Average Accuracy: {average_accuracy:.4f}')

**Multi Layer Perceptron-2**

In [None]:
dataset_seq = load_data(txt_files, folder_path, is_record = False, fraction_of_anomaly = 0.05)

In [None]:
def create_sequences(data, labels, window_size):
    X = []
    y = []
    for i in range(len(data) - window_size):
        X.append(data[i:(i + window_size)])
        y.append(labels[i + window_size])
    return np.array(X), np.array(y)

def train_and_evaluate_mlp2(df_tuple, window_size=20, threshold=0.10):
    # Extract features and labels
    X = df_tuple[0]['feature'].values
    y = df_tuple[0]['is_anomaly'].values
    last_training_data = df_tuple[1]
    begin_anomaly = df_tuple[2]
    end_anomaly = df_tuple[3]

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X.reshape(-1, 1)).flatten()

    # Create sequences
    X_sequences, y_sequences = create_sequences(X, y, window_size)
    
    # Split into train and test sets
    X_train, X_test = X_sequences[:last_training_data - window_size], X_sequences[last_training_data - window_size:]
    y_train, y_test = y_sequences[:last_training_data - window_size], y_sequences[last_training_data - window_size:]

    model = Sequential()
    model.add(Input(shape=(window_size,)))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    class_weight = {0: 1.0, 1: 20.0}
    model.compile(optimizer='adam', loss='binary_crossentropy')
    # model.fit(X_train, y_train, epochs=70, batch_size=32, validation_split=0.2, class_weight=class_weight)


    # model.compile(optimizer='adam', loss=custom_loss)
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train the model with early stopping
    model.fit(X_train, y_train, epochs=70, verbose=0, batch_size=32, validation_split=0.2, callbacks=[early_stopping], class_weight=class_weight)

    y_pred = model.predict(X_test).flatten()
    correct_or_not = is_prediction_correct((np.argmax(y_pred) + last_training_data), begin_anomaly, end_anomaly)
    y_pred = (y_pred > threshold).astype(int)
    specificity = custom_specificity(y_test, y_pred, tolerance=end_anomaly - begin_anomaly)
    return correct_or_not, specificity

# # Usage
# df_tuple = (df, last_training_data, begin_anomaly, end_anomaly)
# correct_or_not, specificity = train_and_evaluate_mlp(df_tuple)
# List to store the results
accuracy_scores = []
specificity_scores = []

for i, df in enumerate(dataset_seq):
    print(f'Training and evaluating on dataset {i+1}')
    accuracy, specificity = train_and_evaluate_mlp2(df, threshold = 0.15)
    specificity_scores.append(specificity)
    accuracy_scores.append(accuracy)
    print(f'Dataset {i+1} specificity: {specificity:.4f} Correctly_predicted: {accuracy}')

# Calculate the average precision, recall, and F1 score across all datasets
average_specificity = np.mean(specificity_scores)
average_accuracy =  sum(1 for item in accuracy_scores if item == True) / len(accuracy_scores)
print(f'Average specificity: {average_specificity:.4f}')
print(f'Average Accuracy: {average_accuracy:.4f}')


# Anomalous sequence detection - semi supervised.

In [None]:
def create_lstm_autoencoder(input_shape):
    model = Sequential([
        LSTM(64, activation='relu', return_sequences=True),
        Dropout(0.2),
        LSTM(32, activation='relu', return_sequences=False),
        RepeatVector(input_shape[0]),
        LSTM(32, activation='relu', return_sequences=True),
        Dropout(0.2),
        LSTM(64, activation='relu', return_sequences=True),
        Dropout(0.2),
        TimeDistributed(Dense(input_shape[1]))
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

def fit_lstm_autoencoder(model, train_data, epochs=80, batch_size=32):
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(train_data, train_data, epochs=epochs, verbose=0, batch_size=batch_size, 
              validation_split=0.2, callbacks=[early_stopping], shuffle=False)
    return model

def create_sequences(data, timesteps):
    sequences = []
    for i in range(len(data) - timesteps + 1):
        sequence = data[i:i + timesteps]
        sequences.append(sequence)
    return np.array(sequences)

def is_prediction_correct_seq(prediction, begin, end):
    print(f'prediction: {prediction}, begin: {begin}, end: {end}')
    L = end - begin + 1
    return min(begin - L, begin - 100) < prediction < max(end + L, end + 100)

def detect_anomaly(model, test_data, segment_length):
    predictions = model.predict(test_data)
    reconstruction_errors = np.mean(np.abs(predictions - test_data), axis=1)
    segment_errors = []

    # Calculate average error for each segment
    for i in range(len(reconstruction_errors) - segment_length + 1):
        segment_error = np.mean(reconstruction_errors[i:i + segment_length])
        segment_errors.append(segment_error)

    # Find the segment with the highest average error
    most_confident_anomalous_segment = np.argmax(segment_errors)
    center_of_anomalous_segment = most_confident_anomalous_segment + segment_length // 2

    return center_of_anomalous_segment

def train_and_evaluate_lstm_autoencoder(df_tuple, timesteps, segment_length):
    X = df_tuple[0]['feature'].values
    last_training_data = df_tuple[1]
    begin_anomaly = df_tuple[2]
    end_anomaly = df_tuple[3]
    
    train_data, test_data = X[:last_training_data], X[last_training_data:]

    # Create sequences
    train_data = create_sequences(train_data, timesteps)
    test_data = create_sequences(test_data, timesteps)
    
    # Reshape data for LSTM
    train_data = train_data.reshape((train_data.shape[0], timesteps, 1))
    test_data = test_data.reshape((test_data.shape[0], timesteps, 1))
    
    # Create and train the LSTM-Autoencoder
    model = create_lstm_autoencoder((timesteps, 1))
    model = fit_lstm_autoencoder(model, train_data)
    
    # Detect anomalies in the test set
    center_of_anomalous_segment = detect_anomaly(model, test_data, segment_length) + last_training_data
    
    # Evaluate predictions
    correct = is_prediction_correct_seq(center_of_anomalous_segment, begin_anomaly, end_anomaly)
    # print(f'last training point: {last_training_data}')
    return correct

# Example number of timesteps and segment length
timesteps = 32
segment_length = 32

dataset_seq_wo_mutation = load_data(txt_files, folder_path, is_record=False, fraction_of_anomaly=0.02, mutation_done=False)
results = []

for i, df in enumerate(dataset_seq_wo_mutation):
    print(f'Training and evaluating on dataset {i + 1}')
    correct_or_not = train_and_evaluate_lstm_autoencoder(df, timesteps, segment_length)
    results.append(correct_or_not)
    print(f'Dataset {i + 1} correctly identified: {correct_or_not}')

accuracy = sum(1 for item in results if item == True) / len(results)
print(f'Final Accuracy: {accuracy}')
