In [1]:
import os
import sys
import time
import shutil
import pickle

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.metrics import f1_score
from tensorflow import keras

import sklearn.preprocessing
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

In [2]:
def get_dataset(directory):    
    merged_df = pd.DataFrame()
    for files in os.listdir(directory):
        df = pd.read_csv(os.path.join(directory, files), index_col='timestamp')
        df.dropna(inplace=True)
        merged_df = pd.concat([merged_df, df], ignore_index=True)
    
    return merged_df

In [3]:
def convert_time_window(data, timesteps):
    samples = data.shape[0] - timesteps + 1  # Number of samples in the new 3D array
    variables = data.shape[1]  # Number of variables (features)

    data_3d = np.zeros((samples, timesteps, variables))

    for i in range(samples):
        data_3d[i] = data[i:i+timesteps]
    return data_3d

In [4]:
def inverse_convert_time_window(data_3d):
    samples, timesteps, variables = data_3d.shape
    data_2d = np.zeros((samples + timesteps - 1, variables))

    count = np.zeros((samples + timesteps - 1, variables))

    for i in range(samples):
        data_2d[i:i+timesteps] += data_3d[i]
        count[i:i+timesteps] += 1

    data_2d /= count
    return data_2d

In [5]:
def calculate_reconstruction_loss(data, reconstructions):
    reconstruction_errors = np.mean(np.abs(data - reconstructions), axis=1)
    return reconstruction_errors

In [6]:
def find_optimal_threshold(reconstruction_errors, y_true):
    best_threshold = 0.0
    best_f1 = 0.0

    thresholds = np.linspace(0.0, 1.0, num=1000)

    for threshold in thresholds:
        y_pred = (reconstruction_errors > threshold).astype(bool)

        current_f1 = f1_score(y_true.astype(bool), y_pred)

        if current_f1 > best_f1:
            best_f1 = current_f1
            best_threshold = threshold

    return best_threshold, best_f1

In [7]:
merged_df = get_dataset('dataset_processed/splitted/2/train')
merged_data_class = merged_df['class']

merged_data_np = merged_df.drop('class', axis=1).values

In [8]:
time_steps = 10
features = merged_data_np.shape[1]   
latent_dim = 32

In [9]:
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(merged_data_np)

merged_data_np_scaled = scaler.transform(merged_data_np)

pickle.dump(scaler, open('StandardScaler.pkl', 'wb'))

In [10]:
merged_data_np_3d = convert_time_window(merged_data_np_scaled, time_steps)

In [11]:
merged_data_np_3d.shape

(536753, 10, 3)

In [None]:
# Define constants
time_steps = 100   # Number of timesteps in input data
features = 10      # Number of features in each timestep
latent_dim = 64    # Dimension of latent space

# Encoder function
def build_encoder(input_shape, latent_dim):
    inputs = keras.layers.Input(shape=input_shape)
    
    # LSTM layers for encoding
    x = keras.layers.LSTM(128, return_sequences=True)(inputs)
    x = keras.layers.LSTM(64)(x)
    x = keras.layers.Dense(64)(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.LeakyReLU()(x)
    
    # Latent representation
    latent = keras.layers.Dense(latent_dim, activation='relu')(x)
    
    return keras.models.Model(inputs, latent, name="Encoder")

# Decoder function
def build_decoder(latent_dim, output_shape):
    inputs = keras.layers.Input(shape=(latent_dim,))
    
    # Fully connected layers and Reshape to reconstruct time-series data
    x = keras.layers.Dense(64)(inputs)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.LeakyReLU()(x)
    x = keras.layers.Dense(output_shape[0] * output_shape[1])(x)
    outputs = keras.layers.Reshape(output_shape)(x)
    
    return keras.models.Model(inputs, outputs, name="Decoder")


In [None]:
# Generator (Encoder + Latent Space + Decoder)
def build_generator(input_shape, latent_dim):
    encoder = build_encoder(input_shape, latent_dim)
    decoder = build_decoder(latent_dim, input_shape)
    
    inputs = keras.layers.Input(shape=input_shape)
    
    # Encode, apply latent space, and decode
    latent = encoder(inputs)
    outputs = decoder(latent)
    
    return keras.models.Model(inputs, outputs, name="Generator")

# Discriminator (Encoder + Dense Layer)
def build_discriminator(input_shape, latent_dim):
    encoder = build_encoder(input_shape, latent_dim)
    
    inputs = keras.layers.Input(shape=input_shape)
    
    # Use encoder and add classification layer
    latent = encoder(inputs)
    outputs = keras.layers.Dense(1, activation='sigmoid')(latent)
    
    return keras.models.Model(inputs, outputs, name="Discriminator")


In [None]:
# Define loss functions
def generator_loss(fake_output, reconstructed_input, real_latent, reconstructed_latent, lambdas):
    bce_loss = keras.losses.BinaryCrossentropy(from_logits=True)
    
    # Reconstruction loss (input space, L1 loss)
    lx = tf.reduce_mean(tf.abs(reconstructed_input - real_latent))
    
    # Latent space reconstruction loss (L2 loss)
    lz = tf.reduce_mean(tf.square(reconstructed_latent - real_latent))  # Compare the two latent vectors
    
    # Adversarial loss (Binary Crossentropy)
    adversarial_loss = bce_loss(tf.ones_like(fake_output), fake_output)
    
    # Combine losses
    loss = lambdas[0] * lx + lambdas[1] * lz + lambdas[2] * adversarial_loss
    return loss

def discriminator_loss(real_output, fake_output):
    bce_loss = keras.losses.BinaryCrossentropy(from_logits=True)
    
    # Binary crossentropy loss for real and fake samples
    real_loss = bce_loss(tf.ones_like(real_output), real_output)
    fake_loss = bce_loss(tf.zeros_like(fake_output), fake_output)
    
    # Total discriminator loss
    return real_loss + fake_loss


In [None]:
# Build and compile the models
input_shape = (time_steps, features)
latent_dim = 64

generator = build_generator(input_shape, latent_dim)
encoder2 = build_encoder(input_shape, latent_dim)  # Added Encoder2 for latent space reconstruction
discriminator = build_discriminator(input_shape, latent_dim)

# Optimizers
gen_optimizer = keras.optimizers.Adam(0.0002, beta_1=0.5)
disc_optimizer = keras.optimizers.Adam(0.0002, beta_1=0.5)

# Placeholders for real and fake inputs
real_data = keras.layers.Input(shape=input_shape)
fake_data = generator(real_data)

# Discriminator training
real_output = discriminator(real_data)
fake_output = discriminator(fake_data)


In [None]:

# Custom training loop
@tf.function
def train_step(real_data):
    # Latent vectors for reconstruction
    real_latent = generator(real_data)
    
    with tf.GradientTape(persistent=True) as tape:
        # Generate fake data
        fake_data = generator(real_data)
        
        # Discriminator outputs
        real_output = discriminator(real_data)
        fake_output = discriminator(fake_data)
        
        # Latent space reconstruction using Encoder2
        reconstructed_latent = encoder2(fake_data)
        
        # Calculate losses
        gen_loss = generator_loss(fake_output, fake_data, real_latent, reconstructed_latent, lambdas=[0.1, 0.1, 0.8])
        disc_loss = discriminator_loss(real_output, fake_output)
    
    # Update gradients
    gradients_of_generator = tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = tape.gradient(disc_loss, discriminator.trainable_variables)
    
    gen_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    
    return gen_loss, disc_loss


In [None]:

# Training loop
def train(dataset, epochs):
    for epoch in range(epochs):
        for real_data in dataset:
            gen_loss, disc_loss = train_step(real_data)
        
        print(f'Epoch {epoch + 1}, Gen Loss: {gen_loss}, Disc Loss: {disc_loss}')

# Anomaly score and labeling
def compute_anomaly_score(real_data):
    fake_data = generator.predict(real_data)
    
    # Latent space reconstruction loss using Encoder2
    latent_real = generator(real_data)
    latent_fake = encoder2(fake_data)
    
    anomaly_score = np.mean(np.square(latent_real - latent_fake), axis=1)
    
    return anomaly_score

def label_data(anomaly_score, thresholds):
    labels = np.zeros_like(anomaly_score)
    labels[anomaly_score < thresholds[0]] = 0  # "Good"
    labels[(anomaly_score >= thresholds[0]) & (anomaly_score < thresholds[1])] = 1  # "Watching"
    labels[(anomaly_score >= thresholds[1]) & (anomaly_score < thresholds[2])] = 2  # "Warning"
    labels[anomaly_score >= thresholds[2]] = 3  # "Fault"
    return labels

# Model Evaluation
def evaluate_model(real_data, true_labels, thresholds):
    # Compute anomaly scores
    anomaly_scores = compute_anomaly_score(real_data)
    
    # Label data based on thresholds
    predicted_labels = label_data(anomaly_scores, thresholds)
    
    # Calculate evaluation metrics
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    cm = confusion_matrix(true_labels, predicted_labels)
    
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)


In [None]:

# Sample dataset and training example
train_data = np.random.randn(1000, time_steps, features)
train_dataset = tf.data.Dataset.from_tensor_slices(train_data).batch(32)

# Train the model
train(train_dataset, epochs=50)

# Evaluate the model (replace `true_labels` with actual labels)
true_labels = np.random.randint(0, 4, size=1000)
evaluate_model(train_data, true_labels, thresholds=[0.2, 0.5, 0.7])


In [12]:
# # Generator (Autoencoder with Encoder1 and Decoder)
# def build_generator():
#     inputs = keras.layers.Input(shape=(time_steps, features))
    
#     # Encoder 1 (LSTM layers + Fully Connected Layers)
#     x = keras.layers.LSTM(128, return_sequences=True)(inputs)
#     x = keras.layers.LSTM(64)(x)
#     x = keras.layers.Dense(64)(x)
#     x = keras.layers.BatchNormalization()(x)
#     x = keras.layers.LeakyReLU()(x)
    
#     # Latent representation (z)
#     latent = keras.layers.Dense(latent_dim, activation='relu')(x)

#     # Decoder (Symmetrical to Encoder 1)
#     x = keras.layers.Dense(64)(latent)
#     x = keras.layers.BatchNormalization()(x)
#     x = keras.layers.LeakyReLU()(x)
#     x = keras.layers.Dense(time_steps * features)(x)
#     outputs = keras.layers.Reshape((time_steps, features))(x)
    
#     model = keras.models.Model(inputs, outputs, name="Generator")
#     return model


# # Discriminator (used to classify real vs fake)
# def build_discriminator():
#     inputs = keras.layers.Input(shape=(time_steps, features))
    
#     x = keras.layers.LSTM(128, return_sequences=True)(inputs)
#     x = keras.layers.LSTM(64)(x)
#     x = keras.layers.Dense(64)(x)
#     x = keras.layers.BatchNormalization()(x)
#     x = keras.layers.LeakyReLU()(x)
    
#     # Flatten and classify real vs fake
#     x = keras.layers.Flatten()(x)
#     x = keras.layers.Dense(1, activation='sigmoid')(x)
    
#     model = keras.models.Model(inputs, x, name="Discriminator")
#     return model

In [13]:
# # Define loss functions
# def generator_loss(fake_output, reconstructed_input, real_latent, reconstructed_latent, lambdas):
#     bce_loss = keras.losses.BinaryCrossentropy(from_logits=True)
    
#     # Reconstruction loss (input space, L1 loss)
#     lx = tf.reduce_mean(tf.abs(reconstructed_input - real_latent))
    
#     # Latent space reconstruction loss (L2 loss)
#     lz = tf.reduce_mean(tf.square(reconstructed_latent - real_latent))
    
#     # Adversarial loss (Binary Crossentropy)
#     adversarial_loss = bce_loss(tf.ones_like(fake_output), fake_output)
    
#     # Combine losses
#     loss = lambdas[0] * lx + lambdas[1] * lz + lambdas[2] * adversarial_loss
#     return loss


# def discriminator_loss(real_output, fake_output):
#     bce_loss = keras.losses.BinaryCrossentropy(from_logits=True)
    
#     # Binary crossentropy loss for real and fake samples
#     real_loss = bce_loss(tf.ones_like(real_output), real_output)
#     fake_loss = bce_loss(tf.zeros_like(fake_output), fake_output)
    
#     # Total discriminator loss
#     return real_loss + fake_loss

In [5]:
# # Build and compile the models
# generator = build_generator()
# discriminator = build_discriminator()

# # Optimizers
# gen_optimizer = keras.optimizers.Adam(0.0002, beta_1=0.5)
# disc_optimizer = keras.optimizers.Adam(0.0002, beta_1=0.5)

# # Placeholders for real and fake inputs
# real_data = keras.layers.Input(shape=(time_steps, features))
# fake_data = generator(real_data)

# # Discriminator training
# real_output = discriminator(real_data)
# fake_output = discriminator(fake_data)

In [6]:
# @tf.function
# def train_step(real_data):
#     # Latent vectors for reconstruction
#     real_latent = generator(real_data)
    
#     with tf.GradientTape(persistent=True) as tape:
#         # Generate fake data
#         fake_data = generator(real_data)
        
#         # Discriminator outputs
#         real_output = discriminator(real_data)
#         fake_output = discriminator(fake_data)
        
#         # Latent space reconstruction
#         reconstructed_latent = generator(fake_data)
        
#         # Calculate losses
#         gen_loss = generator_loss(fake_output, fake_data, real_latent, reconstructed_latent, lambdas=[0.1, 0.1, 0.8])
#         disc_loss = discriminator_loss(real_output, fake_output)
    
#     # Update gradients
#     gradients_of_generator = tape.gradient(gen_loss, generator.trainable_variables)
#     gradients_of_discriminator = tape.gradient(disc_loss, discriminator.trainable_variables)
    
#     gen_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
#     disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    
#     return gen_loss, disc_loss

In [7]:
# # Training loop
# def train(dataset, epochs):
#     for epoch in range(epochs):
#         for real_data in dataset:
#             gen_loss, disc_loss = train_step(real_data)
        
#         print(f'Epoch {epoch + 1}, Gen Loss: {gen_loss}, Disc Loss: {disc_loss}')

# # Anomaly score and labeling
# def compute_anomaly_score(real_data):
#     fake_data = generator.predict(real_data)
    
#     # Latent space reconstruction loss
#     latent_real = generator(real_data)
#     latent_fake = generator(fake_data)
    
#     anomaly_score = np.mean(np.square(latent_real - latent_fake), axis=1)
    
#     return anomaly_score

# def label_data(anomaly_score, thresholds):
#     labels = np.zeros_like(anomaly_score)
#     labels[anomaly_score < thresholds[0]] = 0  # "Good"
#     labels[(anomaly_score >= thresholds[0]) & (anomaly_score < thresholds[1])] = 1  # "Watching"
#     labels[(anomaly_score >= thresholds[1]) & (anomaly_score < thresholds[2])] = 2  # "Warning"
#     labels[anomaly_score >= thresholds[2]] = 3  # "Fault"
#     return labels

# # Model Evaluation
# def evaluate_model(real_data, true_labels, thresholds):
#     # Compute anomaly scores
#     anomaly_scores = compute_anomaly_score(real_data)
    
#     # Label data based on thresholds
#     predicted_labels = label_data(anomaly_scores, thresholds)
    
#     # Calculate evaluation metrics
#     precision = precision_score(true_labels, predicted_labels, average='weighted')
#     recall = recall_score(true_labels, predicted_labels, average='weighted')
#     f1 = f1_score(true_labels, predicted_labels, average='weighted')
#     cm = confusion_matrix(true_labels, predicted_labels)
    
#     print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
#     print("Confusion Matrix:")
#     print(cm)

In [8]:
# # Sample dataset and training example
# train_data = np.random.randn(1000, time_steps, features)
# train_dataset = tf.data.Dataset.from_tensor_slices(train_data).batch(32)

In [9]:
# # Train the model
# train(train_dataset, epochs=50)


  output, from_logits = _get_logits(


Epoch 1, Gen Loss: 0.5941934585571289, Disc Loss: 1.27066969871521
Epoch 2, Gen Loss: 2.3630566596984863, Disc Loss: 1.4189093112945557
Epoch 3, Gen Loss: 1.8198363780975342, Disc Loss: 1.0383702516555786
Epoch 4, Gen Loss: 1.8360899686813354, Disc Loss: 0.5423786044120789
Epoch 5, Gen Loss: 2.1641626358032227, Disc Loss: 0.2957163453102112
Epoch 6, Gen Loss: 2.6850733757019043, Disc Loss: 0.15120694041252136
Epoch 7, Gen Loss: 0.7025512456893921, Disc Loss: 1.4290196895599365
Epoch 8, Gen Loss: 1.349381446838379, Disc Loss: 0.2759851813316345
Epoch 9, Gen Loss: 0.18524624407291412, Disc Loss: 1.8066127300262451
Epoch 10, Gen Loss: 0.5883899331092834, Disc Loss: 1.359813928604126
Epoch 11, Gen Loss: 0.6414827704429626, Disc Loss: 1.188891887664795
Epoch 12, Gen Loss: 0.6848341226577759, Disc Loss: 1.198060393333435
Epoch 13, Gen Loss: 0.6785936951637268, Disc Loss: 1.2069724798202515
Epoch 14, Gen Loss: 0.70448899269104, Disc Loss: 1.21113121509552
Epoch 15, Gen Loss: 0.778784155845642

In [13]:
# fake_data = generator.predict(train_data)

# # Latent space reconstruction loss
# latent_real = generator(train_data)
# latent_fake = generator(fake_data)

# anomaly_score = np.mean(np.square(latent_real - latent_fake), axis=1)




In [None]:

# # Evaluate the model (replace `true_labels` with actual labels)
# true_labels = np.random.randint(0, 4, size=1000)
# evaluate_model(train_data, true_labels, thresholds=[0.2, 0.5, 0.7])