# Creating an Autoencoder 

## Setting up dependencies + loading the data from csv files 

In [7]:
# Imports 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import os
import joblib
import warnings
import tensorflow as tf
warnings.filterwarnings('ignore')

file_path = '../data/raw/KDDTrain+.csv'

# Load the CSV file
data = pd.read_csv(file_path)

# Convert to a pandas DataFrame
df = pd.DataFrame(data)
df.columns = df.columns.str.strip("'")

#print(df.head())

## Including a feature scaler, so that the numerical feature scale isn't a problem

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_features = df.select_dtypes(include=['float64', 'int64'])
#df[numerical_features.columns] = scaler.fit_transform(numerical_features)

for col in df.columns:
    print(col)


duration
protocol_type
service
flag
src_bytes
dst_bytes
land
wrong_fragment
urgent
hot
num_failed_logins
logged_in
num_compromised
root_shell
su_attempted
num_root
num_file_creations
num_shells
num_access_files
num_outbound_cmds
is_host_login
is_guest_login
count
srv_count
serror_rate
srv_serror_rate
rerror_rate
srv_rerror_rate
same_srv_rate
diff_srv_rate
srv_diff_host_rate
dst_host_count
dst_host_srv_count
dst_host_same_srv_rate
dst_host_diff_srv_rate
dst_host_same_src_port_rate
dst_host_srv_diff_host_rate
dst_host_serror_rate
dst_host_srv_serror_rate
dst_host_rerror_rate
dst_host_srv_rerror_rate
class


Also converting categorical variables with one-hot encoding

In [9]:
df.drop('src_bytes', axis=1, inplace=True)
df = pd.get_dummies(df)


Now some more data processing. Here I: 

- create two dataframes, one containing the normal connections, and one for the anomalies 
- Perform the data splitting for training(80%), testing(10%) and validation(10%) 
- 

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Assuming 'class_normal' and 'class_anomaly' are binary columns indicating the class
normal_data = df[df["class_normal"] == 1]
anomalies = df[df['class_anomaly'] == 1]

# Split the normal data
X_train, X_temp = train_test_split(normal_data, test_size=0.2, random_state=42)
X_val, X_test_normal = train_test_split(X_temp, test_size=0.5, random_state=42)

# Remove 'class_normal' and 'class_anomaly' columns
X_train = X_train.drop(['class_normal', 'class_anomaly'], axis=1)
X_val = X_val.drop(['class_normal', 'class_anomaly'], axis=1)
X_test_normal = X_test_normal.drop(['class_normal', 'class_anomaly'], axis=1)

# Fit and transform scaler on training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Transform validation and normal test data
X_val_scaled = scaler.transform(X_val)
X_test_normal_scaled = scaler.transform(X_test_normal)

# Prepare anomaly test data
X_test_anomaly = anomalies.sample(n=len(X_test_normal))
X_test_anomaly_scaled = scaler.transform(X_test_anomaly.drop(['class_normal', 'class_anomaly'], axis=1))

# Combine scaled normal and anomaly samples to create the final test set
X_test_scaled = np.concatenate([X_test_normal_scaled, X_test_anomaly_scaled])

# Prepare true labels for the test set
y_true_normal = np.zeros(len(X_test_normal))
y_true_anomaly = np.ones(len(X_test_anomaly))
y_true = np.concatenate([y_true_normal, y_true_anomaly])

from scipy.stats import ks_2samp
import numpy as np

# Initialize lists to store the results
p_values = []
ks_statistics = []

# Determine the number of features (assuming X_train_scaled and X_val_scaled are NumPy arrays)
num_features = X_train_scaled.shape[1]

# Loop over each feature
for i in range(num_features):
    # Perform the KS test for this feature
    stat, p = ks_2samp(X_train_scaled[:, i], X_val_scaled[:, i])
    
    # Append the results to our lists
    ks_statistics.append(stat)
    p_values.append(p)

# Optionally, you can adjust the significance level
alpha = 0.05

# Check how many features have different distributions
different_distribution_count = sum(p < alpha for p in p_values)

print(f"Features with distribution differences: {different_distribution_count} out of {num_features}")

# If you want to see the results for each feature
for i in range(num_features):
    print(f"Feature {i}: KS Statistic={ks_statistics[i]}, P-Value={p_values[i]}")


Features with distribution differences: 0 out of 121
Feature 0: KS Statistic=0.004177817408689388, P-Value=0.9999331339965543
Feature 1: KS Statistic=0.016295775734466056, P-Value=0.08232760559006458
Feature 2: KS Statistic=0.00012993280617734637, P-Value=1.0
Feature 3: KS Statistic=0.0, P-Value=1.0
Feature 4: KS Statistic=0.00011137648959225199, P-Value=1.0
Feature 5: KS Statistic=0.0012062818611432435, P-Value=1.0
Feature 6: KS Statistic=0.00020415255964878476, P-Value=1.0
Feature 7: KS Statistic=0.006767443453043198, P-Value=0.9450731593592951
Feature 8: KS Statistic=0.0004270984892434937, P-Value=1.0
Feature 9: KS Statistic=0.00011145366975640059, P-Value=1.0
Feature 10: KS Statistic=9.276504431865451e-05, P-Value=1.0
Feature 11: KS Statistic=0.0007240491704241236, P-Value=1.0
Feature 12: KS Statistic=0.00040823345200224903, P-Value=1.0
Feature 13: KS Statistic=0.00035270783683749585, P-Value=1.0
Feature 14: KS Statistic=0.0008352768125572396, P-Value=1.0
Feature 15: KS Statistic=0

In [11]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.optimizers import Adam

X_train = np.asarray(X_train).astype(np.float32)
X_val = np.asarray(X_val).astype(np.float32)
X_train_scaled = np.asarray(X_train_scaled).astype(np.float32)
X_val_scaled = np.asarray(X_val_scaled).astype(np.float32)
X_test_scaled = np.asarray(X_test_scaled).astype(np.float32)



input_dim = X_train_scaled.shape[1]
encoding_dim = 16  # or choose a different size for the encoding layer

# Encoder
input_layer = Input(shape=(input_dim,))
# encoder = Dense(1024, activation="relu", activity_regularizer=l2(1e-5))(input_layer)
# encoder = Dropout(0.5)(encoder)
# encoder = Dense(512, activation="relu", activity_regularizer=l2(1e-5))(encoder)
# encoder = Dense(256, activation="relu", activity_regularizer=l2(1e-5))(encoder)
encoder = Dense(128, activation="relu", activity_regularizer=l2(1e-3))(input_layer)
encoder = Dense(64, activation="relu")(encoder)  # Bottleneck layer

#, activity_regularizer=l1(1e-5)

# Decoder
decoder = Dense(128, activation="relu", activity_regularizer=l2(1e-3))(encoder)
# decoder = Dense(256, activation="relu", activity_regularizer=l2(1e-5))(decoder)
# decoder = Dense(512, activation="relu", activity_regularizer=l2(1e-5))(decoder)
# decoder = Dense(1024, activation="relu", activity_regularizer=l2(1e-5))(decoder)
# decoder = Dropout(0.5)(decoder)
decoder = Dense(input_dim, activation='sigmoid')(decoder)


autoencoder = Model(inputs=input_layer, outputs=decoder)

# Specify a learning rate
learning_rate = 0.01

# Create and compile the model with the specified learning rate
optimizer = Adam(learning_rate=learning_rate)


autoencoder.compile(optimizer=optimizer, loss='mean_squared_error')

history = autoencoder.fit(X_train_scaled, X_train_scaled,
                epochs=100,
                batch_size=256,
                shuffle=True,
                validation_data=(X_val_scaled, X_val_scaled))


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [None]:

# Predict using the autoencoder
X_test_pred = autoencoder.predict(X_test_scaled)
reconstruction_error = np.mean(np.power(X_test_scaled - X_test_pred, 2), axis=1)

# Define a threshold and predict anomalies
threshold = 0.95
y_pred = [1 if e > threshold else 0 for e in reconstruction_error]


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

#Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy}")

# Calculate precision, recall, and F1-score
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Optionally, print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))


In [None]:
# Define a range of thresholds
thresholds = np.linspace(0, 1, 100)

# Initialize lists to store metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []
false_negatives = []
false_positives = []

for threshold in thresholds:
    y_pred = [1 if e > threshold else 0 for e in reconstruction_error]
    
    accuracies.append(accuracy_score(y_true, y_pred))
    precisions.append(precision_score(y_true, y_pred))
    recalls.append(recall_score(y_true, y_pred))
    f1_scores.append(f1_score(y_true, y_pred))
    
    # Calculate confusion matrix and extract false negatives (FN)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    false_negatives.append(fn)
    false_positives.append(fp)

plt.figure(figsize=(12, 8))

# Create a plot with a primary and secondary y-axis
fig, ax1 = plt.subplots()
#ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

# Plotting the metrics on the primary y-axis
accuracy_line, = ax1.plot(thresholds, accuracies, label='Accuracy', color='g')
precision_line, = ax1.plot(thresholds, precisions, label='Precision', color='b')
recall_line, = ax1.plot(thresholds, recalls, label='Recall', color='r')
f1_score_line, = ax1.plot(thresholds, f1_scores, label='F1 Score', color='c')

# Plot False Negatives on the secondary y-axis
#false_negatives_line, = ax2.plot(thresholds, false_negatives, label='False Negatives', color='y')
#false_positives_line, = ax2.plot(thresholds, false_positives, label='False Positives', color='m')

# Labels and titles
ax1.set_xlabel('Threshold')
ax1.set_ylabel('Metrics')
#ax2.set_ylabel('False Negatives', color='y')
#ax2.set_ylabel('False Positives', color='m')

# Collecting handles and labels for both axes
handles = [accuracy_line, precision_line, recall_line, f1_score_line]
labels = [h.get_label() for h in handles]

# Creating a single legend for both lines
fig.legend(handles, labels, loc='lower right', bbox_to_anchor=(0.9, 0.11))

plt.title('Model Performance Across Different Thresholds')
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc
import numpy as np

# Assuming y_true contains the actual labels for the test set (0 for normal, 1 for anomalies)
# and reconstruction_error contains the reconstruction error from the autoencoder

# The threshold does not directly apply here; instead, use reconstruction error directly
fpr, tpr, thresholds = roc_curve(y_true, reconstruction_error)
roc_auc = auc(fpr, tpr) 

print(f"AUC: {roc_auc}")

import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Plot training & validation loss values
plt.figure(figsize=(8, 6))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss Over Epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')
plt.show()
