In [6]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Load the dataset
data = pd.read_csv('../data/DoS_dataset.csv')  # Replace with your actual dataset path

# Data Preprocessing
def preprocess_data(data):
    scaler = StandardScaler()
    return scaler.fit_transform(data)

# Assuming the dataset doesn't have labels
X = preprocess_data(data.values)

# Split the data into training and testing sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Build the autoencoder model
def build_autoencoder(input_dim):
    # Encoder
    input_layer = layers.Input(shape=(input_dim,))
    encoder = layers.Dense(64, activation="relu")(input_layer)
    encoder = layers.Dense(32, activation="relu")(encoder)
    encoder = layers.Dense(16, activation="relu")(encoder)
    
    # Decoder
    decoder = layers.Dense(32, activation="relu")(encoder)
    decoder = layers.Dense(64, activation="relu")(decoder)
    decoder = layers.Dense(input_dim, activation="sigmoid")(decoder)
    
    # Full model
    autoencoder = models.Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mse')
    
    return autoencoder

# Initialize the model
input_dim = X_train.shape[1]  # Number of features in the dataset
autoencoder = build_autoencoder(input_dim)

# Train the model
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_test, X_test))

# Anomaly Detection
def detect_anomalies(data, model, threshold=0.1):
    reconstructions = model.predict(data)
    reconstruction_loss = np.mean(np.abs(data - reconstructions), axis=1)
    anomalies = reconstruction_loss > threshold
    return anomalies

# Detect anomalies in the test data
anomalies = detect_anomalies(X_test, autoencoder)

print("Detected anomalies:", anomalies)


ValueError: could not convert string to float: '018f'

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models

# Load the dataset (replace with your dataset path)
data = pd.read_csv('../data/DoS_dataset.csv')

# Check data types to identify non-numeric columns
print(data.dtypes)

# Assuming that 'message_id' is the non-numeric column and needs encoding
# Replace 'message_id' with the actual non-numeric column in your dataset
label_encoder = LabelEncoder()
data['message_id_encoded'] = label_encoder.fit_transform(data['message_id'])

# Drop the original non-numeric column (optional)
data = data.drop(columns=['message_id'])

# Preprocess the numeric data (standard scaling)
scaler = StandardScaler()
X = scaler.fit_transform(data.values)

# Split the data into training and testing sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Build the autoencoder model
def build_autoencoder(input_dim):
    # Encoder
    input_layer = layers.Input(shape=(input_dim,))
    encoder = layers.Dense(64, activation="relu")(input_layer)
    encoder = layers.Dense(32, activation="relu")(encoder)
    encoder = layers.Dense(16, activation="relu")(encoder)
    
    # Decoder
    decoder = layers.Dense(32, activation="relu")(encoder)
    decoder = layers.Dense(64, activation="relu")(decoder)
    decoder = layers.Dense(input_dim, activation="sigmoid")(decoder)
    
    # Full model
    autoencoder = models.Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mse')
    
    return autoencoder

# Initialize the model
input_dim = X_train.shape[1]  # Number of features in the dataset
autoencoder = build_autoencoder(input_dim)

# Train the model
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_test, X_test))

# Anomaly Detection based on reconstruction loss
def detect_anomalies(data, model, threshold=0.1):
    reconstructions = model.predict(data)
    reconstruction_loss = np.mean(np.abs(data - reconstructions), axis=1)
    anomalies = reconstruction_loss > threshold
    return anomalies, reconstruction_loss

# Detect anomalies in the test data
anomalies, reconstruction_loss = detect_anomalies(X_test, autoencoder)

# Display anomalies and corresponding reconstruction loss
print("Anomalies detected:", anomalies)
print("Reconstruction loss:", reconstruction_loss)


1478198376.389427    float64
0316                  object
8                      int64
05                    object
21                    object
68                    object
09                    object
21.1                  object
21.2                  object
00                    object
6f                    object
R                     object
dtype: object


KeyError: 'message_id'

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models

# Load the dataset (replace with your dataset path)
data = pd.read_csv('../data/DoS_dataset.csv')

# Step 1: Inspect the column names to find non-numeric columns
print("Columns in the dataset:", data.columns)

# Step 2: Identify and handle non-numeric columns
# For example, let's assume the non-numeric column is 'ID' (replace 'ID' with the actual column name you want to encode)

# Check the data types of columns
print(data.dtypes)

# Use Label Encoding for non-numeric columns if any
# Replace 'ID' with the actual non-numeric column name
if 'ID' in data.columns:
    label_encoder = LabelEncoder()
    data['ID_encoded'] = label_encoder.fit_transform(data['ID'])
    
    # Drop the original non-numeric column
    data = data.drop(columns=['ID'])

# Step 3: Preprocess the data (scaling)
scaler = StandardScaler()
X = scaler.fit_transform(data.values)

# Split the data into training and testing sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Step 4: Build the autoencoder model
def build_autoencoder(input_dim):
    # Encoder
    input_layer = layers.Input(shape=(input_dim,))
    encoder = layers.Dense(64, activation="relu")(input_layer)
    encoder = layers.Dense(32, activation="relu")(encoder)
    encoder = layers.Dense(16, activation="relu")(encoder)
    
    # Decoder
    decoder = layers.Dense(32, activation="relu")(encoder)
    decoder = layers.Dense(64, activation="relu")(decoder)
    decoder = layers.Dense(input_dim, activation="sigmoid")(decoder)
    
    # Full model
    autoencoder = models.Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mse')
    
    return autoencoder

# Initialize the model
input_dim = X_train.shape[1]  # Number of features in the dataset
autoencoder = build_autoencoder(input_dim)

# Step 5: Train the model
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_test, X_test))

# Step 6: Anomaly Detection based on reconstruction loss
def detect_anomalies(data, model, threshold=0.1):
    reconstructions = model.predict(data)
    reconstruction_loss = np.mean(np.abs(data - reconstructions), axis=1)
    anomalies = reconstruction_loss > threshold
    return anomalies, reconstruction_loss

# Detect anomalies in the test data
anomalies, reconstruction_loss = detect_anomalies(X_test, autoencoder)

# Step 7: Display the detected anomalies and corresponding reconstruction loss
print("Anomalies detected:", anomalies)
print("Reconstruction loss:", reconstruction_loss)


Columns in the dataset: Index(['1478198376.389427', '0316', '8', '05', '21', '68', '09', '21.1',
       '21.2', '00', '6f', 'R'],
      dtype='object')
1478198376.389427    float64
0316                  object
8                      int64
05                    object
21                    object
68                    object
09                    object
21.1                  object
21.2                  object
00                    object
6f                    object
R                     object
dtype: object


ValueError: could not convert string to float: '018f'

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models

# Load the dataset (replace with your dataset path)
data = pd.read_csv('../data/DoS_dataset.csv')

# Step 1: Inspect the column names and data types
print("Columns in the dataset:", data.columns)
print(data.dtypes)

# Step 2: Handle non-numeric columns by applying Label Encoding
for column in data.columns:
    if data[column].dtype == 'object':  # Check if the column is non-numeric
        print(f"Encoding column: {column}")
        label_encoder = LabelEncoder()
        data[column] = label_encoder.fit_transform(data[column])

# Step 3: Preprocess the data (scaling)
scaler = StandardScaler()
X = scaler.fit_transform(data.values)  # Since there are no labels, all data is features (X)

# Split the data into training and testing sets
# Note: This split is still important even in unsupervised learning, to test the model on unseen data
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Step 4: Build the autoencoder model for unsupervised anomaly detection
def build_autoencoder(input_dim):
    # Encoder
    input_layer = layers.Input(shape=(input_dim,))
    encoder = layers.Dense(64, activation="relu")(input_layer)
    encoder = layers.Dense(32, activation="relu")(encoder)
    encoder = layers.Dense(16, activation="relu")(encoder)
    
    # Decoder
    decoder = layers.Dense(32, activation="relu")(encoder)
    decoder = layers.Dense(64, activation="relu")(decoder)
    decoder = layers.Dense(input_dim, activation="sigmoid")(decoder)
    
    # Full model
    autoencoder = models.Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mse')
    
    return autoencoder

# Initialize the autoencoder model
input_dim = X_train.shape[1]  # Number of features in the dataset
autoencoder = build_autoencoder(input_dim)

# Step 5: Train the model
# In unsupervised learning, we train the autoencoder only on the features (X_train)
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_test, X_test))

# Step 6: Anomaly Detection based on reconstruction loss
def detect_anomalies(data, model, threshold=0.1):
    reconstructions = model.predict(data)
    reconstruction_loss = np.mean(np.abs(data - reconstructions), axis=1)
    anomalies = reconstruction_loss > threshold
    return anomalies, reconstruction_loss

# Detect anomalies in the test data
anomalies, reconstruction_loss = detect_anomalies(X_test, autoencoder)

# Step 7: Display the detected anomalies and corresponding reconstruction loss
print("Anomalies detected:", anomalies)
print("Reconstruction loss:", reconstruction_loss)


Columns in the dataset: Index(['1478198376.389427', '0316', '8', '05', '21', '68', '09', '21.1',
       '21.2', '00', '6f', 'R'],
      dtype='object')
1478198376.389427    float64
0316                  object
8                      int64
05                    object
21                    object
68                    object
09                    object
21.1                  object
21.2                  object
00                    object
6f                    object
R                     object
dtype: object
Encoding column: 0316
Encoding column: 05
Encoding column: 21
Encoding column: 68
Encoding column: 09
Encoding column: 21.1
Encoding column: 21.2
Encoding column: 00
Encoding column: 6f
Encoding column: R
Epoch 1/50
[1m91645/91645[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 3ms/step - loss: 0.5786 - val_loss: 0.5714
Epoch 2/50
[1m91645/91645[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 3ms/step - loss: 0.5702 - val_loss: 0.5703
Epoch 3/50
[1m91645/91645[0