# Description
This notebook validates the results of synthetic data generation by employing a random forest classifier with real data as testing data.

In [92]:
# Python
import os

# Plotting
import matplotlib.pyplot as plt

# Data handling
import pandas as pd
import numpy as np

# Machine learning modeling
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# Classification Metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

# Load & prepare data

In [87]:
# === Load Data ===

df_simulated = pd.read_parquet(f'data{os.sep}output{os.sep}simulated_data.parquet')
df_real = pd.read_parquet(f'data{os.sep}input{os.sep}kddcup99_data_smurf_&_normal_&_neptune.parquet')

# Use only the relevant columns from the simulated data that match the real data.
# This assumes you've determined these columns previously (feature selection).
df_simulated.reset_index(inplace=True)       # Reset the index (if it was used in simulation)
df_simulated.drop(['Step', 'AgentID'], axis=1, inplace=True)  # Drop unnecessary simulation-related columns

# Get the column names of the simulated data after dropping columns.
X_cols = df_simulated.columns

# Keep only the relevant columns in the real data that are in the simulated data.
df_real = df_real[X_cols]

# Add a column to distinguish between simulated and real data.
df_simulated['data_type'] = 'simulated'
df_real['data_type'] = 'real'

# Combine the real and simulated data into a single DataFrame for consistent preprocessing.
df = pd.concat([df_real, df_simulated])

# === Target Variable Encoding ===

# Encode the target variable ('target') into numerical labels for machine learning.
le = LabelEncoder()
df['target_encoded'] = le.fit_transform(df['target'])

# === Feature Encoding ===

# One-hot encode the categorical columns 'service' and 'protocol_type'.
df_encoded = pd.get_dummies(df, columns=['service', 'protocol_type'])

# Drop the original 'target' column as it has been encoded.
df_encoded.drop('target', axis=1, inplace=True)

# === Data Separation ===

# Split the encoded data back into real and simulated parts.
df_encoded_real = df_encoded[df_encoded['data_type'] == 'real']
df_encoded_sim = df_encoded[df_encoded['data_type'] == 'simulated']

# Separate features (X) and target (y) for both real and simulated data.
X_real = df_encoded_real.drop(['target_encoded' , 'data_type'], axis=1)
y_real = df_encoded_real['target_encoded']

X_sim = df_encoded_sim.drop(['target_encoded', 'data_type'], axis=1)
y_sim = df_encoded_sim['target_encoded']

# Classifier

In [88]:
classifier = RandomForestClassifier(random_state=42)

# Split into training and testing sets
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(X_real, y_real, test_size=0.5, random_state=42)
X_train_sim, X_test_sim, y_train_sim, y_test_sim = train_test_split(X_sim, y_sim, test_size=0.05, random_state=42)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_sim, y_train_sim)

# Evaluation

In [89]:
y_pred = clf.predict(X_test_real)

In [90]:
f1 = f1_score(y_test_real, y_pred, average=None)
print(f1)

[0.9997951  0.99992869]
