<a href="https://colab.research.google.com/github/ericyoc/malicious_js_id_proc/blob/main/malicious_js_id_poc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!pip install numpy pandas scikit-learn tensorflow matplotlib prettytable

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from prettytable import PrettyTable
import random
import string
import os
from scipy import stats

In [3]:
# Set the content directory
content_dir = '/content'

In [4]:
def load_category_data(category_file):
    print("Loading category data...")
    category_df = pd.read_csv(category_file)
    print(f"Category file columns: {category_df.columns}")

    X = []
    y = []
    for i, (_, row) in enumerate(category_df.iterrows()):
        features = extract_advanced_features(row)
        X.append(features)
        y.append(1 if row['category'] == 'worm' else 0)
        if i % 100 == 0:
            print(f"Processing sample {i}/{len(category_df)}", end="\r")

    print("Data loading complete.")
    return np.array(X), np.array(y)

In [5]:
def load_metadata_data(metadata_file):
    print("Loading metadata data...")
    metadata_df = pd.read_csv(metadata_file)
    print(f"Metadata file columns: {metadata_df.columns}")

    X = []
    for i, (_, row) in enumerate(metadata_df.iterrows()):
        features = extract_advanced_features(row)
        X.append(features)
        if i % 100 == 0:
            print(f"Processing sample {i}/{len(metadata_df)}", end="\r")

    print("Data loading complete.")
    return np.array(X)

In [6]:
def extract_advanced_features(row):
    features = {
        'sha_length': len(str(row.get('sha256', row.get('sha', '')))),
        'category_length': len(str(row.get('category', ''))),
        'timestamp_length': len(str(row.get('timestamp', ''))),
        'family_length': len(str(row.get('family', ''))),
        'num_unique_chars': len(set(str(row))),
        'total_string_length': len(str(row))
    }
    return list(features.values())

In [7]:
def apply_obfuscation(data, obfuscation_level):
    if obfuscation_level == 'low':
        noise = np.random.normal(0, 0.1, data.shape).astype(data.dtype)
        return data.astype(np.float64) + noise
    elif obfuscation_level == 'medium':
        X_obf = []
        for row in data:
            obfuscated_row = []
            for feature in row:
                if isinstance(feature, str):
                    new_feature = ''.join(random.choices(string.ascii_letters + string.digits, k=int(feature * 1.5)))
                else:
                    new_feature = feature
                obfuscated_row.append(new_feature)
            X_obf.append(obfuscated_row)
        return np.array(X_obf, dtype=object)
    elif obfuscation_level == 'high':
        return data[:, np.random.permutation(data.shape[1])]
    else:
        return data

In [8]:
def deobfuscate_data(X, obfuscation_level):
    if obfuscation_level == 'none':
        return X
    elif obfuscation_level == 'low':
        return X  # No real deobfuscation for low level
    elif obfuscation_level == 'medium':
        return X  # No real deobfuscation for medium level
    elif obfuscation_level == 'high':
        return X[:, np.argsort(range(X.shape[1]))]  # Attempt to reverse the column shuffle
    return X

In [9]:
def preprocess_data(X, y=None, obfuscation_level='none'):
    print(f"Preprocessing data with obfuscation level: {obfuscation_level}")
    X_obf = apply_obfuscation(X, obfuscation_level)
    X_deobf = deobfuscate_data(X_obf, obfuscation_level)
    if y is not None:
        return X_obf, X_deobf, y
    else:
        return X_obf, X_deobf

In [10]:
def create_model(input_dim, layers=[64, 32], learning_rate=0.001):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(layers[0], activation='relu'))
    for layer_size in layers[1:]:
        model.add(Dense(layer_size, activation='relu'))
        model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))

    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [11]:
def optimize_hyperparameters(X, y):
    best_params = {}
    best_score = 0

    layers = [[64, 32], [128, 64], [256, 128, 64]]
    learning_rates = [0.001, 0.01]
    epochs = [10]
    batch_sizes = [32, 64]

    total_combinations = len(layers) * len(learning_rates) * len(epochs) * len(batch_sizes)
    combination_count = 0

    for layer_config in layers:
        for lr in learning_rates:
            for epoch in epochs:
                for batch_size in batch_sizes:
                    combination_count += 1
                    print(f"Testing configuration {combination_count}/{total_combinations}: layers={layer_config}, learning_rate={lr}, epochs={epoch}, batch_size={batch_size}")
                    model = create_model(input_dim=X.shape[1], layers=layer_config, learning_rate=lr)

                    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

                    history = model.fit(X_train, y_train, epochs=epoch, batch_size=batch_size, validation_data=(X_val, y_val), verbose=1)
                    val_accuracy = history.history['val_accuracy'][-1]

                    if val_accuracy > best_score:
                        best_score = val_accuracy
                        best_params = {
                            'layers': layer_config,
                            'learning_rate': lr,
                            'epochs': epoch,
                            'batch_size': batch_size
                        }

    return best_params

In [12]:
def plot_results(results_dict, title, filename):
    plt.figure(figsize=(12, 6))

    for level, data in results_dict.items():
        plt.plot(data['obfuscated']['history'].history['accuracy'], label=f"Obfuscated - {level}")
        plt.plot(data['deobfuscated']['history'].history['accuracy'], label=f"Deobfuscated - {level}")

    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    full_path = os.path.join(content_dir, filename)
    plt.savefig(full_path)
    plt.close()
    print(f"Plot saved to {full_path}")

In [13]:
def save_model(model, filename):
    full_path = os.path.join(content_dir, filename)
    model.save(full_path, save_format='keras')
    print(f"Model saved to {full_path}")

In [14]:
def save_metrics(metrics, filename):
    full_path = os.path.join(content_dir, filename)
    with open(full_path, 'w') as f:
        for metric_name, metric_value in metrics.items():
            f.write(f"{metric_name}: {metric_value}\n")
    print(f"Metrics saved to {full_path}")

In [15]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_binary = (y_pred > 0.5).astype(int)

    accuracy = model.evaluate(X_test, y_test)[1]
    roc_auc = roc_auc_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)

    return {
        'accuracy': accuracy,
        'roc_auc': roc_auc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [16]:
class KerasClassifierWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, build_fn, **kwargs):
        self.build_fn = build_fn
        self.kwargs = kwargs
        self.model = None

    def fit(self, X, y, **kwargs):
        if self.model is None:
            self.model = self.build_fn()
        self.model.fit(X, y, **kwargs)
        return self

    def predict(self, X):
        return (self.model.predict(X) > 0.5).astype(int)

    def predict_proba(self, X):
        proba = self.model.predict(X)
        return np.column_stack([1 - proba, proba])

    def get_params(self, deep=True):
        return {'build_fn': self.build_fn, **self.kwargs}

    def set_params(self, **params):
        self.kwargs.update(params)
        return self

In [17]:
from sklearn.model_selection import cross_val_score

def cross_validate_model(X, y, model_builder, cv=5):
    keras_classifier = KerasClassifierWrapper(build_fn=model_builder)
    cv_scores = cross_val_score(keras_classifier, X, y, cv=cv, scoring='accuracy')
    return cv_scores.mean(), cv_scores.std()

In [18]:
def create_model_wrapper(input_dim, layers, learning_rate):
    def model_builder():
        return create_model(input_dim, layers, learning_rate)
    return model_builder

In [19]:
def statistical_significance_test(results):
    obfuscation_levels = list(results.keys())
    obfuscated_accuracies = [results[level]['obfuscated']['metrics']['accuracy'] for level in obfuscation_levels]
    deobfuscated_accuracies = [results[level]['deobfuscated']['metrics']['accuracy'] for level in obfuscation_levels]

    f_statistic, p_value = stats.f_oneway(obfuscated_accuracies, deobfuscated_accuracies)
    return f_statistic, p_value

In [20]:
def main():
    category_file = 'bodmas_malware_category.csv'
    metadata_file = 'bodmas_metadata.csv'

    # Load category data
    X_category, y_category = load_category_data(category_file)

    # Load metadata data
    X_metadata = load_metadata_data(metadata_file)

    print(f"Shape of X_category: {X_category.shape}")
    print(f"Shape of X_metadata: {X_metadata.shape}")

    # Ensure X_category and X_metadata have the same number of samples
    if X_category.shape[0] != X_metadata.shape[0]:
        min_samples = min(X_category.shape[0], X_metadata.shape[0])
        X_category = X_category[:min_samples]
        X_metadata = X_metadata[:min_samples]
        y_category = y_category[:min_samples]
        print(f"Adjusted shapes to {min_samples} samples")

    # Concatenate the category and metadata features
    X = np.hstack([X_category, X_metadata])

    obfuscation_levels = ['none', 'low', 'medium', 'high']
    results = {}

    for level in obfuscation_levels:
        print(f"\nProcessing obfuscation level: {level}")
        X_obf, X_deobf, y_obf = preprocess_data(X, y_category, level)

        results[level] = {'obfuscated': {}, 'deobfuscated': {}}

        for data_type, X_processed in [('obfuscated', X_obf), ('deobfuscated', X_deobf)]:
            X_train, X_test, y_train, y_test = train_test_split(X_processed, y_obf, test_size=0.2, random_state=42)

            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            print(f"Optimizing hyperparameters for {data_type} data...")
            best_params = optimize_hyperparameters(X_train_scaled, y_train)

            print(f"Best parameters for {data_type}: {best_params}")

            model_builder = create_model_wrapper(input_dim=X_train_scaled.shape[1],
                                                 layers=best_params['layers'],
                                                 learning_rate=best_params['learning_rate'])

            # Perform cross-validation
            cv_mean, cv_std = cross_validate_model(X_train_scaled, y_train, model_builder)
            print(f"Cross-validation accuracy for {data_type}: {cv_mean:.4f} (+/- {cv_std:.4f})")

            print(f"Training model on {data_type} data...")
            model = create_model(input_dim=X_train_scaled.shape[1],
                                 layers=best_params['layers'],
                                 learning_rate=best_params['learning_rate'])

            history = model.fit(X_train_scaled, y_train,
                                epochs=best_params['epochs'],
                                batch_size=best_params['batch_size'],
                                validation_split=0.2,
                                verbose=1)

            metrics = evaluate_model(model, X_test_scaled, y_test)
            results[level][data_type] = {'metrics': metrics, 'history': history}

            save_model(model, f'model_{level}_{data_type}.keras')
            save_metrics(metrics, f'metrics_{level}_{data_type}.txt')

    # Perform statistical significance test
    f_statistic, p_value = statistical_significance_test(results)
    print(f"\nStatistical Significance Test:")
    print(f"F-statistic: {f_statistic}")
    print(f"p-value: {p_value}")

    # Plot results
    plot_results(results, "Model Performance Across Obfuscation Levels", "performance_plot.png")

    # Summary table
    table = PrettyTable()
    table.field_names = ["Obfuscation Level", "Data Type", "Accuracy", "ROC AUC", "Precision", "Recall", "F1 Score"]
    for level, data in results.items():
        for data_type in ['obfuscated', 'deobfuscated']:
            metrics = data[data_type]['metrics']
            table.add_row([
                level,
                data_type,
                f"{metrics['accuracy']:.4f}",
                f"{metrics['roc_auc']:.4f}",
                f"{metrics['precision']:.4f}",
                f"{metrics['recall']:.4f}",
                f"{metrics['f1']:.4f}"
            ])
    print("\nPerformance Summary:")
    print(table)

    # Save the summary table
    table_str = table.get_string()
    with open(os.path.join(content_dir, 'summary_table.txt'), 'w') as f:
        f.write(table_str)
    print(f"Summary table saved to {os.path.join(content_dir, 'summary_table.txt')}")

In [21]:
if __name__ == "__main__":
    main()

Loading category data...
Category file columns: Index(['sha256', 'category'], dtype='object')
Data loading complete.
Loading metadata data...
Metadata file columns: Index(['sha', 'timestamp', 'family'], dtype='object')
Data loading complete.
Shape of X_category: (57293, 6)
Shape of X_metadata: (134435, 6)
Adjusted shapes to 57293 samples

Processing obfuscation level: none
Preprocessing data with obfuscation level: none
Optimizing hyperparameters for obfuscated data...
Testing configuration 1/12: layers=[64, 32], learning_rate=0.001, epochs=10, batch_size=32
Epoch 1/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9478 - loss: 0.1594 - val_accuracy: 0.9992 - val_loss: 0.0123
Epoch 2/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9993 - loss: 0.0076 - val_accuracy: 0.9992 - val_loss: 0.0131
Epoch 3/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9993 

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 136, in __call__
    score = scorer._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_response.py", line 182, in _get_response_values
    classes = estimator.classes_
AttributeError: 'KerasClassifierWrapper' object has no attribute 'classes_'



[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9907 - loss: 0.0493
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9863 - loss: 0.0466
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9925 - loss: 0.0391
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9912 - loss: 0.0524
Cross-validation accuracy for obfuscated: nan (+/- nan)
Training model on obfuscated data...
Epoch 1/10
[1m573/573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9784 - loss: 0.0664 - val_accuracy: 0.9993 - val_loss: 0.0057
Epoch 2/10
[1m573/573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0055 - val_accuracy: 0.9993 - val_loss: 0.0045
Epoch 3/10
[1m573/573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0044 - val_accuracy: 0.9



Model saved to /content/model_none_obfuscated.keras
Metrics saved to /content/metrics_none_obfuscated.txt
Optimizing hyperparameters for deobfuscated data...
Testing configuration 1/12: layers=[64, 32], learning_rate=0.001, epochs=10, batch_size=32
Epoch 1/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9449 - loss: 0.1660 - val_accuracy: 0.9992 - val_loss: 0.0240
Epoch 2/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0055 - val_accuracy: 0.9992 - val_loss: 0.0262
Epoch 3/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9997 - loss: 0.0036 - val_accuracy: 0.9992 - val_loss: 0.0308
Epoch 4/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0043 - val_accuracy: 0.9992 - val_loss: 0.0347
Epoch 5/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accur

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 136, in __call__
    score = scorer._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_response.py", line 182, in _get_response_values
    classes = estimator.classes_
AttributeError: 'KerasClassifierWrapper' object has no attribute 'classes_'



[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9863 - loss: 0.0438
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9906 - loss: 0.0442
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9924 - loss: 0.0363
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9904 - loss: 0.0453
Cross-validation accuracy for deobfuscated: nan (+/- nan)
Training model on deobfuscated data...
Epoch 1/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9891 - loss: 0.0593 - val_accuracy: 0.9975 - val_loss: 0.0131
Epoch 2/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9992 - loss: 0.0118 - val_accuracy: 0.9892 - val_loss: 0.0510
Epoch 3/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9984 - loss: 0.0225 - val_acc



Model saved to /content/model_none_deobfuscated.keras
Metrics saved to /content/metrics_none_deobfuscated.txt

Processing obfuscation level: low
Preprocessing data with obfuscation level: low
Optimizing hyperparameters for obfuscated data...
Testing configuration 1/12: layers=[64, 32], learning_rate=0.001, epochs=10, batch_size=32
Epoch 1/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9510 - loss: 0.1412 - val_accuracy: 0.9992 - val_loss: 0.0138
Epoch 2/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9995 - loss: 0.0053 - val_accuracy: 0.9992 - val_loss: 0.0155
Epoch 3/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0046 - val_accuracy: 0.9992 - val_loss: 0.0149
Epoch 4/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0051 - val_accuracy: 0.9992 - val_loss: 0.0143
Epoch 5/10


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 136, in __call__
    score = scorer._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_response.py", line 182, in _get_response_values
    classes = estimator.classes_
AttributeError: 'KerasClassifierWrapper' object has no attribute 'classes_'



[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9848 - loss: 0.0406
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9868 - loss: 0.0434
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9881 - loss: 0.0417
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9884 - loss: 0.0430
Cross-validation accuracy for obfuscated: nan (+/- nan)
Training model on obfuscated data...
Epoch 1/10
[1m573/573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9722 - loss: 0.0726 - val_accuracy: 0.9935 - val_loss: 0.0211
Epoch 2/10
[1m573/573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9982 - loss: 0.0283 - val_accuracy: 0.9993 - val_loss: 0.0056
Epoch 3/10
[1m573/573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0042 - val_accuracy: 0.9



Model saved to /content/model_low_obfuscated.keras
Metrics saved to /content/metrics_low_obfuscated.txt
Optimizing hyperparameters for deobfuscated data...
Testing configuration 1/12: layers=[64, 32], learning_rate=0.001, epochs=10, batch_size=32
Epoch 1/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9638 - loss: 0.1493 - val_accuracy: 0.9992 - val_loss: 0.0147
Epoch 2/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0040 - val_accuracy: 0.9992 - val_loss: 0.0155
Epoch 3/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0062 - val_accuracy: 0.9992 - val_loss: 0.0162
Epoch 4/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0043 - val_accuracy: 0.9992 - val_loss: 0.0165
Epoch 5/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accurac

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 136, in __call__
    score = scorer._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_response.py", line 182, in _get_response_values
    classes = estimator.classes_
AttributeError: 'KerasClassifierWrapper' object has no attribute 'classes_'



[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9879 - loss: 0.0446
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9847 - loss: 0.0507
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9842 - loss: 0.0469
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9923 - loss: 0.0364
Cross-validation accuracy for deobfuscated: nan (+/- nan)
Training model on deobfuscated data...
Epoch 1/10
[1m573/573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9755 - loss: 0.0614 - val_accuracy: 0.9993 - val_loss: 0.0063
Epoch 2/10
[1m573/573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0063 - val_accuracy: 0.9993 - val_loss: 0.0070
Epoch 3/10
[1m573/573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0046 - val_accuracy:



Model saved to /content/model_low_deobfuscated.keras
Metrics saved to /content/metrics_low_deobfuscated.txt

Processing obfuscation level: medium
Preprocessing data with obfuscation level: medium
Optimizing hyperparameters for obfuscated data...
Testing configuration 1/12: layers=[64, 32], learning_rate=0.001, epochs=10, batch_size=32
Epoch 1/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9511 - loss: 0.1510 - val_accuracy: 0.9992 - val_loss: 0.0080
Epoch 2/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0070 - val_accuracy: 0.9995 - val_loss: 0.0056
Epoch 3/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0058 - val_accuracy: 0.9992 - val_loss: 0.0268
Epoch 4/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0067 - val_accuracy: 0.9992 - val_loss: 0.0283
Epoch 5

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 136, in __call__
    score = scorer._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_response.py", line 182, in _get_response_values
    classes = estimator.classes_
AttributeError: 'KerasClassifierWrapper' object has no attribute 'classes_'



[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9587 - loss: 0.1175
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9686 - loss: 0.1012
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9780 - loss: 0.1000
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9804 - loss: 0.0969
Cross-validation accuracy for obfuscated: nan (+/- nan)
Training model on obfuscated data...
Epoch 1/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9621 - loss: 0.1236 - val_accuracy: 0.9993 - val_loss: 0.0072
Epoch 2/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0060 - val_accuracy: 0.9993 - val_loss: 0.0063
Epoch 3/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9992 - loss: 0.0083 - val_accurac



Model saved to /content/model_medium_obfuscated.keras
Metrics saved to /content/metrics_medium_obfuscated.txt
Optimizing hyperparameters for deobfuscated data...
Testing configuration 1/12: layers=[64, 32], learning_rate=0.001, epochs=10, batch_size=32
Epoch 1/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9620 - loss: 0.1421 - val_accuracy: 0.9992 - val_loss: 0.0172
Epoch 2/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0054 - val_accuracy: 0.9992 - val_loss: 0.0189
Epoch 3/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9996 - loss: 0.0046 - val_accuracy: 0.9992 - val_loss: 0.0212
Epoch 4/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9995 - loss: 0.0046 - val_accuracy: 0.9992 - val_loss: 0.0215
Epoch 5/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - a

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 136, in __call__
    score = scorer._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_response.py", line 182, in _get_response_values
    classes = estimator.classes_
AttributeError: 'KerasClassifierWrapper' object has no attribute 'classes_'



[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9605 - loss: 0.1115
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9768 - loss: 0.0994
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9753 - loss: 0.0981
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9538 - loss: 0.1192
Cross-validation accuracy for deobfuscated: nan (+/- nan)
Training model on deobfuscated data...
Epoch 1/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9763 - loss: 0.1038 - val_accuracy: 0.9993 - val_loss: 0.0069
Epoch 2/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0054 - val_accuracy: 0.9993 - val_loss: 0.0066
Epoch 3/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9993 - loss: 0.0093 - val_acc



Model saved to /content/model_medium_deobfuscated.keras
Metrics saved to /content/metrics_medium_deobfuscated.txt

Processing obfuscation level: high
Preprocessing data with obfuscation level: high
Optimizing hyperparameters for obfuscated data...
Testing configuration 1/12: layers=[64, 32], learning_rate=0.001, epochs=10, batch_size=32
Epoch 1/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9458 - loss: 0.1638 - val_accuracy: 0.9992 - val_loss: 0.0149
Epoch 2/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9994 - loss: 0.0064 - val_accuracy: 0.9992 - val_loss: 0.0157
Epoch 3/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9992 - loss: 0.0078 - val_accuracy: 0.9992 - val_loss: 0.0171
Epoch 4/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0050 - val_accuracy: 0.9992 - val_loss: 0.0173
Epoch

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 136, in __call__
    score = scorer._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_response.py", line 182, in _get_response_values
    classes = estimator.classes_
AttributeError: 'KerasClassifierWrapper' object has no attribute 'classes_'



[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9876 - loss: 0.0414
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9867 - loss: 0.0467
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9925 - loss: 0.0404
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9831 - loss: 0.0595
Cross-validation accuracy for obfuscated: nan (+/- nan)
Training model on obfuscated data...
Epoch 1/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9882 - loss: 0.0421 - val_accuracy: 0.9993 - val_loss: 0.0060
Epoch 2/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9997 - loss: 0.0032 - val_accuracy: 0.9992 - val_loss: 0.0051
Epoch 3/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9994 - loss: 0.0049 - val_accurac



Model saved to /content/model_high_obfuscated.keras
Metrics saved to /content/metrics_high_obfuscated.txt
Optimizing hyperparameters for deobfuscated data...
Testing configuration 1/12: layers=[64, 32], learning_rate=0.001, epochs=10, batch_size=32
Epoch 1/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9500 - loss: 0.1467 - val_accuracy: 0.9992 - val_loss: 0.0106
Epoch 2/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9995 - loss: 0.0052 - val_accuracy: 0.9992 - val_loss: 0.0109
Epoch 3/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9995 - loss: 0.0053 - val_accuracy: 0.9992 - val_loss: 0.0121
Epoch 4/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9996 - loss: 0.0037 - val_accuracy: 0.9992 - val_loss: 0.0126
Epoch 5/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accur

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 136, in __call__
    score = scorer._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_response.py", line 182, in _get_response_values
    classes = estimator.classes_
AttributeError: 'KerasClassifierWrapper' object has no attribute 'classes_'



[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9901 - loss: 0.0360
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9918 - loss: 0.0489
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9837 - loss: 0.0567
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9890 - loss: 0.0437
Cross-validation accuracy for deobfuscated: nan (+/- nan)
Training model on deobfuscated data...
Epoch 1/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9915 - loss: 0.0389 - val_accuracy: 0.9993 - val_loss: 0.0097
Epoch 2/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0063 - val_accuracy: 0.9993 - val_loss: 0.0049
Epoch 3/10
[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0031 - val_acc



Model saved to /content/model_high_deobfuscated.keras
Metrics saved to /content/metrics_high_deobfuscated.txt

Statistical Significance Test:
F-statistic: 1.5881502188669827
p-value: 0.25438489745449483
Plot saved to /content/performance_plot.png

Performance Summary:
+-------------------+--------------+----------+---------+-----------+--------+----------+
| Obfuscation Level |  Data Type   | Accuracy | ROC AUC | Precision | Recall | F1 Score |
+-------------------+--------------+----------+---------+-----------+--------+----------+
|        none       |  obfuscated  |  0.9999  |  1.0000 |   0.9997  | 1.0000 |  0.9998  |
|        none       | deobfuscated |  0.9995  |  0.9999 |   0.9982  | 1.0000 |  0.9991  |
|        low        |  obfuscated  |  0.9999  |  0.9999 |   0.9997  | 1.0000 |  0.9998  |
|        low        | deobfuscated |  0.9999  |  0.9999 |   0.9997  | 1.0000 |  0.9998  |
|       medium      |  obfuscated  |  0.9999  |  0.9999 |   0.9997  | 1.0000 |  0.9998  |
|       med

The results suggest that the model is highly effective and robust, maintaining near-perfect performance across various levels of obfuscation and both types of data.

The statistical test reinforces this conclusion, indicating that any minor variations in the metrics are likely due to random chance rather than a true difference in performance.

The significance of these results to cybersecurity is that they demonstrate a highly effective model for detecting obfuscated threats, ensuring that security measures remain robust even as attackers employ advanced techniques to hide their malicious activities. This contributes to the overall strengthening of cybersecurity defenses, making systems more resilient against both known and emerging threats.