<a href="https://colab.research.google.com/github/codexnyctis/104520751_concept4/blob/dev%2Fnur%2FML_training/classification_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup and Imports

In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
import joblib
import json
from tensorflow.keras.models import Sequential
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import os

# Data Loading and Preprocessing

In [43]:
# Load the data
df = pd.read_csv('Obfuscated-MalMem2022.csv')

In [44]:
# Extract malware type and family from the 'category' column
def extract_malware_info(category):
    if category == 'Benign':
        return 'Benign', 'Benign'
    parts = category.split('-')
    if len(parts) >= 2:
        return parts[0], parts[1]
    return 'Unknown', 'Unknown'

df['Malware_Type'], df['Malware_Family'] = zip(*df['Category'].apply(extract_malware_info))

Presentation Set

In [45]:
# Extract presentation set using stratified sampling
df_main, df_present = train_test_split(
    df, test_size=0.002, stratify=df['Class'], random_state=42
)

# Save the presentation set to a new CSV file
presentation_file = 'presentation_samples.csv'
df_present.to_csv(presentation_file, index=False)

print(f"Presentation set saved to {presentation_file}")
print(f"Number of samples in presentation set: {len(df_present)}")

# Display class distribution in the presentation set
print("\nClass distribution in presentation set:")
print(df_present['Class'].value_counts(normalize=True))

print("\nClass distribution in main dataset:")
print(df_main['Class'].value_counts(normalize=True))

Presentation set saved to presentation_samples.csv
Number of samples in presentation set: 118

Class distribution in presentation set:
Class
Benign     0.5
Malware    0.5
Name: proportion, dtype: float64

Class distribution in main dataset:
Class
Benign     0.5
Malware    0.5
Name: proportion, dtype: float64


In [46]:
# Preprocess the data
def preprocess_data(df):
    # Separate features and targets from the file: We need to distinguish between input features and output labels
    X = df.drop(['Category', 'Class', 'Malware_Type', 'Malware_Family'], axis=1)
    y_binary = df['Class']
    y_4class = df['Malware_Type']
    y_16class = df['Malware_Family']

    # Label encode the targets: Converts categorical labels into numerical format
    le_binary = LabelEncoder()
    le_4class = LabelEncoder()
    le_16class = LabelEncoder()
    y_binary = le_binary.fit_transform(y_binary)
    y_4class = le_4class.fit_transform(y_4class)
    y_16class = le_16class.fit_transform(y_16class)

    # Remove constant features: Features that don't vary across samples don't provide useful information
    variance_selector = VarianceThreshold()
    X_var = variance_selector.fit_transform(X)
    X_var = pd.DataFrame(X_var, columns=X.columns[variance_selector.get_support()])

    print("\nShape after removing constant features:", X_var.shape)

    # Select top 16 features: Helps in dimensionality reduction
    feature_selector = SelectKBest(score_func=f_classif, k=min(16, X_var.shape[1]))
    try:
        X_selected = feature_selector.fit_transform(X_var, y_binary)
        selected_features = X_var.columns[feature_selector.get_support()].tolist()
    except:
        print("Error in feature selection. Using all non-constant features.")
        X_selected = X_var
        selected_features = X_var.columns.tolist()

    X_new = pd.DataFrame(X_selected, columns=selected_features)

    print("\nShape after feature selection:", X_new.shape)
    print("Selected features:", selected_features)

    # Standardise the features: Ensures all features are on the same scale
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_new)
    X_scaled = pd.DataFrame(X_scaled, columns=X_new.columns)

    print(f"\nFinal number of features after preprocessing: {X_scaled.shape[1]}")

    return X_scaled, y_binary, y_4class, y_16class, le_binary, le_4class, le_16class, scaler, feature_selector, variance_selector

# This is to take the variables to the global scope
X_scaled, y_binary, y_4class, y_16class, le_binary, le_4class, le_16class, scaler, feature_selector, variance_selector = preprocess_data(df_main)



Shape after removing constant features: (58478, 52)

Shape after feature selection: (58478, 16)
Selected features: ['pslist.avg_threads', 'dlllist.ndlls', 'dlllist.avg_dlls_per_proc', 'handles.nevent', 'handles.nkey', 'handles.nthread', 'handles.nsemaphore', 'handles.ntimer', 'handles.nsection', 'handles.nmutant', 'ldrmodules.not_in_load', 'ldrmodules.not_in_init', 'ldrmodules.not_in_mem', 'svcscan.process_services', 'svcscan.shared_process_services', 'svcscan.nactive']

Final number of features after preprocessing: 16


**Data Analysis**

In [47]:
#To see if the preprocessing went correct and to observe class imbalance
print("\nData Analysis on Main Dataset (After Extracting Presentation Set):")
print("\nDistribution of binary classes:")
print(df_main['Class'].value_counts())

print("\nDistribution of 4-class categories:")
print(df_main['Malware_Type'].value_counts())

print("\nDistribution of 16-class categories:")
print(df_main['Malware_Family'].value_counts())

benign_percentage = (df_main['Class'] == 'Benign').mean() * 100
print(f"\nPercentage of benign samples: {benign_percentage:.2f}%")
print(f"Percentage of malicious samples: {100 - benign_percentage:.2f}%")


Data Analysis on Main Dataset (After Extracting Presentation Set):

Distribution of binary classes:
Class
Benign     29239
Malware    29239
Name: count, dtype: int64

Distribution of 4-class categories:
Malware_Type
Benign        29239
Spyware        9998
Ransomware     9770
Trojan         9471
Name: count, dtype: int64

Distribution of 16-class categories:
Malware_Family
Benign          29239
Transponder      2402
Gator            2194
Shade            2125
Refroso          2000
Ako              1998
180solutions     1997
CWS              1997
Scar             1993
Conti            1979
Emotet           1963
Maze             1954
Zeus             1946
Pysa             1714
Reconyc          1569
TIBS             1408
Name: count, dtype: int64

Percentage of benign samples: 50.00%
Percentage of malicious samples: 50.00%


# Model Creation

In [48]:
# Create base models: Three different types of ensemble classifiers
def create_base_models():
    return [
        # Random Forest with 100 trees
        RandomForestClassifier(n_estimators=100, random_state=42),
        # XGBoost with 100 boosting rounds
        XGBClassifier(n_estimators=100, random_state=42),
        # Extra Trees with 100 trees
        ExtraTreesClassifier(n_estimators=100, random_state=42)
    ]

# Create meta-learner: A neural network to combine predictions from base models
def create_meta_learner(n_classes, n_features):
    model = Sequential([
        Input(shape=(n_features,)),
        # First hidden layer with 256 neurons and ReLU activation
        Dense(256, activation='relu'),
        # Second hidden layer with 128 neurons and ReLU activation
        Dense(128, activation='relu')
    ])
    # Output layer based on the number of classes
    if n_classes == 2:
        # Binary classification: single neuron with sigmoid activation
        model.add(Dense(1, activation='sigmoid'))
    else:
        # Multi-class classification: softmax activation for multiple classes
        model.add(Dense(n_classes, activation='softmax'))

    return model

# Experiment

In [49]:
# Run a single experiment with cross-validation
def run_experiment(X, y, n_classes, seed):
    np.random.seed(seed)
    tf.random.set_seed(seed)

    # Initialize stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=seed)
    results = []
    best_accuracy = 0
    best_models = None

    for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
        print(f"Fold {fold}")
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Train and predict with base models
        base_models = create_base_models()
        base_predictions = []
        for model in base_models:
            model.fit(X_train, y_train)
            if n_classes == 2:
                base_predictions.append(model.predict_proba(X_val)[:, 1].reshape(-1, 1))
            else:
                base_predictions.append(model.predict_proba(X_val))

        meta_features = np.hstack(base_predictions)

        # Train meta-learner on base model predictions
        meta_learner = create_meta_learner(n_classes, meta_features.shape[1])
        meta_learner.compile(
            optimizer='adam',
            loss='binary_crossentropy' if n_classes == 2 else 'sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        meta_learner.fit(meta_features, y_val, epochs=100, batch_size=32, verbose=0)

        # Make final predictions using the trained meta-learner
        final_predictions = meta_learner.predict(meta_features)
        if n_classes == 2:
            final_predictions = (final_predictions > 0.5).astype(int).flatten()
        else:
            final_predictions = np.argmax(final_predictions, axis=1)

        # Calculate performance metrics
        accuracy = accuracy_score(y_val, final_predictions)
        precision = precision_score(y_val, final_predictions, average='binary' if n_classes == 2 else 'weighted')
        recall = recall_score(y_val, final_predictions, average='binary' if n_classes == 2 else 'weighted')
        f1 = f1_score(y_val, final_predictions, average='binary' if n_classes == 2 else 'weighted')

        results.append((accuracy, precision, recall, f1))

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_models = (base_models, meta_learner)

    return np.mean(results, axis=0), best_models

In [50]:
# Run multiple experiments and average the results
def run_multiple_experiments(X, y, n_classes, n_runs=5):
    all_results = []
    best_overall_accuracy = 0
    best_overall_models = None
    for run in range(n_runs):
        print(f"\nRun {run + 1}")
        results, models = run_experiment(X, y, n_classes, seed=run)
        all_results.append(results)
        if results[0] > best_overall_accuracy:
            best_overall_accuracy = results[0]
            best_overall_models = models
    return np.mean(all_results, axis=0), best_overall_models

# Model Training and Evaluation

In [51]:
# Print final results
def print_results(results, classification_type):
    print(f"\n{classification_type} Classification Results:")
    print(f"Accuracy: {results[0]:.4f}")
    print(f"Precision: {results[1]:.4f}")
    print(f"Recall: {results[2]:.4f}")
    print(f"F1-score: {results[3]:.4f}")

In [52]:
print("Binary Classification")
binary_results, binary_models = run_multiple_experiments(X_scaled, pd.Series(y_binary), 2)
print_results(binary_results, "Binary")

Binary Classification

Run 1
Fold 1
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Fold 2
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Fold 3
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 4
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step

Run 2
Fold 1
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 2
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 3
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 4
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step

Run 3
Fold 1
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 2
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 3
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 4
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [53]:
print("\n4-Class Classification")
results_4class, models_4class = run_multiple_experiments(X_scaled, pd.Series(y_4class), 4)
print_results(results_4class, "4-Class")


4-Class Classification

Run 1
Fold 1
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Fold 2
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 3
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 4
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step

Run 2
Fold 1
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 2
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 3
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 4
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step

Run 3
Fold 1
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Fold 2
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 3
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 4
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [54]:
print("\n16-Class Classification")
results_16class, models_16class = run_multiple_experiments(X_scaled, pd.Series(y_16class), 16)
print_results(results_16class, "16-Class")


16-Class Classification

Run 1
Fold 1
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 2
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 3
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step
Fold 4
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step

Run 2
Fold 1
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 2
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 3
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 4
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

Run 3
Fold 1
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 2
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 3
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Fold 4
[1m457/457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

# Model Saving

In [59]:
# Model Saving
def save_model_components(base_models, meta_learner, le_binary, le_4class, le_16class, scaler, feature_selector, variance_selector, selected_features, save_path):
    os.makedirs(save_path, exist_ok=True)

    # Save base models
    for i, model in enumerate(base_models):
        joblib.dump(model, f"{save_path}/base_model_{i}.joblib")

    # Save meta-learner with .keras extension
    tf.keras.models.save_model(meta_learner, f"{save_path}/meta_learner.keras")

    # Save preprocessors
    joblib.dump(le_binary, f"{save_path}/le_binary.joblib")
    joblib.dump(le_4class, f"{save_path}/le_4class.joblib")
    joblib.dump(le_16class, f"{save_path}/le_16class.joblib")
    joblib.dump(scaler, f"{save_path}/scaler.joblib")
    joblib.dump(feature_selector, f"{save_path}/feature_selector.joblib")
    joblib.dump(variance_selector, f"{save_path}/variance_selector.joblib")

    # Save selected features
    with open(f"{save_path}/selected_features.json", 'w') as f:
        json.dump(selected_features, f)

    print(f"Model components saved to {save_path}")

In [60]:
# Save models
save_model_components(
    binary_models[0],  # base models
    binary_models[1],  # meta learner
    le_binary,
    le_4class,
    le_16class,
    scaler,
    feature_selector,
    variance_selector,
    X_scaled.columns.tolist(),  # selected features
    save_path="saved_model_binary"
)

save_model_components(
    models_4class[0],
    models_4class[1],
    le_binary,
    le_4class,
    le_16class,
    scaler,
    feature_selector,
    variance_selector,
    X_scaled.columns.tolist(),
    save_path="saved_model_4class"
)

save_model_components(
    models_16class[0],
    models_16class[1],
    le_binary,
    le_4class,
    le_16class,
    scaler,
    feature_selector,
    variance_selector,
    X_scaled.columns.tolist(),
    save_path="saved_model_16class"
)

print("All models have been saved.")

# Download the saved models from Google Colab:
from google.colab import files

!zip -r saved_models.zip saved_model_binary saved_model_4class saved_model_16class
files.download('saved_models.zip')

Model components saved to saved_model_binary
Model components saved to saved_model_4class
Model components saved to saved_model_16class
All models have been saved.
  adding: saved_model_binary/ (stored 0%)
  adding: saved_model_binary/base_model_1.joblib (deflated 82%)
  adding: saved_model_binary/le_16class.joblib (deflated 30%)
  adding: saved_model_binary/feature_selector.joblib (deflated 44%)
  adding: saved_model_binary/base_model_0.joblib (deflated 76%)
  adding: saved_model_binary/base_model_2.joblib (deflated 74%)
  adding: saved_model_binary/selected_features.json (deflated 59%)
  adding: saved_model_binary/le_binary.joblib (deflated 28%)
  adding: saved_model_binary/variance_selector.joblib (deflated 46%)
  adding: saved_model_binary/scaler.joblib (deflated 31%)
  adding: saved_model_binary/le_4class.joblib (deflated 28%)
  adding: saved_model_binary/meta_learner.keras (deflated 25%)
  adding: saved_model_4class/ (stored 0%)
  adding: saved_model_4class/base_model_1.joblib (d

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>