In [None]:
# !pip install sagemaker --quiet
# !pip install tf-keras --quiet
# !pip install tensorflow --quiet
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 --quiet

In [None]:
import sagemaker
import boto3
import pandas as pd
import numpy as np
import time
import tempfile
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, confusion_matrix, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout, SimpleRNN, Bidirectional, GRU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Reading & Preparing Embeddings Data

In [None]:
# Create a SageMaker session
sagemaker_session = sagemaker.Session()
# Get the default S3 bucket associated with your SageMaker session
bucket = sagemaker_session.default_bucket()  # replace with your own bucket name if you have one
# Create an S3 resource client
s3 = boto3.client("s3")
# Get the AWS region name
region = boto3.Session().region_name
# Get the execution role for SageMaker
role = sagemaker.get_execution_role()
# Create a SageMaker client
smclient = boto3.Session().client("sagemaker")

In [None]:
# Fetching zip files from S3 bucket

# Define bucket and key
bucket = "rg-dsa4266241005-dsa4266241005st-220"

asm_train_key = "Shared/asm_embeddings_train.csv"
asm_test_key = "Shared/asm_embeddings_test.csv"

bytes_train_key = "Shared/bytes_embeddings_train.csv"
bytes_test_key = "Shared/bytes_embeddings_test.csv"

# Number of classes for categorical conversion
num_class = 8

# Helper function to load a CSV file from S3
def load_csv_from_s3(bucket, key):
    obj = s3.get_object(Bucket=bucket, Key=key)
    return pd.read_csv(obj['Body'])

# Load CSV files into DataFrames
asm_embeddings_train_df = load_csv_from_s3(bucket, asm_train_key)
asm_embeddings_test_df = load_csv_from_s3(bucket, asm_test_key)
bytes_embeddings_train_df = load_csv_from_s3(bucket, bytes_train_key)
bytes_embeddings_test_df = load_csv_from_s3(bucket, bytes_test_key)

In [None]:
asm_embeddings_train_df

In [None]:
# Prepare data for model training
bytes_embeddings_train = bytes_embeddings_train_df.drop('Label', axis=1).to_numpy()
bytes_label_train = to_categorical(bytes_embeddings_train_df['Label'], num_classes=num_class)

bytes_embeddings_test = bytes_embeddings_test_df.drop('Label', axis=1).to_numpy()
bytes_label_test = to_categorical(bytes_embeddings_test_df['Label'], num_classes=num_class)

asm_embeddings_train = asm_embeddings_train_df.drop('Label', axis=1).to_numpy()
asm_label_train = to_categorical(asm_embeddings_train_df['Label'], num_classes=num_class)

asm_embeddings_test = asm_embeddings_test_df.drop('Label', axis=1).to_numpy()
asm_label_test = to_categorical(asm_embeddings_test_df['Label'], num_classes=num_class)

In [None]:
asm_embeddings_train.shape[1]

# Cluster of embeddings

In [None]:
def cluster(X, y, dim_reduction, dim, feature_type):
    if dim_reduction == 'PCA':
        reduction = PCA(n_components=dim)
    elif dim_reduction == 'TSNE':
        reduction = TSNE(n_components=dim, random_state=42)

    X_reduced = reduction.fit_transform(X)
    palette = sns.color_palette("colorblind", n_colors=len(np.unique(y_encoded)))

    if dim == 2:
        plt.figure(figsize=(10, 8))
        sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1], hue=y, palette=palette, s=100, alpha=0.7)

        plt.title(f"{feature_type} Embedding Clusters", fontsize=16)
        plt.xlabel("Reduced Dimension 1", fontsize=12)
        plt.ylabel("Reduced Dimension 2", fontsize=12)
        plt.legend(title="Classes", loc="upper right")
        plt.show()
    elif dim == 3:
        fig = plt.figure(figsize=(10, 8))
        ax = fig.add_subplot(111, projection='3d')

        sc = ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y_encoded, cmap=palette, s=100, alpha=0.7)

        ax.set_title(f"{feature_type} 3D Embedding Clusters", fontsize=16)
        ax.set_xlabel("Reduced Dimension 1", fontsize=12)
        ax.set_ylabel("Reduced Dimension 2", fontsize=12)
        ax.set_zlabel("Reduced Dimension 3", fontsize=12)

        legend1 = ax.legend(*sc.legend_elements(), title="Classes")
        ax.add_artist(legend1)

        plt.show()

In [None]:
cluster(bytes_embeddings_test, bytes_embeddings_test_df['Label'], 'TSNE', 2, 'Bytes')

In [None]:
cluster(asm_embeddings_test, asm_embeddings_test_df['Label'], 'TSNE', 2, 'ASM')

In [None]:
combined_embeddings_test = tf.concat([bytes_embeddings_test, asm_embeddings_test], axis=1)
cluster(combined_embeddings_test, asm_embeddings_test_df['Label'], 'TSNE', 2, 'Combined')

# Training

In [None]:
def RNN_initialisation(regu_weight, embedding_size):
    model = Sequential([
        Input(shape=(embedding_size, 1)),
        SimpleRNN(units = 64, activation = 'leaky_relu', return_sequences=True, kernel_regularizer=l2(regu_weight)),
        Dropout(0.1),
        SimpleRNN(units = 32, activation = 'leaky_relu', return_sequences=True, kernel_regularizer=l2(regu_weight)),
        Dropout(0.2),
        SimpleRNN(units = 16, activation = 'leaky_relu', kernel_regularizer=l2(regu_weight)),
        Dropout(0.3),
        Dense(num_class, activation='softmax')
    ])

    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.0005), metrics=['accuracy'])
    return model

In [None]:
def LSTM_initialisation(regu_weight, embedding_size):
    model = Sequential([
        Input(shape=(embedding_size, 1)),
        LSTM(units=64, activation='tanh', recurrent_activation='sigmoid', return_sequences=True, kernel_regularizer=l2(regu_weight)),
        Dropout(0.1),
        LSTM(units=32, activation='tanh', recurrent_activation='sigmoid', return_sequences=True, kernel_regularizer=l2(regu_weight)),
        Dropout(0.2),
        LSTM(units=16, activation='tanh', recurrent_activation='sigmoid', kernel_regularizer=l2(regu_weight)),
        Dropout(0.3),
        Dense(8, activation='softmax')
    ])

    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.002), metrics=['accuracy'])
    return model

In [None]:
def train_plot(history, feature_type, model_type):
    # plot training & validation loss values
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'{feature_type} {model_type} Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend(loc='upper right')
    plt.show()

In [None]:
def save_model_to_s3(model, bucket_name, s3_folder_path, filename):
    """
    Saves a TensorFlow model to a specified S3 bucket folder with a given filename.

    Parameters:
    - model: The TensorFlow model to be saved.
    - bucket_name: The name of the S3 bucket.
    - s3_folder_path: The folder path within the S3 bucket to save the model.
    - filename: The base name for the model in S3, with `.keras` or `.h5` extension.

    Returns:
    - None
    """
    # Ensure filename ends with `.keras` or `.h5`
    if not filename.endswith(('.keras', '.h5')):
        raise ValueError("Filename must end with '.keras' or '.h5'")

    # Save the model locally in a temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        model_path = os.path.join(temp_dir, filename)
        # Save the model using the specified format
        model.save(model_path)

        # Initialize the S3 client
        s3_client = boto3.client('s3')

        # Upload the file to S3
        s3_file_path = os.path.join(s3_folder_path, filename)
        s3_client.upload_file(model_path, bucket_name, s3_file_path)

    print(f"Model saved and uploaded to S3 bucket '{bucket_name}' in folder '{s3_folder_path}/{filename}'")

## Fitting bytes only (RNN)

In [None]:
%%time
embedding_size = bytes_embeddings_train.shape[1]
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# define RNN
RNN_model_bytes = RNN_initialisation(0.01, embedding_size)

# fit BYTES to RNN
RNN_model_bytes_history = RNN_model_bytes.fit(bytes_embeddings_train, bytes_label_train,
                                              validation_data=(bytes_embeddings_test, bytes_label_test),
                                              epochs=100,
                                              callbacks=[early_stopping],
                                              verbose=1)

# Predict class probabilities and classes for the test set
y_pred_proba = RNN_model_bytes.predict(bytes_embeddings_test)
y_pred = np.argmax(y_pred_proba, axis=1)  # Convert probabilities to class predictions
y_true = np.argmax(bytes_label_test, axis=1)  # Convert one-hot encoded labels back to integers

# Evaluate the RNN model on training data
loss, accuracy = RNN_model_bytes.evaluate(bytes_embeddings_train, bytes_label_train, verbose=1)
print(f"Training Loss: {loss}, Training Accuracy: {accuracy}")

# Evaluate the RNN model on test data
test_loss, test_accuracy = RNN_model_bytes.evaluate(bytes_embeddings_test, bytes_label_test, verbose=1)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# Calculate the multiclass log loss
mlogloss = log_loss(y_true, y_pred_proba)
print(f"Multiclass Log Loss: {mlogloss}")

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Display a classification report for additional metrics (optional)
class_report = classification_report(y_true, y_pred, target_names=[str(c) for c in range(num_class)])
print("Classification Report:")
print(class_report)

In [None]:
train_plot(RNN_model_bytes_history, 'Bytes', 'RNN')

In [None]:
save_model_to_s3(RNN_model_bytes, bucket, 'Shared/RNN_saved_models', 'RNN_model_bytes_69.keras')

# Fitting asm only (RNN)

In [None]:
%%time
embedding_size = asm_embeddings_train.shape[1]
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Define RNN model for ASM embeddings
RNN_model_asm = RNN_initialisation(0.001, embedding_size)

# Fit ASM embeddings to RNN model
RNN_model_asm_history = RNN_model_asm.fit(asm_embeddings_train, asm_label_train,
                                          validation_data=(asm_embeddings_test, asm_label_test),
                                          epochs=100,
                                          callbacks=[early_stopping],
                                          verbose=1)

# Predict class probabilities and classes for the ASM test set
y_pred_proba_asm = RNN_model_asm.predict(asm_embeddings_test)
y_pred_asm = np.argmax(y_pred_proba_asm, axis=1)  # Convert probabilities to class predictions
y_true_asm = np.argmax(asm_label_test, axis=1)  # Convert one-hot encoded labels back to integers

# Evaluate the RNN model on ASM training data
loss_asm, accuracy_asm = RNN_model_asm.evaluate(asm_embeddings_train, asm_label_train, verbose=1)
print(f"Training Loss: {loss_asm}, Training Accuracy: {accuracy_asm}")

# Evaluate the RNN model on ASM test data
test_loss_asm, test_accuracy_asm = RNN_model_asm.evaluate(asm_embeddings_test, asm_label_test, verbose=1)
print(f"Test Loss: {test_loss_asm}, Test Accuracy: {test_accuracy_asm}")

# Calculate the multiclass log loss for ASM
mlogloss_asm = log_loss(y_true_asm, y_pred_proba_asm)
print(f"Multiclass Log Loss: {mlogloss_asm}")

# Generate the confusion matrix for ASM
conf_matrix_asm = confusion_matrix(y_true_asm, y_pred_asm)
print("Confusion Matrix:")
print(conf_matrix_asm)

# Display a classification report for additional metrics for ASM (optional)
class_report_asm = classification_report(y_true_asm, y_pred_asm, target_names=[str(c) for c in range(num_class)])
print("Classification Report:")
print(class_report_asm)

In [None]:
train_plot(RNN_model_asm_history, 'Asm', 'RNN')

In [None]:
save_model_to_s3(RNN_model_asm, bucket, 'Shared/RNN_saved_models', 'RNN_model_asm_78.keras')

## Fitting bytes + asm

In [None]:
# Ensure both have the same number of samples
assert bytes_embeddings_train.shape[0] == asm_embeddings_train.shape[0], \
    "Mismatch in the number of samples between bytes and ASM embeddings."

# Concatenate training embeddings along the feature axis (axis=1)
combined_embeddings_train = tf.concat([bytes_embeddings_train, asm_embeddings_train], axis=1)
print("Combined train embeddings shape:", combined_embeddings_train.shape)

# Concatenate testing embeddings along the feature axis (axis=1)
combined_embeddings_test = tf.concat([bytes_embeddings_test, asm_embeddings_test], axis=1)
print("Combined test embeddings shape:", combined_embeddings_test.shape)

# Ensure embedding size consistency
combined_embedding_size = combined_embeddings_train.shape[1]
print("Embedding size:", combined_embedding_size)

In [None]:
%%time
# Define RNN model for ASM embeddings
RNN_model_combined = RNN_initialisation(0.001, combined_embedding_size)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Fit combined embeddings to RNN model
RNN_model_combined_history = RNN_model_combined.fit(combined_embeddings_train, asm_label_train,
                                                    validation_data=(combined_embeddings_test, asm_label_test),
                                                    epochs=100,
                                                    callbacks=[early_stopping],
                                                    verbose=1)

# Predict class probabilities and classes for the combined test set
y_pred_proba_combined = RNN_model_combined.predict(combined_embeddings_test)
y_pred_combined = np.argmax(y_pred_proba_combined, axis=1)  # Convert probabilities to class predictions
y_true_combined = np.argmax(asm_label_test, axis=1)  # Convert one-hot encoded labels back to integers

# Evaluate the RNN model on combined training data
loss_combined, accuracy_combined = RNN_model_combined.evaluate(combined_embeddings_train, asm_label_train, verbose=1)
print(f"Training Loss: {loss_combined}, Training Accuracy: {accuracy_combined}")

# Evaluate the RNN model on combined test data
test_loss_combined, test_accuracy_combined = RNN_model_combined.evaluate(combined_embeddings_test, asm_label_test, verbose=1)
print(f"Test Loss: {test_loss_combined}, Test Accuracy: {test_accuracy_combined}")

# Calculate the multiclass log loss for combined embeddings
mlogloss_combined = log_loss(y_true_combined, y_pred_proba_combined)
print(f"Multiclass Log Loss: {mlogloss_combined}")

# Generate the confusion matrix for combined embeddings
conf_matrix_combined = confusion_matrix(y_true_combined, y_pred_combined)
print("Confusion Matrix:")
print(conf_matrix_combined)

# Display a classification report for additional metrics for combined embeddings (optional)
class_report_combined = classification_report(y_true_combined, y_pred_combined, target_names=[str(c) for c in range(num_class)])
print("Classification Report:")
print(class_report_combined)


In [None]:
train_plot(RNN_model_combined_history, 'Combined', 'RNN')

In [None]:
save_model_to_s3(RNN_model_combined, bucket, 'Shared/RNN_saved_models', 'RNN_model_combined_80_new.keras')

## Fitting asm only (LSTM)

In [None]:
%%time
embedding_size = asm_embeddings_train.shape[1]
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Define LSTM model for ASM embeddings
LSTM_model_asm = LSTM_initialisation(0.0001, embedding_size)

# Fit ASM embeddings to GRU model
LSTM_model_asm_history = LSTM_model_asm.fit(asm_embeddings_train, asm_label_train,
                                            validation_data=(asm_embeddings_test, asm_label_test),
                                            epochs=100,
                                            callbacks=[early_stopping],
                                            verbose=1)

# Predict class probabilities and classes for the ASM test set
y_pred_proba_asm = LSTM_model_asm.predict(asm_embeddings_test)
y_pred_asm = np.argmax(y_pred_proba_asm, axis=1)  # Convert probabilities to class predictions
y_true_asm = np.argmax(asm_label_test, axis=1)  # Convert one-hot encoded labels back to integers

# Evaluate the GRU model on ASM training data
loss_asm, accuracy_asm = LSTM_model_asm.evaluate(asm_embeddings_train, asm_label_train, verbose=1)
print(f"Training Loss: {loss_asm}, Training Accuracy: {accuracy_asm}")

# Evaluate the GRU model on ASM test data
test_loss_asm, test_accuracy_asm = LSTM_model_asm.evaluate(asm_embeddings_test, asm_label_test, verbose=1)
print(f"Test Loss: {test_loss_asm}, Test Accuracy: {test_accuracy_asm}")

# Calculate the multiclass log loss for ASM
mlogloss_asm = log_loss(y_true_asm, y_pred_proba_asm)
print(f"Multiclass Log Loss: {mlogloss_asm}")

# Generate the confusion matrix for ASM
conf_matrix_asm = confusion_matrix(y_true_asm, y_pred_asm)
print("Confusion Matrix:")
print(conf_matrix_asm)

# Display a classification report for additional metrics for ASM (optional)
class_report_asm = classification_report(y_true_asm, y_pred_asm, target_names=[str(c) for c in range(num_class)])
print("Classification Report:")
print(class_report_asm)

In [None]:
train_plot(LSTM_model_asm_history, 'ASM', 'LSTM')