In [1]:
# HTML scraper
import requests
from bs4 import BeautifulSoup

# Standard libraries
import pandas as pd
import numpy as np

# String manipulation - from string to vector
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
import re
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Neural network requirements
import tensorflow as tf
import joblib
import transformers

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gltut\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gltut\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\gltut\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Import HTML file and splitting it in sentences

In [2]:
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')

# Function to get HTML content and tokenize into sentences
def fetch_html_and_tokenize(url):
    response = requests.get(url)
    html_content = response.text

    # Parse HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text()

    # Tokenize into sentences
    sentences = nltk.sent_tokenize(text)
    # Remove newline characters from each sentence
    sentences = [sentence.replace('\n', '') for sentence in sentences]
    df = pd.DataFrame(data = sentences, columns = ['text'])


    return df

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gltut\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Example URL
url = r'https://www.trendmicro.com/en_us/research/19/b/trickbot-adds-remote-application-credential-grabbing-capabilities-to-its-repertoire.html'
df = fetch_html_and_tokenize(url)

# Preprocessing detector

In [4]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize (batch):
    return tokenizer(batch["text"], max_length=512, padding='max_length', truncation=True, return_tensors="tf")

# Convert the DataFrame to a datasets.Dataset
dataset = Dataset.from_pandas(df)

# Create a DatasetDict with a single dataset
dataset_dict = DatasetDict({'my_dataset': dataset})
ds_encoded = dataset_dict.map(tokenize, batched= True, batch_size= None)

# Tokenization
X = [tokenizer(text, padding="max_length",max_length = 512, truncation=True)['input_ids'] for text in ds_encoded["my_dataset"]['text']]
X = np.array(X, dtype='int32')

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

In [5]:
my_reloaded_model = tf.keras.models.load_model(
       'Detector/output/detector-bert.keras',
       custom_objects={"TFBertModel": transformers.TFBertModel}
)



In [6]:
def confidence_col(num : float):
    if num <= 0.5:
        return np.round(1 - num, 4)
    else:
        return np.round(num, 4)

def detection_col(num : float):
    return bool(np.round(num, 0))


prediction_sample= my_reloaded_model.predict(X)

# Building the final detection dataset
df['prediction'] = prediction_sample
df['detection'] = df['prediction'].apply(detection_col)
df['confidence'] = df['prediction'].apply(confidence_col)
df = df.drop('prediction', axis = 1)
df.head()



Unnamed: 0,text,detection,confidence
0,Trickbot Adds Credential-Grabbing Capabilities...,False,0.7257
1,Respond Faster.,True,0.682
2,See More.,True,0.7959
3,Respond Faster.,True,0.682
4,Move faster than your adversaries with powerfu...,True,0.6758


# Preprocessing classifier

In [7]:
classifier_path = r'Classifier\output\classifier.keras'
classifier = tf.keras.models.load_model(classifier_path)
classifier.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 512, 100)          934200    
                                                                 
 dropout (Dropout)           (None, 512, 100)          0         
                                                                 
 lstm (LSTM)                 (None, 200)               240800    
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense (Dense)               (None, 190)               38190     
                                                                 
Total params: 1,213,190
Trainable params: 1,213,190
Non-trainable params: 0
_________________________________________________________________


In [8]:
lemma = WordNetLemmatizer()
s_words = stopwords.words('english')

def cleaning(row):
    row = re.sub(r'http\S+', '', row)
    row = re.sub("[^a-zA-Z0-9]", " ", row)
    row = nltk.word_tokenize(row.lower())
    row = [lemma.lemmatize(word) for word in row]
    row = [word for word in row if word not in s_words]
    row = " ".join(row)
    return row

df['MB'] = df['text'].apply(cleaning)
            
voc_size = 9000
max_sent_length = 512

one_hot = [one_hot(words, voc_size) for words in df['MB']]
pad= pad_sequences(one_hot, padding = 'pre', maxlen = max_sent_length)

In [9]:
encoder = joblib.load(r'Classifier\label_encoder.pkl')

# Build the dataset with the classification of the sentences in MITRE

_ = [[] for i in range(df.shape[0])]
df['classification'], df['classification_conf'] = _, _
prediction_sample= classifier(pad)
predictions_confidences = []
prob_tensor = tf.keras.activations.softmax(prediction_sample, axis = -1)


for index in range(df.shape[0]):
        
        if df['detection'][index]:
            top_k_probabilities, top_k_classes = tf.math.top_k(prob_tensor[index], k=5)
            attack_ids=[]
            probabilities=[]

            for classy in top_k_classes:
                attack_ids.append(str(encoder.inverse_transform([classy])[0]))

            for probability in top_k_probabilities:
                probabilities.append(np.round(float(probability), 4))

            df['classification'][index] = attack_ids
            df['classification_conf'][index] = probabilities
    
df_result = df.drop('MB', axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['classification'][index] = attack_ids
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['classification_conf'][index] = probabilities


In [10]:
df_result.to_csv('result.csv')