# Embeddings

> Giovanni Foletto - May 30, 2024

In this notebook I will investigate a methods to get information about logs passing them in a *embeddings* layer.

Similarity of both results are basically the same. The distribution is not so good.

Superclose to 80%, pretrain that as benign. Then run it again, since it will concentrate more the data and concentrate the graph. Then check 

Retry with: `benign`, `undecided`, `malicious`.

Try data: `key:value`. Not remove the data.
Distribution between 3 variable.

In [2]:
import torch
import torch.nn
from torch.utils.data import Dataset, DataLoader
import torchtext as text
from torchtext.data import get_tokenizer

import numpy as np
import polars as pl

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

import gc
from copy import deepcopy as dc
import datetime


The following required CPU features were not detected:
    ssse3, sse4.1, sse4.2, popcnt
Continuing to use this version of Polars on this processor will likely result in a crash.
Install the `polars-lts-cpu` package instead of `polars` to run Polars with better compatibility.

Hint: If you are on an Apple ARM machine (e.g. M1) this is likely due to running Python under Rosetta.
It is recommended to install a native version of Python that does not run under Rosetta x86-64 emulation.




In [15]:
class LogDataset(Dataset):
	def __init__(self, input_file, tokenizer, vec):
		self.tokenizer = tokenizer
		self.vec = vec
		self.lines = []
		self.embeddings = []
		with open(input_file) as of:
			self.lines = of.readlines()

		for line in self.lines:
			self.lines.append(self.calculate_embedding(line))
		
	def calculate_embedding(self, input):
		self.tokens = self.tokenizer(input)
		return self.vec.get_vecs_by_tokens(self.tokens, lower_case_backup=True)

	def __len__(self):
		return len(self.lines)

	def __getitem__(self, i):
		return self.lines[i], self.embeddings[i]

In [16]:
vec = text.vocab.GloVe(name='6B', dim=50)

tokenizer = get_tokenizer("basic_english")
train_dataset = LogDataset(
	"../data/raw/unificated.ndjson", 
	tokenizer,
	vec
    )

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
for _, batch in enumerate(train_loader):
  x_batch, y_batch = batch[0].to(device), batch[1].to(device)
  print(x_batch.shape, y_batch.shape)
  break

# Perplexity Solution

In [28]:
class TxtEmbedSimplify(nn.Module):
    def __init__(self, vocab_size, embedding_dim, tokenizer):
        super(TxtEmbedSimplify, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, embedding_dim)
        self.red = nn.Linear(embedding_dim, 2)

		

    def forward(self, input_text):
        embeddings = self.embedding(input_text)
        output = self.fc(embeddings)
        output = self.red(output)
        return output
    
model = TxtEmbedSimplify(vocab_size=50000, embedding_dim=128)
input_text = torch.tensor([[1, 2, 3, 4, 5, 1235, 19999]])  # Assuming the text is represented as a sequence of indices
output = model(input_text)
print(output)

tensor([[[ 0.1306, -0.1011],
         [ 0.6891,  0.3014],
         [ 0.0328, -0.2631],
         [ 0.0926,  0.3759],
         [ 0.5456,  0.0521],
         [ 0.1061, -0.0774],
         [-0.3428,  0.2210]]], grad_fn=<ViewBackward0>)


In [29]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.neighbors import NearestNeighbors

class KNNClassifier:
    def __init__(self, k):
        self.k = k
        self.X_train = None
        self.y_train = None
        self.neigh = None

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        self.neigh = NearestNeighbors(n_neighbors=self.k, algorithm='auto', metric='euclidean')
        self.neigh.fit(self.X_train)

    def predict(self, X_test):
        distances, indices = self.neigh.kneighbors(X_test)
        y_pred = self.y_train[indices].mode(dim=1)[0]
        return y_pred

# Example usage
X_train = torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
y_train = torch.tensor([0, 0, 1, 1, 2])

X_test = torch.tensor([[2, 3], [6, 7], [11, 12]])

knn = KNNClassifier(k=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("Predicted labels:", y_pred)

Predicted labels: tensor([0, 1, 1])


## Using Transformers

In [3]:
from transformers import BertModel, BertTokenizer
import torch

# Load the pre-trained BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tensor([[[-0.2163, -0.3068, -0.1451,  ..., -0.5404,  0.1290,  0.8808],
         [-0.3963, -0.5435, -0.0127,  ..., -0.6086,  0.2866,  0.3548],
         [-0.1151, -0.5401,  0.4744,  ..., -0.1545, -0.3873,  1.0624],
         ...,
         [-0.2552, -0.5139, -0.2054,  ..., -0.2215, -0.6791,  0.2645],
         [ 0.4151,  0.1741, -0.6553,  ...,  0.4056, -0.2988, -0.4242],
         [ 0.7307,  0.2165, -0.5562,  ...,  0.4656, -0.6639, -0.2994]]])


In [5]:
# Define the text input
text = '{"userAgent": "Boto3/1.9.201 Python/2.7.12 Linux/4.4.0-157-generic Botocore/1.12.201", "eventID": "40422e90-d6ec-4c33-9ed3-e206107", "errorMessage": "Request limit exceeded.", "userIdentity": {"type": "IAMUser", "principalId": "AIDA9BO36HFBHKGJAO9C1", "arn": "arn:aws:iam::811596193553:user/backup", "accountId": "811596193553", "accessKeyId": "ASIARF55FBMFZBXLKDFW", "userName": "backup", "sessionContext": {"sessionIssuer": {}, "webIdFederationData": {}, "attributes": {"mfaAuthenticated": "false", "creationDate": "2019-08-21T07:41:25Z"}}}, "eventType": "AwsApiCall", "errorCode": "Client.RequestLimitExceeded", "sourceIPAddress": "5.205.62.253", "eventName": "RunInstances", "eventSource": "ec2.amazonaws.com", "recipientAccountId": "811596193553", "requestParameters": {"instancesSet": {"items": [{"imageId": "ami-afde8862bc169b8d2", "minCount": 1, "maxCount": 10}]}, "userData": "<sensitiveDataRemoved>", "instanceType": "r4.16xlarge", "blockDeviceMapping": {}, "monitoring": {"enabled": false}, "disableApiTermination":'

# Encode the text using the tokenizer
input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])

# Pass the input through the BERT model to get the embeddings
with torch.no_grad():
    last_hidden_states = model(input_ids)[0]  # The last hidden-state is the pooled output of the BERT model

# The embeddings are now stored in last_hidden_states
print(last_hidden_states)  # Output: torch.Size([1, 9, 768])

tensor([[[-8.0356e-01, -5.0567e-02, -1.7274e-02,  ..., -1.7778e-01,
          -4.0623e-01,  6.8475e-01],
         [-9.0496e-01,  3.2348e-01, -2.4660e-01,  ..., -4.0812e-01,
          -3.7556e-01,  4.3471e-01],
         [-1.3830e+00, -9.5320e-02,  2.8186e-01,  ..., -9.2735e-02,
           2.9880e-01,  3.5581e-01],
         ...,
         [-2.8888e-01, -2.0850e-01, -7.5742e-03,  ..., -2.3136e-01,
           3.8235e-01, -9.2748e-01],
         [-5.1660e-01,  2.3752e-01, -2.1572e-01,  ..., -1.1691e-01,
          -1.2893e-01,  2.0560e-02],
         [ 3.3490e-02,  2.8451e-01,  7.5735e-04,  ...,  1.0637e-01,
          -3.3073e-01,  2.7184e-01]]])


In [6]:
last_hidden_states.shape

torch.Size([1, 494, 768])

# New (Embedding => KNN)

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext.data.utils import get_tokenizer
import torchtext as text

from sklearn.cluster import DBSCAN
from tqdm import tqdm

from numpy import unique, where
import polars as pl
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns



The following required CPU features were not detected:
    ssse3, sse4.1, sse4.2, popcnt
Continuing to use this version of Polars on this processor will likely result in a crash.
Install the `polars-lts-cpu` package instead of `polars` to run Polars with better compatibility.

Hint: If you are on an Apple ARM machine (e.g. M1) this is likely due to running Python under Rosetta.
It is recommended to install a native version of Python that does not run under Rosetta x86-64 emulation.




In [2]:
class TextEmbeddingModel(nn.Module):
    def __init__(self, tokenizer, vocab, translation):
        super(TextEmbeddingModel, self).__init__()
        
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.translation = translation
        
    def forward(self, input_text):
        
        input_text = input_text.translate(self.translation)
        tokens = self.tokenizer(input_text)
        output = vec.get_vecs_by_tokens(tokens, lower_case_backup=True)

        return output

translation = {
    ord("."): "",
    ord("{"): "",
    ord("}"): "",
    ord(":"): "",
    ord("/"): "",
    ord("-"): "",
    ord("_"): "",
    ord("\""): "",
    ord(","): "",
    ord(";"): "",
    ord("\n"): ""
}

vec = text.vocab.GloVe(name='6B', dim=300)
tokenizer = get_tokenizer("basic_english")

model = TextEmbeddingModel(tokenizer=tokenizer, vocab=vec, translation=translation)

In [3]:
translation = {
    ord("."): "",
    ord("{"): "",
    ord("}"): "",
    ord(":"): "",
    ord("/"): "",
    ord("-"): "",
    ord("_"): "",
    ord("\""): "",
    ord(","): "",
    ord(";"): "",
    ord("\n"): ""
}

vec = text.vocab.GloVe(name='6B', dim=300)
tokenizer = get_tokenizer("basic_english")

def compute_embeddings(input_text):
    
    input_text = input_text.translate(translation)
    tokens = tokenizer(input_text)
    output = vec.get_vecs_by_tokens(tokens, lower_case_backup=True)

    return output

In [4]:
dataset = []
N_SAMPLE = 150000

with open("../../data/raw/unificated.ndjson") as of:
	lines = of.readlines()[:N_SAMPLE]

	for line in tqdm(lines):
		tt = compute_embeddings(line)
		dataset.append(tt.tolist())

df = pl.DataFrame(dataset)

df.read_csv("../../data/prepared/1500m_embeddings.csv")


  9%|▉         | 13753/150000 [02:11<108:55:21,  2.88s/it]

In [13]:
len(dataset)

105283

In [None]:
dbscan_model = DBSCAN(eps=1, min_samples=500) # min distance = 1, min_samples=500
dbscan_model.fit([i.detatch().numpy() for i in dataset])