In [3]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel

class MatchingNetworkBERT(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased'):
        super(MatchingNetworkBERT, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        # You can add more layers here if needed
        self.fc = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size) 

    def forward(self, support_set, query_set):
        # Embed support set using BERT
        support_embeddings = self.bert(**support_set).last_hidden_state[:, 0, :]  # Take the [CLS] token embedding
        support_embeddings = F.relu(self.fc(support_embeddings)) 

        # Embed query set using BERT
        query_embeddings = self.bert(**query_set).last_hidden_state[:, 0, :]  # Take the [CLS] token embedding
        query_embeddings = F.relu(self.fc(query_embeddings)) 

        # Calculate similarity between query and support embeddings
        similarity = torch.matmul(query_embeddings, support_embeddings.transpose(0, 1))

        # Softmax to get probabilities
        probabilities = F.softmax(similarity, dim=1)

        return probabilities

# Example usage
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example input data
support_texts = ['{"userAgent": "aws-cli/1.11.2 Python/2.7.10 Darwin/16.4.0 botocore/1.4.60", "eventID": "58aa82e7-2941-45fc-8b87-4610a108640c3", "userIdentity": {"type": "IAMUser", "principalId": "AIDA9BO36HFBHKGJAO9C1", "arn": "arn:aws:iam::811596193553:user/backup", "accountId": "811596193553", "accessKeyId": "AKIA1ZBTOEKWKVHP6GHZ", "userName": "backup"}}', 
                 '"{"userAgent": "aws-cli/1.11.2 Python/2.7.10 Darwin/16.4.0 botocore/1.4.60", "eventID": "430b263c-9431-472b-b8d5-cf5df767dd18", "userIdentity": {"type": "IAMUser", "principalId": "AIDA9BO36HFBHKGJAO9C1", "arn": "arn:aws:iam::811596193553:user/backup", "accountId": "811596193553", "accessKeyId": "AKIA1ZBTOEKWKVHP6GHZ", "userName": "backup"}}']
query_text = ['{"userAgent": "aws-cli/1.11.2 Python/2.7.10 Darwin/16.4.0 botocore/1.4.60", "eventID": "a5041a9d-d61c-4727-aa41-65d9c75e924a", "userIdentity": {"type": "IAMUser", "principalId": "AIDA9BO36HFBHKGJAO9C1", "arn": "arn:aws:iam::811596193553:user/backup", "accountId": "811596193553", "accessKeyId": "AKIA1ZBTOEKWKVHP6GHZ", "userName": "backup"}}']

# Tokenize the input texts
support_inputs = tokenizer(support_texts, return_tensors="pt", padding=True, truncation=True)
query_inputs = tokenizer(query_text, return_tensors="pt", padding=True, truncation=True)

# Create the model
model = MatchingNetworkBERT()

# Get the output
output = model(support_inputs, query_inputs)

print(output)


  from .autonotebook import tqdm as notebook_tqdm


tensor([[0.7976, 0.2024]], grad_fn=<SoftmaxBackward0>)


In [4]:
import json
with open("../../data/raw/flaws_cloudtrail02.ndjson") as file:
    lines = file.readlines()
    
    support_input = tokenizer(lines, return_tensors="pt", padding=True, truncation=True)
    query_lines = []
    with open("../../data/raw/flaws_cloudtrail02.ndjson") as query_file:
        read_lines = query_file.readlines()
        for l in read_lines:
            query_lines.append(l)
    
    query_lines = tokenizer(query_lines, return_tensors="pt", padding=True, truncation=True)
    model = MatchingNetworkBERT()
    output = model(support_input, query_inputs)

    print(output)

In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, BertTokenizer, AdamW
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class BERTFineTuner(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_labels=2):
        super(BERTFineTuner, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

# Example usage
# 1. Prepare your data
texts = ["This is a positive example.", "This is a negative example."]
labels = [1, 0]  # 1 for positive, 0 for negative
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

# 2. Create dataset and dataloader
dataset = CustomDataset(texts, labels, tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# 3. Initialize model, optimizer, and loss function
model = BERTFineTuner()
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# 4. Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

epochs = 3
for epoch in range(epochs):
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')