In [1]:
import pandas as pd
input_csv_path = "microsoft/guide_alerts.csv"
df = pd.read_csv(input_csv_path)
df = df.head(5_000_000)

event_counts = df["event"].value_counts()
df["event_count"] = df["event"].map(event_counts)
df["event"] = df.apply(lambda row: 0 if row["event_count"] < 3 else row["event"], axis=1)
df.drop(columns=["event_count"], inplace=True)

df_filtered = df[df['label'] == 0]
df_sorted = df_filtered.sort_values(by=['machine', 'timestamp'])

event_sequences = (
    df_sorted
    .groupby('machine')
    .agg({
        'event': list,
        'timestamp': list   # keep timestamps as list
    })
    .reset_index()
)

event_sequences['event_count'] = event_sequences['event'].apply(len)

event_sequences = event_sequences[event_sequences['event_count'] >= 5]

event_sequences

Unnamed: 0,machine,event,timestamp,event_count
0,0,"[2180, 1497, 5653, 2180, 2284, 46, 5653, 4083,...","[15723664, 15778333, 16480956, 16496454, 16502...",1163
1,1,"[534, 1063, 3188, 3387, 9654, 1462, 303, 537, ...","[16873871, 16874794, 16877089, 16878734, 16880...",1190
2,2,"[347, 584, 63, 28, 63, 46, 993, 46, 28, 28, 28...","[16903523, 16905797, 16905877, 16906056, 16907...",2167
3,5,"[28, 913, 63, 28, 11186, 584, 28, 5680, 28, 46...","[16907322, 16910168, 16910331, 16910693, 16910...",1369
4,6,"[2559, 2559, 2559, 2559, 2722, 2559, 2559, 255...","[16874400, 16874400, 16875628, 16875628, 16876...",2222
...,...,...,...,...
12698,22448,"[772, 772, 772, 772, 772]","[17065915, 17065915, 17077248, 17094401, 17094...",5
12699,22449,"[4, 4, 4, 4, 4]","[16973806, 16973806, 16973806, 16973806, 16973...",5
12700,22450,"[4, 4, 4, 4, 4]","[16971756, 16971756, 16971756, 16971756, 16971...",5
12705,22462,"[4, 2, 2, 2, 4]","[16926797, 16928932, 16939825, 16943246, 16944...",5


In [2]:
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors
from sklearn.model_selection import train_test_split

def create_event_tokenizer(df, vocab_size=10000):
    """
    Create a tokenizer specifically for event sequences in log data
    """
    tokenizer = Tokenizer(models.WordPiece())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    
    # Special tokens for BERT-like models
    special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
    event_counts = df['event'].value_counts()
    print(f"Found {len(event_counts)} unique events")
    top_events = event_counts.head(min(vocab_size - len(special_tokens) - 1000, len(event_counts))).index.tolist()
    priority_tokens = [str(event) for event in top_events]
    all_special_tokens = special_tokens + priority_tokens
    
    trainer = trainers.WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=all_special_tokens,
        min_frequency=1
    )
    
    return tokenizer, trainer, priority_tokens

# Use only benign sequences (label == 0) for training
benign_sequences = event_sequences.copy()

logs = []

for _, row in benign_sequences.iterrows():
    machine_id = row['machine']
    event_list = row['event']
    ts_list = row['timestamp']
    
    if len(event_list) >= 5:  # Only use sequences with at least 5 events
        event_text = event_list
        timestamp_text = ts_list
        logs.append((machine_id, event_text, timestamp_text))

train_logs, test_logs = train_test_split(logs, test_size=0.2, random_state=42)

train_texts_sequence = [' '.join(map(str, event)) for _, event, _ in train_logs]
print(train_texts_sequence[:2])

tokenizer, trainer, priority_events = create_event_tokenizer(df, vocab_size=15000)

tokenizer.train_from_iterator(train_texts_sequence, trainer)

# BERT format
tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

hf_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
hf_tokenizer.add_special_tokens({
    'pad_token': '[PAD]',
    'unk_token': '[UNK]',
    'cls_token': '[CLS]',
    'sep_token': '[SEP]',
    'mask_token': '[MASK]'
})

print(f"Vocabulary size: {hf_tokenizer.vocab_size}")

hf_tokenizer.save_pretrained('./event_tokenizer')


['453 2 4 453 2', '4 4 2 2 2 4 2']
Found 30905 unique events



Vocabulary size: 15000


('./event_tokenizer/tokenizer_config.json',
 './event_tokenizer/special_tokens_map.json',
 './event_tokenizer/tokenizer.json')

In [3]:
from torch.utils.data import Dataset, DataLoader
import random

class LogSequenceDataset(Dataset):
    """
    Dataset for creating windowed event sequences for BERT training
    """
    def __init__(self, event_sequences, tokenizer, window_size=5, stride=3, mask_prob=0.15):
        self.tokenizer = tokenizer
        self.window_size = window_size
        self.stride = stride
        self.mask_prob = mask_prob
        self.windows = []
        
        print(f"Creating windowed sequences with window_size={window_size}, stride={stride}")
        
        for machine_id, events in event_sequences.items():
            if len(events) >= window_size:
                for i in range(0, len(events) - window_size + 1, stride):
                    window = events[i:i + window_size]
                    self.windows.append({
                        'machine_id': machine_id,
                        'events': window,
                        'position': i
                    })
        
        print(f"Created {len(self.windows)} windows from {len(event_sequences)} machines")
    
    def __len__(self):
        return len(self.windows)
    
    def __getitem__(self, idx):
        window = self.windows[idx]
        events = window['events']
        
        event_text = ' '.join(map(str, events))
        
        # Tokenize
        encoding = self.tokenizer(
            event_text,
            padding='max_length',
            truncation=True,
            max_length=self.window_size + 2,  # +2 for [CLS] and [SEP]
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        labels = input_ids.clone()
        special_tokens = {
            self.tokenizer.cls_token_id,
            self.tokenizer.sep_token_id,
            self.tokenizer.pad_token_id
        }
        
        maskable_positions = []
        for i, token_id in enumerate(input_ids):
            if token_id.item() not in special_tokens:
                maskable_positions.append(i)
        
        num_mask = max(1, int(len(maskable_positions) * self.mask_prob))
        mask_positions = random.sample(maskable_positions, min(num_mask, len(maskable_positions)))
        
        for pos in mask_positions:
            rand = random.random()
            if rand < 0.8:
                input_ids[pos] = self.tokenizer.mask_token_id
            elif rand < 0.9:
                input_ids[pos] = random.randint(0, self.tokenizer.vocab_size - 1)
        
        # Set labels to -100 for non-masked positions (ignore in loss)
        for i in range(len(labels)):
            if i not in mask_positions:
                labels[i] = -100
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
            'machine_id': window['machine_id']
        }

event_sequences_dict = {}
for _, row in event_sequences.iterrows():
    machine_id = row["machine"]
    events = row["event"]
    if len(events) >= 5:
        event_sequences_dict[machine_id] = events

window_size = 5
stride = 3

dataset = LogSequenceDataset(
    event_sequences_dict,
    hf_tokenizer,
    window_size=window_size,
    stride=stride,
    mask_prob=0.15
)

batch_size = 64
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

Creating windowed sequences with window_size=5, stride=3
Created 1026803 windows from 5393 machines


In [4]:
import numpy as np
import torch
from transformers import BertConfig, BertForMaskedLM, get_linear_schedule_with_warmup
from torch.optim import AdamW
import torch.nn.functional as F
from tqdm import tqdm
import os
import math

class LogBERT:
    """
    BERT-based model for log anomaly detection using masked language modeling
    """
    def __init__(self, vocab_size, tokenizer, device='cuda'):
        self.tokenizer = tokenizer
        self.device = device
        
        # BERT configuration
        self.config = BertConfig(
            vocab_size=vocab_size,
            hidden_size=512,
            num_hidden_layers=6,
            num_attention_heads=8,
            intermediate_size=2048,
            max_position_embeddings=128,
            type_vocab_size=2,
            pad_token_id=tokenizer.pad_token_id,
            sep_token_id=tokenizer.sep_token_id,
            cls_token_id=tokenizer.cls_token_id,
            mask_token_id=tokenizer.mask_token_id
        )
        
        self.model = BertForMaskedLM(self.config)
        self.model.to(device)
            
    def train_model(self, dataloader, epochs=3, learning_rate=5e-5, save_path='./logbert_model'):
        """
        Train the LogBERT model using masked language modeling
        """
        optimizer = AdamW(self.model.parameters(), lr=learning_rate)
        total_steps = len(dataloader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(0.1 * total_steps),
            num_training_steps=total_steps
        )
        
        self.model.train()
        total_loss = 0
                
        for epoch in range(epochs):
            epoch_loss = 0
            progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{epochs}')
            
            for batch_idx, batch in enumerate(progress_bar):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                loss = outputs.loss
                
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                
                epoch_loss += loss.item()
                total_loss += loss.item()
                
                progress_bar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'avg_loss': f'{epoch_loss/(batch_idx+1):.4f}'
                })

                # if (batch_idx + 1) % 20000 == 0:
                #     checkpoint_path = f"{save_path}_checkpoint_{epoch}_{batch_idx}"
                #     os.makedirs(checkpoint_path, exist_ok=True)
                #     self.model.save_pretrained(checkpoint_path)
                #     self.tokenizer.save_pretrained(checkpoint_path)
                #     print(f"Checkpoint saved at {checkpoint_path}")
                            
            avg_epoch_loss = epoch_loss / len(dataloader)
            print(f"Epoch {epoch+1} completed. Average loss: {avg_epoch_loss:.4f}")
        
        os.makedirs(save_path, exist_ok=True)
        self.model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)
        print(f"Final model saved to {save_path}")

        return total_loss / epochs * len(dataloader)

    def calculate_sequence_probability(self, event_sequence, window_size=5):
        """
        Calculate the probability of an event sequence using the trained model
        This is used for anomaly detection - low probability indicates anomaly
        """
        self.model.eval()
        
        # Convert sequence to string
        if isinstance(event_sequence, list):
            event_text = ' '.join(map(str, event_sequence))
        else:
            event_text = event_sequence
        encoding = self.tokenizer(
            event_text,
            padding='max_length',
            truncation=True,
            max_length=window_size + 2,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        
        with torch.no_grad():
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            predictions = outputs.logits
            probabilities = F.softmax(predictions, dim=-1)
            

            log_prob_sum = 0.0
            token_count = 0
            
            for i in range(1, input_ids.size(1) - 1):
                if attention_mask[0, i] == 1:
                    token_id = input_ids[0, i].item()
                    if token_id not in [self.tokenizer.pad_token_id, 
                                      self.tokenizer.cls_token_id, 
                                      self.tokenizer.sep_token_id]:
                        token_prob = probabilities[0, i, token_id].item()
                        log_prob_sum += math.log(token_prob + 1e-12)
                        token_count += 1
            
            if token_count > 0:
                avg_log_prob = log_prob_sum / token_count
                return avg_log_prob
            else:
                return float('-inf')
logbert = LogBERT(
    vocab_size=hf_tokenizer.vocab_size,
    tokenizer=hf_tokenizer,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

In [5]:
avg_loss = logbert.train_model(
    dataloader, 
    epochs=1,
    learning_rate=5e-5,
    save_path='./logbert_model_alert'
)
print(f"Training completed! Average loss: {avg_loss:.4f}")

Epoch 1/1: 100%|██████████| 16044/16044 [10:14<00:00, 26.13it/s, loss=1.1441, avg_loss=1.7788]


Epoch 1 completed. Average loss: 1.7788
Final model saved to ./logbert_model_alert
Training completed! Average loss: 457884294.8290


In [6]:
import torch
import torch.nn.functional as F

class BERTCandidateClassifier:
    def __init__(self, model, tokenizer, device='cuda', top_k=5):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.top_k = top_k
        self.model.eval()
        
    def create_timestamp_windows(self, df, machine_id, target_event, target_timestamp, window_size=5):
        machine_data = df[df['machine'] == machine_id].sort_values('timestamp')
        target_row = machine_data[
            (machine_data['timestamp'] == target_timestamp) & 
            (machine_data['event'] == target_event)
        ]
        if target_row.empty:
            return []
        
        target_index = machine_data.index.get_loc(target_row.index[0])
        
        windows = []
        events = machine_data['event'].tolist()
        timestamps = machine_data['timestamp'].tolist()
        labels = machine_data['label'].tolist()
        
        for start_offset in range(window_size - 1, -1, -1):
            start_idx = max(0, target_index - start_offset)
            end_idx = min(len(events), start_idx + window_size)
            
            if end_idx - start_idx == window_size and target_index >= start_idx and target_index < end_idx:
                window_events = events[start_idx:end_idx]
                window_timestamps = timestamps[start_idx:end_idx]
                window_labels = labels[start_idx:end_idx]
                target_position = target_index - start_idx
                
                windows.append({
                    'events': window_events,
                    'timestamps': window_timestamps,
                    'labels': window_labels,
                    'target_position': target_position,
                    'start_idx': start_idx,
                    'end_idx': end_idx - 1
                })
        
        return windows
    
    def get_top_k_candidates(self, masked_sequence, mask_position):
        """
        Use BERT classification head to get top K candidates for the masked position
        Assumes only 1 masked position per inference
        """
        event_text = ' '.join(map(str, masked_sequence))
        
        encoding = self.tokenizer(
            event_text,
            padding='max_length',
            truncation=True,
            max_length=10,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        
        with torch.no_grad():
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            predictions = outputs.logits
            probabilities = F.softmax(predictions, dim=-1)
            
            # Find the position of [MASK] token in the tokenized sequence
            mask_token_id = self.tokenizer.mask_token_id
            mask_token_position = None
            
            for i, token_id in enumerate(input_ids[0]):
                if token_id == mask_token_id:
                    mask_token_position = i
                    break
            
            if mask_token_position is None:
                print("Warning: No [MASK] token found in sequence")
                return [], []
            
            top_k_probs, top_k_ids = torch.topk(probabilities[0, mask_token_position], self.top_k)
            candidates, probs = [], []
            
            for prob, tid in zip(top_k_probs, top_k_ids):
                token = self.tokenizer.convert_ids_to_tokens([tid.item()])[0]
                if token.startswith('##'):
                    token = token[2:]
                try:
                    candidate = int(token)
                except ValueError:
                    candidate = token
                candidates.append(candidate)
                probs.append(prob.item())

            return candidates, probs
                
    def detect_malicious_with_candidates(self, df, target_event, machine_id, target_timestamp, 
                                       window_size=5, min_windows=1):
        """
        Detect if target event is malicious using top-K candidate approach with majority voting
        
        Args:
            df: Full dataset
            target_event: Event to analyze for maliciousness
            machine_id: Machine ID to analyze
            target_timestamp: Specific timestamp of the event
            window_size: Size of sliding windows (default 5)
            min_windows: Minimum number of windows required for analysis
            
        Returns:
            Dictionary with analysis results including majority vote decision
        """
        windows = self.create_timestamp_windows(df, machine_id, target_event, target_timestamp, window_size)
        
        if len(windows) < min_windows:
            print(f"Insufficient windows ({len(windows)}) for analysis. Need at least {min_windows}.")
            return None
                
        window_results = []
        malicious_votes = 0
        benign_votes = 0
        
        for i, window in enumerate(windows):
            masked_events = window['events'].copy()
            original_event = masked_events[window['target_position']]
            masked_events[window['target_position']] = '[MASK]'
            
            # Get top-K candidates
            candidates, probabilities = self.get_top_k_candidates(masked_events, window['target_position'])
            
            is_in_candidates = original_event in candidates
            candidate_rank = candidates.index(original_event) + 1 if is_in_candidates else None
            candidate_probability = probabilities[candidates.index(original_event)] if is_in_candidates else 0.0
            
            # print(f"  Original event in top-{self.top_k}? {'YES' if is_in_candidates else 'NO'}")
            # if is_in_candidates:
            #     print(f"  Rank: {candidate_rank}, Probability: {candidate_probability:.6f}")
            
            # Vote: malicious if NOT in top-K candidates
            # if candidate_probability < 0.3:
            #     malicious_votes += 1
            #     is_malicious_vote = True
            #     print(f"  Vote: MALICIOUS (not in top-{self.top_k} candidates)")
            # else:
            #     benign_votes += 1
            #     is_malicious_vote = False
            #     print(f"  Vote: BENIGN (in top-{self.top_k} candidates)")
            is_malicious_vote = not is_in_candidates
            if is_malicious_vote:
                malicious_votes += 1
                # print(f"  Vote: MALICIOUS (not in top-{self.top_k} candidates)")
            else:
                benign_votes += 1
                # print(f"  Vote: BENIGN (in top-{self.top_k} candidates)")
            
            window_results.append({
                'window_index': i,
                'window_events': window['events'],
                'masked_events': masked_events,
                'original_event': original_event,
                'target_position': window['target_position'],
                'candidates': candidates,
                'probabilities': probabilities,
                'is_in_candidates': is_in_candidates,
                'candidate_rank': candidate_rank,
                'candidate_probability': candidate_probability,
                'malicious_vote': is_malicious_vote,
                'timestamps': window['timestamps'],
                'labels': window['labels']
            })
        
        # Majority rule decision
        total_votes = malicious_votes + benign_votes
        majority_threshold = total_votes / 2

        final_decision_malicious = 1 if malicious_votes > majority_threshold else 0
        confidence = max(malicious_votes, benign_votes) / total_votes
                
        result = {
            'machine_id': machine_id,
            'target_event': target_event,
            'target_timestamp': target_timestamp,
            'window_size': window_size,
            'top_k': self.top_k,
            'num_windows': len(windows),
            'malicious_votes': malicious_votes,
            'benign_votes': benign_votes,
            'total_votes': total_votes,
            'final_decision_malicious': final_decision_malicious,
            'confidence': confidence,
            'window_results': window_results,
            'approach': 'bert_candidate_classification'
        }
        
        return result
    
    def batch_analyze_with_candidates(self, df, events_to_analyze):
        """
        Analyze multiple events using the candidate approach
        
        Args:
            events_to_analyze: List of tuples (event, machine_id, timestamp)
        """
        results = []
        predictions = []
        
        for event_info in events_to_analyze:
            
            if len(event_info) != 3:
                print(f"Invalid event format: {event_info}. Expected (event, machine_id, timestamp)")
                continue
                
            event, machine_id, timestamp = event_info
            result = self.detect_malicious_with_candidates(
                df, event, machine_id, timestamp
            )
            
            if result:
                results.append(result)
                predictions.append(result['final_decision_malicious'])
            else:
                predictions.append(None)

        return results, predictions

candidate_classifier = BERTCandidateClassifier(
    model=logbert.model,
    tokenizer=hf_tokenizer,
    device=logbert.device,
    top_k=4
)

In [7]:
# OPTIMIZED BATCHED VERSION OF BERTCandidateClassifier
import torch
import torch.nn.functional as F
import numpy as np

class OptimizedBERTCandidateClassifier(BERTCandidateClassifier):
    """
    Optimized version of BERTCandidateClassifier with batched processing
    """
    
    def get_top_k_candidates_batch(self, masked_sequences_batch, max_length=10):
        """
        Batch process multiple masked sequences to get top K candidates for all of them at once
        
        Args:
            masked_sequences_batch: List of masked sequences (each is a list of events with '[MASK]')
            max_length: Maximum sequence length for padding
            
        Returns:
            candidates_batch: List of candidate lists for each sequence
            probabilities_batch: List of probability lists for each sequence
        """
        if not masked_sequences_batch:
            return [], []
        
        event_texts = []
        for masked_sequence in masked_sequences_batch:
            event_text = ' '.join(map(str, masked_sequence))
            event_texts.append(event_text)
        
        encodings = self.tokenizer(
            event_texts,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        
        input_ids = encodings['input_ids'].to(self.device)
        attention_mask = encodings['attention_mask'].to(self.device)
        
        with torch.no_grad():
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            predictions = outputs.logits
            probabilities = F.softmax(predictions, dim=-1)
            
            mask_token_id = self.tokenizer.mask_token_id
            
            candidates_batch = []
            probabilities_batch = []
            
            for seq_idx in range(input_ids.size(0)):
                mask_position = None
                for token_idx, token_id in enumerate(input_ids[seq_idx]):
                    if token_id == mask_token_id:
                        mask_position = token_idx
                        break
                
                if mask_position is None:
                    candidates_batch.append([])
                    probabilities_batch.append([])
                    continue
                
                top_k_probs, top_k_ids = torch.topk(
                    probabilities[seq_idx, mask_position], 
                    self.top_k
                )
                
                candidates, probs = [], []
                for prob, tid in zip(top_k_probs, top_k_ids):
                    token = self.tokenizer.convert_ids_to_tokens([tid.item()])[0]
                    if token.startswith('##'):
                        token = token[2:]
                    try:
                        candidate = int(token)
                    except ValueError:
                        candidate = token
                    candidates.append(candidate)
                    probs.append(prob.item())
                
                candidates_batch.append(candidates)
                probabilities_batch.append(probs)
        
        return candidates_batch, probabilities_batch
    
    def detect_malicious_with_candidates_batch(self, df, events_to_analyze, window_size=5, min_windows=1):
        """
        Batch process multiple events for malicious detection using candidate approach
        
        Args:
            df: Full dataset
            events_to_analyze: List of tuples (event, machine_id, timestamp)
            window_size: Size of sliding windows
            min_windows: Minimum number of windows required for analysis
            
        Returns:
            results: List of analysis results
            predictions: List of final predictions
        """
        
        # Prepare all windows and masked sequences for batch processing
        batch_data = []
        event_indices = []
        
        for event_idx, event_info in enumerate(events_to_analyze):
            event, machine_id, timestamp = event_info
            windows = self.create_timestamp_windows(df, machine_id, event, timestamp, window_size)
            
            if len(windows) < min_windows:
                print(f"Event {event_idx}: Insufficient windows ({len(windows)}) for analysis")
                continue
            for window_idx, window in enumerate(windows):
                masked_events = window['events'].copy()
                original_event = masked_events[window['target_position']]
                masked_events[window['target_position']] = '[MASK]'
                
                batch_data.append({
                    'event_idx': event_idx,
                    'window_idx': window_idx,
                    'masked_sequence': masked_events,
                    'original_event': original_event,
                    'window': window,
                    'event_info': event_info
                })
                event_indices.append(event_idx)
        
        if not batch_data:
            print("No valid data to process")
            return [], []
        masked_sequences = [item['masked_sequence'] for item in batch_data]
        candidates_batch, probabilities_batch = self.get_top_k_candidates_batch(
            masked_sequences, max_length=window_size + 2
        )
        
        event_results = {}
        for item, candidates, probs in zip(batch_data, candidates_batch, probabilities_batch):
            event_idx = item['event_idx']
            if event_idx not in event_results:
                event_results[event_idx] = {
                    'event_info': item['event_info'],
                    'windows': [],
                    'window_results': []
                }
            original_event = item['original_event']
            is_in_candidates = original_event in candidates
            candidate_rank = candidates.index(original_event) + 1 if is_in_candidates else None
            candidate_probability = probs[candidates.index(original_event)] if is_in_candidates else 0.0
            
            window_result = {
                'window_index': item['window_idx'],
                'window_events': item['window']['events'],
                'masked_events': item['masked_sequence'],
                'original_event': original_event,
                'target_position': item['window']['target_position'],
                'candidates': candidates,
                'probabilities': probs,
                'is_in_candidates': is_in_candidates,
                'candidate_rank': candidate_rank,
                'candidate_probability': candidate_probability,
                'malicious_vote': not is_in_candidates,
                'timestamps': item['window']['timestamps'],
                'labels': item['window']['labels']
            }
            
            event_results[event_idx]['windows'].append(item['window'])
            event_results[event_idx]['window_results'].append(window_result)
        
        # Process final results for each event
        results = []
        predictions = []
        
        for event_idx in sorted(event_results.keys()):
            event_data = event_results[event_idx]
            event, machine_id, timestamp = event_data['event_info']
            window_results = event_data['window_results']
            
            if not window_results:
                predictions.append(None)
                continue
            
            malicious_votes = sum(1 for wr in window_results if wr['malicious_vote'])
            benign_votes = len(window_results) - malicious_votes
            total_votes = len(window_results)
            majority_threshold = total_votes / 2
            
            final_decision_malicious = 1 if malicious_votes > majority_threshold else 0
            confidence = max(malicious_votes, benign_votes) / total_votes
            
            result = {
                'machine_id': machine_id,
                'target_event': event,
                'target_timestamp': timestamp,
                'window_size': window_size,
                'top_k': self.top_k,
                'num_windows': len(window_results),
                'malicious_votes': malicious_votes,
                'benign_votes': benign_votes,
                'total_votes': total_votes,
                'final_decision_malicious': final_decision_malicious,
                'confidence': confidence,
                'window_results': window_results,
                'approach': 'bert_candidate_classification_batch'
            }
            
            results.append(result)
            predictions.append(final_decision_malicious)
        
        return results, predictions
    
    def batch_analyze_with_candidates_optimized(self, df, events_to_analyze, batch_size=256):
        """
        Optimized batch analysis that processes events in smaller chunks for memory efficiency
        
        Args:
            df: Full dataset
            events_to_analyze: List of tuples (event, machine_id, timestamp)
            batch_size: Number of events to process in each batch
            
        Returns:
            results: List of analysis results
            predictions: List of final predictions
        """
        
        all_results = []
        all_predictions = []
        
        for i in range(0, len(events_to_analyze), batch_size):
            batch_events = events_to_analyze[i:i + batch_size]
            
            batch_results, batch_predictions = self.detect_malicious_with_candidates_batch(
                df, batch_events
            )
            
            all_results.extend(batch_results)
            all_predictions.extend(batch_predictions)
            
            while len(all_predictions) < i + len(batch_events):
                all_predictions.append(None)
                
        return all_results, all_predictions

# Initialize optimized classifier
optimized_candidate_classifier = OptimizedBERTCandidateClassifier(
    model=logbert.model,
    tokenizer=hf_tokenizer,
    device=logbert.device,
    top_k=4
)

In [8]:
benign_test = []

for machine, events, timestamp in test_logs:
    for event, timestamp in zip(events, timestamp):
        benign_test.append((event, machine, timestamp))


malicious_logs = list(df[df['label'] == 1][['event', 'machine', 'timestamp']].itertuples(index=False, name=None))

events_to_analyze = benign_test + malicious_logs

truth_label = [0] * len(benign_test) + [1] * len(malicious_logs)


import random
paired = list(zip(events_to_analyze, truth_label))
sampled = random.sample(paired, 500)
sampled_events, sampled_labels = zip(*sampled)
sampled_events = list(sampled_events)
sampled_labels = list(sampled_labels)

results, predictions = optimized_candidate_classifier.batch_analyze_with_candidates_optimized(df, sampled_events, batch_size=8192)

Event 107: Insufficient windows (0) for analysis
Event 120: Insufficient windows (0) for analysis
Event 250: Insufficient windows (0) for analysis
Event 490: Insufficient windows (0) for analysis


In [9]:
none_count = len(list(filter(lambda x: x is None, predictions)))
print(none_count)
predictions = [1 if x is None else x for x in predictions]

4


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(sampled_labels, predictions)
precision = precision_score(sampled_labels, predictions)
recall = recall_score(sampled_labels, predictions)
f1 = f1_score(sampled_labels, predictions)
conf_matrix = confusion_matrix(sampled_labels, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")


Accuracy: 0.1920
Precision: 1.0000
Recall: 0.1871
F1 Score: 0.3153
Confusion Matrix:
[[  3   0]
 [404  93]]
