# Chapter 35: Vector Database Optimization & Log Analysis

Run this notebook directly in Google Colab - no local Python needed!

**Full code**: [GitHub](https://github.com/eduardd76/AI_for_networking_and_security_engineers/tree/main/CODE/Volume-3-Production-Systems/Chapter-35-Vector-Databases)

## Setup

Install dependencies and configure API keys.

In [None]:
# Install dependencies
!pip install -q chromadb sentence-transformers anthropic python-dotenv

# Import and configure API key
import os
from getpass import getpass

# Check for Colab secrets first
try:
    from google.colab import userdata
    os.environ['ANTHROPIC_API_KEY'] = userdata.get('ANTHROPIC_API_KEY')
    print('✓ Using API keys from Colab secrets')
except:
    # Fall back to manual entry
    if 'ANTHROPIC_API_KEY' not in os.environ:
        os.environ['ANTHROPIC_API_KEY'] = getpass('Enter ANTHROPIC_API_KEY: ')
    print('✓ API keys configured')

print('\n✅ Setup complete! Ready to run examples.')

## Example 1: Embedding Model Comparison for Network Logs

Test different embedding models to find the best balance of speed and accuracy for network logs.

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import time

# Test different embedding models
models = {
    'all-MiniLM-L6-v2': SentenceTransformer('all-MiniLM-L6-v2'),  # 384 dims, fast
    'all-mpnet-base-v2': SentenceTransformer('all-mpnet-base-v2'),  # 768 dims, accurate
    'paraphrase-MiniLM-L3-v2': SentenceTransformer('paraphrase-MiniLM-L3-v2')  # 384 dims, very fast
}

# Sample network logs
logs = [
    "BGP peer 10.1.1.1 down - connection timeout",
    "Interface GigabitEthernet0/1 changed state to down",
    "OSPF neighbor 10.2.2.2 state changed from FULL to DOWN",
    "Authentication failed for user admin from 192.168.1.50",
    "High CPU utilization detected: 95% for 5 minutes"
]

# Test embedding speed and dimensions
print("Embedding Model Performance for Network Logs:\n")
for name, model in models.items():
    start = time.time()
    embeddings = model.encode(logs)
    elapsed = time.time() - start

    print(f"{name}:")
    print(f"  Dimensions: {embeddings.shape[1]}")
    print(f"  Time for {len(logs)} logs: {elapsed*1000:.2f}ms")
    print(f"  Per-log: {elapsed*1000/len(logs):.2f}ms")
    print(f"  Throughput: {len(logs)/elapsed:.1f} logs/sec\n")

## Example 2: ChromaDB Setup with Network Logs

Build a vector database for semantic search across network security logs.

In [None]:
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import uuid

# Initialize ChromaDB with persistent storage
client = chromadb.Client(Settings(
    anonymized_telemetry=False,
    is_persistent=False  # In-memory for Colab
))

# Create collection with custom embedding function
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

collection = client.get_or_create_collection(
    name="network_security_logs",
    metadata={
        "description": "Network and security logs with semantic search",
        "embedding_model": "all-MiniLM-L6-v2"
    }
)

# Sample network logs with metadata
logs_data = [
    {
        "log": "authentication failed for user admin from 192.168.1.50 after 3 attempts",
        "metadata": {"severity": 3, "category": "security", "device": "firewall1", "timestamp": 1705320000}
    },
    {
        "log": "bgp peer 10.1.1.1 down due to hold timer expired",
        "metadata": {"severity": 2, "category": "routing", "device": "router1", "timestamp": 1705320120}
    },
    {
        "log": "interface gigabitethernet0/1 excessive input errors detected",
        "metadata": {"severity": 4, "category": "interface", "device": "switch1", "timestamp": 1705320240}
    },
    {
        "log": "failed login attempt rejected invalid password from remote host 192.168.1.50",
        "metadata": {"severity": 3, "category": "security", "device": "firewall1", "timestamp": 1705320360}
    },
    {
        "log": "ospf neighbor 10.2.2.2 state change full to down adjacency lost",
        "metadata": {"severity": 2, "category": "routing", "device": "router2", "timestamp": 1705320480}
    }
]

# Embed and add to collection
log_texts = [item["log"] for item in logs_data]
embeddings = embedding_model.encode(log_texts).tolist()

collection.add(
    embeddings=embeddings,
    documents=log_texts,
    metadatas=[item["metadata"] for item in logs_data],
    ids=[str(uuid.uuid4()) for _ in logs_data]
)

print(f"Added {len(logs_data)} logs to ChromaDB")
print(f"Collection size: {collection.count()} documents\n")

# Query for similar security events
query_text = "access denied wrong credentials from 192.168.1.50"
query_embedding = embedding_model.encode([query_text]).tolist()

results = collection.query(
    query_embeddings=query_embedding,
    n_results=3,
    where={"category": "security"}  # Filter to security logs only
)

print(f"Query: '{query_text}'")
print("\nTop 3 similar security events:")
for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
)):
    print(f"\n{i+1}. Distance: {distance:.4f}")
    print(f"   Log: {doc}")
    print(f"   Device: {metadata['device']}, Severity: {metadata['severity']}")

## Example 3: Batch Processing Large Log Volumes

Efficiently ingest millions of network logs with batching and progress tracking.

In [None]:
import chromadb
from sentence_transformers import SentenceTransformer
import numpy as np
import time
from typing import List, Dict
import uuid

class NetworkLogIngestor:
    """Efficiently ingests millions of network logs into ChromaDB."""

    def __init__(self, collection_name: str, batch_size: int = 1000):
        self.batch_size = batch_size
        self.client = chromadb.Client(Settings(is_persistent=False))
        self.collection = self.client.get_or_create_collection(name=collection_name)
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.stats = {
            'total_processed': 0,
            'total_time': 0,
            'batch_times': []
        }

    def ingest_batch(self, logs: List[Dict]) -> Dict:
        """Ingest a batch of logs with timing metrics."""
        start_time = time.time()

        # Extract texts for embedding
        texts = [log['text'] for log in logs]

        # Generate embeddings
        embed_start = time.time()
        embeddings = self.embedding_model.encode(
            texts,
            batch_size=32,
            show_progress_bar=False
        ).tolist()
        embed_time = time.time() - embed_start

        # Add to ChromaDB
        db_start = time.time()
        self.collection.add(
            embeddings=embeddings,
            documents=texts,
            metadatas=[log['metadata'] for log in logs],
            ids=[str(uuid.uuid4()) for _ in logs]
        )
        db_time = time.time() - db_start

        batch_time = time.time() - start_time

        return {
            'batch_size': len(logs),
            'total_time': batch_time,
            'embedding_time': embed_time,
            'db_time': db_time,
            'throughput': len(logs) / batch_time
        }

# Simulate large log stream
def generate_sample_logs(count: int):
    """Generate sample network logs for testing."""
    log_templates = [
        "interface {iface} changed state to {state}",
        "bgp peer {ip} connection {status}",
        "authentication {result} for user {user} from {ip}",
        "cpu utilization {percent}% threshold exceeded"
    ]

    import random

    for i in range(count):
        template = random.choice(log_templates)
        log_text = template.format(
            iface=f"GigabitEthernet0/{random.randint(1,48)}",
            state=random.choice(['up', 'down']),
            ip=f"10.{random.randint(1,255)}.{random.randint(1,255)}.{random.randint(1,255)}",
            status=random.choice(['established', 'timeout', 'reset']),
            result=random.choice(['succeeded', 'failed', 'rejected']),
            user=random.choice(['admin', 'operator', 'guest']),
            percent=random.randint(50, 99)
        )

        yield {
            'text': log_text,
            'metadata': {
                'severity': random.randint(1, 7),
                'timestamp': 1705320000 + i,
                'device': f"device{random.randint(1,50)}"
            }
        }

# Test ingestion with 5,000 logs (reduced for Colab)
print("Testing batch ingestion with 5,000 logs...\n")

ingestor = NetworkLogIngestor(
    collection_name="large_log_collection",
    batch_size=1000
)

# Process in batches
batch = []
for log in generate_sample_logs(5000):
    batch.append(log)
    if len(batch) >= 1000:
        stats = ingestor.ingest_batch(batch)
        ingestor.stats['total_processed'] += stats['batch_size']
        ingestor.stats['total_time'] += stats['total_time']
        
        print(f"Processed {ingestor.stats['total_processed']} logs | "
              f"Batch throughput: {stats['throughput']:.1f} logs/sec | "
              f"Embedding: {stats['embedding_time']:.2f}s | "
              f"DB: {stats['db_time']:.2f}s")
        batch = []

# Process remaining
if batch:
    stats = ingestor.ingest_batch(batch)
    ingestor.stats['total_processed'] += stats['batch_size']

print(f"\n=== Ingestion Complete ===")
print(f"Total logs: {ingestor.stats['total_processed']}")
print(f"Total time: {ingestor.stats['total_time']:.2f}s")
print(f"Average throughput: {ingestor.stats['total_processed']/ingestor.stats['total_time']:.1f} logs/sec")

## Example 4: Security Event Correlation

Use semantic search to find related security events and reconstruct attack chains.

In [None]:
import chromadb
from sentence_transformers import SentenceTransformer
from datetime import datetime

class SecurityEventCorrelator:
    """Semantic search for security event correlation."""

    def __init__(self):
        self.client = chromadb.Client(Settings(is_persistent=False))
        self.collection = self.client.get_or_create_collection(
            name="security_events",
            metadata={"hnsw:space": "cosine"}
        )
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def ingest_event(self, log_text: str, metadata: dict):
        """Add a single security event."""
        embedding = self.model.encode([log_text]).tolist()
        self.collection.add(
            embeddings=embedding,
            documents=[log_text],
            metadatas=[metadata],
            ids=[f"event_{metadata['timestamp']}_{hash(log_text) % 1000000}"]
        )

    def find_related_events(self, incident_description: str, min_severity: int = 3, top_k: int = 10):
        """Find security events related to an incident."""
        query_embedding = self.model.encode([incident_description]).tolist()
        
        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=top_k,
            where={"severity": {"$lte": min_severity}}
        )
        return results

# Example usage
correlator = SecurityEventCorrelator()

# Ingest sample security events
sample_events = [
    {
        "log": "failed ssh login attempt from 203.0.113.45 username admin",
        "metadata": {"severity": 3, "device": "firewall1", "timestamp": 1705320000, "event_type": "auth_failure"}
    },
    {
        "log": "port scan detected from 203.0.113.45 targeting ports 22,23,80,443",
        "metadata": {"severity": 2, "device": "ids1", "timestamp": 1705320120, "event_type": "scan"}
    },
    {
        "log": "multiple authentication failures from 203.0.113.45 threshold exceeded",
        "metadata": {"severity": 2, "device": "firewall1", "timestamp": 1705320240, "event_type": "brute_force"}
    },
    {
        "log": "successful ssh login from 203.0.113.45 username admin after failed attempts",
        "metadata": {"severity": 1, "device": "server1", "timestamp": 1705320360, "event_type": "auth_success"}
    }
]

print("Ingesting security events...")
for event in sample_events:
    correlator.ingest_event(event['log'], event['metadata'])

print(f"Total events: {correlator.collection.count()}\n")

# Investigate a reported compromise
print("=== Investigating Suspected Server Compromise ===\n")
incident = "server was compromised after successful authentication from suspicious IP"

related = correlator.find_related_events(
    incident_description=incident,
    min_severity=3,
    top_k=5
)

print(f"Query: '{incident}'\n")
print("Related events (ordered by relevance):\n")

for i, (doc, metadata, distance) in enumerate(zip(
    related['documents'][0],
    related['metadatas'][0],
    related['distances'][0]
)):
    relevance = (1 - distance) * 100
    timestamp = datetime.fromtimestamp(metadata['timestamp'])

    print(f"{i+1}. [{timestamp.strftime('%H:%M:%S')}] Relevance: {relevance:.1f}%")
    print(f"   Event: {doc}")
    print(f"   Device: {metadata['device']}, Type: {metadata['event_type']}, Severity: {metadata['severity']}")
    print()

## Interactive Section

Try your own vector database experiments here!

In [None]:
# Your code here
# Experiment with different queries and log patterns


## Next Steps

- Full code: [Chapter 35 on GitHub](https://github.com/eduardd76/AI_for_networking_and_security_engineers/tree/main/CODE/Volume-3-Production-Systems/Chapter-35-Vector-Databases)
- Learn more: [vExpertAI.com](https://vexpertai.com)
- Author: Eduard Dulharu ([@eduardd76](https://github.com/eduardd76))