In [None]:
pip install -U langchain langchain-community faiss-cpu ollama openai tiktoken


In [None]:
pip install notebook ipywidgets


In [1]:
# Cell 1: Imports and paths
import json
from pathlib import Path
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings
from langchain.docstore.document import Document
from langchain.llms import Ollama
from langchain.chains import RetrievalQA

CHUNK_FILE = Path(r"C:\Projects\cyote_tool_wiz\data\chunks\RAG_Ready_Test_chunks.jsonl")
assert CHUNK_FILE.exists(), f"Chunk file not found: {CHUNK_FILE}"


In [2]:
# Cell 2: Load chunks
with open(CHUNK_FILE, "r", encoding="utf-8") as f:
    raw_chunks = [json.loads(line) for line in f]

documents = [
    Document(
        page_content=chunk["content"],
        metadata={
            "tool_name": chunk["tool_name"],
            "section": chunk["section"],
            "tactics_supported": chunk.get("tactics_supported", []),
            "techniques_supported": chunk.get("techniques_supported", [])
        }
    )
    for chunk in raw_chunks
]

print(f"✅ Loaded {len(documents)} chunks for retrieval.")


✅ Loaded 179 chunks for retrieval.


In [4]:
# Cell 3: Build FAISS vector store using embedding-compatible Ollama model with progress bar
from tqdm.notebook import tqdm
from langchain_community.vectorstores import FAISS

# ✅ Initialize Ollama embedding model
embedding = OllamaEmbeddings(model="nomic-embed-text")

# 🧠 Extract raw text and metadata from documents
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]

# 🚀 Generate embeddings with progress bar
print("🔍 Generating embeddings...")
vectors = []
for text in tqdm(texts, desc="Embedding Chunks"):
    vectors.append(embedding.embed_query(text))

# 🏗️ Build FAISS index from embeddings
print("🔧 Building FAISS index...")
text_embedding_pairs = list(zip(texts, vectors))  # ✅ format fix
vectorstore = FAISS.from_embeddings(text_embedding_pairs, embedding=embedding, metadatas=metadatas)

# 🔍 Create retriever interface
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

print("✅ Vector index built and retriever ready.")

🔍 Generating embeddings...


Embedding Chunks:   0%|          | 0/179 [00:00<?, ?it/s]

🔧 Building FAISS index...
✅ Vector index built and retriever ready.


In [9]:
from langchain.callbacks.base import BaseCallbackHandler
from IPython.display import display, Markdown
from IPython.display import clear_output

class NotebookStreamHandler(BaseCallbackHandler):
    def __init__(self):
        self.tokens = []

    def on_llm_new_token(self, token: str, **kwargs) -> None:
        self.tokens.append(token)
        clear_output(wait=True)
        display(Markdown("".join(self.tokens)))


In [12]:
# Cell 4: Setup LLM-powered Q&A with streaming using ChatOllama
from langchain_community.chat_models import ChatOllama
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA

# 🧠 Use chat-capable model with streaming
llm = ChatOllama(
    model="gemma3:12b",  # ensure this is a chat-friendly model
    callbacks=[StreamingStdOutCallbackHandler()]
)

# 🔍 Setup RetrievalQA with retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)



In [27]:
# Cell 5: Ask a CyOTE-aware threat hunt question (FrostyGoop context)
query = (
    "Tell me about havex malware. If you don't have access to information about it tell me. What are the MITRE ATT&CK for ICS Techniques associated with it? Which OT protocols does it use?"
)

result = qa_chain.invoke({"query": query})  # use `.invoke()` for streaming-enabled chains

# ✅ After answer is streamed, print sources
print("\n📚 Sources:")
for doc in result['source_documents']:
    print(f"→ {doc.metadata['tool_name']} ({doc.metadata['section']})")



Okay, let's break down what's known about Havex, its associated MITRE ATT&CK techniques for ICS/OT, and the OT protocols it's known to interact with. I'll also address the limitations regarding my current information access.

**What is Havex? (And What We *Do* Know - With Caveats)**

Havex is a sophisticated, modular malware platform primarily targeting Industrial Control Systems (ICS) and Operational Technology (OT) environments.  Here's a summary of what has been publicly reported, bearing in mind that information has been fragmented and sometimes difficult to fully confirm due to the nature of these attacks:

*   **Discovery & Attribution (Uncertainties):** Havex was initially identified in 2016. Its origins are murky. While initially attributed to Russia/APT28, this attribution is not universally accepted and is controversial within the security community. It's possible multiple actors are using or have used the framework.  The uncertainty in attribution makes it challenging to def

In [None]:
# Cell 1: Imports and paths
import json
from pathlib import Path
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings
from langchain.docstore.document import Document
from langchain.llms import Ollama
from langchain.chains import RetrievalQA

# 🧠 New: Custom loader
from utils.db_loader import load_jsonl_chunks, load_db_entries

# Paths to input sources
CHUNK_FILE = Path("data/chunks/RAG_Ready_Test_chunks.jsonl")
DB_PATH = Path("data/tool_knowledge.db")

assert CHUNK_FILE.exists(), f"❌ Chunk file not found: {CHUNK_FILE}"
assert DB_PATH.exists(), f"❌ Database not found: {DB_PATH}"


In [None]:
# Cell 2: Load documents from .jsonl and tool_knowledge.db
jsonl_docs = load_jsonl_chunks(CHUNK_FILE)
db_docs = load_db_entries(DB_PATH)

# 🧩 Combine them for embedding
documents = jsonl_docs + db_docs
print(f"📚 Total documents: {len(documents)}")


In [None]:
# Cell 3: Build FAISS vector store using embedding-compatible Ollama model with progress bar
from tqdm.notebook import tqdm
from langchain_community.vectorstores import FAISS

embedding = OllamaEmbeddings(model="nomic-embed-text")

texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]

print("🔍 Generating embeddings...")
vectors = []
for text in tqdm(texts, desc="Embedding Chunks"):
    vectors.append(embedding.embed_query(text))

print("🔧 Building FAISS index...")
vectorstore = FAISS.from_embeddings(vectors, texts, metadatas=metadatas)

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
print("✅ Vector index built and retriever ready.")


In [None]:
# Cell 4: Setup LLM-powered Q&A (streaming optional)
from langchain.chains import RetrievalQA
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# Use an Ollama-supported model with chat capability
llm = Ollama(
    model="llama3",  # Change to "mistral" or another if needed
    callbacks=[StreamingStdOutCallbackHandler()]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

print("🧠 QA chain ready.")


In [None]:
# Cell 5: Ask a contextual threat hunt question
query = "Which CyOTE tools can help in a threat hunt involving the FrostyGoop malware and the associated MITRE ATT&CK for ICS tactics and techniques?"

result = qa_chain.invoke({"query": query})  # Use `.invoke()` instead of `qa_chain(query)`

print("🔎 Answer:\n")
print(result['result'])

print("\n📚 Sources:")
for doc in result['source_documents']:
    print(f"→ {doc.metadata.get('tool_name', 'Unknown Tool')} ({doc.metadata.get('section', 'unknown section')})")


In [None]:
query = "What observable types are most useful for detecting this malware?"
result = qa_chain.invoke({"query": query})

print("🔎 Answer:\n")
print(result['result'])

print("\n📚 Sources:")
for doc in result['source_documents']:
    print(f"→ {doc.metadata.get('tool_name', 'Unknown Tool')} ({doc.metadata.get('section', 'unknown section')})")


In [None]:
# Cell 7: Browse document metadata (for debugging or fine-tuning)
from collections import Counter

tool_names = [doc.metadata.get("tool_name", "unknown") for doc in documents]
print(f"🛠️ Tools Represented: {Counter(tool_names)}")
