# CVE to CWE Linker
This notebook provides a tool to link a CVE ID to its associated CWEs and display their names and descriptions using the local CVE and CWE databases.

In [1]:
import os
import json
import re
import math
import shutil
import subprocess
from pathlib import Path

import numpy as np
import scipy.sparse as sp
import xml.etree.ElementTree as ET

# Project-relative paths (assumes you run the notebook from the repo root)
PROJECT_ROOT = Path.cwd()
CWE_XML_PATH = PROJECT_ROOT / "data" / "cwec_v4.19.xml"
CVE_BASE_DIR = PROJECT_ROOT / "data" / "cvelistV5-main" / "cves"

print(f"Project root: {PROJECT_ROOT}")
print(f"CWE Database: {CWE_XML_PATH}")
print(f"CVE Database Directory: {CVE_BASE_DIR}")

Project root: /home/dnfy/Desktop/Fortiss
CWE Database: /home/dnfy/Desktop/Fortiss/data/cwec_v4.19.xml
CVE Database Directory: /home/dnfy/Desktop/Fortiss/data/cvelistV5-main/cves


## 1. Parse CWE Database
We parse the CWE XML file to create a mapping from CWE ID to its name and description.

In [2]:
def parse_cwe_database(xml_path: Path):
    """Parse CWE XML into:
    - cwe_map: CWE-<id> -> {name, description}
    - cwe_corpus: list[{id, name, description, text}] for retrieval

    Note: This catalog is ~1k weaknesses, so a full parse is OK.
    """
    cwe_map = {}
    cwe_corpus = []

    xml_path = Path(xml_path)
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # Extract namespace if present
    ns = {"cwe": root.tag.split("}")[0].strip("{")} if "}" in root.tag else {}
    xpath = ".//cwe:Weakness" if ns else ".//Weakness"

    for weakness in root.findall(xpath, ns):
        wid = weakness.get("ID")
        wname = weakness.get("Name")

        desc_elem = weakness.find("cwe:Description", ns) if ns else weakness.find("Description")
        description = (desc_elem.text or "").strip() if desc_elem is not None else ""
        if not description:
            description = "No description available."

        cwe_id = f"CWE-{wid}"
        cwe_map[cwe_id] = {"name": wname, "description": description}

        # Retrieval text: keep it simple and dense
        text = f"{cwe_id}: {wname}. {description}"
        cwe_corpus.append({"id": cwe_id, "name": wname, "description": description, "text": text})

    print(f"Successfully parsed {len(cwe_map)} CWEs from {xml_path.name}.")
    return cwe_map, cwe_corpus

cwe_map, cwe_corpus = parse_cwe_database(CWE_XML_PATH)

Successfully parsed 969 CWEs from cwec_v4.19.xml.


## 1b. Hybrid RAG: Build a Retriever Index (offline fallback)

This notebook supports two retriever backends:
- **Preferred**: `sentence-transformers/all-mpnet-base-v2` embeddings + FAISS/Chroma (if installed).
- **Fallback (works offline here)**: lightweight **TF‑IDF + cosine similarity** using `scipy`.

The output is the same: given a CVE description, retrieve the top‑k closest CWE definitions.

In [3]:
def _normalize_text(s: str) -> str:
    s = (s or "").lower()
    # keep letters/digits, turn the rest into spaces
    s = re.sub(r"[^a-z0-9]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


def _tokenize(s: str):
    # very simple tokenizer (fast + deterministic)
    s = _normalize_text(s)
    return [t for t in s.split(" ") if t]


def build_tfidf_index(corpus_texts):
    """Build a TF-IDF (L2-normalized) sparse matrix.

    Returns: (X, vocab)
      - X: shape (n_docs, n_terms)
      - vocab: token -> column index
    """
    n_docs = len(corpus_texts)

    # 1) collect term counts per doc + document frequency
    doc_term_counts = []
    df = {}

    for text in corpus_texts:
        counts = {}
        for tok in _tokenize(text):
            counts[tok] = counts.get(tok, 0) + 1
        doc_term_counts.append(counts)
        for tok in counts.keys():
            df[tok] = df.get(tok, 0) + 1

    # 2) build vocab
    vocab = {tok: i for i, tok in enumerate(sorted(df.keys()))}
    n_terms = len(vocab)

    # 3) build sparse TF matrix
    rows = []
    cols = []
    vals = []
    for r, counts in enumerate(doc_term_counts):
        for tok, tf in counts.items():
            c = vocab[tok]
            rows.append(r)
            cols.append(c)
            vals.append(float(tf))

    tf = sp.csr_matrix((vals, (rows, cols)), shape=(n_docs, n_terms), dtype=np.float32)

    # 4) idf
    # smooth: idf = log((1+n)/(1+df)) + 1
    idf = np.empty(n_terms, dtype=np.float32)
    for tok, c in vocab.items():
        idf[c] = math.log((1.0 + n_docs) / (1.0 + df[tok])) + 1.0

    X = tf.multiply(idf)

    # 5) L2 normalize rows for cosine similarity
    row_norm = np.sqrt(X.multiply(X).sum(axis=1)).A1
    row_norm[row_norm == 0] = 1.0
    X = sp.diags(1.0 / row_norm).dot(X)

    return X, vocab


def tfidf_query(text: str, vocab, idf_vec):
    counts = {}
    for tok in _tokenize(text):
        if tok in vocab:
            counts[tok] = counts.get(tok, 0) + 1

    if not counts:
        return sp.csr_matrix((1, len(vocab)), dtype=np.float32)

    rows = []
    cols = []
    vals = []
    for tok, tf in counts.items():
        c = vocab[tok]
        rows.append(0)
        cols.append(c)
        vals.append(float(tf))

    q_tf = sp.csr_matrix((vals, (rows, cols)), shape=(1, len(vocab)), dtype=np.float32)
    q = q_tf.multiply(idf_vec)

    q_norm = np.sqrt(q.multiply(q).sum(axis=1)).A1
    q_norm[q_norm == 0] = 1.0
    q = q.multiply(1.0 / q_norm[0])

    return q


# Build the fallback retriever index now
_cwe_texts = [c["text"] for c in cwe_corpus]
X_tfidf, vocab = build_tfidf_index(_cwe_texts)

# Precompute idf vector in vocab order for queries
idf_vec = np.ones(len(vocab), dtype=np.float32)
# reconstruct df from matrix is annoying; easiest is to re-derive from corpus (small)
# so we reuse the same idf formula here:
_df = {tok: 0 for tok in vocab.keys()}
for text in _cwe_texts:
    seen = set(_tokenize(text))
    for tok in seen:
        if tok in _df:
            _df[tok] += 1
n_docs = len(_cwe_texts)
for tok, c in vocab.items():
    idf_vec[c] = math.log((1.0 + n_docs) / (1.0 + _df[tok])) + 1.0

print(f"TF-IDF index built: X={X_tfidf.shape}, vocab={len(vocab)} tokens")


def retrieve_cwe_candidates(query_text: str, top_k: int = 5):
    """Retriever step: returns top_k CWE candidates with similarity scores.

    Default backend: TF-IDF cosine (works offline).
    If you install sentence-transformers, we auto-upgrade to SBERT cosine.
    """

    q = tfidf_query(query_text, vocab, idf_vec)

    # cosine similarity since both are normalized
    sims = (X_tfidf @ q.T).toarray().ravel()
    if sims.size == 0:
        return []

    top_idx = np.argsort(-sims)[:top_k]
    results = []
    for i in top_idx:
        c = cwe_corpus[int(i)]
        results.append({
            "cwe_id": c["id"],
            "score": float(sims[int(i)]),
            "name": c["name"],
            "description": c["description"],
        })
    return results


# --- Optional: preferred semantic retriever (SBERT) ---
RETRIEVER_BACKEND = "tfidf"

try:
    from sentence_transformers import SentenceTransformer

    _sbert_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    _cwe_emb = _sbert_model.encode(_cwe_texts, normalize_embeddings=True, show_progress_bar=True)
    RETRIEVER_BACKEND = "sbert"

    def retrieve_cwe_candidates(query_text: str, top_k: int = 5):
        """Retriever step (SBERT): embed query, cosine against CWE embeddings."""
        q = _sbert_model.encode([query_text], normalize_embeddings=True)[0]
        sims = _cwe_emb @ q
        top_idx = np.argsort(-sims)[:top_k]
        results = []
        for i in top_idx:
            c = cwe_corpus[int(i)]
            results.append({
                "cwe_id": c["id"],
                "score": float(sims[int(i)]),
                "name": c["name"],
                "description": c["description"],
            })
        return results

except Exception:
    # sentence-transformers not installed (or model not available). Keep TF-IDF backend.
    pass

print(f"Retriever backend: {RETRIEVER_BACKEND}")

TF-IDF index built: X=(969, 3681), vocab=3681 tokens
Retriever backend: tfidf


## 2. Link CVE to CWE
Function to find the CVE JSON file and extract CWE IDs.

In [None]:
def normalize_cve_id(cve_input: str):
    m = re.search(r"(CVE-\d{4}-\d+)", (cve_input or "").upper())
    return m.group(1) if m else None


def get_cve_path(cve_id: str) -> Path | None:
    """Construct the CVE JSON path for cvelistV5 layout: <year>/<prefix>xxx/CVE-YYYY-NNNN.json"""
    cve_id = (cve_id or "").upper()
    match = re.match(r"CVE-(\d{4})-(\d+)$", cve_id)
    if not match:
        return None

    year = match.group(1)
    number = match.group(2)

    # Directory is the number with last 3 digits replaced by 'xxx'
    # 0001 -> 0xxx, 1234 -> 1xxx, 12345 -> 12xxx
    if len(number) < 4:
        dir_name = "0xxx"
    else:
        dir_name = number[:-3] + "xxx"

    return CVE_BASE_DIR / year / dir_name / f"{cve_id}.json"


def read_cve_record(cve_input: str):
    """Return (cve_id, data_dict) or (cve_id, error_str)."""
    cve_id = normalize_cve_id(cve_input)
    if not cve_id:
        return None, "Invalid CVE ID or link."

    cve_path = get_cve_path(cve_id)
    if not cve_path:
        return cve_id, f"Could not map CVE to path: {cve_id}"

    if not cve_path.exists():
        return cve_id, f"CVE file not found at {cve_path}"

    with cve_path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    return cve_id, data


def extract_cves_explicit_cwes(cve_data: dict):
    """Extract explicit CWE IDs from CVE JSON (containers.cna.problemTypes.*.descriptions[].cweId)."""
    cwe_ids = []
    problem_types = cve_data.get("containers", {}).get("cna", {}).get("problemTypes", [])
    for pt in problem_types:
        for desc in pt.get("descriptions", []):
            cwe_id = desc.get("cweId")
            if cwe_id and isinstance(cwe_id, str) and cwe_id.startswith("CWE-"):
                cwe_ids.append(cwe_id)
    return sorted(set(cwe_ids))


def extract_cve_description(cve_data: dict) -> str:
    """Best-effort: get the English description from CVE V5."""
    descs = cve_data.get("containers", {}).get("cna", {}).get("descriptions", [])
    for d in descs:
        if d.get("lang") == "en" and d.get("value"):
            return str(d.get("value")).strip()

    # fallback: any description
    for d in descs:
        if d.get("value"):
            return str(d.get("value")).strip()

    return ""


def retrieve_cwes_for_cve(cve_data: dict, top_k: int = 5, use_hyde: bool = False, ollama_model: str = "mistral:7b-instruct"):
    """Hybrid step: if explicit CWEs exist, return those; otherwise retrieve based on description.
    
    NEW (Phase 3 - HyDE): If use_hyde=True, generates hypothetical CWE definition before retrieval.
    """
    explicit = extract_cves_explicit_cwes(cve_data)
    desc = extract_cve_description(cve_data)

    if explicit:
        return {
            "mode": "explicit",
            "cve_description": desc,
            "explicit_cwes": explicit,
            "retrieved": retrieve_cwe_candidates(desc, top_k=top_k) if desc else [],
            "hyde_document": None,
        }

    if not desc:
        return {"mode": "none", "cve_description": "", "explicit_cwes": [], "retrieved": [], "hyde_document": None}

    # NEW: HyDE step - generate hypothetical CWE definition
    query_for_retrieval = desc
    hyde_document = None
    
    if use_hyde:
        hyde_def, _ = generate_hyde_document(desc, model=ollama_model)
        if hyde_def:
            hyde_document = hyde_def
            query_for_retrieval = hyde_def
            print(f"[HyDE] CVE: {desc[:80]}...")
            print(f"[HyDE] Generated CWE: {hyde_def[:80]}...")

    return {
        "mode": "rag_hyde" if use_hyde else "rag",
        "cve_description": desc,
        "explicit_cwes": [],
        "retrieved": retrieve_cwe_candidates(query_for_retrieval, top_k=top_k),
        "hyde_document": hyde_document,
    }

## 2b. HyDE Step (NEW - Phase 3): Hypothetical Document Embeddings

**HyDE (Hypothetical Document Embeddings)** approach:
- Instead of abstracting the CVE, generate a **hypothetical CWE definition**
- LLM writes what a CWE entry would look like for this vulnerability
- Use that generated definition for retrieval

**Why this should work better than abstraction:**
- Matches CWE writing style and vocabulary
- Uses CWE-specific terminology
- Same document type (definition → definition) instead of (description → definition)

**Example:**
- CVE: "Buffer overflow in libpng 1.2.3 allows remote code execution"
- HyDE generates: "The product writes data past the end of a buffer, allowing attackers to execute arbitrary code..."
- This matches how CWE-787 is actually written!

## 2c. Reasoner Step (Optional): Local LLM

If you have a local LLM runner (e.g., **Ollama**) you can let it choose the best CWE among the retrieved candidates.

Prompt template:

"Given this vulnerability description and these 5 potential weakness definitions, which one fits best? If none fit well, look at the parents of these CWEs."

In [None]:
def build_hyde_prompt(cve_description: str) -> str:
    """Build HyDE prompt to generate a hypothetical CWE definition from CVE description."""
    lines = []
    lines.append("You are a CWE (Common Weakness Enumeration) author writing weakness definitions.")
    lines.append("Return ONLY the weakness definition text. Do not add explanations or markdown.")
    lines.append("")
    lines.append("TASK: Given this vulnerability instance, write a CWE-style weakness definition.")
    lines.append("")
    lines.append("VULNERABILITY INSTANCE:")
    lines.append(cve_description.strip() if cve_description else "(missing)")
    lines.append("")
    lines.append("INSTRUCTIONS:")
    lines.append("- Write in CWE style: describe the weakness TYPE, not the specific instance")
    lines.append("- Start with 'The product...' or 'The software...' (like real CWE definitions)")
    lines.append("- Remove specific product names and versions")
    lines.append("- Focus on what the SOFTWARE does wrong (not what attackers do)")
    lines.append("- Use CWE terminology: 'improper validation', 'insufficient verification', etc.")
    lines.append("- Keep it 2-4 sentences")
    lines.append("")
    lines.append("Example:")
    lines.append("  CVE: 'SQL injection in login.php via username parameter'")
    lines.append("  CWE-style: 'The product constructs SQL queries using externally-influenced input without proper neutralization of special elements, allowing attackers to modify the intended SQL command structure.'")
    lines.append("")
    lines.append("WEAKNESS DEFINITION:")
    return "\n".join(lines)


def generate_hyde_document(cve_description: str, model: str = "mistral:7b-instruct", timeout_s: int = 60):
    """Use Ollama to generate hypothetical CWE definition from CVE description (HyDE).
    
    Returns: (hyde_definition, raw_output) or (None, None) if fails
    """
    if not ollama_available():
        return None, None
    
    prompt = build_hyde_prompt(cve_description)
    
    try:
        proc = subprocess.run(
            ["ollama", "run", model],
            input=prompt.encode("utf-8"),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=timeout_s,
        )
        if proc.returncode != 0:
            print(f"Ollama error: {proc.stderr.decode('utf-8', errors='ignore')}")
            return None, None
        
        raw_output = proc.stdout.decode("utf-8", errors="ignore").strip()
        
        # Clean up: remove any markdown formatting or extra explanations
        hyde_def = raw_output.strip()
        
        # If LLM added markdown code blocks, extract the content
        if hyde_def.startswith("```"):
            lines = hyde_def.split("\n")
            hyde_def = "\n".join([l for l in lines if not l.strip().startswith("```")])
        
        hyde_def = hyde_def.strip()
        
        return hyde_def, raw_output
        
    except subprocess.TimeoutExpired:
        print("Ollama timeout during HyDE generation")
        return None, None
    except Exception as e:
        print(f"Ollama HyDE generation failed: {e}")
        return None, None


# Test example (only runs if ollama is available)
if ollama_available():
    test_cve_desc = "A buffer overflow vulnerability in libpng version 1.2.3 allows remote attackers to execute arbitrary code via a crafted PNG file."
    hyde_def, raw = generate_hyde_document(test_cve_desc)
    
    if hyde_def:
        print("=== HyDE TEST ===")
        print("CVE Description:")
        print(test_cve_desc)
        print("\nGenerated CWE-style Definition:")
        print(hyde_def)
        print("\n=== END TEST ===")
else:
    print("Ollama not available; HyDE will be skipped.")

In [5]:
def build_reasoner_prompt(cve_description: str, candidates: list[dict], top_k: int = 5) -> str:
    """Build a single prompt for an instruction-tuned LLM (e.g., mistral:7b-instruct)."""
    lines = []
    lines.append("You are a security analyst.")
    lines.append("Return ONLY valid JSON. Do not wrap it in markdown.")
    lines.append("")
    lines.append("VULNERABILITY DESCRIPTION:")
    lines.append(cve_description.strip() if cve_description else "(missing)")
    lines.append("")
    lines.append(f"TOP {top_k} RETRIEVED CWE DEFINITIONS:")

    for i, c in enumerate(candidates[:top_k], start=1):
        lines.append("")
        lines.append(f"{i}. {c['cwe_id']} — {c.get('name','')}")
        lines.append(f"Definition: {c.get('description','')}")

    lines.append("")
    lines.append(
        "Task: Given the vulnerability description and the candidate CWE definitions, choose the SINGLE best CWE. "
        "If none fit well, output best_cwe as 'NONE'. "
        "If NONE, suggest which parent(s) to check and why (in the rationale).\n\n"
        "Output schema (JSON): {\"best_cwe\": <CWE-XXX or NONE>, \"confidence\": <0..1>, \"rationale\": <string>}"
    )
    return "\n".join(lines)


def ollama_available() -> bool:
    return shutil.which("ollama") is not None


def run_ollama_reasoner(prompt: str, model: str = "mistral:7b-instruct", timeout_s: int = 180):
    """Runs ollama if installed. Returns stdout text or None."""
    if not ollama_available():
        return None

    # Non-interactive: pass prompt via stdin
    proc = subprocess.run(
        ["ollama", "run", model],
        input=prompt.encode("utf-8"),
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        timeout=timeout_s,
    )
    if proc.returncode != 0:
        print(proc.stderr.decode("utf-8", errors="ignore"))
        return None

    return proc.stdout.decode("utf-8", errors="ignore").strip()

## 3. Query Tool
Input a CVE link or ID to see the results.

In [None]:
def pretty_print_cwe(cwe_id: str):
    info = cwe_map.get(cwe_id)
    if not info:
        print(f"- {cwe_id}: (not found in CWE catalog)")
        return
    print(f"- {cwe_id}: {info['name']}")
    print(f"  Description: {info['description']}")


def lookup_cve_hybrid(
    cve_input: str,
    top_k: int = 5,
    run_llm: bool = False,
    use_hyde: bool = False,
    ollama_model: str = "mistral:7b-instruct",
):
    """Hybrid CVE->CWE with optional HyDE (Phase 3):
    - If CVE record has explicit CWE(s), show them.
    - NEW: If use_hyde=True, generate hypothetical CWE definition before retrieval
    - Always retrieve top-k candidates from CVE description (or HyDE document).
    - Optionally run a local LLM reasoner (Ollama) on the retrieved candidates.
    """

    cve_id, data_or_err = read_cve_record(cve_input)
    print(f"### Hybrid results for {cve_id} ###\n")

    if isinstance(data_or_err, str):
        print(f"Error: {data_or_err}")
        return

    cve_data = data_or_err
    desc = extract_cve_description(cve_data)

    if desc:
        print("CVE description:")
        print(desc)
        print("")

    out = retrieve_cwes_for_cve(cve_data, top_k=top_k, use_hyde=use_hyde, ollama_model=ollama_model)

    if out.get("hyde_document"):
        print("HyDE-generated CWE definition:")
        print(out["hyde_document"])
        print("")

    if out["explicit_cwes"]:
        print("Explicit CWEs in CVE record:")
        for cwe_id in out["explicit_cwes"]:
            pretty_print_cwe(cwe_id)
        print("")

    if out["retrieved"]:
        print(f"Retriever top-{top_k} CWE candidates (cosine similarity):")
        for c in out["retrieved"]:
            print(f"- {c['cwe_id']} (score={c['score']:.4f}) — {c.get('name','')}")
        print("")

        prompt = build_reasoner_prompt(desc, out["retrieved"], top_k=top_k)

        if run_llm:
            resp = run_ollama_reasoner(prompt, model=ollama_model)
            if resp is None:
                print("LLM reasoner not available (ollama missing or failed). Showing prompt instead:\n")
                print(prompt)
            else:
                print("LLM reasoner output:\n")
                print(resp)
        else:
            print("Reasoner prompt (copy/paste into your local LLM):\n")
            print(prompt)
    else:
        print("No candidates retrieved (missing description or empty query).")


# Example usage (link or raw CVE ID both work)
# If you installed Ollama + mistral:7b-instruct, set run_llm=True.
# NEW: Set use_hyde=True to enable Phase 3 HyDE (Hypothetical Document Embeddings)
lookup_cve_hybrid(
    "https://www.cve.org/CVERecord?id=CVE-2024-0001",
    top_k=5,
    run_llm=True,
    use_hyde=True,  # NEW: Enable HyDE
    ollama_model="mistral:7b-instruct",
)

### Hybrid results for CVE-2024-0001 ###

CVE description:
A condition exists in FlashArray Purity whereby a local account intended for initial array configuration remains active potentially allowing a malicious actor to gain elevated privileges.

Explicit CWEs in CVE record:
- CWE-1188: Initialization of a Resource with an Insecure Default
  Description: The product initializes or sets a resource with a default that is intended to be changed by the product's installer, administrator, or maintainer, but the default is not secure.

Retriever top-5 CWE candidates (cosine similarity):
- CWE-496 (score=0.1729) — Public Data Assigned to Private Array-Typed Field
- CWE-489 (score=0.1692) — Active Debug Code
- CWE-648 (score=0.1597) — Incorrect Use of Privileged APIs
- CWE-582 (score=0.1594) — Array Declared Public, Final, and Static
- CWE-129 (score=0.1524) — Improper Validation of Array Index

LLM reasoner output:

{
  "best_cwe": "CWE-648",
  "confidence": 0.9,
  "rationale": "The vulnerab