diff --git a/scripts/es-sarif/.gitignore b/scripts/es-sarif/.gitignore
new file mode 100644
index 000000000..d65a23223
--- /dev/null
+++ b/scripts/es-sarif/.gitignore
@@ -0,0 +1,181 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.env.local
+.env.*.local
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be added to the global gitignore or merged into this project gitignore.  For a PyCharm
+#  project, it is recommended to include/store project specific gitignore file(s) within
+#  the project root.
+#  https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+.idea/
+
+# IDE specific files
+.vscode/
+*.swp
+*.swo
+*~
+
+# OS specific files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+
+elastic-start-local/
+mrva/
diff --git a/scripts/es-sarif/Makefile b/scripts/es-sarif/Makefile
new file mode 100644
index 000000000..231691ec4
--- /dev/null
+++ b/scripts/es-sarif/Makefile
@@ -0,0 +1,20 @@
+.PHONY: format check install help
+
+check:
+	black --check --diff *.py
+
+compile:
+	python3 -m py_compile *.py
+
+format:
+	black *.py
+
+help:
+	@echo "Available targets:"
+	@echo "  install  - Install black formatter"
+	@echo "  format   - Format all Python files in this directory"
+	@echo "  check    - Check formatting without making changes"
+
+install:
+	pip install -r requirements.txt
+
diff --git a/scripts/es-sarif/activate.sh b/scripts/es-sarif/activate.sh
new file mode 100755
index 000000000..70eb02ce2
--- /dev/null
+++ b/scripts/es-sarif/activate.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Convenience script to activate the SARIF Elasticsearch Indexer environment
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+VENV_DIR="$SCRIPT_DIR/.venv"
+
+if [ ! -d "$VENV_DIR" ]; then
+    echo "Virtual environment not found. Run setup.sh first."
+    exit 1
+fi
+
+echo "Activating SARIF Elasticsearch Indexer environment..."
+echo "Python version: $($VENV_DIR/bin/python --version)"
+echo "To deactivate, run: deactivate"
+echo
+
+# Start a new shell with the virtual environment activated
+exec bash --rcfile <(echo "source $VENV_DIR/bin/activate; PS1='(es-sarif) \u@\h:\w\$ '")
diff --git a/scripts/es-sarif/index-sarif-results-in-elasticsearch.md b/scripts/es-sarif/index-sarif-results-in-elasticsearch.md
new file mode 100644
index 000000000..5984c6b4c
--- /dev/null
+++ b/scripts/es-sarif/index-sarif-results-in-elasticsearch.md
@@ -0,0 +1,88 @@
+# SARIF Files Elasticsearch Indexer
+
+This script creates a fresh Elasticsearch index and indexes SARIF 2.1.0 results from multiple SARIF files into it.
+
+## Requirements
+
+- Python 3.11+
+- SARIF files conforming to version 2.1.0 specification (such as those produced by `gh mrva`)
+- Accessible URLs for running instances of Elasticsearch (aka "es") and Kibana (e.g. via `Quick Setup` below)
+
+## Usage
+
+```bash
+python index-sarif-results-in-elasticsearch.py <sarif_files_list.txt> <elasticsearch_index_name>
+```
+
+## Input File Format
+
+The SARIF files list should be a plain text file with one relative file path per line:
+
+```text
+output_misra-c-and-cpp-default_top-1000/solvespace/solvespace/solvespace_solvespace_18606.sarif
+output_misra-c-and-cpp-default_top-1000/solvespace/solvespace/solvespace_solvespace_18607.sarif
+# Comments starting with # are ignored
+```
+
+**Note**: Paths are resolved relative to the directory containing the list file.
+
+## Quick Setup
+
+1. **Set up Python environment:**
+
+```bash
+## Change to the directory that contains this document
+cd scripts/es-sarif
+bash setup-venv.sh
+source .venv/bin/activate
+```
+
+1. **Set up Elasticsearch and Kibana with Docker:**
+
+```bash
+curl -fsSL https://elastic.co/start-local | sh
+```
+
+1. **Run the indexer:**
+
+```bash
+## from the `scripts/es-sarif` directory
+python index-sarif-results-in-elasticsearch.py mrva/sessions/sarif-files.txt codeql-coding-standards-misra-sarif
+```
+
+The `elastic-start-local` setup provides:
+
+- Elasticsearch at `http://localhost:9200`
+- Kibana at `http://localhost:5601`
+- API key stored in `elastic-start-local/.env` as `ES_LOCAL_API_KEY`
+
+## Example Queries
+
+Search for high-severity results:
+
+```json
+GET /codeql-coding-standards-misra-sarif/_search
+{
+  "query": { "term": { "level": "error" } }
+}
+```
+
+Find results for a specific rule:
+
+```json
+GET /codeql-coding-standards-misra-sarif/_search  
+{
+  "query": { "term": { "ruleId": "CERT-C-MSC30-C" } }
+}
+```
+
+## Managing Elasticsearch Services
+
+Control the Docker services:
+
+```bash
+cd elastic-start-local
+./start.sh     # Start services  
+./stop.sh      # Stop services
+./uninstall.sh # Remove everything (deletes all data)
+```
diff --git a/scripts/es-sarif/index-sarif-results-in-elasticsearch.py b/scripts/es-sarif/index-sarif-results-in-elasticsearch.py
new file mode 100644
index 000000000..39d97f05d
--- /dev/null
+++ b/scripts/es-sarif/index-sarif-results-in-elasticsearch.py
@@ -0,0 +1,661 @@
+#!/usr/bin/env python3
+"""
+SARIF Results Elasticsearch Indexer
+
+This script creates a fresh Elasticsearch index and indexes individual SARIF results
+from multiple SARIF files into it. Each result from runs[].results[] becomes a
+separate document in the Elasticsearch index, allowing for granular querying and
+analysis of code scanning findings.
+
+The script reads a list of SARIF file paths from a text file and bulk indexes all
+results into a single Elasticsearch index. Each result document includes:
+- All original SARIF result fields (ruleId, message, locations, etc.)
+- Derived fields (ruleGroup, ruleLanguage) parsed from ruleId
+- versionControlProvenance from run, OR derived from filename pattern
+- repositoryUri (flattened from versionControlProvenance for easier querying)
+- Source file tracking metadata
+
+Repository URI Derivation from Filename:
+If versionControlProvenance is missing or lacks repositoryUri, it will be derived
+from SARIF filenames matching: [<lang>-<framework>_]<org>_<repo>[_<id>].sarif
+Example: "cpp-misra_nasa_fprime_18795.sarif" -> "https://github.com/nasa/fprime"
+
+This approach keeps documents minimal by indexing ONLY the result objects to avoid
+Elasticsearch size limits. Tool info and automation details are NOT included.
+
+Usage:
+    python index-sarif-results-in-elasticsearch.py <sarif_files_list.txt> <elasticsearch_index_name>
+
+Environment Variables:
+    ES_LOCAL_URL          - Elasticsearch host URL (default: http://localhost:9200)
+    ES_LOCAL_API_KEY      - API key for authentication (optional, enables API key auth)
+    ES_LOCAL_USERNAME     - Username for basic authentication (optional)
+    ES_LOCAL_PASSWORD     - Password for basic authentication (optional)
+    ES_BULK_DELAY         - Delay in seconds between bulk indexing chunks (default: 1)
+
+Requirements:
+    - Python 3.11+
+    - elasticsearch (pip install elasticsearch)
+    - Elasticsearch instance accessible at configured host
+"""
+
+import sys
+import json
+import os
+import time
+from pathlib import Path
+from elasticsearch import Elasticsearch, helpers
+from elasticsearch.exceptions import ConnectionError, RequestError
+
+
+def load_env_file(env_file_path):
+    """Load environment variables from a .env file."""
+    if not os.path.exists(env_file_path):
+        return
+
+    with open(env_file_path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#") and "=" in line:
+                key, value = line.split("=", 1)
+                # Handle variable substitution
+                if "${" in value and "}" in value:
+                    # Simple variable substitution for ${VAR} patterns
+                    import re
+
+                    def replace_var(match):
+                        var_name = match.group(1)
+                        return os.environ.get(var_name, match.group(0))
+
+                    value = re.sub(r"\$\{([^}]+)\}", replace_var, value)
+                os.environ[key] = value
+
+
+# --- Configuration ---
+DEFAULT_ELASTIC_HOST = "http://localhost:9200"
+SARIF_VERSION = "2.1.0"
+
+# Elasticsearch mapping optimized for SARIF result documents
+# Minimal mapping - only results with versionControlProvenance enrichment
+SARIF_MAPPING = {
+    "mappings": {
+        "dynamic": True,  # Allow dynamic field mapping for any unmapped fields
+        "properties": {
+            # Core SARIF result fields
+            "ruleId": {"type": "keyword"},
+            "ruleIndex": {"type": "integer"},
+            "kind": {"type": "keyword"},
+            "level": {"type": "keyword"},
+            # Derived fields from ruleId parsing
+            "ruleGroup": {"type": "keyword"},
+            "ruleLanguage": {"type": "keyword"},
+            # Message object
+            "message": {
+                "properties": {
+                    "text": {"type": "text"},
+                    "markdown": {"type": "text"},
+                    "id": {"type": "keyword"},
+                    "arguments": {"type": "keyword"},
+                }
+            },
+            # Locations array - each result can have multiple locations
+            "locations": {
+                "type": "nested",
+                "properties": {
+                    "id": {"type": "integer"},
+                    "physicalLocation": {
+                        "properties": {
+                            "artifactLocation": {
+                                "properties": {
+                                    "uri": {"type": "keyword"},
+                                    "uriBaseId": {"type": "keyword"},
+                                    "index": {"type": "integer"},
+                                }
+                            },
+                            "region": {
+                                "properties": {
+                                    "startLine": {"type": "integer"},
+                                    "startColumn": {"type": "integer"},
+                                    "endLine": {"type": "integer"},
+                                    "endColumn": {"type": "integer"},
+                                    "charOffset": {"type": "integer"},
+                                    "charLength": {"type": "integer"},
+                                    "byteOffset": {"type": "integer"},
+                                    "byteLength": {"type": "integer"},
+                                }
+                            },
+                            "contextRegion": {
+                                "properties": {
+                                    "startLine": {"type": "integer"},
+                                    "endLine": {"type": "integer"},
+                                    "snippet": {"properties": {"text": {"type": "text"}}},
+                                }
+                            },
+                        }
+                    },
+                    "logicalLocations": {
+                        "type": "nested",
+                        "properties": {
+                            "name": {"type": "keyword"},
+                            "fullyQualifiedName": {"type": "keyword"},
+                            "decoratedName": {"type": "keyword"},
+                            "kind": {"type": "keyword"},
+                        },
+                    },
+                },
+            },
+            # Rule reference
+            "rule": {"properties": {"id": {"type": "keyword"}, "index": {"type": "integer"}}},
+            # Fingerprints for deduplication
+            "partialFingerprints": {"type": "object"},
+            "fingerprints": {"type": "object"},
+            # Analysis and classification
+            "analysisTarget": {
+                "properties": {
+                    "uri": {"type": "keyword"},
+                    "uriBaseId": {"type": "keyword"},
+                    "index": {"type": "integer"},
+                }
+            },
+            "guid": {"type": "keyword"},
+            "correlationGuid": {"type": "keyword"},
+            "occurrenceCount": {"type": "integer"},
+            "rank": {"type": "float"},
+            "baselineState": {"type": "keyword"},
+            # ONLY versionControlProvenance from run-level (minimal enrichment)
+            "versionControlProvenance": {
+                "type": "nested",
+                "properties": {
+                    "repositoryUri": {"type": "keyword"},
+                    "revisionId": {"type": "keyword"},
+                    "branch": {"type": "keyword"},
+                    "revisionTag": {"type": "keyword"},
+                },
+            },
+            # Flattened repositoryUri for easier querying (extracted from versionControlProvenance)
+            "repositoryUri": {"type": "keyword"},
+            # Metadata for tracking source SARIF file
+            "_sarif_source": {
+                "properties": {
+                    "file_path": {"type": "keyword"},
+                    "file_name": {"type": "keyword"},
+                    "indexed_at": {"type": "date"},
+                }
+            },
+        }
+    },
+    "settings": {
+        "number_of_shards": 1,
+        "number_of_replicas": 0,
+        "analysis": {"analyzer": {"sarif_text": {"type": "standard", "stopwords": "_none_"}}},
+    },
+}
+def create_elasticsearch_client(host, api_key=None, username=None, password=None):
+    """Create Elasticsearch client with optional API key or basic authentication."""
+    if api_key and api_key.strip():
+        return Elasticsearch(
+            hosts=[host],
+            api_key=api_key.strip(),
+            verify_certs=False,  # For local development
+            ssl_show_warn=False,
+        )
+    elif username and password:
+        return Elasticsearch(
+            hosts=[host],
+            basic_auth=(username, password),
+            verify_certs=False,  # For local development
+            ssl_show_warn=False,
+        )
+    else:
+        return Elasticsearch(hosts=[host])
+
+
+def validate_elasticsearch_connection(es_client, host):
+    """Test connection to Elasticsearch server."""
+    try:
+        if not es_client.ping():
+            print("Error: Could not connect to Elasticsearch. Please check your host and port.")
+            return False
+        print(f"✓ Connected to Elasticsearch at {host}")
+        return True
+    except ConnectionError:
+        print(f"Error: Could not connect to Elasticsearch at {host}")
+        print("Please ensure Elasticsearch is running and accessible.")
+        return False
+
+
+def validate_index_name(es_client, index_name):
+    """Validate that the index doesn't already exist."""
+    if es_client.indices.exists(index=index_name):
+        print(f"Error: Index '{index_name}' already exists.")
+        print(
+            "This script requires a fresh index. Please choose a different name or delete the existing index."
+        )
+        return False
+    print(f"✓ Index name '{index_name}' is available")
+    return True
+
+
+def create_index_with_mapping(es_client, index_name):
+    """Create the Elasticsearch index with SARIF 2.1.0 mapping."""
+    try:
+        es_client.indices.create(index=index_name, body=SARIF_MAPPING)
+        print(f"✓ Created index '{index_name}' with SARIF 2.1.0 mapping")
+        return True
+    except RequestError as e:
+        print(f"Error creating index: {e}")
+        return False
+
+
+def read_sarif_files_list(list_file_path):
+    """Read the list of SARIF files from the specified text file."""
+    list_file = Path(list_file_path)
+    if not list_file.exists():
+        print(f"Error: SARIF files list not found at {list_file_path}")
+        return None
+
+    base_dir = list_file.parent
+    sarif_files = []
+
+    with open(list_file, "r") as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+
+            # Resolve relative paths relative to the list file
+            sarif_path = base_dir / line
+            if not sarif_path.exists():
+                print(f"Warning: SARIF file not found: {sarif_path} (line {line_num})")
+                continue
+
+            if not sarif_path.suffix.lower() == ".sarif":
+                print(
+                    f"Warning: File does not have .sarif extension: {sarif_path} (line {line_num})"
+                )
+                continue
+
+            sarif_files.append(sarif_path)
+
+    print(f"✓ Found {len(sarif_files)} valid SARIF files to process")
+    return sarif_files
+
+
+def parse_rule_id(rule_id):
+    """
+    Parse a ruleId to extract ruleGroup and ruleLanguage.
+
+    Examples:
+    - "cpp/misra/function-like-macros-defined" -> ruleGroup="cpp/misra", ruleLanguage="cpp"
+    - "c/misra/cast-between-pointer-to-object-and-non-int-arithmetic-type" -> ruleGroup="c/misra", ruleLanguage="c"
+    - "py/baseline/expected-extracted-files" -> ruleGroup="py/baseline", ruleLanguage="py"
+    """
+    if not rule_id:
+        return None, None
+
+    parts = rule_id.split("/")
+    if len(parts) < 2:
+        return None, None
+
+    # First part is the language
+    rule_language = parts[0]
+
+    # Rule group is language + first category (e.g., "cpp/misra", "py/baseline")
+    if len(parts) >= 2:
+        rule_group = f"{parts[0]}/{parts[1]}"
+    else:
+        rule_group = rule_language
+
+    return rule_group, rule_language
+
+
+def parse_repository_uri_from_filename(filename):
+    """
+    Parse repository URI from SARIF filename following the pattern:
+    [<lang>-<framework>_]<org>_<repo>[_<id>].sarif
+    
+    Examples:
+    - "nasa_fprime_18795.sarif" -> "https://github.com/nasa/fprime"
+    - "cpp-misra_nasa_fprime_18795.sarif" -> "https://github.com/nasa/fprime"
+    - "tmux_tmux.sarif" -> "https://github.com/tmux/tmux"
+    
+    Returns:
+        str or None: The repository URI if parsing succeeds, None otherwise
+    """
+    # Remove .sarif extension
+    name = filename.replace('.sarif', '')
+    
+    # Split by underscore
+    parts = name.split('_')
+    
+    # Need at least org_repo (2 parts)
+    if len(parts) < 2:
+        return None
+    
+    # Check if first part contains a hyphen (lang-framework pattern)
+    if '-' in parts[0]:
+        # Pattern: lang-framework_org_repo[_id]
+        # Skip the lang-framework prefix
+        if len(parts) < 3:
+            return None
+        org = parts[1]
+        repo = parts[2]
+    else:
+        # Pattern: org_repo[_id]
+        org = parts[0]
+        repo = parts[1]
+    
+    return f"https://github.com/{org}/{repo}"
+
+
+def sarif_results_generator(sarif_files, index_name):
+    """
+    Generator that yields Elasticsearch bulk actions for individual SARIF results.
+
+    For each SARIF file:
+    1. Processes each run in runs[]
+    2. Extracts each result from runs[].results[]
+    3. Creates a separate Elasticsearch document per result
+    4. Adds derived fields (ruleGroup, ruleLanguage) from ruleId parsing
+    5. Enriches with versionControlProvenance from run, or derives repositoryUri from filename
+    6. Adds source file tracking metadata
+
+    Filename Pattern for Repository URI Derivation:
+    - [<lang>-<framework>_]<org>_<repo>[_<id>].sarif
+    - Examples: "nasa_fprime_18795.sarif" -> "https://github.com/nasa/fprime"
+    - Examples: "cpp-misra_tmux_tmux.sarif" -> "https://github.com/tmux/tmux"
+
+    This approach keeps document sizes minimal by ONLY indexing the result objects
+    themselves plus minimal enrichment data, avoiding the overhead of tool info,
+    automation details, and other run-level data.
+    """
+    from datetime import datetime
+
+    indexed_at = datetime.utcnow().isoformat()
+
+    total_results = 0
+
+    for sarif_file in sarif_files:
+        print(f"Processing {sarif_file.name}...")
+        
+        # Parse repository URI from filename
+        repo_uri_from_filename = parse_repository_uri_from_filename(sarif_file.name)
+        if repo_uri_from_filename:
+            print(f"  → Derived repository URI: {repo_uri_from_filename}")
+
+        try:
+            with open(sarif_file, "r", encoding="utf-8") as f:
+                sarif_data = json.load(f)
+
+            # Validate SARIF version
+            if sarif_data.get("version") != SARIF_VERSION:
+                print(
+                    f"Warning: {sarif_file.name} has version {sarif_data.get('version')}, expected {SARIF_VERSION}"
+                )
+
+            # Extract metadata from the SARIF file
+            runs = sarif_data.get("runs", [])
+            if not runs:
+                print(f"Warning: No runs found in {sarif_file.name}")
+                continue
+
+            file_results_count = 0
+            for run_index, run in enumerate(runs):
+                results = run.get("results", [])
+                if not results:
+                    print(f"Warning: No results found in run {run_index} of {sarif_file.name}")
+                    continue
+
+                file_results_count += len(results)
+
+                # Extract ONLY versionControlProvenance from run (minimal enrichment)
+                version_control_provenance = run.get("versionControlProvenance", [])
+                
+                # If no versionControlProvenance in run, create from filename
+                if not version_control_provenance and repo_uri_from_filename:
+                    version_control_provenance = [{
+                        "repositoryUri": repo_uri_from_filename
+                    }]
+                # If versionControlProvenance exists but missing repositoryUri, add it from filename
+                elif version_control_provenance and repo_uri_from_filename:
+                    # Check if repositoryUri is missing from the first entry
+                    if not version_control_provenance[0].get("repositoryUri"):
+                        version_control_provenance[0]["repositoryUri"] = repo_uri_from_filename
+
+                for result_index, result in enumerate(results):
+                    # Create a document that includes ONLY the result fields
+                    document = dict(result)  # Copy all result fields
+
+                    # Add derived fields from ruleId parsing
+                    rule_id = document.get("ruleId")
+                    if rule_id:
+                        rule_group, rule_language = parse_rule_id(rule_id)
+                        if rule_group:
+                            document["ruleGroup"] = rule_group
+                        if rule_language:
+                            document["ruleLanguage"] = rule_language
+
+                    # Add ONLY versionControlProvenance (not tool, automationDetails, etc.)
+                    if version_control_provenance:
+                        document["versionControlProvenance"] = version_control_provenance
+                        # Also add flattened repositoryUri for easier querying
+                        if version_control_provenance[0].get("repositoryUri"):
+                            document["repositoryUri"] = version_control_provenance[0]["repositoryUri"]
+
+                    # Add source file metadata
+                    document["_sarif_source"] = {
+                        "file_path": str(sarif_file),
+                        "file_name": sarif_file.name,
+                        "indexed_at": indexed_at,
+                    }
+
+                    yield {
+                        "_index": index_name,
+                        "_source": document,
+                    }
+
+                    total_results += 1
+
+            print(f"  → Found {file_results_count} results in {sarif_file.name}")
+
+        except FileNotFoundError:
+            print(f"Error: SARIF file not found: {sarif_file}")
+            continue
+        except json.JSONDecodeError as e:
+            print(f"Error: Could not decode JSON from {sarif_file}: {e}")
+            continue
+        except Exception as e:
+            print(f"Error processing {sarif_file}: {e}")
+            continue
+
+    print(
+        f"✓ Prepared {total_results} individual results for indexing from {len(sarif_files)} SARIF files"
+    )
+
+
+def index_sarif_files(sarif_files, index_name, host, api_key=None, username=None, password=None, bulk_delay=1):
+    """
+    Connect to Elasticsearch and bulk index all SARIF results with progress logging.
+    
+    Args:
+        sarif_files: List of SARIF file paths to index
+        index_name: Name of the Elasticsearch index to create
+        host: Elasticsearch host URL
+        api_key: Optional API key for authentication
+        username: Optional username for basic auth
+        password: Optional password for basic auth
+        bulk_delay: Delay in seconds between bulk indexing chunks (default: 1)
+    """
+    es_client = create_elasticsearch_client(host, api_key, username, password)
+
+    # Validate connection
+    if not validate_elasticsearch_connection(es_client, host):
+        return False
+
+    # Validate index name
+    if not validate_index_name(es_client, index_name):
+        return False
+
+    # Create index with mapping
+    if not create_index_with_mapping(es_client, index_name):
+        return False
+
+    print(f"Indexing results from {len(sarif_files)} SARIF files...")
+    if bulk_delay > 0:
+        print(f"Bulk delay: {bulk_delay} second(s) between chunks")
+    print()
+
+    try:
+        # Track progress during bulk indexing
+        documents_indexed = 0
+        last_progress_update = 0
+        progress_interval = 100  # Update every 100 documents
+        chunks_processed = 0
+
+        def progress_callback(success, info):
+            """Callback to track progress during bulk indexing."""
+            nonlocal documents_indexed, last_progress_update, chunks_processed
+            documents_indexed += 1
+            
+            # Print progress updates periodically
+            if documents_indexed - last_progress_update >= progress_interval:
+                print(f"  → Indexed {documents_indexed} documents so far...")
+                last_progress_update = documents_indexed
+            
+            if not success:
+                print(f"  ✗ Failed to index document: {info}")
+
+        # Use bulk helper to index all documents with progress tracking
+        print("Starting bulk indexing...")
+        for success, info in helpers.streaming_bulk(
+            es_client,
+            sarif_results_generator(sarif_files, index_name),
+            chunk_size=500,
+            request_timeout=60,
+            raise_on_error=False,
+        ):
+            progress_callback(success, info)
+            
+            # Check if we just completed a chunk and should sleep
+            # streaming_bulk yields one result per document, so we track chunks
+            if documents_indexed > 0 and documents_indexed % 500 == 0:
+                chunks_processed += 1
+                if bulk_delay > 0:
+                    print(f"  → Sleeping {bulk_delay}s after chunk {chunks_processed}...")
+                    time.sleep(bulk_delay)
+
+        print(f"  → Indexed {documents_indexed} documents (final)")
+        print()
+        print("-" * 50)
+        print(f"✓ Bulk indexing complete")
+        print(f"✓ Total documents indexed: {documents_indexed}")
+        if chunks_processed > 0:
+            print(f"✓ Total chunks processed: {chunks_processed}")
+
+        # Get final index stats to verify
+        stats = es_client.indices.stats(index=index_name)
+        doc_count = stats["indices"][index_name]["total"]["docs"]["count"]
+        print(f"✓ Final document count in index: {doc_count}")
+        
+        if doc_count != documents_indexed:
+            print(f"⚠ Warning: Document count mismatch (indexed: {documents_indexed}, in index: {doc_count})")
+
+        return True
+
+    except Exception as e:
+        print(f"Error during bulk indexing: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def main():
+    if len(sys.argv) != 3:
+        print(f"Usage: python {sys.argv[0]} <sarif_files_list.txt> <elasticsearch_index_name>")
+        print()
+        print("Arguments:")
+        print(
+            "  sarif_files_list.txt    - Text file containing relative paths to SARIF files (one per line)"
+        )
+        print("  elasticsearch_index_name - Name for the new Elasticsearch index to create")
+        print()
+        print("Environment Variables:")
+        print("  ES_LOCAL_URL            - Elasticsearch host URL (default: http://localhost:9200)")
+        print("  ES_LOCAL_API_KEY        - API key for authentication (optional)")
+        print("  ES_LOCAL_USERNAME       - Username for basic authentication (optional)")
+        print("  ES_LOCAL_PASSWORD       - Password for basic authentication (optional)")
+        print("  ES_BULK_DELAY           - Delay in seconds between bulk chunks (default: 1)")
+        print()
+        print("Example:")
+        print(f"  python {sys.argv[0]} sarif-files.txt sarif_results_2024")
+        print("  ES_LOCAL_URL=https://my-cluster.elastic.co:9243 \\")
+        print("  ES_LOCAL_API_KEY=your_api_key \\")
+        print("  ES_BULK_DELAY=1 \\")
+        print(f"  python {sys.argv[0]} sarif-files.txt sarif_results_2024")
+        sys.exit(1)
+
+    sarif_files_list = sys.argv[1]
+    index_name = sys.argv[2]
+
+    # Load environment variables from .env file if it exists
+    script_dir = Path(__file__).parent
+    env_file = script_dir / "elastic-start-local" / ".env"
+    load_env_file(env_file)
+
+    # Get configuration from environment variables
+    elastic_host = os.getenv("ES_LOCAL_URL", DEFAULT_ELASTIC_HOST)
+    elastic_api_key = os.getenv("ES_LOCAL_API_KEY")
+    elastic_username = os.getenv("ES_LOCAL_USERNAME")
+    elastic_password = os.getenv("ES_LOCAL_PASSWORD")
+    bulk_delay = float(os.getenv("ES_BULK_DELAY", "1"))
+
+    # Handle variable substitution in ES_LOCAL_URL if needed
+    if elastic_host and "${ES_LOCAL_PORT}" in elastic_host:
+        es_local_port = os.getenv("ES_LOCAL_PORT", "9200")
+        elastic_host = elastic_host.replace("${ES_LOCAL_PORT}", es_local_port)
+
+    # Treat empty string or literal "None" as None for API key
+    if elastic_api_key == "" or elastic_api_key == "None":
+        elastic_api_key = None
+    
+    # Treat empty strings as None for username/password
+    if elastic_username == "" or elastic_username == "None":
+        elastic_username = None
+    if elastic_password == "" or elastic_password == "None":
+        elastic_password = None
+
+    # Determine authentication method
+    auth_method = "None"
+    if elastic_api_key:
+        auth_method = "API Key"
+    elif elastic_username and elastic_password:
+        auth_method = "Basic Auth (Username/Password)"
+    
+    print(f"SARIF Files Elasticsearch Indexer")
+    print(f"==================================")
+    print(f"SARIF files list: {sarif_files_list}")
+    print(f"Elasticsearch index: {index_name}")
+    print(f"Elasticsearch host: {elastic_host}")
+    print(f"Authentication: {auth_method}")
+    if bulk_delay > 0:
+        print(f"Bulk delay: {bulk_delay} second(s) between chunks")
+    print()
+
+    # Read and validate SARIF files list
+    sarif_files = read_sarif_files_list(sarif_files_list)
+    if not sarif_files:
+        print("No valid SARIF files found. Exiting.")
+        sys.exit(1)
+
+    # Index the files
+    if index_sarif_files(sarif_files, index_name, elastic_host, elastic_api_key, elastic_username, elastic_password, bulk_delay):
+        print(f"\n✓ Successfully created and populated index '{index_name}'")
+        print(f"You can now query the index using Elasticsearch APIs or Kibana.")
+        sys.exit(0)
+    else:
+        print(f"\n✗ Failed to create or populate index '{index_name}'")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/es-sarif/pyproject.toml b/scripts/es-sarif/pyproject.toml
new file mode 100644
index 000000000..eea3d934c
--- /dev/null
+++ b/scripts/es-sarif/pyproject.toml
@@ -0,0 +1,19 @@
+[tool.black]
+line-length = 100
+target-version = ['py311']
+include = '\.pyi?$'
+extend-exclude = '''
+/(
+  # directories
+  \.eggs
+  | \.git
+  | \.hg
+  | \.mypy_cache
+  | \.tox
+  | \.venv
+  | _build
+  | buck-out
+  | build
+  | dist
+)/
+'''
diff --git a/scripts/es-sarif/requirements.txt b/scripts/es-sarif/requirements.txt
new file mode 100644
index 000000000..241829792
--- /dev/null
+++ b/scripts/es-sarif/requirements.txt
@@ -0,0 +1,8 @@
+# Required for index-sarif-results-in-elasticsearch.py
+elasticsearch>=7.0.0,<9.0.0
+
+# Code formatter
+black>=24.0.0
+
+# Note: run-gh-mrva-for-query-suite.py only uses Python standard library modules
+# and does not require any external dependencies
\ No newline at end of file
diff --git a/scripts/es-sarif/run-gh-mrva-for-query-suite.md b/scripts/es-sarif/run-gh-mrva-for-query-suite.md
new file mode 100644
index 000000000..3401f6956
--- /dev/null
+++ b/scripts/es-sarif/run-gh-mrva-for-query-suite.md
@@ -0,0 +1,191 @@
+# MRVA Query Suite Runner
+
+This script automates the execution of Multi-Repository Vulnerability Analysis (MRVA) sessions for all queries in a CodeQL query suite. It manages the complete lifecycle: submission, monitoring, and downloading results.
+
+## Features
+
+- **Automated Query Resolution**: Uses CodeQL CLI to resolve all queries in a suite
+- **Concurrent Session Management**: Submits and monitors multiple MRVA sessions simultaneously  
+- **State Persistence**: Saves progress to resume interrupted runs
+- **Status Monitoring**: Periodically checks session status and downloads completed results
+- **Error Handling**: Robust error handling with detailed logging
+
+## Prerequisites
+
+1. **GitHub CLI with MRVA extension**:
+
+   ```bash
+   gh extension install github/gh-mrva
+   ```
+
+2. **CodeQL CLI**: Must be available in your PATH
+
+   ```bash
+   codeql --version
+   ```
+
+3. **Authentication**: GitHub CLI must be authenticated with appropriate permissions
+
+## Usage
+
+### Basic Usage
+
+```bash
+python run-gh-mrva-for-query-suite.py \
+  --query-suite codeql/misra-cpp-coding-standards@2.50.0 \
+  --output-base-dir ./mrva/sessions \
+  --session-prefix t1-misra-cpp-default \
+  --language cpp \
+  --repository-list cpp_top_1000
+```
+
+### With Custom Concurrency
+
+```bash
+python run-gh-mrva-for-query-suite.py \
+  --query-suite codeql/misra-c-coding-standards@2.50.0 \
+  --output-base-dir ./mrva/sessions \
+  --session-prefix t1-misra-c-default \
+  --language c \
+  --repository-list cpp_top_1000 \
+  --max-concurrent 10 \
+  --check-interval 600
+```
+
+## Arguments
+
+### Required Arguments
+
+- `--query-suite`: CodeQL query suite to analyze (e.g., `codeql/misra-cpp-coding-standards@2.50.0`)
+- `--output-base-dir`: Base directory for output files and session state
+- `--session-prefix`: Prefix for MRVA session names (e.g., `t1-misra-cpp-default`)
+- `--language`: Programming language (`cpp`, `c`, `java`, `javascript`, `python`, `go`, `csharp`)
+- `--repository-list`: Repository list for MRVA analysis (e.g., `top_1000`)
+
+### Optional Arguments
+
+- `--max-concurrent`: Maximum concurrent MRVA sessions (default: 20)
+- `--check-interval`: Status check interval in seconds (default: 300)
+
+## Output Structure
+
+The script creates the following output structure:
+
+```text
+<output-base-dir>/
+├── <session-prefix>_state.json     # Session state and progress
+└── sessions/                       # Downloaded SARIF results
+    ├── <session-prefix>-query1/
+    │   ├── owner1_repo1_12345.sarif
+    │   ├── owner2_repo2_12346.sarif
+    │   └── ...
+    ├── <session-prefix>-query2/
+    │   ├── owner1_repo1_12347.sarif
+    │   └── ...
+    └── ...
+```
+
+## State Management
+
+The script maintains a state file (`<session-prefix>_state.json`) that tracks:
+
+- Query resolution and session mapping
+- Submission timestamps
+- Current status of each session
+- Completion timestamps
+- Download locations
+- Error messages
+
+This allows the script to be safely interrupted and resumed.
+
+## Session Naming Convention
+
+MRVA sessions are named using the pattern:
+
+```text
+<session-prefix>-<sanitized-query-name>
+```
+
+For example:
+
+- Query: `cpp/misra/function-like-macros-defined.ql`
+- Session: `t1-misra-cpp-default-function-like-macros-defined`
+
+## Workflow
+
+1. **Query Resolution**: Resolves all `.ql` files in the specified query suite
+2. **Initial Submission**: Submits up to `max-concurrent` sessions
+3. **Monitoring Loop**:
+   - Check status of active sessions every `check-interval` seconds
+   - Download results for completed sessions
+   - Submit new sessions when capacity is available
+   - Continue until all queries are processed
+4. **Completion**: Provides final summary of results
+
+## Status Tracking
+
+Sessions progress through these states:
+
+- `not_started`: Session not yet submitted
+- `submitted`: Session submitted to MRVA
+- `in_progress`: Session is running
+- `completed`: Session finished successfully
+- `failed`: Session encountered an error
+
+## Integration with Elasticsearch
+
+After running this script, you can index the SARIF results into Elasticsearch using the companion script:
+
+```bash
+# Create a file list of all SARIF files
+find ./mrva/sessions -name "*.sarif" > results/misra/sarif-files.txt
+
+# Index results into Elasticsearch
+python index-sarif-results-in-elasticsearch.py results/misra/sarif-files.txt misra-mrva-results-top-1000-repos
+```
+
+## Error Handling
+
+- **Network Issues**: Retries status checks and downloads
+- **API Rate Limits**: Respects GitHub API limits with appropriate delays
+- **Partial Failures**: Continues processing other sessions if individual sessions fail
+- **Interruption**: Saves state and allows for clean resumption
+
+## Monitoring
+
+The script provides real-time status updates:
+
+```text
+Status: {'submitted': 5, 'in_progress': 10, 'completed': 25, 'failed': 1} | Active: 15 | Remaining: 12
+Waiting 300 seconds before next check...
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **MRVA Extension Not Found**:
+
+   ```bash
+   gh extension install github/gh-mrva
+   ```
+
+2. **CodeQL Not in PATH**:
+
+   ```bash
+   export PATH="/path/to/codeql:$PATH"
+   ```
+
+3. **Authentication Issues**:
+
+   ```bash
+   gh auth login
+   ```
+
+4. **Session Limits**: Reduce `--max-concurrent` if hitting API limits
+
+### Logs and Debugging
+
+- Session state is saved in `<session-prefix>_state.json`
+- Check GitHub CLI logs: `gh mrva list --json`
+- Verify query resolution: `codeql resolve queries -- <query-suite>`
diff --git a/scripts/es-sarif/run-gh-mrva-for-query-suite.py b/scripts/es-sarif/run-gh-mrva-for-query-suite.py
new file mode 100644
index 000000000..ff23e4334
--- /dev/null
+++ b/scripts/es-sarif/run-gh-mrva-for-query-suite.py
@@ -0,0 +1,889 @@
+#!/usr/bin/env python3
+"""
+github/codeql-coding-standards:scripts/es-sarif/run-gh-mrva-for-query-suite.py
+
+MRVA = Multi-Repository Vulnerability Analysis => scale one CodeQL query to many (1000s of) repos.
+
+This script creates a MRVA session for each query in a given query suite and manages
+the execution lifecycle including submission, monitoring, and downloading of results.
+The script uses a pipeline-based approach where each query is processed asynchronously
+through its complete lifecycle: submit -> monitor -> download -> copy results.
+
+This script expects that the `mrva` extension has already been installed for the
+`gh` CLI tool, such that `gh mrva` commands can be executed.
+
+Usage:
+    python run-gh-mrva-for-query-suite.py --query-suite <suite> --output-base-dir <dir> [options]
+
+Requirements:
+    - GitHub CLI with mrva extension installed
+    - CodeQL CLI available in PATH
+    - Python 3.11+
+"""
+
+import sys
+import json
+import os
+import argparse
+import subprocess
+import time
+import re
+import threading
+from pathlib import Path
+from typing import Dict, List, Optional, Set, Tuple
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from enum import Enum
+
+
+class PipelineState(Enum):
+    """States in the MRVA processing pipeline."""
+
+    NOT_STARTED = "not_started"
+    SUBMITTING = "submitting"
+    SUBMITTED = "submitted"
+    MONITORING = "monitoring"
+    COMPLETED = "completed"
+    DOWNLOADING = "downloading"
+    DOWNLOADED = "downloaded"
+    COPYING = "copying"
+    FINISHED = "finished"
+    FAILED = "failed"
+
+
+@dataclass
+class MRVAPipeline:
+    """Represents the processing pipeline for a single query."""
+
+    query_path: str
+    session_name: str
+    language: str
+    state: PipelineState = PipelineState.NOT_STARTED
+    submitted_at: Optional[datetime] = None
+    completed_at: Optional[datetime] = None
+    downloaded_at: Optional[datetime] = None
+    finished_at: Optional[datetime] = None
+    error_message: Optional[str] = None
+    run_count: int = 0
+    succeeded_runs: int = 0
+    failed_runs: int = 0
+
+    def to_dict(self) -> Dict:
+        """Convert pipeline to dictionary for JSON serialization."""
+        return {
+            "query_path": self.query_path,
+            "session_name": self.session_name,
+            "language": self.language,
+            "state": self.state.value,
+            "submitted_at": self.submitted_at.isoformat() if self.submitted_at else None,
+            "completed_at": self.completed_at.isoformat() if self.completed_at else None,
+            "downloaded_at": self.downloaded_at.isoformat() if self.downloaded_at else None,
+            "finished_at": self.finished_at.isoformat() if self.finished_at else None,
+            "error_message": self.error_message,
+            "run_count": self.run_count,
+            "succeeded_runs": self.succeeded_runs,
+            "failed_runs": self.failed_runs,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict) -> "MRVAPipeline":
+        """Create pipeline from dictionary."""
+        pipeline = cls(
+            query_path=data["query_path"],
+            session_name=data["session_name"],
+            language=data["language"],
+            state=PipelineState(data["state"]),
+        )
+        pipeline.submitted_at = (
+            datetime.fromisoformat(data["submitted_at"]) if data.get("submitted_at") else None
+        )
+        pipeline.completed_at = (
+            datetime.fromisoformat(data["completed_at"]) if data.get("completed_at") else None
+        )
+        pipeline.downloaded_at = (
+            datetime.fromisoformat(data["downloaded_at"]) if data.get("downloaded_at") else None
+        )
+        pipeline.finished_at = (
+            datetime.fromisoformat(data["finished_at"]) if data.get("finished_at") else None
+        )
+        pipeline.error_message = data.get("error_message")
+        pipeline.run_count = data.get("run_count", 0)
+        pipeline.succeeded_runs = data.get("succeeded_runs", 0)
+        pipeline.failed_runs = data.get("failed_runs", 0)
+        return pipeline
+
+
+class MRVAManager:
+    """Manages MRVA pipelines for a query suite using concurrent processing."""
+
+    def __init__(
+        self,
+        query_suite: str,
+        output_base_dir: Path,
+        session_prefix: str,
+        language: str,
+        repository_list: str,
+        max_concurrent: int = 20,
+        check_interval: int = 300,
+        dry_run: bool = False,
+    ):
+        self.query_suite = query_suite
+        self.output_base_dir = output_base_dir
+        self.session_prefix = session_prefix
+        self.language = language
+        self.repository_list = repository_list
+        self.max_concurrent = max_concurrent
+        self.check_interval = check_interval
+        self.dry_run = dry_run
+
+        # Directories
+        self.sessions_dir = output_base_dir / "sessions"
+        self.status_dir = output_base_dir / "sessions" / "status"
+        self.results_dir = output_base_dir / "results" / session_prefix
+        self.state_file = output_base_dir / f"{session_prefix}_state.json"
+        self.sarif_list_file = self.results_dir / "sarif-files.txt"
+
+        # Pipeline tracking
+        self.pipelines: Dict[str, MRVAPipeline] = {}
+        self.pipelines_lock = threading.Lock()
+
+        # Create directories (only in non-dry-run mode)
+        if not self.dry_run:
+            self.sessions_dir.mkdir(parents=True, exist_ok=True)
+            self.status_dir.mkdir(parents=True, exist_ok=True)
+            self.results_dir.mkdir(parents=True, exist_ok=True)
+
+        # Load existing state if available (both modes)
+        self._load_state()
+
+    def _load_state(self):
+        """Load existing pipeline state from file."""
+        if self.state_file.exists():
+            try:
+                with open(self.state_file, "r") as f:
+                    state_data = json.load(f)
+
+                for pipeline_data in state_data.get("pipelines", []):
+                    pipeline = MRVAPipeline.from_dict(pipeline_data)
+                    self.pipelines[pipeline.session_name] = pipeline
+
+                print(f"✓ Loaded state for {len(self.pipelines)} pipelines from {self.state_file}")
+            except Exception as e:
+                print(f"Warning: Could not load state file: {e}")
+
+    def _save_state(self):
+        """Save current pipeline state to file."""
+        # Don't save state in dry-run mode to avoid creating directories
+        if self.dry_run:
+            return
+
+        state_data = {
+            "query_suite": self.query_suite,
+            "session_prefix": self.session_prefix,
+            "language": self.language,
+            "repository_list": self.repository_list,
+            "updated_at": datetime.utcnow().isoformat(),
+            "pipelines": [p.to_dict() for p in self.pipelines.values()],
+        }
+
+        # Ensure parent directory exists
+        self.state_file.parent.mkdir(parents=True, exist_ok=True)
+
+        with open(self.state_file, "w") as f:
+            json.dump(state_data, f, indent=2)
+
+    def resolve_queries(self) -> List[str]:
+        """Resolve query files from the query suite using CodeQL CLI."""
+        print(f"Resolving queries from suite: {self.query_suite}")
+
+        try:
+            result = subprocess.run(
+                ["codeql", "resolve", "queries", "--", self.query_suite],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+
+            query_paths = []
+            for line in result.stdout.strip().split("\n"):
+                line = line.strip()
+                if line and line.endswith(".ql"):
+                    query_paths.append(line)
+
+            print(f"✓ Found {len(query_paths)} queries in suite")
+            return query_paths
+
+        except subprocess.CalledProcessError as e:
+            print(f"Error resolving queries: {e}")
+            print(f"STDOUT: {e.stdout}")
+            print(f"STDERR: {e.stderr}")
+            sys.exit(1)
+
+    def _generate_session_name(self, query_path: str) -> str:
+        """Generate a session name from query path."""
+        query_name = Path(query_path).stem
+        sanitized = re.sub(r"[^a-zA-Z0-9\-_]", "-", query_name)
+        return f"{self.session_prefix}-{sanitized}"
+
+    def get_existing_sessions(self) -> Dict[str, Dict]:
+        """Get existing MRVA sessions from gh mrva list."""
+        # Save list output to file for performance and debugging
+        list_file = self.sessions_dir / "mrva-sessions-list.json"
+
+        try:
+            result = subprocess.run(
+                ["gh", "mrva", "list", "--json"], capture_output=True, text=True, check=True
+            )
+
+            # Save output to file (only if not in dry-run to avoid creating directories)
+            if not self.dry_run:
+                list_file.write_text(result.stdout)
+
+            sessions_data = json.loads(result.stdout)
+            sessions_by_name = {}
+
+            for session in sessions_data:
+                session_name = session.get("name", "")
+                if session_name.startswith(self.session_prefix):
+                    sessions_by_name[session_name] = session
+
+            return sessions_by_name
+
+        except subprocess.CalledProcessError as e:
+            print(f"Warning: Could not list existing sessions: {e}")
+            return {}
+        except json.JSONDecodeError as e:
+            print(f"Warning: Could not parse session list: {e}")
+            return {}
+
+    def get_session_status(self, session_name: str) -> Optional[Dict]:
+        """Get detailed status for a specific session."""
+        # Save status output to file for performance and debugging (only in non-dry-run)
+        status_file = self.status_dir / f"{session_name}_status.json"
+
+        try:
+            result = subprocess.run(
+                ["gh", "mrva", "status", "--json", "--session", session_name],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+
+            # Save output to file (only if not in dry-run to avoid creating directories)
+            if not self.dry_run:
+                status_file.write_text(result.stdout)
+
+            status_data = json.loads(result.stdout)
+            if status_data and len(status_data) > 0:
+                return status_data[0]
+            return None
+
+        except subprocess.CalledProcessError:
+            return None
+        except json.JSONDecodeError:
+            return None
+
+    def check_downloads_exist(self, session_name: str) -> Tuple[bool, List[str]]:
+        """Check if downloads already exist for a session."""
+        session_dir = self.sessions_dir / session_name
+
+        if not session_dir.exists():
+            return False, []
+
+        # Look for SARIF files
+        sarif_files = list(session_dir.glob("*.sarif"))
+        return len(sarif_files) > 0, [str(f) for f in sarif_files]
+
+    def submit_session(self, pipeline: MRVAPipeline) -> bool:
+        """Submit a new MRVA session."""
+        try:
+            cmd = [
+                "gh",
+                "mrva",
+                "submit",
+                "--language",
+                self.language,
+                "--session",
+                pipeline.session_name,
+                "--list",
+                self.repository_list,
+                "--query",
+                pipeline.query_path,
+            ]
+
+            if self.dry_run:
+                # In dry run, just mark as submitted without printing per-session details
+                pipeline.state = PipelineState.SUBMITTED
+                pipeline.submitted_at = datetime.utcnow()
+                return True
+
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+
+            pipeline.state = PipelineState.SUBMITTED
+            pipeline.submitted_at = datetime.utcnow()
+            return True
+
+        except subprocess.CalledProcessError as e:
+            pipeline.state = PipelineState.FAILED
+            pipeline.error_message = f"Submit failed: {e.stderr if e.stderr else str(e)}"
+            return False
+
+    def monitor_session(self, pipeline: MRVAPipeline) -> bool:
+        """Monitor a session until it completes."""
+        pipeline.state = PipelineState.MONITORING
+
+        # In dry-run, just check status once and return without waiting
+        if self.dry_run:
+            session_status = self.get_session_status(pipeline.session_name)
+
+            if not session_status:
+                # Session doesn't exist yet, would need to be submitted
+                pipeline.state = PipelineState.NOT_STARTED
+                pipeline.error_message = "Session does not exist (would need to be submitted)"
+                return False
+
+            status = session_status.get("status", "").lower()
+            runs = session_status.get("runs", [])
+
+            pipeline.run_count = len(runs)
+            pipeline.succeeded_runs = len([r for r in runs if r.get("status") == "succeeded"])
+            pipeline.failed_runs = len([r for r in runs if r.get("status") == "failed"])
+            in_progress = len([r for r in runs if r.get("status") in ["pending", "in_progress"]])
+
+            if status in ["completed", "succeeded"] and in_progress == 0:
+                pipeline.state = PipelineState.COMPLETED
+                pipeline.completed_at = datetime.utcnow()
+                return True
+            elif status == "failed":
+                pipeline.state = PipelineState.FAILED
+                pipeline.error_message = f"MRVA session failed (status: {status})"
+                return False
+            else:
+                # Still in progress
+                pipeline.state = PipelineState.SUBMITTED
+                pipeline.error_message = (
+                    f"Session still in progress (status: {status}, runs: {pipeline.run_count})"
+                )
+                return False
+
+        # Non-dry-run mode: actually monitor until completion
+        while True:
+            session_status = self.get_session_status(pipeline.session_name)
+
+            if not session_status:
+                time.sleep(self.check_interval)
+                continue
+
+            status = session_status.get("status", "").lower()
+            runs = session_status.get("runs", [])
+
+            # Count run statuses
+            pipeline.run_count = len(runs)
+            pipeline.succeeded_runs = len([r for r in runs if r.get("status") == "succeeded"])
+            pipeline.failed_runs = len([r for r in runs if r.get("status") == "failed"])
+            in_progress = len([r for r in runs if r.get("status") in ["pending", "in_progress"]])
+
+            if status in ["completed", "succeeded"] and in_progress == 0:
+                pipeline.state = PipelineState.COMPLETED
+                pipeline.completed_at = datetime.utcnow()
+                return True
+            elif status == "failed":
+                pipeline.state = PipelineState.FAILED
+                pipeline.error_message = "Session failed"
+                return False
+
+            time.sleep(self.check_interval)
+
+    def download_session(self, pipeline: MRVAPipeline) -> bool:
+        """Download results for a completed session."""
+        pipeline.state = PipelineState.DOWNLOADING
+
+        session_dir = self.sessions_dir / pipeline.session_name
+
+        # Only create directory in non-dry-run mode
+        if not self.dry_run:
+            session_dir.mkdir(parents=True, exist_ok=True)
+
+        # In dry-run, just mark as would-be-downloaded
+        if self.dry_run:
+            pipeline.state = PipelineState.DOWNLOADED
+            pipeline.downloaded_at = datetime.utcnow()
+            return True
+
+        try:
+            # Download all results for the session at once
+            cmd = [
+                "gh",
+                "mrva",
+                "download",
+                "--session",
+                pipeline.session_name,
+                "--output-dir",
+                str(session_dir),
+            ]
+
+            subprocess.run(cmd, capture_output=True, text=True, check=True)
+
+            pipeline.state = PipelineState.DOWNLOADED
+            pipeline.downloaded_at = datetime.utcnow()
+            return True
+
+        except subprocess.CalledProcessError as e:
+            pipeline.state = PipelineState.FAILED
+            pipeline.error_message = f"Download failed: {e.stderr if e.stderr else str(e)}"
+            return False
+
+    def copy_results(self, pipeline: MRVAPipeline) -> bool:
+        """Copy downloaded SARIF files to results directory and update sarif-files.txt."""
+        pipeline.state = PipelineState.COPYING
+
+        session_dir = self.sessions_dir / pipeline.session_name
+
+        # In dry-run, skip the actual file operations
+        if self.dry_run:
+            pipeline.state = PipelineState.FINISHED
+            pipeline.finished_at = datetime.utcnow()
+            return True
+
+        if not session_dir.exists():
+            pipeline.state = PipelineState.FAILED
+            pipeline.error_message = "Session directory does not exist"
+            return False
+
+        # Find all SARIF files in session directory
+        sarif_files = list(session_dir.glob("*.sarif"))
+
+        if not sarif_files:
+            # No SARIF files to copy (could be zero results)
+            pipeline.state = PipelineState.FINISHED
+            pipeline.finished_at = datetime.utcnow()
+            return True
+
+        try:
+            copied_files = []
+
+            for sarif_file in sarif_files:
+                # Parse filename to extract org/repo info
+                # Expected format: <org>_<repo>_<runId>.sarif
+                filename = sarif_file.name
+                parts = filename.replace(".sarif", "").split("_")
+
+                if len(parts) >= 3:
+                    org = parts[0]
+                    repo = "_".join(parts[1:-1])  # Handle repo names with underscores
+
+                    # Create destination directory
+                    dest_dir = self.results_dir / org / repo
+                    dest_dir.mkdir(parents=True, exist_ok=True)
+
+                    # Copy file
+                    dest_file = dest_dir / filename
+                    import shutil
+
+                    shutil.copy2(sarif_file, dest_file)
+
+                    # Record relative path for sarif-files.txt
+                    relative_path = f"{org}/{repo}/{filename}"
+                    copied_files.append(relative_path)
+
+            # Append to sarif-files.txt
+            if copied_files:
+                with open(self.sarif_list_file, "a") as f:
+                    for relative_path in copied_files:
+                        f.write(f"{relative_path}\n")
+
+            pipeline.state = PipelineState.FINISHED
+            pipeline.finished_at = datetime.utcnow()
+            return True
+
+        except Exception as e:
+            pipeline.state = PipelineState.FAILED
+            pipeline.error_message = f"Copy failed: {str(e)}"
+            return False
+
+    def process_pipeline(self, pipeline: MRVAPipeline) -> bool:
+        """Process a single pipeline through all stages."""
+        try:
+            # Submit
+            if pipeline.state == PipelineState.NOT_STARTED:
+                if not self.submit_session(pipeline):
+                    return False
+                with self.pipelines_lock:
+                    self._save_state()
+
+            # Monitor
+            if pipeline.state in [PipelineState.SUBMITTED, PipelineState.MONITORING]:
+                if not self.monitor_session(pipeline):
+                    return False
+                with self.pipelines_lock:
+                    self._save_state()
+
+            # Download
+            if pipeline.state == PipelineState.COMPLETED:
+                if not self.download_session(pipeline):
+                    return False
+                with self.pipelines_lock:
+                    self._save_state()
+
+            # Copy results
+            if pipeline.state == PipelineState.DOWNLOADED:
+                if not self.copy_results(pipeline):
+                    return False
+                with self.pipelines_lock:
+                    self._save_state()
+
+            return pipeline.state == PipelineState.FINISHED
+
+        except Exception as e:
+            pipeline.state = PipelineState.FAILED
+            pipeline.error_message = f"Pipeline error: {str(e)}"
+            with self.pipelines_lock:
+                self._save_state()
+            return False
+
+    def restore_existing_pipelines(self, query_paths: List[str]):
+        """Restore pipelines from existing MRVA sessions."""
+        print("\nChecking for existing MRVA sessions...")
+        existing_sessions = self.get_existing_sessions()
+
+        if existing_sessions:
+            print(
+                f"Found {len(existing_sessions)} existing sessions with prefix '{self.session_prefix}'"
+            )
+
+        restored_count = 0
+
+        for query_path in query_paths:
+            session_name = self._generate_session_name(query_path)
+
+            # Skip if already in state
+            if session_name in self.pipelines:
+                continue
+
+            # Check if session exists in MRVA
+            if session_name in existing_sessions:
+                session_data = existing_sessions[session_name]
+
+                # Create pipeline
+                pipeline = MRVAPipeline(
+                    query_path=query_path, session_name=session_name, language=self.language
+                )
+
+                # Determine state based on session status and downloads
+                session_status = self.get_session_status(session_name)
+                downloads_exist, sarif_files = self.check_downloads_exist(session_name)
+
+                if session_status:
+                    status = session_status.get("status", "").lower()
+                    runs = session_status.get("runs", [])
+
+                    pipeline.run_count = len(runs)
+                    pipeline.succeeded_runs = len(
+                        [r for r in runs if r.get("status") == "succeeded"]
+                    )
+                    pipeline.failed_runs = len([r for r in runs if r.get("status") == "failed"])
+
+                    if status in ["completed", "succeeded"]:
+                        if downloads_exist:
+                            # Check if results have been copied
+                            # For simplicity, assume if downloaded, we need to copy
+                            pipeline.state = PipelineState.DOWNLOADED
+                            pipeline.submitted_at = datetime.utcnow()  # Unknown actual time
+                            pipeline.completed_at = datetime.utcnow()
+                            pipeline.downloaded_at = datetime.utcnow()
+                        else:
+                            pipeline.state = PipelineState.COMPLETED
+                            pipeline.submitted_at = datetime.utcnow()
+                            pipeline.completed_at = datetime.utcnow()
+                    elif status == "failed":
+                        pipeline.state = PipelineState.FAILED
+                        pipeline.error_message = "Session previously failed"
+                    else:
+                        # In progress
+                        pipeline.state = PipelineState.SUBMITTED
+                        pipeline.submitted_at = datetime.utcnow()
+
+                self.pipelines[session_name] = pipeline
+                restored_count += 1
+
+        if restored_count > 0:
+            print(f"✓ Restored {restored_count} pipelines from existing sessions")
+            self._save_state()
+
+    def get_status_summary(self) -> Dict[str, int]:
+        """Get summary of pipeline states."""
+        summary = {}
+        for pipeline in self.pipelines.values():
+            state = pipeline.state.value
+            summary[state] = summary.get(state, 0) + 1
+        return summary
+
+    def print_progress(self, total_queries: int):
+        """Print detailed progress information."""
+        summary = self.get_status_summary()
+
+        not_started = total_queries - len(self.pipelines)
+        submitting = summary.get(PipelineState.SUBMITTING.value, 0)
+        submitted = summary.get(PipelineState.SUBMITTED.value, 0)
+        monitoring = summary.get(PipelineState.MONITORING.value, 0)
+        completed = summary.get(PipelineState.COMPLETED.value, 0)
+        downloading = summary.get(PipelineState.DOWNLOADING.value, 0)
+        downloaded = summary.get(PipelineState.DOWNLOADED.value, 0)
+        copying = summary.get(PipelineState.COPYING.value, 0)
+        finished = summary.get(PipelineState.FINISHED.value, 0)
+        failed = summary.get(PipelineState.FAILED.value, 0)
+
+        print("\n" + "=" * 70)
+        print(f"PROGRESS REPORT - {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')}")
+        print("=" * 70)
+        print(f"Total queries in suite:           {total_queries}")
+        print(f"  Not yet started:                {not_started}")
+        print(f"  Submitting:                     {submitting}")
+        print(f"  Submitted (monitoring):         {submitted + monitoring}")
+        print(f"  Completed (awaiting download):  {completed}")
+        print(f"  Downloading:                    {downloading}")
+        print(f"  Downloaded (copying results):   {downloaded + copying}")
+        print(f"  Finished:                       {finished}")
+        print(f"  Failed/Errored:                 {failed}")
+        print("-" * 70)
+        print(
+            f"Completion rate:                  {(finished / total_queries * 100) if total_queries > 0 else 0:.1f}%"
+        )
+        print("=" * 70)
+        print()
+
+    def run_until_complete(self) -> bool:
+        """Run all pipelines until complete using concurrent processing."""
+        # Resolve queries
+        query_paths = self.resolve_queries()
+        total_queries = len(query_paths)
+
+        print(
+            f"\n{'[DRY RUN] ' if self.dry_run else ''}Planning to process {total_queries} queries..."
+        )
+        print(f"Session prefix: {self.session_prefix}")
+        print(f"Max concurrent pipelines: {self.max_concurrent}")
+        print(f"Monitor interval: {self.check_interval} seconds")
+        print()
+
+        # Restore existing pipelines
+        self.restore_existing_pipelines(query_paths)
+
+        # Create pipelines for new queries
+        for query_path in query_paths:
+            session_name = self._generate_session_name(query_path)
+            if session_name not in self.pipelines:
+                pipeline = MRVAPipeline(
+                    query_path=query_path, session_name=session_name, language=self.language
+                )
+                self.pipelines[session_name] = pipeline
+
+        # Get pipelines that need processing
+        pipelines_to_process = [
+            p for p in self.pipelines.values() if p.state != PipelineState.FINISHED
+        ]
+
+        if not pipelines_to_process:
+            print("\n✓ All pipelines already finished!")
+            return True
+
+        print(f"\nProcessing {len(pipelines_to_process)} pipelines...")
+
+        if self.dry_run:
+            print("\nDRY RUN: Simulating all pipeline operations (no actual API calls)...")
+
+        # Process pipelines concurrently
+        completed = 0
+        failed = 0
+
+        with ThreadPoolExecutor(max_workers=self.max_concurrent) as executor:
+            # Submit all pipelines
+            future_to_pipeline = {
+                executor.submit(self.process_pipeline, pipeline): pipeline
+                for pipeline in pipelines_to_process
+            }
+
+            # Process completions
+            for future in as_completed(future_to_pipeline):
+                pipeline = future_to_pipeline[future]
+                try:
+                    success = future.result()
+                    if success:
+                        completed += 1
+                        if not self.dry_run:
+                            print(
+                                f"✓ Finished: {pipeline.session_name} ({completed + failed}/{len(pipelines_to_process)})"
+                            )
+                    else:
+                        failed += 1
+                        error_msg = pipeline.error_message or "Unknown error"
+                        print(
+                            f"✗ Failed: {pipeline.session_name} - {error_msg} ({completed + failed}/{len(pipelines_to_process)})"
+                        )
+                except Exception as e:
+                    failed += 1
+                    print(
+                        f"✗ Exception in {pipeline.session_name}: {e} ({completed + failed}/{len(pipelines_to_process)})"
+                    )
+
+                # Print progress every 10 completions (or every 50 in dry-run)
+                progress_interval = 50 if self.dry_run else 10
+                if (completed + failed) % progress_interval == 0:
+                    self.print_progress(total_queries)
+
+        # Final progress report
+        self.print_progress(total_queries)
+
+        print(f"\n{'=' * 70}")
+        print(f"Processing complete!")
+        print(f"  Finished: {completed}")
+        print(f"  Failed: {failed}")
+        print(f"{'=' * 70}")
+
+        if not self.dry_run:
+            print(f"\nResults directory: {self.results_dir}")
+            print(f"SARIF file list: {self.sarif_list_file}")
+            print(f"State file: {self.state_file}")
+
+        return failed == 0
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run MRVA sessions for all queries in a CodeQL query suite (Pipeline-based v2)",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic usage
+  python run-gh-mrva-for-query-suite.py \\
+    --query-suite codeql/misra-cpp-coding-standards@2.50.0 \\
+    --output-base-dir ./mrva \\
+    --session-prefix t1-misra-cpp-default \\
+    --language cpp \\
+    --repository-list cpp_top_1000
+
+  # With local query suite file
+  python run-gh-mrva-for-query-suite.py \\
+    --query-suite ../../cpp/misra/src/codeql-suites/misra-cpp-default.qls \\
+    --output-base-dir ./mrva \\
+    --session-prefix t1-misra-cpp-default \\
+    --language cpp \\
+    --repository-list cpp_top_1000 \\
+    --max-concurrent 10
+
+  # Dry run
+  python run-gh-mrva-for-query-suite.py \\
+    --query-suite ../../cpp/autosar/src/codeql-suites/autosar-default.qls \\
+    --output-base-dir ./mrva \\
+    --session-prefix t1-autosar-cpp-default \\
+    --language cpp \\
+    --repository-list cpp_top_1000 \\
+    --dry-run
+        """,
+    )
+
+    parser.add_argument(
+        "--query-suite",
+        required=True,
+        help="CodeQL query suite - either a pack reference or path to .qls file",
+    )
+    parser.add_argument(
+        "--output-base-dir",
+        required=True,
+        type=Path,
+        help="Base directory for output files and session state",
+    )
+    parser.add_argument(
+        "--session-prefix",
+        required=True,
+        help="Prefix for MRVA session names (e.g., t1-misra-cpp-default)",
+    )
+    parser.add_argument(
+        "--language",
+        required=True,
+        choices=["cpp", "c", "java", "javascript", "python", "go", "csharp"],
+        help="Programming language for analysis",
+    )
+    parser.add_argument(
+        "--repository-list", required=True, help="Repository list for MRVA (e.g., cpp_top_1000)"
+    )
+    parser.add_argument(
+        "--max-concurrent",
+        type=int,
+        default=20,
+        help="Maximum number of concurrent pipeline workers (default: 20)",
+    )
+    parser.add_argument(
+        "--check-interval",
+        type=int,
+        default=300,
+        help="Interval in seconds between status checks (default: 300)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print commands that would be executed without actually running them",
+    )
+
+    args = parser.parse_args()
+
+    print("MRVA Query Suite Runner (Pipeline-based v2)")
+    print("=" * 70)
+    print(f"Run started: {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')}")
+    print(f"Query suite: {args.query_suite}")
+    print(f"Output directory: {args.output_base_dir}")
+    print(f"Session prefix: {args.session_prefix}")
+    print(f"Language: {args.language}")
+    print(f"Repository list: {args.repository_list}")
+    print(f"Max concurrent: {args.max_concurrent}")
+    print(f"Monitor interval: {args.check_interval} seconds")
+    if args.dry_run:
+        print("DRY RUN MODE: No submissions or downloads will occur")
+    print()
+
+    # Validate dependencies
+    if not args.dry_run:
+        try:
+            subprocess.run(["gh", "mrva", "--help"], capture_output=True, check=True)
+            print("✓ GitHub CLI with mrva extension is available")
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            print("✗ Error: GitHub CLI with mrva extension not found")
+            sys.exit(1)
+
+        try:
+            subprocess.run(["codeql", "--version"], capture_output=True, check=True)
+            print("✓ CodeQL CLI is available")
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            print("✗ Error: CodeQL CLI not found")
+            sys.exit(1)
+    else:
+        print("✓ [DRY RUN] Skipping dependency validation")
+
+    print()
+
+    # Create and run manager
+    manager = MRVAManager(
+        query_suite=args.query_suite,
+        output_base_dir=args.output_base_dir,
+        session_prefix=args.session_prefix,
+        language=args.language,
+        repository_list=args.repository_list,
+        max_concurrent=args.max_concurrent,
+        check_interval=args.check_interval,
+        dry_run=args.dry_run,
+    )
+
+    try:
+        success = manager.run_until_complete()
+        sys.exit(0 if success else 1)
+
+    except KeyboardInterrupt:
+        print(f"\n\nInterrupted by user")
+        print(f"Session state saved to: {manager.state_file}")
+        print(f"You can resume by running the script again with the same parameters")
+        sys.exit(130)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/es-sarif/sarif-infer-versionControlProvenance.md b/scripts/es-sarif/sarif-infer-versionControlProvenance.md
new file mode 100644
index 000000000..95fdb8828
--- /dev/null
+++ b/scripts/es-sarif/sarif-infer-versionControlProvenance.md
@@ -0,0 +1,43 @@
+# sarif-infer-versionControlProvenance.py
+
+Pre-indexing script that infers `versionControlProvenance.repositoryUri` from SARIF filenames.
+
+## Usage
+
+```bash
+python sarif-infer-versionControlProvenance.py <sarif_files_list.txt>
+```
+
+## Purpose
+
+Ensures SARIF files have `versionControlProvenance.repositoryUri` set before indexing into Elasticsearch. This is required for the indexing script to properly enrich results with repository information.
+
+## File Naming Convention
+
+Files must follow the pattern: `<language>-<framework>_<org>_<repo>.sarif`
+
+Examples:
+
+- `c-cert_nasa_fprime.sarif` → `https://github.com/nasa/fprime`
+- `cpp-misra_bitcoin_bitcoin.sarif` → `https://github.com/bitcoin/bitcoin`
+- `c-misra_zeromq_libzmq.sarif` → `https://github.com/zeromq/libzmq`
+
+Files with exactly 2 underscores are processed; others are skipped with an error.
+
+## Behavior
+
+- **Modifies files in-place** - consider backing up first
+- Skips files that already have `repositoryUri` set
+- Only infers `repositoryUri` (not `revisionId` or `branch`)
+- Adds to first run in each SARIF file
+
+## Example
+
+```bash
+# Process all files in the list
+python sarif-infer-versionControlProvenance.py mrva/sarif-files_results-1.txt
+
+# Output:
+#   [OK]   c-cert_nasa_fprime.sarif: Added repositoryUri: https://github.com/nasa/fprime
+#   [SKIP] cpp-cert_nasa_fprime.sarif: Already has repositoryUri (skipped)
+```
diff --git a/scripts/es-sarif/sarif-infer-versionControlProvenance.py b/scripts/es-sarif/sarif-infer-versionControlProvenance.py
new file mode 100755
index 000000000..3b28121fb
--- /dev/null
+++ b/scripts/es-sarif/sarif-infer-versionControlProvenance.py
@@ -0,0 +1,293 @@
+#!/usr/bin/env python3
+"""
+SARIF Version Control Provenance Inference Script
+
+This script processes SARIF files and infers the versionControlProvenance.repositoryUri
+field for each run based on the file naming convention: <language>-<framework>_<org>_<repo>.sarif
+
+The script reads a list of SARIF file paths from a text file (similar to the indexing script)
+and modifies each SARIF file in-place to add the inferred versionControlProvenance information.
+
+Usage:
+    python sarif-infer-versionControlProvenance.py <sarif_files_list.txt>
+
+File Naming Convention:
+    Files should follow the pattern: <language>-<framework>_<org>_<repo>.sarif
+    Examples:
+        - c-cert_nasa_fprime.sarif -> https://github.com/nasa/fprime
+        - cpp-misra_bitcoin_bitcoin.sarif -> https://github.com/bitcoin/bitcoin
+        - c-misra_curl_curl.sarif -> https://github.com/curl/curl
+
+Notes:
+    - This is a pre-indexing step to ensure versionControlProvenance is present
+    - Only repositoryUri is inferred; revisionId and branch cannot be derived from filename
+    - Files are modified in-place; consider backing up before running
+    - Files already containing versionControlProvenance are skipped unless they lack repositoryUri
+"""
+
+import sys
+import os
+import json
+import re
+from pathlib import Path
+
+
+def parse_filename_for_repo_info(filename):
+    """
+    Parse a SARIF filename to extract organization and repository information.
+    
+    Expected filename pattern: <language>-<framework>_<org>_<repo>.sarif
+    Files conforming to our naming scheme have exactly 2 underscores.
+    
+    Examples:
+        c-cert_nasa_fprime.sarif -> ('nasa', 'fprime')
+        cpp-misra_bitcoin_bitcoin.sarif -> ('bitcoin', 'bitcoin')
+        c-misra_curl_curl.sarif -> ('curl', 'curl')
+    
+    Args:
+        filename: The SARIF filename (without path)
+    
+    Returns:
+        Tuple of (org, repo) if successful, None otherwise
+    """
+    # Remove .sarif extension
+    name_without_ext = filename
+    if name_without_ext.endswith('.sarif'):
+        name_without_ext = name_without_ext[:-6]
+    
+    # Count underscores - should be exactly 2 for conforming files
+    underscore_count = name_without_ext.count('_')
+    if underscore_count != 2:
+        return None
+    
+    # Split by underscore: <language-framework>_<org>_<repo>
+    # The value between underscores is org, value after last underscore is repo
+    first_underscore = name_without_ext.index('_')
+    last_underscore = name_without_ext.rindex('_')
+    
+    org = name_without_ext[first_underscore + 1:last_underscore]
+    repo = name_without_ext[last_underscore + 1:]
+    
+    if org and repo:
+        return (org, repo)
+    
+    return None
+
+
+def infer_repository_uri(org, repo):
+    """
+    Construct a GitHub repository URI from organization and repository names.
+    
+    Args:
+        org: GitHub organization name
+        repo: GitHub repository name
+    
+    Returns:
+        Full GitHub repository URI
+    """
+    return f"https://github.com/{org}/{repo}"
+
+
+def process_sarif_file(sarif_path):
+    """
+    Process a single SARIF file to add inferred versionControlProvenance.
+    
+    Args:
+        sarif_path: Path to the SARIF file
+    
+    Returns:
+        Tuple of (success: bool, message: str)
+    """
+    try:
+        # Read the SARIF file
+        with open(sarif_path, 'r', encoding='utf-8') as f:
+            sarif_data = json.load(f)
+        
+        # Extract filename for parsing
+        filename = Path(sarif_path).name
+        
+        # Parse filename to get org and repo
+        repo_info = parse_filename_for_repo_info(filename)
+        if not repo_info:
+            return (False, f"Could not parse repository info from filename: {filename}")
+        
+        org, repo = repo_info
+        repository_uri = infer_repository_uri(org, repo)
+        
+        # Track whether we modified anything
+        modified = False
+        
+        # Process each run in the SARIF file
+        if 'runs' not in sarif_data or not isinstance(sarif_data['runs'], list):
+            return (False, "SARIF file does not contain valid 'runs' array")
+        
+        for run_index, run in enumerate(sarif_data['runs']):
+            # Check if versionControlProvenance already exists
+            if 'versionControlProvenance' not in run:
+                # Create new versionControlProvenance array
+                run['versionControlProvenance'] = [
+                    {
+                        "repositoryUri": repository_uri
+                    }
+                ]
+                modified = True
+            else:
+                # Check if it's an array
+                if not isinstance(run['versionControlProvenance'], list):
+                    return (False, f"Run {run_index} has invalid versionControlProvenance (not an array)")
+                
+                # Check if any entry already has repositoryUri set
+                has_repository_uri = False
+                for vcp in run['versionControlProvenance']:
+                    if 'repositoryUri' in vcp and vcp['repositoryUri']:
+                        has_repository_uri = True
+                        break
+                
+                # If no repositoryUri found, add one
+                if not has_repository_uri:
+                    if len(run['versionControlProvenance']) == 0:
+                        # Empty array, add new entry
+                        run['versionControlProvenance'].append({
+                            "repositoryUri": repository_uri
+                        })
+                    else:
+                        # Array has entries but no repositoryUri, add to first entry
+                        run['versionControlProvenance'][0]['repositoryUri'] = repository_uri
+                    modified = True
+        
+        # Write back to file if modified
+        if modified:
+            with open(sarif_path, 'w', encoding='utf-8') as f:
+                json.dump(sarif_data, f, indent=2)
+            return (True, f"Added repositoryUri: {repository_uri}")
+        else:
+            return (True, f"Already has repositoryUri (skipped)")
+    
+    except json.JSONDecodeError as e:
+        return (False, f"JSON decode error: {e}")
+    except Exception as e:
+        return (False, f"Error processing file: {e}")
+
+
+def read_sarif_files_list(list_file_path):
+    """
+    Read the list of SARIF files from the specified text file.
+    
+    Args:
+        list_file_path: Path to the text file containing SARIF file paths
+    
+    Returns:
+        List of absolute paths to SARIF files
+    """
+    list_file = Path(list_file_path)
+    if not list_file.exists():
+        print(f"Error: File list '{list_file_path}' does not exist.")
+        return []
+    
+    base_dir = list_file.parent
+    sarif_files = []
+    
+    with open(list_file, 'r') as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            
+            # Skip empty lines and comments
+            if not line or line.startswith('#'):
+                continue
+            
+            # Resolve relative paths
+            if not os.path.isabs(line):
+                sarif_path = base_dir / line
+            else:
+                sarif_path = Path(line)
+            
+            # Check if file exists
+            if not sarif_path.exists():
+                print(f"Warning: File not found (line {line_num}): {line}")
+                continue
+            
+            # Check if it's a .sarif file
+            if not sarif_path.suffix == '.sarif':
+                print(f"Warning: Not a .sarif file (line {line_num}): {line}")
+                continue
+            
+            sarif_files.append(sarif_path)
+    
+    return sarif_files
+
+
+def main():
+    if len(sys.argv) != 2:
+        print("SARIF Version Control Provenance Inference Script")
+        print("=" * 50)
+        print()
+        print("Usage:")
+        print(f"  python {sys.argv[0]} <sarif_files_list.txt>")
+        print()
+        print("Description:")
+        print("  Infers and adds versionControlProvenance.repositoryUri to SARIF files")
+        print("  based on the filename pattern: <language>-<framework>_<org>_<repo>.sarif")
+        print()
+        print("Example:")
+        print(f"  python {sys.argv[0]} sarif-files_results-1.txt")
+        print()
+        print("Note:")
+        print("  Files are modified in-place. Consider backing up before running.")
+        sys.exit(1)
+    
+    sarif_files_list = sys.argv[1]
+    
+    print("SARIF Version Control Provenance Inference Script")
+    print("=" * 50)
+    print(f"SARIF files list: {sarif_files_list}")
+    print()
+    
+    # Read and validate SARIF files list
+    sarif_files = read_sarif_files_list(sarif_files_list)
+    if not sarif_files:
+        print("Error: No valid SARIF files found in the list.")
+        sys.exit(1)
+    
+    print(f"Found {len(sarif_files)} SARIF files to process")
+    print()
+    
+    # Process each file
+    success_count = 0
+    skip_count = 0
+    error_count = 0
+    
+    for sarif_file in sarif_files:
+        filename = sarif_file.name
+        success, message = process_sarif_file(sarif_file)
+        
+        if success:
+            if "skipped" in message.lower():
+                skip_count += 1
+                print(f"  [SKIP] {filename}: {message}")
+            else:
+                success_count += 1
+                print(f"  [OK]   {filename}: {message}")
+        else:
+            error_count += 1
+            print(f"  [ERROR] {filename}: {message}")
+    
+    # Summary
+    print()
+    print("=" * 50)
+    print("Processing Summary:")
+    print(f"  Total files:     {len(sarif_files)}")
+    print(f"  Modified:        {success_count}")
+    print(f"  Skipped:         {skip_count}")
+    print(f"  Errors:          {error_count}")
+    print()
+    
+    if error_count > 0:
+        print(f"⚠️  {error_count} file(s) had errors. Please review the output above.")
+        sys.exit(1)
+    else:
+        print(f"✓ Successfully processed all files!")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/es-sarif/setup-venv.sh b/scripts/es-sarif/setup-venv.sh
new file mode 100755
index 000000000..0545ca9bf
--- /dev/null
+++ b/scripts/es-sarif/setup-venv.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# Setup script for SARIF analysis environment (MRVA + Elasticsearch indexing)
+
+set -e  # Exit on any error
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+VENV_DIR="$SCRIPT_DIR/.venv"
+
+echo "SARIF Analysis Environment Setup"
+echo "================================"
+echo "Setting up Python virtual environment in: $VENV_DIR"
+echo
+
+# Check Python 3.11 availability
+if ! command -v python3.11 &> /dev/null; then
+    echo "Error: Python 3.11 is required but not found"
+    echo "Please install Python 3.11 first"
+    exit 1
+fi
+
+python_version=$(python3.11 --version 2>&1 | grep -oE '[0-9]+\.[0-9]+' | head -1)
+echo "Python version: $python_version"
+
+# Create virtual environment
+echo "Creating virtual environment with Python 3.11..."
+python3.11 -m venv "$VENV_DIR"
+
+# Activate virtual environment
+echo "Activating virtual environment..."
+source "$VENV_DIR/bin/activate"
+
+# Upgrade pip
+echo "Upgrading pip..."
+pip install --upgrade pip
+
+# Install dependencies
+echo "Installing dependencies from requirements.txt..."
+pip install -r "$SCRIPT_DIR/requirements.txt"
+
+# Verify installation
+echo
+echo "Verifying installation..."
+python -c "import elasticsearch; print(f'✓ Elasticsearch client version: {elasticsearch.__version__}')"
+
+echo
+echo "✓ Setup complete!"
+echo
+echo "Source the python virtual environment within scripts/es-sarif:"
+echo "  source .venv/bin/activate"
+echo
+echo "To run MRVA for a query suite:"
+echo "  python run-gh-mrva-for-query-suite.py --query-suite codeql/misra-cpp-coding-standards@2.50.0 --output-base-dir ./mrva --session-prefix t1-misra-cpp-default --language cpp --repository-list top_1000"
+echo
+echo "To run the indexer:"
+echo "  python index-sarif-results-in-elasticsearch.py ../../mrva/sessions/sarif-files.txt sarif_results_2024"
+echo
+echo "To deactivate the virtual environment:"
+echo "  deactivate"
\ No newline at end of file