# Data Explorer Notebook

## This notebook shows how to use [nvidia/llama-3.3-nemotron-super-49b](https://build.nvidia.com/nvidia/llama-3_3-nemotron-super-49b-v1_5) to generate a full report given any tabular dataset. Get your free API key for NVIDIA NIM LLM inference by registering https://build.nvidia.com/


## Setup

Import required libraries for running the CLI data explorer in a notebook environment.

In [None]:
from __future__ import annotations

import io
import re
import sys
from contextlib import redirect_stderr, redirect_stdout
from pathlib import Path
from typing import Iterable, List, Sequence


## Helper Functions

These utilities handle environment setup and output parsing.

In [None]:
def patch_httpx_for_openai() -> None:
    """Ensure httpx clients accept the `proxies` kwarg expected by OpenAI."""
    try:
        import httpx  # type: ignore
    except ImportError:
        return

    def _needs_patch(fn):
        return "proxies" not in fn.__code__.co_varnames and "proxy" in fn.__code__.co_varnames

    if _needs_patch(httpx.Client.__init__):  # type: ignore[attr-defined]
        original_init = httpx.Client.__init__

        def patched_init(self, *args, **kwargs):  # type: ignore[override]
            if "proxies" in kwargs and "proxy" not in kwargs:
                kwargs["proxy"] = kwargs.pop("proxies")
            return original_init(self, *args, **kwargs)

        httpx.Client.__init__ = patched_init  # type: ignore[assignment]

    if hasattr(httpx, "AsyncClient") and _needs_patch(httpx.AsyncClient.__init__):  # type: ignore[attr-defined]
        original_async_init = httpx.AsyncClient.__init__

        def patched_async_init(self, *args, **kwargs):  # type: ignore[override]
            if "proxies" in kwargs and "proxy" not in kwargs:
                kwargs["proxy"] = kwargs.pop("proxies")
            return original_async_init(self, *args, **kwargs)

        httpx.AsyncClient.__init__ = patched_async_init  # type: ignore[assignment]


def resolve_repo_root() -> Path:
    """Best-effort resolution of the repository root when running in a notebook."""
    file_path = globals().get("__file__")
    if file_path:
        return Path(file_path).resolve().parent
    return Path.cwd()


def ensure_pkg_on_path(repo_root: Path | None = None) -> Path:
    """Append the repository's `src` directory to sys.path if needed."""
    base = repo_root or resolve_repo_root()
    src_dir = base / "src"
    src_dir_str = str(src_dir)
    if src_dir.exists() and src_dir_str not in sys.path:
        sys.path.append(src_dir_str)
    return src_dir


def build_cli_argv(data_files: Iterable[Path], api_key: str | None, passthrough: List[str]) -> List[str]:
    """Construct the argument vector expected by `cli_data_explorer.cli.main`."""
    argv = ["cli-data-explorer", "workflow", "--data"]
    argv.extend(str(path) for path in data_files)
    if api_key:
        argv.extend(["--api-key", api_key])
    argv.extend(passthrough)
    return argv


REPORT_PATH_PATTERN = re.compile(r"(app_notebooks/\S+/report/report\.md)")
last_report_path: Path | None = None


def extract_report_path(output: str) -> Path | None:
    """Return the last markdown report path mentioned in CLI output."""
    match = None
    for candidate in REPORT_PATH_PATTERN.findall(output):
        match = candidate
    return Path(match) if match else None


## Main Runner

The `run_data_explorer()` function invokes the CLI workflow programmatically, capturing output and extracting the report path.

In [None]:
def run_data_explorer(
    data: Sequence[str | Path] | str | Path,
    pkg_path: str = "/kaggle/input/cli-data-explorer",
    api_key: str | None = None,
    vision_model: str | None = None,
    vision_disable: bool = False,
    extra_args: Sequence[str] | None = None,
) -> tuple[int, Path | None]:
    """Invoke cli-data-explorer using the same logic as data_explore.py."""
    global last_report_path


    patch_httpx_for_openai()
    ensure_pkg_on_path(Path(pkg_path))

    try:
        from cli_data_explorer import cli as cli_module
    except ImportError as exc:
        raise RuntimeError("Unable to import cli_data_explorer. Check your environment.") from exc

    if isinstance(data, (str, Path)):
        data_iterable: Sequence[str | Path] = [data]
    else:
        data_iterable = data

    data_paths = [Path(item).expanduser().resolve() for item in data_iterable]
    missing = [str(path) for path in data_paths if not path.exists()]
    if missing:
        raise FileNotFoundError(f"Data file(s) not found: {', '.join(missing)}")

    passthrough = list(extra_args or [])
    if vision_model:
        passthrough = ["--vision-model", vision_model, *passthrough]
    if vision_disable and "--no-vision" not in passthrough:
        passthrough = ["--no-vision", *passthrough]

    cli_argv = build_cli_argv(data_paths, api_key, passthrough)
    original_argv = sys.argv.copy()
    sys.argv = cli_argv

    buffer = io.StringIO()
    exit_code = 0
    with redirect_stdout(buffer), redirect_stderr(buffer):
        try:
            cli_module.main()
        except SystemExit as exc:
            exit_code = int(exc.code or 0)
        finally:
            sys.argv = original_argv

    output = buffer.getvalue()
    print(output, end="")

    report_path = extract_report_path(output)
    last_report_path = report_path

    return exit_code, report_path


## Usage

1. Update the list of data files to point to your dataset(s).
2. Optionally supply an API key, vision overrides, or extra CLI flags.
3. Run the helper function below; it returns the exit code and the detected markdown report path.


## Configuration

Get your free API key for NVIDIA NIM LLM inference by registering https://build.nvidia.com/ Save your API key in Kaggle secrets.

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("NVIDIA_API_KEY")

## Install Dependencies

Install `pandoc` and LaTeX packages for PDF generation.

In [None]:
!apt install pandoc && apt-get update && apt-get install texlive-latex-base texlive-latex-recommended texlive-fonts-recommended texlive-xetex

## Run the Explorer

Execute the data explorer on your dataset. The workflow will analyze the data and generate a markdown report.

In [None]:
# Example invocation (uncomment and edit as needed).
data_files = ["/kaggle/input/global-earthquake-tsunami-risk-assessment-dataset/earthquake_data_tsunami.csv"]
exit_code, report_path = run_data_explorer(
    data_files,
    api_key=api_key, 
    extra_args=["--prompt", "Explore this dataset"],
)
print(f"cli-data-explorer exited with status {exit_code}")
print(f"Markdown report path: {report_path}")
last_report_path


## Generate PDF

Convert the markdown report to PDF format using `pandoc`.

In [None]:
import os
cmd = (f"pandoc {report_path} "
      f"--resource-path={report_path.parent} "
      "--pdf-engine=xelatex "
      "-o report.pdf -V geometry:margin=1in" 
      )
print(cmd)
os.system(cmd)

## View Report

Display the generated PDF report inline.

In [None]:
from IPython.display import IFrame
IFrame(
  src="./report.pdf",
  width=900,
  height=600,
)