# Scanner

> Scan nbdev notebooks for exported functions and classes

In [None]:
#| default_exp scanner

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import ast
import importlib
from execnb.nbio import read_nb
from fastcore.basics import AttrDict
from nbdev.config import get_config
import re

In [None]:
#| export
def get_export_cells(
    nb_path: Path    # Path to the notebook file
) -> List[Dict[str, Any]]:  # List of cells with export directives
    "Extract all code cells from a notebook that have export directives"
    nb = read_nb(nb_path)
    export_cells = []
    
    for cell in nb.cells:
        if cell.cell_type == 'code' and cell.source:
            lines = cell.source.split('\n')
            for line in lines:
                if line.strip().startswith('#| export'):
                    export_cells.append({
                        'cell_id': cell.get('id', None),
                        'source': cell.source,
                        'idx': cell.idx_ if hasattr(cell, 'idx_') else None
                    })
                    break
    
    return export_cells

In [None]:
#| export
def extract_definitions(
    source: str  # Python source code
) -> List[Dict[str, Any]]:  # List of function/class definitions with metadata
    "Extract function and class definitions from source code"
    definitions = []
    
    # Remove export directive lines
    lines = source.split('\n')
    clean_lines = [line for line in lines if not line.strip().startswith('#| ')]
    clean_source = '\n'.join(clean_lines)
    
    try:
        tree = ast.parse(clean_source)
        
        for node in ast.walk(tree):
            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
                # Get the source lines for this definition
                start_line = node.lineno - 1
                end_line = node.end_lineno if hasattr(node, 'end_lineno') else start_line + 1
                
                def_lines = clean_lines[start_line:end_line]
                def_source = '\n'.join(def_lines)
                
                definition = {
                    'name': node.name,
                    'type': type(node).__name__,
                    'source': def_source,
                    'lineno': node.lineno,
                    'is_async': isinstance(node, ast.AsyncFunctionDef)
                }
                
                # For functions, extract parameters
                if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
                    definition['args'] = []
                    for arg in node.args.args:
                        definition['args'].append({
                            'name': arg.arg,
                            'annotation': ast.unparse(arg.annotation) if arg.annotation else None
                        })
                    
                    # Add return annotation
                    definition['returns'] = ast.unparse(node.returns) if node.returns else None
                
                definitions.append(definition)
                
    except SyntaxError as e:
        print(f"Syntax error parsing source: {e}")
        
    return definitions

In [None]:
#| export
def scan_notebook(
    nb_path: Path,  # Path to the notebook to scan
    nbs_root: Optional[Path] = None  # Root notebooks directory (for relative paths)
) -> List[Dict[str, Any]]:  # List of exported definitions with metadata
    "Scan a notebook and extract all exported function/class definitions"
    export_cells = get_export_cells(nb_path)
    all_definitions = []
    
    for cell in export_cells:
        definitions = extract_definitions(cell['source'])
        for defn in definitions:
            # If nbs_root is provided, use it; otherwise get from config
            if nbs_root is None:
                cfg = get_config()
                nbs_root = Path(cfg.config_path) / cfg.nbs_path
            
            # Store relative path from nbs directory for nested folders
            relative_path = nb_path.relative_to(nbs_root)
            
            defn['notebook'] = str(relative_path)  # Store full relative path
            defn['cell_id'] = cell['cell_id']
            
            # Try to get the actual function object from exported module
            try:
                # Get the module name from the notebook name
                module_name = nb_path.stem
                if module_name.startswith('0'):
                    # Handle numbered notebooks like 00_core -> core
                    module_name = module_name.split('_', 1)[1] if '_' in module_name else module_name
                
                # Build module path including subdirectories
                module_parts = ['cjm_nbdev_docments']
                
                # Add subdirectory parts if notebook is in a subdirectory
                if relative_path.parent != Path('.'):
                    module_parts.extend(relative_path.parent.parts)
                
                module_parts.append(module_name)
                full_module_name = '.'.join(module_parts)
                
                # Import the module
                module = importlib.import_module(full_module_name)
                
                # Get the function/class object
                if hasattr(module, defn['name']):
                    defn['func_obj'] = getattr(module, defn['name'])
                else:
                    defn['func_obj'] = None
                    
            except Exception:
                defn['func_obj'] = None
            
            all_definitions.append(defn)
    
    return all_definitions

In [None]:
#| export
def scan_project(
    nbs_path: Optional[Path] = None,  # Path to notebooks directory (defaults to config.nbs_path)
    pattern: str = "*.ipynb"  # Pattern for notebook files to scan
) -> List[Dict[str, Any]]:  # All exported definitions found in the project
    "Scan all notebooks in a project for exported definitions"
    if nbs_path is None:
        cfg = get_config()
        nbs_path = Path(cfg.config_path) / cfg.nbs_path
    
    nbs_path = Path(nbs_path)
    all_definitions = []
    
    # Use rglob to recursively find notebooks in subdirectories
    for nb_path in nbs_path.rglob(pattern):
        # Skip private notebooks and those in .ipynb_checkpoints
        if not nb_path.name.startswith('_') and '.ipynb_checkpoints' not in str(nb_path):
            try:
                # Pass the nbs_path to scan_notebook so it knows the root
                definitions = scan_notebook(nb_path, nbs_root=nbs_path)
                all_definitions.extend(definitions)
            except Exception as e:
                print(f"Error scanning {nb_path}: {e}")
    
    return all_definitions

In [None]:
# Test scanning this project
definitions = scan_project()
print(f"Found {len(definitions)} exported definitions")
for defn in definitions[:5]:  # Show first 5
    print(f"- {defn['type']}: {defn['name']} in {defn['notebook']}")

Found 58 exported definitions
- ClassDef: DocmentsCheckResult in 00_core.ipynb
- FunctionDef: __post_init__ in 00_core.ipynb
- FunctionDef: extract_param_docs_from_func in 00_core.ipynb
- FunctionDef: extract_param_docs in 00_core.ipynb
- FunctionDef: check_return_doc in 00_core.ipynb


In [None]:
# Test if nested folders would be detected
import tempfile
import shutil
from pathlib import Path

# Create a temporary directory structure to test nested scanning
with tempfile.TemporaryDirectory() as tmpdir:
    tmp_path = Path(tmpdir)
    
    # Create nested structure
    (tmp_path / "actions").mkdir()
    (tmp_path / "data_display").mkdir()
    
    # Create dummy notebooks
    dummy_nb = {
        "cells": [
            {"cell_type": "code", "source": "#| export\ndef test_func(): pass", "id": "cell-1"}
        ],
        "metadata": {},
        "nbformat": 4,
        "nbformat_minor": 5
    }
    
    import json
    
    # Create notebooks in root
    with open(tmp_path / "index.ipynb", "w") as f:
        json.dump(dummy_nb, f)
    
    # Create notebooks in subdirectories
    with open(tmp_path / "actions" / "button.ipynb", "w") as f:
        json.dump(dummy_nb, f)
    
    with open(tmp_path / "data_display" / "table.ipynb", "w") as f:
        json.dump(dummy_nb, f)
    
    # Test scanning
    print("Testing nested folder scanning:")
    notebooks = list(tmp_path.rglob("*.ipynb"))
    print(f"Created {len(notebooks)} test notebooks:")
    for nb in sorted(notebooks):
        print(f"  - {nb.relative_to(tmp_path)}")
    
    # Test scan_project with nested folders
    definitions = scan_project(nbs_path=tmp_path)
    print(f"\nscan_project found {len(definitions)} definitions")
    for defn in definitions:
        print(f"  - {defn['name']} in {defn['notebook']}")

Testing nested folder scanning:
Created 3 test notebooks:
  - actions/button.ipynb
  - data_display/table.ipynb
  - index.ipynb

scan_project found 3 definitions
  - test_func in index.ipynb
  - test_func in data_display/table.ipynb
  - test_func in actions/button.ipynb


In [None]:
# Debug: Check what notebooks we're finding (including nested folders)
from nbdev.config import get_config
cfg = get_config()
nbs_path = Path(cfg.config_path) / cfg.nbs_path
notebooks = list(nbs_path.rglob("*.ipynb"))
print(f"Looking in: {nbs_path}")
print(f"Found {len(notebooks)} notebooks (including nested):")
for nb in sorted(notebooks):
    if not nb.name.startswith('_') and '.ipynb_checkpoints' not in str(nb):
        relative_path = nb.relative_to(nbs_path)
        print(f"  - {relative_path}")
        cells = get_export_cells(nb)
        if cells:
            print(f"    Export cells: {len(cells)}")

Looking in: /mnt/SN850X_8TB_EXT4/Projects/GitHub/cj-mills/cjm-nbdev-docments/nbs
Found 12 notebooks (including nested):
  - 00_core.ipynb
    Export cells: 14
  - 01_scanner.ipynb
    Export cells: 5
  - 02_report.ipynb
    Export cells: 8
  - 03_autofix.ipynb
    Export cells: 29
  - 04_cli.ipynb
    Export cells: 6
  - index.ipynb


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()