codatio · pj-simpson · Sep 5, 2025 · Sep 5, 2025
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,21 @@
 src/components/page/reference/ReleaseNotes/release-notes.json
 /.vs
 
+# Code utilities - generated files
+code_utils/files_with_code.txt
+code_utils/temp/
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.egg-info/
+.venv/
+venv/
+env/
+dist/
+build/
+
 
 # Misc
 .DS_Store

diff --git a/code_utils/.python-version b/code_utils/.python-version
@@ -0,0 +1 @@
+3.11
diff --git a/code_utils/README.md b/code_utils/README.md
@@ -0,0 +1,33 @@
+# Codat Documentation Code Utils
+
+Utilities for extracting and managing code snippets from Codat documentation.
+Currently consists of a single script `extract_code_from_files.py` which will find every markdown file
+in the docs directory containing a code snippet. It will then extract those snippets into files under a `temp/` directory. 
+
+
+## Usage
+
+```
+
+# Run the code extractor
+
+uv run extract_code_from_files.py
+
+## Development
+
+This project uses [uv](https://astral.sh/uv) for dependency management.
+
+```bash
+# Install dependencies
+uv sync
+
+# Install development dependencies
+uv sync --extra dev
+```
+
+## Structure
+
+- `code_finder.py` - Main CodeFinder class
+- `extract_code_from_files.py` - Entrypoint script. 
+- `temp/` - Generated code snippets (gitignored)
+- `files_with_code.txt` - List of files containing code (gitignored)
diff --git a/code_utils/__init__.py b/code_utils/__init__.py
diff --git a/code_utils/code_finder.py b/code_utils/code_finder.py
@@ -0,0 +1,166 @@
+import os
+import sys
+import re
+import shutil
+from pathlib import Path
+
+
+class CodeFinder:
+
+    all_languages = {'python', 'javascript', 'csharp', 'go'}
+    target_languages = {'python', 'javascript', 'csharp' }
+    deprecated_languages = {'go'}
+
+    def __init__(self, output_file_name:str = "files_with_code.txt"):
+        self.matching_files = []
+
+        self.script_dir = Path(__file__).resolve().parent
+        self.temp_dir = self.script_dir / 'temp'
+        self.project_root = self.script_dir.parent
+        self.docs_path = self.project_root / 'docs'
+        self.output_file = self.script_dir / output_file_name
+
+    def has_code_snippets(self, file_path:str, target_languages:set):
+        """
+        Check if a markdown file contains code snippets with specified languages.
+
+        Args:
+            file_path (str): Path to the markdown file
+            target_languages (set): Set of programming languages to look for
+
+        Returns:
+            bool: True if file contains code snippets with target languages
+        """
+        try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read()
+
+            # Look for code blocks that start with ``` followed by language name
+            # Pattern matches: ```python, ```javascript, ```csharp, ```go
+            pattern = r'```(' + '|'.join(CodeFinder.all_languages) + r')\b'
+
+            matches = re.findall(pattern, content, re.IGNORECASE)
+            return len(matches) > 0
+
+        except Exception as e:
+            print(f"Error reading file {file_path}: {e}")
+            return False
+
+    def find_files_with_code(self):
+        """Find markdown files in docs directory that contain code snippets with specified languages."""
+
+        # Check if docs directory exists
+        if not self.docs_path.exists():
+            print(f"Error: docs directory not found at {self.docs_path}")
+            sys.exit(1)
+
+        print(f"Searching for markdown files with code snippets in: {self.docs_path}")
+        print(f"Looking for languages: {', '.join(sorted(CodeFinder.target_languages))}")
+
+        # Find all markdown files recursively using pathlib
+        markdown_files = list(self.docs_path.rglob('*.md')) + list(self.docs_path.rglob('*.mdx'))
+
+        for file_path in markdown_files:
+            # Get relative path from docs directory for output (using forward slashes)
+            rel_path = file_path.relative_to(self.docs_path).as_posix()
+
+            # Check if file contains target code snippets
+            if self.has_code_snippets(file_path, CodeFinder.target_languages):
+                self.matching_files.append(rel_path)
+                print(f"Found: {rel_path}")
+
+        # Sort the paths for better readability
+        self.matching_files.sort()
+
+        # Write to output file
+        try:
+            with open(self.output_file, 'w', encoding='utf-8') as f:
+
+                for path in self.matching_files:
+                    f.write(path + '\n')
+
+            print(f"\nSummary:")
+            print(f"- Total markdown files processed: {len(self.matching_files)}")
+            print(f"- Files with target code snippets: {len(self.matching_files)}")
+            print(f"- Results saved to: {self.output_file}")
+
+        except Exception as e:
+            print(f"Error writing to output file: {e}")
+            sys.exit(1)
+
+    def extract_code(self):
+        """
+        Extract code snippets from matching files and save them to temp directory.
+        Creates subdirectories for each programming language.
+        """
+        if not self.matching_files:
+            print("No matching files found. Run find_files_with_code() first.")
+            return
+
+        # Language to file extension mapping
+        lang_extensions = {
+            'python': '.py',
+            'javascript': '.ts',
+            'csharp': '.cs'
+        }
+
+
+        # Create subdirectories for each target language
+        for lang in CodeFinder.target_languages:
+            lang_dir = self.temp_dir / lang
+            lang_dir.mkdir(parents=True, exist_ok=True)
+
+        print(f"Created temp directory structure at: {self.temp_dir}")
+
+        # Counters for summary
+        total_snippets = 0
+        snippets_by_lang = {lang: 0 for lang in CodeFinder.target_languages}
+
+        # Process each matching file
+        for file_path in self.matching_files:
+            full_file_path = self.docs_path / file_path
+
+            try:
+                with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                    content = f.read()
+
+                # Extract code blocks using regex
+                # Pattern: ```language followed by code until closing ```
+                pattern = r'```(' + '|'.join(CodeFinder.target_languages) + r')\b\n?(.*?)\n?```'
+                matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE)
+
+                for i, (language, code_content) in enumerate(matches):
+                    language = language.lower()
+
+                    if language in CodeFinder.target_languages:
+                        # Clean up the code content
+                        code_content = code_content.strip()
+
+                        if code_content:  # Only save non-empty snippets
+                            # Generate filename based on source file and snippet index
+                            source_name = Path(file_path).stem
+                            source_name = re.sub(r'[^\w\-_]', '_', source_name)  # Clean filename
+
+                            filename = f"{source_name}_snippet_{i+1}{lang_extensions[language]}"
+                            snippet_path = self.temp_dir / language / filename
+
+                            # Save the code snippet
+                            with open(snippet_path, 'w', encoding='utf-8') as snippet_file:
+                                snippet_file.write(code_content)
+
+                            total_snippets += 1
+                            snippets_by_lang[language] += 1
+
+                            print(f"Extracted {language} snippet: {Path(language) / filename}")
+
+            except Exception as e:
+                print(f"Error processing file {file_path}: {e}")
+                continue
+
+        # Print summary
+        print(f"\nExtraction Summary:")
+        print(f"- Total code snippets extracted: {total_snippets}")
+        for lang, count in snippets_by_lang.items():
+            if count > 0:
+                print(f"- {lang.capitalize()} snippets: {count}")
+        print(f"- Snippets saved to: {self.temp_dir}")
diff --git a/code_utils/extract_code_from_files.py b/code_utils/extract_code_from_files.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+"""
+Script to walk through the 'docs' directory and find markdown files with code snippets.
+Looks for code blocks with specific programming languages: python, javascript, csharp, go.
+Saves matching file paths to files_with_code.txt.
+"""
+from code_finder import CodeFinder
+
+if __name__ == "__main__":
+    finder = CodeFinder()
+    finder.find_files_with_code()
+    finder.extract_code()
diff --git a/code_utils/pyproject.toml b/code_utils/pyproject.toml
@@ -0,0 +1,37 @@
+[project]
+name = "code-utils"
+version = "0.1.0"
+description = "Utilities for extracting and managing code snippets from Codat documentation"
+authors = [
+    {name = "Codat Documentation Team"}
+]
+readme = "README.md"
+requires-python = ">=3.8"
+dependencies = []
+
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "black",
+    "ruff",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["."]
+
+[tool.black]
+line-length = 88
+target-version = ['py38']
+
+[tool.ruff]
+target-version = "py38"
+line-length = 88
+select = ["E", "F", "W", "I"]
+ignore = []
+
+[tool.ruff.isort]
+known-first-party = ["code_finder"]