docling-project
diff --git a/‎docling_core/transforms/chunker/__init__.py‎
Lines changed: 10 additions & 4 deletions b/‎docling_core/transforms/chunker/__init__.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎docling_core/transforms/chunker/base_code_chunker.py‎
Lines changed: 1 addition & 1 deletion b/‎docling_core/transforms/chunker/base_code_chunker.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docling_core/transforms/chunker/code_chunk_utils/types.py‎
Lines changed: 0 additions & 35 deletions b/‎docling_core/transforms/chunker/code_chunk_utils/types.py‎
Lines changed: 0 additions & 35 deletions
diff --git a/‎docling_core/transforms/chunker/code_chunk_utils/utils.py‎
Lines changed: 12 additions & 0 deletions b/‎docling_core/transforms/chunker/code_chunk_utils/utils.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docling_core/transforms/chunker/code_chunking_strategy.py‎
Lines changed: 287 additions & 0 deletions b/‎docling_core/transforms/chunker/code_chunking_strategy.py‎
Lines changed: 287 additions & 0 deletions
@@ -13,15 +13,21 @@
     ChunkSizeProcessor,
     RangeTracker,
 )
-from docling_core.transforms.chunker.code_chunk_utils.types import (
+from docling_core.transforms.chunker.code_chunk_utils.utils import Language
+from docling_core.transforms.chunker.code_chunking_strategy import (
+    CodeChunkingStrategyFactory,
+    DefaultCodeChunkingStrategy,
+    LanguageDetector,
+    NoOpCodeChunkingStrategy,
+)
+from docling_core.transforms.chunker.hierarchical_chunker import (
     ChunkType,
     CodeChunk,
+    CodeChunkingStrategy,
     CodeDocMeta,
-)
-from docling_core.transforms.chunker.code_chunk_utils.utils import Language
-from docling_core.transforms.chunker.hierarchical_chunker import (
     DocChunk,
     DocMeta,
     HierarchicalChunker,
 )
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
 from docling_core.transforms.chunker.page_chunker import PageChunker
@@ -8,12 +8,12 @@
     ChunkSizeProcessor,
     RangeTracker,
 )
-from docling_core.transforms.chunker.code_chunk_utils.types import CodeChunk
 from docling_core.transforms.chunker.code_chunk_utils.utils import (
     Language,
     get_children,
     to_str,
 )
+from docling_core.transforms.chunker.hierarchical_chunker import CodeChunk
 from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
 from docling_core.types import DoclingDocument as DLDocument
 from docling_core.types.doc.labels import DocItemLabel
 
@@ -3,7 +3,7 @@
 
 from tree_sitter import Node
 
-from docling_core.transforms.chunker.code_chunk_utils.types import (
+from docling_core.transforms.chunker.hierarchical_chunker import (
     ChunkType,
     CodeChunk,
     CodeDocMeta,
 
@@ -10,6 +10,7 @@
 from tree_sitter import Node, Tree
 
 from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
+from docling_core.types.doc.labels import CodeLanguageLabel
 
 
 class Language(str, Enum):
@@ -47,6 +48,17 @@ def get_tree_sitter_language(self):
         else:
             return None
 
+    def to_code_language_label(self):
+
+        mapping = {
+            Language.PYTHON: CodeLanguageLabel.PYTHON,
+            Language.JAVA: CodeLanguageLabel.JAVA,
+            Language.C: CodeLanguageLabel.C,
+            Language.TYPESCRIPT: CodeLanguageLabel.TYPESCRIPT,
+            Language.JAVASCRIPT: CodeLanguageLabel.JAVASCRIPT,
+        }
+        return mapping.get(self, CodeLanguageLabel.UNKNOWN)
+
     def get_import_query(self) -> Optional[str]:
         if self == Language.PYTHON:
             return """
 
@@ -0,0 +1,287 @@
+from typing import Any, Dict, Iterator, Optional
+
+from docling_core.transforms.chunker.base_code_chunker import CodeChunker
+from docling_core.transforms.chunker.code_chunk_utils.utils import Language
+from docling_core.transforms.chunker.hierarchical_chunker import (
+    ChunkType,
+    CodeChunk,
+    CodeDocMeta,
+)
+from docling_core.transforms.chunker.language_code_chunkers import (
+    CFunctionChunker,
+    JavaFunctionChunker,
+    JavaScriptFunctionChunker,
+    PythonFunctionChunker,
+    TypeScriptFunctionChunker,
+)
+from docling_core.types.doc.base import Size
+from docling_core.types.doc.document import (
+    CodeItem,
+    DoclingDocument,
+    DocumentOrigin,
+    PageItem,
+)
+from docling_core.utils.legacy import _create_hash
+
+
+class LanguageDetector:
+    """Utility class for detecting programming languages from code content and file extensions."""
+
+    @staticmethod
+    def detect_from_extension(filename: Optional[str]) -> Optional[Language]:
+        """Detect language from file extension."""
+
+        if not filename:
+            return None
+
+        filename_lower = filename.lower()
+
+        for language in Language:
+            for ext in language.file_extensions():
+                if filename_lower.endswith(ext):
+                    return language
+        return None
+
+    @staticmethod
+    def detect_from_content(code_text: str) -> Optional[Language]:
+        """Detect language from code content using heuristics."""
+
+        if not code_text:
+            return None
+
+        code_lower = code_text.lower().strip()
+
+        if any(
+            pattern in code_lower
+            for pattern in [
+                "def ",
+                "import ",
+                "from ",
+                'if __name__ == "__main__"',
+                "print(",
+                "lambda ",
+                "yield ",
+                "async def",
+            ]
+        ) and not any(
+            pattern in code_lower
+            for pattern in ["public class", "private ", "protected ", "package "]
+        ):
+            return Language.PYTHON
+
+        if any(
+            pattern in code_lower
+            for pattern in [
+                "package main",
+                "func main()",
+                'import "fmt"',
+                'import "os"',
+                "chan ",
+                "interface{}",
+                "go func",
+                "defer ",
+                ":= ",
+            ]
+        ) and not any(
+            pattern in code_lower
+            for pattern in [
+                "public class",
+                "import java.",
+                "System.out.println",
+                "extends ",
+                "implements ",
+            ]
+        ):
+            return None
+
+        if any(
+            pattern in code_lower
+            for pattern in [
+                "public class",
+                "package ",
+                "import java.",
+                "public static void main",
+                "extends ",
+                "implements ",
+                "String[]",
+                "System.out.println",
+            ]
+        ) and not any(
+            pattern in code_lower
+            for pattern in ["package main", "func main()", "chan ", "interface{}"]
+        ):
+            return Language.JAVA
+
+        if any(
+            pattern in code_lower
+            for pattern in [
+                ": string",
+                ": number",
+                ": boolean",
+                "interface ",
+                "type ",
+                "enum ",
+                "public ",
+                "private ",
+                "protected ",
+            ]
+        ):
+            return Language.TYPESCRIPT
+
+        if any(
+            pattern in code_lower
+            for pattern in [
+                "function ",
+                "const ",
+                "let ",
+                "var ",
+                "=>",
+                "require(",
+                "module.exports",
+                "export ",
+                "import ",
+                "console.log",
+            ]
+        ):
+            return Language.JAVASCRIPT
+
+        if any(
+            pattern in code_lower
+            for pattern in [
+                "#include",
+                "int main(",
+                "void ",
+                "char ",
+                "float ",
+                "double ",
+                "struct ",
+                "#define",
+                "printf(",
+                "scanf(",
+            ]
+        ):
+            return Language.C
+
+        return None
+
+    @staticmethod
+    def detect_language(
+        code_text: str, filename: Optional[str] = None
+    ) -> Optional[Language]:
+        """Detect language from both filename and content."""
+
+        if filename:
+            lang = LanguageDetector.detect_from_extension(filename)
+            if lang:
+                return lang
+            return None
+
+        return LanguageDetector.detect_from_content(code_text)
+
+
+class CodeChunkingStrategyFactory:
+    """Factory for creating language-specific code chunking strategies."""
+
+    @staticmethod
+    def create_chunker(language: Language, **kwargs: Any) -> CodeChunker:
+        """Create a language-specific code chunker."""
+
+        chunker_map = {
+            Language.PYTHON: PythonFunctionChunker,
+            Language.TYPESCRIPT: TypeScriptFunctionChunker,
+            Language.JAVASCRIPT: JavaScriptFunctionChunker,
+            Language.C: CFunctionChunker,
+            Language.JAVA: JavaFunctionChunker,
+        }
+
+        chunker_class = chunker_map.get(language)
+        if not chunker_class:
+            raise ValueError(f"No chunker available for language: {language}")
+
+        return chunker_class(**kwargs)
+
+
+class DefaultCodeChunkingStrategy:
+    """Default implementation of CodeChunkingStrategy that uses language detection and appropriate chunkers."""
+
+    def __init__(self, **chunker_kwargs: Any):
+        """Initialize the strategy with optional chunker parameters."""
+
+        self.chunker_kwargs = chunker_kwargs
+        self._chunker_cache: Dict[Language, CodeChunker] = {}
+
+    def _get_chunker(self, language: Language) -> CodeChunker:
+        """Get or create a chunker for the given language."""
+
+        if language not in self._chunker_cache:
+            self._chunker_cache[language] = CodeChunkingStrategyFactory.create_chunker(
+                language, **self.chunker_kwargs
+            )
+        return self._chunker_cache[language]
+
+    def chunk_code_item(
+        self,
+        code_text: str,
+        language: Language,
+        original_doc=None,
+        original_item=None,
+        **kwargs: Any,
+    ) -> Iterator[CodeChunk]:
+        """Chunk a single code item using the appropriate language chunker."""
+
+        if not code_text.strip():
+            return
+
+        chunker = self._get_chunker(language)
+
+        if original_doc and original_doc.origin:
+            filename = original_doc.origin.filename or "code_chunk"
+            mimetype = original_doc.origin.mimetype or "text/plain"
+            binary_hash = _create_hash(code_text)
+        else:
+            filename = "code_chunk"
+            mimetype = "text/plain"
+            binary_hash = _create_hash(code_text)
+
+        if original_item and hasattr(original_item, "self_ref"):
+            self_ref = original_item.self_ref
+        else:
+            self_ref = "#/texts/0"
+
+        code_item = CodeItem(text=code_text, self_ref=self_ref, orig=code_text)
+
+        doc = DoclingDocument(
+            name=filename,
+            texts=[code_item],
+            pages={0: PageItem(page_no=0, size=Size(width=612.0, height=792.0))},
+            origin=DocumentOrigin(
+                filename=filename, mimetype=mimetype, binary_hash=binary_hash
+            ),
+        )
+
+        yield from chunker.chunk(doc, **kwargs)
+
+
+class NoOpCodeChunkingStrategy:
+    """No-operation code chunking strategy that returns the original code as a single chunk."""
+
+    def chunk_code_item(
+        self,
+        code_text: str,
+        language: Language,
+        original_doc=None,
+        original_item=None,
+        **kwargs: Any,
+    ) -> Iterator[CodeChunk]:
+        """Return the code as a single chunk without further processing."""
+
+        if not code_text.strip():
+            return
+
+        meta = CodeDocMeta(
+            chunk_type=ChunkType.CODE_BLOCK,
+            start_line=1,
+            end_line=len(code_text.splitlines()),
+        )
+
+        yield CodeChunk(text=code_text, meta=meta)
Original file line number	Diff line number	Diff line change
`@@ -8,12 +8,12 @@`
`8`	`8`	`ChunkSizeProcessor,`
`9`	`9`	`RangeTracker,`
`10`	`10`	`)`
`11`		`-from docling_core.transforms.chunker.code_chunk_utils.types import CodeChunk`
`12`	`11`	`from docling_core.transforms.chunker.code_chunk_utils.utils import (`
`13`	`12`	`Language,`
`14`	`13`	`get_children,`
`15`	`14`	`to_str,`
`16`	`15`	`)`
	`16`	`+from docling_core.transforms.chunker.hierarchical_chunker import CodeChunk`
`17`	`17`	`from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer`
`18`	`18`	`from docling_core.types import DoclingDocument as DLDocument`
`19`	`19`	`from docling_core.types.doc.labels import DocItemLabel`