diff --git a/Cargo.toml b/Cargo.toml index 93866065..d7f2e075 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,11 +49,34 @@ console-subscriber = "0.4.1" env_logger = "0.11.7" reqwest = { version = "0.12.13", features = ["json"] } async-openai = "0.28.0" - tree-sitter = "0.25.3" tree-sitter-language = "0.1.5" -tree-sitter-python = "0.23.6" +# Per language tree-sitter parsers +tree-sitter-c = "0.23.4" +tree-sitter-cpp = "0.23.4" +tree-sitter-c-sharp = "0.23.1" +tree-sitter-css = "0.23.2" +tree-sitter-fortran = "0.5.0" +tree-sitter-go = "0.23.4" +tree-sitter-html = "0.23.2" +tree-sitter-java = "0.23.5" tree-sitter-javascript = "0.23.1" -tree-sitter-typescript = "0.23.2" +tree-sitter-json = "0.24.8" tree-sitter-md = "0.3.2" +tree-sitter-pascal = "0.10.0" +tree-sitter-php = "0.23.11" +tree-sitter-python = "0.23.6" +tree-sitter-r = "1.1.0" +tree-sitter-ruby = "0.23.1" +tree-sitter-rust = "0.23.2" +tree-sitter-scala = "0.23.4" +tree-sitter-scss = "1.0.0" +tree-sitter-sequel = "0.3.8" +tree-sitter-swift = "0.7.0" +tree-sitter-toml-ng = "0.7.0" +tree-sitter-typescript = "0.23.2" +tree-sitter-xml = "0.7.0" +tree-sitter-yaml = "0.7.0" + globset = "0.4.16" +unicase = "2.8.1" diff --git a/examples/code_embedding/main.py b/examples/code_embedding/main.py index be41b2d7..e06c089e 100644 --- a/examples/code_embedding/main.py +++ b/examples/code_embedding/main.py @@ -16,8 +16,9 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind Define an example flow that embeds files into a vector database. """ data_scope["files"] = flow_builder.add_source( - cocoindex.sources.LocalFile(path="../../python", included_patterns=["**/*.py"])) - + cocoindex.sources.LocalFile(path="../..", + included_patterns=["*.py"], + excluded_patterns=[".*"])) code_embeddings = data_scope.add_collector() with data_scope["files"].row() as file: diff --git a/src/ops/functions/split_recursively.rs b/src/ops/functions/split_recursively.rs index 8288c864..150109be 100644 --- a/src/ops/functions/split_recursively.rs +++ b/src/ops/functions/split_recursively.rs @@ -4,6 +4,7 @@ use regex::{Matches, Regex}; use std::collections::HashSet; use std::sync::LazyLock; use std::{collections::HashMap, sync::Arc}; +use unicase::UniCase; use crate::base::field_attrs; use crate::{fields_value, ops::sdk::*}; @@ -31,10 +32,10 @@ struct LanguageConfig { } fn add_language<'a>( - output: &'a mut HashMap<&'static str, Arc>, + output: &'a mut HashMap, Arc>, name: &'static str, aliases: impl IntoIterator, - lang_fn: tree_sitter_language::LanguageFn, + lang_fn: impl Into, terminal_node_kinds: impl IntoIterator, ) { let tree_sitter_lang: tree_sitter::Language = lang_fn.into(); @@ -58,49 +59,143 @@ fn add_language<'a>( terminal_node_kind_ids, }); for name in std::iter::once(name).chain(aliases.into_iter()) { - if output.insert(name, config.clone()).is_some() { + if output.insert(name.into(), config.clone()).is_some() { panic!("Language `{name}` already exists"); } } } -static TREE_SITTER_LANGUAGE_BY_LANG: LazyLock>> = +static TREE_SITTER_LANGUAGE_BY_LANG: LazyLock, Arc>> = LazyLock::new(|| { let mut map = HashMap::new(); + add_language(&mut map, "C", [".c"], tree_sitter_c::LANGUAGE, []); add_language( &mut map, - "Python", - ["py", "python"], - tree_sitter_python::LANGUAGE, + "C++", + [".cpp", ".cc", ".cxx", ".h", ".hpp", "cpp"], + tree_sitter_c::LANGUAGE, + [], + ); + add_language( + &mut map, + "C#", + [".cs", "cs"], + tree_sitter_c_sharp::LANGUAGE, + [], + ); + add_language(&mut map, "CSS", [".css"], tree_sitter_css::LANGUAGE, []); + add_language( + &mut map, + "Fortran", + [".f", ".f90", ".f95", ".f03", "f", "f90", "f95", "f03"], + tree_sitter_fortran::LANGUAGE, + [], + ); + add_language( + &mut map, + "Go", + [".go", "golang"], + tree_sitter_go::LANGUAGE, + [], + ); + add_language( + &mut map, + "HTML", + [".html", ".htm"], + tree_sitter_html::LANGUAGE, [], ); + add_language(&mut map, "Java", [".java"], tree_sitter_java::LANGUAGE, []); add_language( &mut map, "JavaScript", - ["JS", "js", "Javascript", "javascript"], + [".js", "js"], tree_sitter_javascript::LANGUAGE, [], ); + add_language(&mut map, "JSON", [".json"], tree_sitter_json::LANGUAGE, []); add_language( &mut map, - "TypeScript", - ["TS", "ts", "Typescript", "typescript"], - tree_sitter_typescript::LANGUAGE_TYPESCRIPT, + "Markdown", + [".md", "md"], + tree_sitter_md::LANGUAGE, + ["inline"], + ); + add_language( + &mut map, + "Pascal", + [".pas", "pas", ".dpr", "dpr", "Delphi"], + tree_sitter_pascal::LANGUAGE, + [], + ); + add_language(&mut map, "PHP", [".php"], tree_sitter_php::LANGUAGE_PHP, []); + add_language( + &mut map, + "Python", + [".py"], + tree_sitter_python::LANGUAGE, + [], + ); + add_language(&mut map, "R", [".r"], tree_sitter_r::LANGUAGE, []); + add_language(&mut map, "Ruby", [".rb"], tree_sitter_ruby::LANGUAGE, []); + add_language( + &mut map, + "Rust", + [".rs", "rs"], + tree_sitter_rust::LANGUAGE, + [], + ); + add_language( + &mut map, + "Scala", + [".scala"], + tree_sitter_scala::LANGUAGE, + [], + ); + add_language( + &mut map, + "SCSS", + [".scss"], + tree_sitter_scss::language(), + [], + ); + add_language(&mut map, "SQL", [".sql"], tree_sitter_sequel::LANGUAGE, []); + add_language( + &mut map, + "Swift", + [".swift"], + tree_sitter_swift::LANGUAGE, + [], + ); + add_language( + &mut map, + "TOML", + [".toml"], + tree_sitter_toml_ng::LANGUAGE, [], ); add_language( &mut map, "TSX", - ["tsx"], + [".tsx"], tree_sitter_typescript::LANGUAGE_TSX, [], ); add_language( &mut map, - "Markdown", - ["md", "markdown"], - tree_sitter_md::LANGUAGE.into(), - ["inline"], + "TypeScript", + [".ts", "ts"], + tree_sitter_typescript::LANGUAGE_TYPESCRIPT, + [], + ); + add_language(&mut map, "XML", [".xml"], tree_sitter_xml::LANGUAGE_XML, []); + add_language(&mut map, "DTD", [".dtd"], tree_sitter_xml::LANGUAGE_DTD, []); + add_language( + &mut map, + "YAML", + [".yaml", ".yml"], + tree_sitter_yaml::LANGUAGE, + [], ); map }); @@ -416,7 +511,7 @@ impl SimpleFunctionExecutor for Executor { .optional() .map(|v| anyhow::Ok(v.as_str()?.as_ref())) .transpose()? - .and_then(|lang| TREE_SITTER_LANGUAGE_BY_LANG.get(lang)) + .and_then(|lang| TREE_SITTER_LANGUAGE_BY_LANG.get(&UniCase::new(lang))) }; let recursive_chunker = RecursiveChunker {