diff --git a/docs/docs/sources/localfile.md b/docs/docs/sources/localfile.md index 3351d8ae..4db51ad0 100644 --- a/docs/docs/sources/localfile.md +++ b/docs/docs/sources/localfile.md @@ -23,6 +23,10 @@ The spec takes the following fields: ::: +* `max_file_size` (`int`, optional): if provided, files exceeding this size in bytes will be treated as non-existent and skipped during processing. + This is useful to avoid processing large files that are not relevant to your use case, such as videos or backups. + If not specified, no size limit is applied. + ### Schema The output is a [*KTable*](/docs/core/data_types#ktable) with the following sub fields: diff --git a/python/cocoindex/sources/_engine_builtin_specs.py b/python/cocoindex/sources/_engine_builtin_specs.py index 6c90307b..0bd9c6d5 100644 --- a/python/cocoindex/sources/_engine_builtin_specs.py +++ b/python/cocoindex/sources/_engine_builtin_specs.py @@ -23,6 +23,9 @@ class LocalFile(op.SourceSpec): # See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns. excluded_patterns: list[str] | None = None + # If provided, files exceeding this size in bytes will be treated as non-existent. + max_file_size: int | None = None + class GoogleDrive(op.SourceSpec): """Import data from Google Drive.""" diff --git a/src/ops/sources/local_file.rs b/src/ops/sources/local_file.rs index 6701f99e..332a23cc 100644 --- a/src/ops/sources/local_file.rs +++ b/src/ops/sources/local_file.rs @@ -13,12 +13,14 @@ pub struct Spec { binary: bool, included_patterns: Option>, excluded_patterns: Option>, + max_file_size: Option, } struct Executor { root_path: PathBuf, binary: bool, pattern_matcher: PatternMatcher, + max_file_size: Option, } #[async_trait] @@ -49,6 +51,14 @@ impl SourceExecutor for Executor { new_dirs.push(Cow::Owned(path)); } } else if self.pattern_matcher.is_file_included(relative_path) { + // Check file size limit + if let Some(max_size) = self.max_file_size { + if let Ok(metadata) = path.metadata() { + if metadata.len() > max_size as u64 { + continue; + } + } + } let ordinal: Option = if options.include_ordinal { Some(path.metadata()?.modified()?.try_into()?) } else { @@ -86,6 +96,18 @@ impl SourceExecutor for Executor { }); } let path = self.root_path.join(path); + // Check file size limit + if let Some(max_size) = self.max_file_size { + if let Ok(metadata) = path.metadata() { + if metadata.len() > max_size as u64 { + return Ok(PartialSourceRowData { + value: Some(SourceValue::NonExistence), + ordinal: Some(Ordinal::unavailable()), + content_version_fp: None, + }); + } + } + } let ordinal = if options.include_ordinal { Some(path.metadata()?.modified()?.try_into()?) } else { @@ -172,6 +194,7 @@ impl SourceFactoryBase for Factory { root_path: PathBuf::from(spec.path), binary: spec.binary, pattern_matcher: PatternMatcher::new(spec.included_patterns, spec.excluded_patterns)?, + max_file_size: spec.max_file_size, })) } }