High-performance PDF text extraction for vectorization pipelines.
- Fast: 40-134 MiB/s throughput (15-143x faster than pdf-extract)
- Parallel: Multi-threaded page extraction with rayon
- Streaming: Constant memory usage for large documents
- Chunking: Built-in text chunking for RAG/embedding pipelines
- Metadata: Extract title, author, dates without parsing pages
[dependencies]
pdfvec = "0.1"// One-liner extraction
let text = pdfvec::extract(&std::fs::read("document.pdf")?)?;use pdfvec::{Extractor, Result};
fn main() -> Result<()> {
let data = std::fs::read("document.pdf")?;
// Simple extraction
let text = Extractor::new().extract(&data)?;
// With configuration
let text = Extractor::new()
.parallel(true)
.page_separator("\n---\n")
.normalize_whitespace(true)
.extract(&data)?;
println!("{text}");
Ok(())
}use pdfvec::{Extractor, Result};
fn main() -> Result<()> {
let data = std::fs::read("document.pdf")?;
let doc = Extractor::new().extract_document(&data)?;
println!("Pages: {}", doc.page_count());
println!("Total chars: {}", doc.total_chars());
for page in doc.pages() {
println!("Page {}: {} chars", page.number(), page.char_count());
}
Ok(())
}use pdfvec::{Extractor, Result};
fn main() -> Result<()> {
let data = std::fs::read("large.pdf")?;
for page_result in Extractor::new().pages(&data) {
let page = page_result?;
// Process page immediately, memory freed after each iteration
println!("Page {}: {} chars", page.number(), page.char_count());
}
Ok(())
}use pdfvec::{Chunker, ChunkStrategy};
let text = "First sentence. Second sentence.\n\nNew paragraph here.";
// Fixed-size chunks with overlap
let chunks: Vec<_> = Chunker::new(ChunkStrategy::Fixed)
.chunk_size(100)
.overlap(20)
.chunks(text)
.collect();
// Paragraph-based chunking
let chunks: Vec<_> = Chunker::new(ChunkStrategy::Paragraph)
.chunks(text)
.collect();
// Sentence-based chunking
let chunks: Vec<_> = Chunker::new(ChunkStrategy::Sentence)
.chunks(text)
.collect();
for chunk in chunks {
println!("[{}] {}", chunk.index(), chunk.text());
}use pdfvec::{Extractor, Result};
fn main() -> Result<()> {
let data = std::fs::read("document.pdf")?;
let meta = Extractor::new().extract_metadata(&data)?;
println!("Title: {:?}", meta.title());
println!("Author: {:?}", meta.author());
println!("Pages: {}", meta.page_count());
if let Some(date) = meta.creation_date() {
println!("Created: {}", date.format("%Y-%m-%d"));
}
Ok(())
}# Install
cargo install pdfvec
# Extract text
pdfvec extract document.pdf
# Extract to file
pdfvec extract document.pdf -o output.txt
# Show metadata
pdfvec metadata document.pdf
# Process directory
pdfvec extract ./papers/ -o ./output/Benchmarked against pdf-extract on academic papers:
| File Size | pdfvec | pdf-extract | Speedup |
|---|---|---|---|
| 33 KB | 818 µs | 12.7 ms | 15x |
| 94 KB | 1.5 ms | 83 ms | 55x |
| 422 KB | 3.1 ms | 439 ms | 143x |
Throughput: 40-134 MiB/s vs 0.9-2.6 MiB/s
Licensed under either of:
- Apache License, Version 2.0 (LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0)
- MIT license (LICENSE-MIT or http://opensource.org/licenses/MIT)
at your option.