emstar is a high-performance Rust library for reading and writing STAR files.
Emstar provides fast, memory-efficient parsing, reading and writing of STAR file formats commonly used in cryo-EM workflows.
- ⚡ High Performance: Written in Rust with zero-copy parsing where possible
- 🎯 Type-Safe: Strongly typed API with comprehensive error handling
- 📊 Flexible Data: Support for both simple blocks (key-value) and loop blocks (tabular data)
- 🗂️ CRUD Operations: Create, Read, Update, Delete APIs for easy data manipulation
- 📈 Statistics API: Analyze STAR file contents with detailed statistics
- 🧪 Well-Tested: Comprehensive test suite with integration tests
- 📊 Benchmarks: Performance benchmarks for large-scale data processing
- 🔧 Easy to Use: Simple, intuitive API similar to the Python starfile package
Add to your Cargo.toml:
[dependencies]
emstar = "0.1"use emstar::{read, write, DataBlock, DataValue, LoopBlock};
use std::collections::HashMap;
// Read a STAR file
let data_blocks = read("particles.star", None)?;
// Access a data block
if let Some(DataBlock::Loop(particles)) = data_blocks.get("particles") {
println!("Found {} particles", particles.row_count());
println!("Columns: {:?}", particles.columns());
}
// Write modified data
write(&data_blocks, "modified_particles.star", None)?;
# Ok::<(), emstar::Error>(())use emstar::{write, DataBlock, DataValue, LoopBlock, SimpleBlock};
use std::collections::HashMap;
// Create a simple block using array initialization
let simple: SimpleBlock = [
("rlnImageSize", DataValue::Integer(256)),
("rlnPixelSize", DataValue::Float(1.23)),
].into();
// Create a loop block using the builder pattern
let particles = LoopBlock::builder()
.columns(&["rlnCoordinateX", "rlnCoordinateY"])
.rows(vec![
vec![DataValue::Float(91.7987), DataValue::Float(83.6226)],
vec![DataValue::Float(97.6358), DataValue::Float(80.4370)],
])
.build()?;
// Combine into data blocks
let mut data = HashMap::new();
data.insert("general".to_string(), DataBlock::Simple(simple));
data.insert("particles".to_string(), DataBlock::Loop(particles));
// Write to file
write(&data, "output.star", None)?;
# Ok::<(), emstar::Error>(())emstar provides simple read/write operations. For file management, use the Rust standard library:
use emstar::{read, write, stats};
use std::path::Path;
use std::fs;
// Read a file
let data = read("particles.star", None)?;
// Write a file (creates new or overwrites existing)
write(&data_blocks, "output.star", None)?;
// Check if file exists
if Path::new("particles.star").exists() {
println!("File exists!");
}
// Delete a file
fs::remove_file("old.star")?;
// Get file statistics
let file_stats = stats("particles.star")?;
println!("Total blocks: {}", file_stats.n_blocks);
println!("Total particles: {}", file_stats.total_loop_rows);emstar provides optional configuration for reading and writing:
Filter and customize reading behavior:
use emstar::{read, ReadOptions};
// Read only specific blocks
let options = ReadOptions {
blocks_to_read: Some(vec!["particles".to_string()]),
..Default::default()
};
let data = read("particles.star", Some(options))?;
// Skip loop blocks (metadata-only mode)
let options = ReadOptions {
skip_loop_blocks: true,
..Default::default()
};
let metadata = read("particles.star", Some(options))?;Customize output format:
use emstar::{write, WriteOptions};
let options = WriteOptions {
float_precision: Some(4), // 4 decimal places for floats
header_comment: Some("Generated by my pipeline".to_string()),
exclude_blocks: Some(vec!["temp_data".to_string()]),
..Default::default()
};
write(&data, "output.star", Some(options))?;use emstar::{merge_with_file, DataBlock, LoopBlock, DataValue};
use std::collections::HashMap;
let mut new_blocks = HashMap::new();
let particles = LoopBlock::builder()
.columns(&["rlnCoordinateX", "rlnCoordinateY"])
.rows(vec![vec![DataValue::Float(100.0), DataValue::Float(200.0)]])
.build()?;
new_blocks.insert("new_particles".to_string(), DataBlock::Loop(particles));
merge_with_file(&new_blocks, "existing.star")?;Quick validation without loading all data:
use emstar::validate;
match validate("large_file.star") {
Ok(details) => {
println!("File is valid! Blocks: {}", details.n_blocks);
}
Err(e) => println!("Validation failed: {}", e),
}use emstar::merge;
merge(vec!["file1.star", "file2.star", "file3.star"], "merged.star")?;Calculate statistics without loading all data:
use emstar::stats_streaming;
let stats = stats_streaming("huge_file.star")?;
println!("Total blocks: {}", stats.n_blocks);
println!("Total rows: {}", stats.total_loop_rows);use emstar::{SimpleBlock, DataValue};
let mut block = SimpleBlock::new();
// Create
block.set("rlnImageSize", DataValue::Integer(256));
block.set("rlnPixelSize", DataValue::Float(1.06));
// Read
if let Some(value) = block.get("rlnImageSize") {
println!("Image size: {:?}", value);
}
// Update
block.set("rlnPixelSize", DataValue::Float(1.12));
// Delete
block.remove("rlnPixelSize");
block.clear(); // Remove all
// Utilities
let has_key = block.contains_key("rlnImageSize");
let count = block.len();use emstar::{LoopBlock, DataValue};
let mut block = LoopBlock::new();
// Create - add columns and rows
block.add_column("rlnCoordinateX");
block.add_column("rlnCoordinateY");
block.add_row(vec![
DataValue::Float(100.0),
DataValue::Float(200.0),
])?;
// Read
let value = block.get(0, 0); // row 0, col 0
let value = block.get_by_name(0, "rlnCoordinateX");
let column = block.get_column("rlnCoordinateX");
// Update
block.set_by_name(0, "rlnCoordinateX", DataValue::Float(150.0))?;
// Delete
block.remove_row(0)?;
block.remove_column("rlnCoordinateY")?;
block.clear_rows(); // Keep columns, remove all rows
block.clear(); // Remove everything
// Utilities
let has_col = block.columns().contains(&"rlnCoordinateX");
let n_rows = block.row_count();
let n_cols = block.column_count();
// Iterate
for row in block.iter_rows() {
println!("{:?}", row);
}For more ergonomic LoopBlock creation, use the builder pattern:
use emstar::{LoopBlock, DataValue};
// Create a LoopBlock using the builder
let particles = LoopBlock::builder()
.columns(&["rlnCoordinateX", "rlnCoordinateY", "rlnAnglePsi"])
.rows(vec![
vec![DataValue::Float(100.0), DataValue::Float(200.0), DataValue::Float(45.0)],
vec![DataValue::Float(150.0), DataValue::Float(250.0), DataValue::Float(90.0)],
])
.build()?;
assert_eq!(particles.row_count(), 2);
assert_eq!(particles.column_count(), 3);Builder methods:
.columns(&["col1", "col2"])- Set all column names at once.rows(vec![vec![...], vec![...]])- Add multiple rows at once.build()- Construct the LoopBlock
Access blocks without verbose pattern matching:
use emstar::{DataBlock, SimpleBlock, LoopBlock};
let data_blocks = emstar::read("particles.star")?;
// Using expect_simple/expect_loop (panics with message if wrong type)
if let Some(block) = data_blocks.get("general") {
let simple_block = block.expect_simple("general should be a SimpleBlock");
}
if let Some(block) = data_blocks.get("particles") {
let loop_block = block.expect_loop("particles should be a LoopBlock");
}
// Using as_simple/as_loop (returns Option)
if let Some(simple) = data_blocks.get("general").as_simple() {
// Work with SimpleBlock
}
// Check block type
if data_blocks.get("particles").is_loop() {
// It's a LoopBlock
}Create a SimpleBlock from an array of key-value pairs:
use emstar::{SimpleBlock, DataValue};
let block: SimpleBlock = [
("rlnImageSize", DataValue::Integer(256)),
("rlnPixelSize", DataValue::Float(1.06)),
("rlnVoltage", DataValue::Float(300.0)),
].into();
assert_eq!(block.len(), 3);Analyze STAR file contents:
use emstar::{stats, DataBlockStats, StarStats};
use std::collections::HashMap;
// Get statistics from file (loads entire file into memory)
let file_stats = stats("particles.star")?;
println!("Total blocks: {}", file_stats.n_blocks);
println!("SimpleBlocks: {}", file_stats.n_simple_blocks);
println!("LoopBlocks: {}", file_stats.n_loop_blocks);
println!("Total particles: {}", file_stats.total_loop_rows);
// Get specific block stats
if let Some(DataBlockStats::Loop(l)) = file_stats.get_block_stats("particles") {
println!("Particles: {} rows x {} cols", l.n_rows, l.n_cols);
}
// Get stats from in-memory data
let blocks: HashMap<String, DataBlock> = read("particles.star", None)?;
let mem_stats = StarStats::from_blocks(&blocks);emstar provides strongly typed data representations:
Represents a single value in a STAR file:
DataValue::String(String)- String valuesDataValue::Integer(i64)- Integer valuesDataValue::Float(f64)- Floating-point valuesDataValue::Null- Null/NA values
Represents a data block in a STAR file:
DataBlock::Simple(SimpleBlock)- Key-value pairsDataBlock::Loop(LoopBlock)- Tabular data with columns and rows
Key-value pairs for simple data blocks:
let mut block = SimpleBlock::new();
block.set("key", DataValue::String("value".into()));
let value = block.get("key");
// Statistics
let stats = block.stats();
println!("Entries: {}", stats.n_entries);Tabular data for loop blocks:
// Using the builder pattern (recommended)
let block = LoopBlock::builder()
.columns(&["col1", "col2"])
.rows(vec![vec![DataValue::Integer(1), DataValue::Integer(2)]])
.build()?;
// Or using traditional methods
let mut block = LoopBlock::new();
block.add_column("col1");
block.add_column("col2");
block.add_row(vec![DataValue::Integer(1), DataValue::Integer(2)])?;
let n_rows = block.row_count();
let n_cols = block.column_count();
let value = block.get(0, 0); // Get value at row 0, col 0
let value = block.get_by_name(0, "col2"); // Get value by column name (row first!)
// Statistics
let stats = block.stats();
println!("Rows: {}, Cols: {}, Cells: {}", stats.n_rows, stats.n_cols, stats.n_cells);| Function | Description |
|---|---|
read(path) |
Read a STAR file from disk |
write(&data, path) |
Write data to a STAR file (creates or overwrites) |
to_string(&data) |
Convert data to STAR format string |
list_blocks(&blocks) |
List all blocks with their names and types |
merge_with_file(&blocks, path) |
Merge blocks with an existing file |
validate(path) |
Validate a STAR file without loading |
merge(paths, output_path) |
Merge multiple STAR files |
stats_streaming(path) |
Calculate statistics without loading |
For file management (delete, exists), use std::fs and std::path::Path.
| Function | Description |
|---|---|
stats(path) |
Calculate statistics for a STAR file |
| Method | Description |
|---|---|
new() |
Create a new empty SimpleBlock |
get(key) |
Read value by key |
set(key, value) |
Create/Update value |
remove(key) |
Delete key-value pair |
contains_key(key) |
Check if key exists |
contains_value(value) |
Check if value exists |
keys() |
Get all keys |
values() |
Get iterator over all values |
iter() |
Iterate over key-value pairs |
len() |
Get number of entries |
is_empty() |
Check if block is empty |
clear() |
Remove all entries |
drain() |
Remove and return all entries |
retain(f) |
Retain entries matching predicate |
first_value() |
Get the first value |
get_or_insert(key) |
Get value or insert Null |
stats() |
Get block statistics |
| Method | Description |
|---|---|
new() |
Create a new empty LoopBlock |
with_columns(names) |
Create with specified columns |
get(row, col) |
Read value at position |
get_by_name(row, col_name) |
Read value by column name |
get_f64(row, col_name) |
Get value as f64 (auto-converts) |
get_i64(row, col_name) |
Get value as i64 (auto-converts) |
get_string(row, col_name) |
Get value as string |
get_column(name) |
Get entire column as Vec |
iter_rows() |
Iterate over all rows |
column_iter_f64(name) |
Iterate column as Option |
column_iter_i64(name) |
Iterate column as Option |
column_iter_str(name) |
Iterate column as Option<&str> |
column_metadata(name) |
Get column metadata (type, len, nulls) |
add_column(name) |
Add a column |
add_row(values) |
Add a row |
set(row, col, value) |
Update cell by indices |
set_by_name(row, col_name, value) |
Update cell by name |
remove_row(idx) |
Delete a row |
remove_column(name) |
Delete a column |
clear_rows() |
Remove all rows |
clear() |
Remove all data |
row_count() |
Get number of rows |
column_count() |
Get number of columns |
stats() |
Get block statistics |
builder() |
Create a LoopBlockBuilder (fluent API) |
as_dataframe() |
Access underlying Polars DataFrame |
from_dataframe(df) |
Create LoopBlock from Polars DataFrame |
| Method | Description |
|---|---|
new() |
Create a new builder |
columns(names) |
Set column names |
rows(data) |
Set all rows |
validate() |
Validate builder state |
build() |
Build the LoopBlock |
build_validated() |
Build with validation |
| Method | Description |
|---|---|
block_type() |
Get type name ("SimpleBlock" or "LoopBlock") |
is_simple() |
Check if block is a SimpleBlock |
is_loop() |
Check if block is a LoopBlock |
as_simple() |
Get SimpleBlock reference (returns Option) |
as_loop() |
Get LoopBlock reference (returns Option) |
as_simple_mut() |
Get mutable SimpleBlock reference |
as_loop_mut() |
Get mutable LoopBlock reference |
expect_simple(msg) |
Get SimpleBlock or panic with message |
expect_loop(msg) |
Get LoopBlock or panic with message |
count() |
Get entry count (Simple) or row count (Loop) |
stats() |
Get block statistics |
| Type | Description |
|---|---|
StarStats |
Comprehensive STAR file statistics |
StarStats::from_blocks(&blocks) |
Create stats from in-memory blocks |
StarStats::get_block_stats(name) |
Get stats for specific block |
StarStats::has_loop_blocks() |
Check if file has any LoopBlocks |
StarStats::has_simple_blocks() |
Check if file has any SimpleBlocks |
DataBlockStats |
Block-level statistics (enum) |
LoopBlockStats |
LoopBlock statistics (rows, cols, cells) |
SimpleBlockStats |
SimpleBlock statistics (entries) |
| Type | Fields | Description |
|---|---|---|
ReadOptions |
blocks_to_read, skip_loop_blocks, skip_simple_blocks |
Options for reading STAR files |
WriteOptions |
float_precision, header_comment, exclude_blocks |
Options for writing STAR files |
ValidationDetails |
n_blocks, estimated_size_bytes, block_names, is_empty |
File validation results |
ColumnMetadata |
name, dtype, len, null_count |
Column metadata for inspection |
| Method | Description |
|---|---|
as_integer() |
Convert to i64 (returns Option) |
as_float() |
Convert to f64 (returns Option) |
as_string() |
Convert to &str (returns Option) |
as_bool() |
Convert to bool (returns Option) |
is_null() |
Check if value is Null |
is_nan() |
Check if Float is NaN |
is_infinite() |
Check if Float is infinite |
type_name() |
Get type name as string |
See the examples/ directory for comprehensive examples:
basic_usage.rs- Basic read/write operations with Polars integrationcrud_operations.rs- Complete CRUD operations demonstrationstatistics.rs- Statistics API usage
Run examples:
cargo run --example basic_usage
cargo run --example crud_operations
cargo run --example statisticsemstar is designed for high performance:
- Zero-copy parsing where possible using
SmartString - Efficient numeric parsing using
lexical - Optimized memory layout for loop blocks using Polars DataFrames
- Streaming I/O for large files
For best performance with emstar:
Fast Column Access
// Slow - allocates a Vec<DataValue>
let col = block.get_column("x").unwrap();
// Fast - zero-copy iteration
for val in block.column_iter_f64("x").unwrap() {
// val is Option<f64>
}Batch Updates
// Slow - O(n) per call, each call recreates the column
for (i, val) in updates {
block.set_by_name(i, "col", val)?;
}
// Fast - rebuild once using the builder pattern
let mut rows = Vec::new();
for val in updates {
rows.push(vec![val]);
}
let new_block = LoopBlock::builder()
.columns(&["col"])
.rows(rows)
.build()?;Row Iteration
// Slow - allocates Vec for each row
for row in block.iter_rows() {
// row is Vec<DataValue>
}
// Fast - use column iterators when possible
if let (Some(x_iter), Some(y_iter)) = (
block.column_iter_f64("rlnCoordinateX"),
block.column_iter_f64("rlnCoordinateY")
) {
for (x, y) in x_iter.zip(y_iter) {
// x, y are Option<f64>
}
}For complex operations, access the underlying Polars DataFrame:
use emstar::{read, DataBlock, LoopBlock};
use polars::prelude::*;
let data_blocks = read("particles.star", None)?;
if let Some(DataBlock::Loop(particles)) = data_blocks.get("particles") {
// Get reference to the underlying Polars DataFrame
let df: &DataFrame = particles.as_dataframe();
// Use standard Polars operations
// Example: Filter rows
let mask = df.column("rlnCoordinateX")?.f64()?.gt(100.0)?;
let filtered = df.filter(&mask)?;
// Example: Select columns
let selected = df.select(["rlnCoordinateX", "rlnCoordinateY"])?;
// Example: Add computed column (requires clone for ownership)
let mut df_clone = df.clone();
let x = df_clone.column("rlnCoordinateX")?.f64()?;
let y = df_clone.column("rlnCoordinateY")?.f64()?;
let distance = (x.pow(2.0)? + y.pow(2.0)?)?.sqrt()?;
df_clone.with_column(distance.rename("rlnDistance"))?;
// Convert back to LoopBlock
let modified = LoopBlock::from_dataframe(df_clone);
}Key Points:
as_dataframe()returns&DataFramefor read-only access- Use
.clone()if you need ownership for modifications - Convert back with
LoopBlock::from_dataframe(df)
Run benchmarks:
cargo benchBenchmark results (on typical hardware):
- Parse 10,000 rows: ~5-10ms
- Write 10,000 rows: ~2-5ms
- Parse 100,000 rows: ~50-100ms
- Write 100,000 rows: ~20-50ms
Run tests:
cargo testRun tests with coverage:
cargo tarpaulin --out Htmlemstar uses thiserror for comprehensive error types:
use emstar::Error;
match read("file.star") {
Ok(data) => { /* process data */ }
Err(Error::FileNotFound(path)) => eprintln!("File not found: {:?}", path),
Err(Error::Parse { line, message }) => eprintln!("Parse error at line {}: {}", line, message),
Err(e) => eprintln!("Error: {}", e),
}default- Core functionalityserde- Optional serde support for (de)serialization (planned)
Contributions are welcome! Please feel free to submit a Pull Request.
This project is licensed under the MIT License - see the LICENSE file for details.