fix the readme

dmyersturnbull · Jan 22, 2024 · 1cc912e · 1cc912e
1 parent ebf8d28
commit 1cc912e
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 67 deletions.
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ Efficient, high-quality
 [streaming](https://docs.oracle.com/en/java/javase/21/docs/api/java.base/java/util/stream/Stream.html)
 parsers and writers for 16 text-based formats used in bioinformatics.
 
-The goal is to have the best possible parsers for the most problematic ancient formats.
+The goal is to have the best possible parsers for the most hated and problematic formats.
 
 **Supported formats:**
 
@@ -35,22 +35,23 @@ The goal is to have the best possible parsers for the most problematic ancient f
 
 - Reads and writes Java Streams, keeping only essential metadata in memory.
 - Parses every part of a format, leaving nothing as text unnecessarily.
-- Has a consistent API. Coordinates are always 0-indexed and text is always escaped as per the specification.
+- Has a consistent API.
+  Coordinates are always 0-indexed and text is always escaped as per the specification.
 - Immutable, thread-safe, null-pointer-safe (`Optional<>`), and arbitrary-precision.
-- All methods are either exposed through interfaces, or reside in records, enums, and final classes
+- All methods are in interfaces, or in records, enums, or final classes
 
 #### Example:
 
 This example reads, filters, and writes a VCF file.
 
 ```java
-import org.pharmgkb.parsers.vcf;
+import org.pharmgkb.parsers.vcf.*;
+import org.pharmgkb.parsers.vcf.model.*;
 
-Stream<VcfPosition> goodMitochondrialCalls = new VcfDataParser().parseFile(path)
-	.filter(p -> p.chromosome.isMitochondial())
-	.filter(VcfFilters.qualityAtLeast(10)); // converts to BigDecimal
+Stream<VcfPosition> mitochondrialCalls = new VcfDataParser().parseFile(path)
+	.filter(p -> p.chromosome().isMitochondial())
 
-new VcfDataWriter().writeToFile(goodMitochondrialCalls, filteredPath);
+new VcfDataWriter().writeToFile(mitochondrialCalls, filteredPath);
 ```
 
 ## Build/install
@@ -100,20 +101,22 @@ functions (`parallel()`, `collect`, `flatMap`, etc.)
 
 ```java
 // Store GFF3 (or GVF, or GTF) features into a list
-List<Gff3Feature> features = new Gff3Parser().collectAll(inputFile);
-features.get(0).getType(); // the parser unescaped this string
+List<Gff3Feature> features = new GffParser.Builder().build().collectAll(inputFile);
+features.get(0).type(); // the parser unescaped this string
 
 // Now write the lines:
-new Gff3Writer().writeToFile(outputFile);
+new Gff3Writer.Builder().build().writeToFile(outputFile);
 // The writer percent-encodes GFF3 fields as necessary
 ```
 
 ```java
 // From a BED file, get distinct chromosome names that start with "chr", in parallel
-Files.lines(file).map(new BedParser())
+Files.lines(file)
+  .map(new BedParser())
 	.parallel()
-	.map(BedFeature::getChromosome).distinct()
-	.filter(chr -> chr.startsWith("chr"))
+	.map(BedFeature::chromosome())
+  .distinct()
+	.filter(chr -> chr.startsWith("chr"));
 // You can also use new BedParser().parseAll(file)
 ```
 
@@ -122,14 +125,14 @@ Files.lines(file).map(new BedParser())
 Pedigree pedigree = new PedigreeParser.Builder().build().apply(Files.lines(file));
 NavigableSet<Individual> children = pedigree.getFamily("Johnsons")
 	.find("Harry Johnson")
-	.getChildren();
+	.children();
 ```
 
 ```java
 // Traverse through a family pedigree in topological order
 Pedigree pedigree = new PedigreeParser.Builder().build().apply(Files.lines(file));
-Stream<Individual> = pedigree.getFamily("Johnsons")
-	.topologicalOrderStream();
+Stream<Individual> = pedigree.family("Johnsons")
+	.topologicalOrder();
 ```
 
 ```java
@@ -139,46 +142,31 @@ GenomeChain chain = new GenomeChainParser().apply(Files.lines(hg19ToGrch38ChainF
 List<Locus> liftedOver = lociList.parallelStream()
 	.map(chain)
 	.filter(Optional::isPresent)
-	.collect(Collectors.toList());
+	.toList();
 // You can also use new GenomeChainParser().parse(hg19ToGrch38ChainFile)
 ```
 
 ```java
 // Print formal species names from a GenBank file
 Path input = Paths.get("plasmid.genbank");
-properties = new GenbankParser().parseAll(input)
+new GenbankParser().parseAll(input)
 	.filter(record -> record instanceof SourceAnnotation)
-	.map(record -> record.getFormalName())
-	.forEach(System.out::println)
+	.map(record -> record.formalName())
+	.forEach(System.out::println);
 ```
 
 ```java
 // Parse a GenBank file
 // Get the set of "color" properties of features on the complement starting before the sequence
 Set<String> properties = new GenbankParser().parseAll(input)
 	.filter(record -> record instanceof FeaturesAnnotation)
-	.flatMap(record -> record.getFeatures())
+	.flatMap(record -> record.features())
 	.filter(feature -> record.range.isComplement());
 	.filter(feature -> record.range.start() < 0);
-	.flatMap(feature -> feature.getProperties().entrySet().stream())
+	.flatMap(feature -> feature.properties().entrySet().stream())
 	.filter(prop -> prop.getKey().equals("color"))
 	.map(prop -> prop.getValue())
-	.collect(Collectors.toSet())
-```
-
-```java
-// Parse a GenBank file
-// Get the set of "color" properties of features on the complement starting before the sequence
-Path input = Paths.get("plasmid.genbank");
-Set<String> properties = new GenbankParser().parseAll(input)
-	.filter(record -> record instanceof FeaturesAnnotation)
-	.flatMap(record -> record.getFeatures())
-	.filter(feature -> record.range.isComplement());
-	.filter(feature -> record.range.start() < 0);
-	.flatMap(feature -> feature.getProperties().entrySet().stream())
-	.filter(prop -> prop.getKey().equals("color"))
-	.map(prop -> prop.getValue())
-	.collect(Collectors.toSet())
+	.toSet();
 ```
 
 ```java
@@ -190,13 +178,14 @@ char base = stream.read("gene_1", 58523);
 ```
 
 ```java
-// Suppose you have a 2GB FASTA file and a method smithWaterman that returns AlignmentResults
+// Suppose you have a 2GB FASTA file
+// and a method smithWaterman that returns AlignmentResults
 // Align each sequence and get the top 10 results, in parallel
 MultilineFastaSequenceParser parser = new MultilineFastaSequenceParser.Builder().build();
 List<AlignmentResult> topScores = parser.parseAll(Files.lines(fastaFile))
 	.parallel()
-	.peek(sequence -> logger.info("Aligning {}", sequence.getHeader())
-	.map(sequence -> smithWaterman(sequence.getSequence(), reference))
+	.peek(sequence -> logger.info("Aligning {}", sequence.header())
+	.map(sequence -> smithWaterman(sequence.sequence(), reference))
 	.sorted() // assuming AlignmentResult implements Comparable
 	.limit(10);
 }
@@ -210,60 +199,73 @@ List<AlignmentResult> topScores = parser.parseAll(Files.lines(fastaFile))
 	"hasSynonym" <https://abc#feline> .
  */
 Stream<String> input = null;
-try (BufferedReader reader = new BufferedReader(new InputStreamReader((HttpURLConnection) myUrl.openConnection()).getInputStream()))) {
+try (
+  BufferedReader reader = new BufferedReader(
+    new InputStreamReader((HttpURLConnection) myUrl.openConnection()).getInputStream())
+  )
+) {
 	input = reader.lines();
 }
-TripleParser parser = new TripleParser(true);  // usePrefixes=true will replace prefixes
+// usePrefixes=true will replace prefixes
+TripleParser parser = new TripleParser(true);
 Stream<Triple> stream = input.map(new TripleParser());
-// contains:  List[ https://abc#cat belongsTo https://abc#owner , https://abc#cat hasSynonym https://abc#feline ]
-List<Prefix> prefixes = parser.getPrefixes();
+// contains:  List[ https://abc#cat belongsTo https://abc#owner , \
+// https://abc#cat hasSynonym https://abc#feline ]
+List<Prefix> prefixes = parser.prefixes();
 ```
 
 ```java
-// Parse VCF, validate it, and write a new VCF file containing only positions whose QUAL field
+// Parse VCF, validate it,
+// and write a new VCF file containing only positions whose QUAL field
 // is at least 10, each with its FILTER field cleared
-VcfMetadataCollection metadata = new VcfMetadataParser().parse(input); // short-circuits during read
+// short-circuits during read:
+VcfMetadataCollection metadata = new VcfMetadataParser().parse(input);
 Stream<VcfPosition> data = new VcfDataParser().parseAll(input)
-	.filter(p -> p.getQuality().isPresent() && p.getQuality().get().greaterThanOrEqual("10"))
-	.map(p -> new VcfPosition.Builder(p).clearFilters().build())
-	.peek(new VcfValidator.Builder(metadata).warnOnly().build()); // verify consistent with metadata
-new VcfMetadataWriter().writeToFile(metadata.getLines(), output);
+	.filter(p ->
+    p.quality().stream().anyMatch(q -> q.greaterThanOrEqual("10"))
+  ).map(p -> new VcfPosition.Builder(p).clearFilters().build())
+  // verify consistent with metadata:
+	.peek(new VcfValidator.Builder(metadata).warnOnly().build());
+new VcfMetadataWriter().writeToFile(metadata.lines(), output);
 new VcfDataWriter().appendToFile(data, output);
 ```
 
 ```java
 // From a VCF file, associate every GT with its number of occurrences, in parallel
 Map<String, Long> genotypeCounts = new VcfDataParser().parseAll(input)
 	.parallel()
-	.flatMap(p -> p.getSamples().stream())
+	.flatMap(p -> p.samples().stream())
 	.filter(s -> s.containsKey(ReservedFormatProperty.Genotype))
 	.map(s -> s.get(ReservedFormatProperty.Genotype).get())
 	.collect(Collectors.groupingBy(Function.identity(), Collectors.counting()));
 ```
 
 ```java
-Stream<BigDecimal> org.pharmgkb.parsers.text.MatrixParserI.tabs().parseAll(file).map(BigDecimal::new);
+Stream<GeneralizedBigDecimal> MatrixParserI.tabs().parseAll(file).map(GeneralizedBigDecimal::new);
 ```
 
-### Guiding principles
+### Principles
 
 1. Where possible, a parser is a `Function<String, R>` or `Function<Stream<String>, R>`,
    and writer is a `Function<R, String>` or `Function<R, Stream<String>>`.
    [Java 8+ Streams](https://www.oracle.com/technetwork/articles/java/ma14-java-se-8-streams-2177646.html)
    are expected to be used.
-2. Null values are generally banned from public methods in favor of
+2. Null values are banned from public methods in favor of
    [`Optional`](https://download.java.net/java/early_access/jdk16/docs/api/java.base/java/util/Optional.html).
    See https://www.oracle.com/technetwork/articles/java/java8-optional-2175753.html for more information.
-3. Most operations are thread-safe. Thread safety is annotated using `javax.annotation.concurrent`.
-4. Top-level data classes are immutable, as annotated by or `javax.annotation.concurrent.Immutable`.
-5. The builder pattern is used for non-trivial classes. Each builder has a copy constructor.
-6. Links to specifications are provided. Any choice made in an ambiguous specification is documented.
-7. Parsing and writing is _moderately_ strict. Severe violations throw a `BadDataFormatException`,
-   and milder violations are logged as warnings using SLF4J.
+3. Most operations are thread-safe.
+   Thread safety is annotated using `javax.annotation.concurrent`.
+4. Top-level data classes are immutable, as annotated by `javax.annotation.concurrent.Immutable`.
+5. The builder pattern is used for non-trivial classes.
+   Each builder has a copy constructor.
+6. Links to specifications are provided.
+   Any choice made in an ambiguous specification is documented.
+7. Parsing and writing is _moderately_ strict.
+   Severe violations throw a `BadDataFormatException`, and milder violations are logged as SLF4J warnings.
    Not every aspect of a specification is validated.
 8. For specification-mandated escape sequences, encoding and decoding is automatic.
 9. Coordinates are _always 0-based_, even for 1-based formats.
-   This is to ensure consistency as well as arithmetic simplicity.
+   This is to ensure consistency and arithmetic simplicity.
 
 ### Pitfalls
 

diff --git a/vcf/src/main/java/org/pharmgkb/parsers/vcf/VcfMetadataParser.java b/vcf/src/main/java/org/pharmgkb/parsers/vcf/VcfMetadataParser.java
@@ -40,10 +40,7 @@ public VcfMetadataParser() {
 
     @Nonnull
 	@Override
-	public VcfMetadataCollection apply(
-        @Nonnull
-        Stream<String> stream
-    ) throws BadDataFormatException {
+	public VcfMetadataCollection apply(@Nonnull Stream<String> stream) throws BadDataFormatException {
 		Objects.requireNonNull(stream, "Stream cannot be null");
 		final VcfMetadataCollection.Builder builder = new VcfMetadataCollection.Builder();
 		stream.takeWhile(s -> s.startsWith("#"))