Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

capnp de/serialization of unified sites #114

Merged
merged 1 commit into from Jun 21, 2017
Jump to file or symbol
Failed to load files and symbols.
+217 −3
Split
View
@@ -3,6 +3,8 @@
using Cxx = import "/capnp/c++.capnp";
$Cxx.namespace("GLnexus::capnp");
+### discovered alleles
+
struct DiscoveredAlleleInfo {
isRef @0 :Bool = false;
@@ -38,3 +40,29 @@ struct DiscoveredAlleles {
contigs @1 : List(Contig);
aips @2 :List(AlleleInfoPair);
}
+
+### unified sites
+
+struct OriginalAllele {
+ pos @0 :Range;
+ dna @1 :Text;
+ unifiedAllele @2 :Int64;
+}
+
+struct UnifiedSite {
+ pos @0 :Range;
+ containingTargetOption :union {
+ noContainingTarget @1 :Void;
+ containingTarget @2 :Range;
+ }
+ alleles @3 :List(Text);
+ unification @4 :List(OriginalAllele);
+ alleleFrequencies @5 :List(Float32);
+ lostAlleleFrequency @6 :Float32;
+ qual @7 :Int64;
+ monoallelic @8 :Bool;
+}
+
+struct UnifiedSites {
+ sites @0 :List(UnifiedSite);
+}
View
@@ -120,6 +120,12 @@ static int all_steps(const vector<string> &vcf_files,
console->info() << "Writing unified sites as YAML to " << filename;
H("write unified sites to file",
GLnexus::cli::utils::write_unified_sites_to_file(sites, contigs, filename));
+
+ string filename_cflat("/tmp/sites.cflat");
+ console->info() << "Writing unified sites in serialized capnproto to " << filename
+ << ", verifying that it is readable";
+ H("Verify that the cflat file is readable",
+ GLnexus::capnp_unified_sites_verify(sites, filename_cflat));
}
// genotype
View
@@ -520,6 +520,21 @@ struct unified_site {
unified_site& ans);
};
+// write vector<unified_site> structure to a file, with cap'n proto serialization
+Status capnp_of_unified_sites(const std::vector<unified_site> &sites, const std::string &filename);
+
+// write unified sites capnp to a file descriptor.
+Status capnp_of_unified_sites_fd(const std::vector<unified_site> &sites, int fd);
+
+// read unified sites from a capnp file
+Status unified_sites_of_capnp(const std::string &filename, std::vector<unified_site>& sites);
+
+// Verify that we can serialize and deserialize unified sites
+//
+// Note: this is a debugging function
+Status capnp_unified_sites_verify(const std::vector<unified_site> &sites, const std::string &filename);
+
+
// Statistics collected during range queries
struct StatsRangeQuery {
int64_t nBCFRecordsRead; // how many BCF records were read from the DB
View
@@ -600,6 +600,167 @@ Status unified_site::of_yaml(const YAML::Node& yaml, const vector<pair<string,si
return Status::OK();
}
+// write unified sites list to a file-descriptor, with cap'n proto serialization
+static Status capnp_write_unified_sites_fd(const std::vector<unified_site>& sites, int fd) {
+ ::capnp::MallocMessageBuilder message;
+ auto us_b = message.initRoot<capnp::UnifiedSites>();
+
+ auto sites_b = us_b.initSites(sites.size());
+ for (int i = 0; i < sites.size(); i++) {
+ const auto& site = sites[i];
+ auto site_b = sites_b[i];
+ int j;
+
+ auto pos_b = site_b.initPos();
+ pos_b.setRid(site.pos.rid);
+ pos_b.setBeg(site.pos.beg);
+ pos_b.setEnd(site.pos.end);
+
+ if (site.containing_target.rid > -1) {
+ auto ct_b = site_b.getContainingTargetOption().initContainingTarget();
+ ct_b.setRid(site.containing_target.rid);
+ ct_b.setBeg(site.containing_target.beg);
+ ct_b.setEnd(site.containing_target.end);
+ } else {
+ site_b.getContainingTargetOption().setNoContainingTarget(::capnp::VOID);
+ }
+
+ auto alleles_b = site_b.initAlleles(site.alleles.size());
+ for (j = 0; j < site.alleles.size(); j++) {
+ alleles_b.set(j, site.alleles[j]);
+ }
+
+ auto unification_b = site_b.initUnification(site.unification.size());
+ j = 0;
+ for (const auto& p : site.unification) {
+ auto oa_b = unification_b[j++];
+ auto oa_pos_b = oa_b.initPos();
+ oa_pos_b.setRid(p.first.pos.rid);
+ oa_pos_b.setBeg(p.first.pos.beg);
+ oa_pos_b.setEnd(p.first.pos.end);
+ oa_b.setDna(p.first.dna);
+ oa_b.setUnifiedAllele(p.second);
+ }
+
+ auto allele_frequencies_b = site_b.initAlleleFrequencies(site.allele_frequencies.size());
+ for (j = 0; j < site.allele_frequencies.size(); j++) {
+ allele_frequencies_b.set(j, site.allele_frequencies[j]);
+ }
+
+ site_b.setLostAlleleFrequency(site.lost_allele_frequency);
+ site_b.setQual(site.qual);
+ site_b.setMonoallelic(site.monoallelic);
+ }
+
+ // Capnp throws exceptions on errors, and only in extreme cases.
+ writePackedMessageToFd(fd, message);
+ return Status::OK();
+}
+
+
+Status capnp_of_unified_sites_fd(const vector<unified_site> &sites, int fd) {
+ try {
+ return capnp_write_unified_sites_fd(sites, fd);
+ } catch (exception &e) {
+ return Status::IOError("Capnproto error during de-serialization", e.what());
+ }
+}
+
+Status capnp_of_unified_sites(const vector<unified_site>& sites, const std::string &filename) {
+ int fd = open(filename.c_str(), O_WRONLY | O_CREAT | O_TRUNC, S_IRWXU);
+ if (fd < 0) {
+ return Status::IOError("Could not open file for writing", filename);
+ }
+ Status s = capnp_of_unified_sites_fd(sites, fd);
+ if (close(fd) != 0) {
+ return Status::IOError("file close failed", filename);
+ }
+ return s;
+}
+
+// read unified_site list from a file-descriptor, as serialized by cap'n proto
+static Status _capnp_read_unified_sites_fd(int fd, vector<unified_site>& sites) {
+ ::capnp::ReaderOptions ropt;
+ ropt.traversalLimitInWords = CAPNP_TRAVERSAL_LIMIT;
+ ::capnp::PackedFdMessageReader message(fd, ropt);
+ auto sites_r = message.getRoot<capnp::UnifiedSites>();
+
+ sites.clear();
+ for(const auto site_r : sites_r.getSites()) {
+ range pos(-1,-1,-1);
+ const auto pos_r = site_r.getPos();
+ pos.rid = pos_r.getRid();
+ pos.beg = pos_r.getBeg();
+ pos.end = pos_r.getEnd();
+
+ unified_site site(pos);
+
+ const auto cto_r = site_r.getContainingTargetOption();
+ if (cto_r.hasContainingTarget()) {
+ const auto ct_r = cto_r.getContainingTarget();
+ site.containing_target.rid = ct_r.getRid();
+ site.containing_target.beg = ct_r.getBeg();
+ site.containing_target.end = ct_r.getEnd();
+ }
+
+ for (const auto& al : site_r.getAlleles()) {
+ site.alleles.push_back(string(al));
+ }
+
+ for (const auto oa_r : site_r.getUnification()) {
+ range oa_pos(-1,-1,-1);
+ auto oa_pos_r = oa_r.getPos();
+ oa_pos.rid = oa_pos_r.getRid();
+ oa_pos.beg = oa_pos_r.getBeg();
+ oa_pos.end = oa_pos_r.getEnd();
+
+ site.unification[allele(oa_pos, oa_r.getDna())] = oa_r.getUnifiedAllele();
+ }
+
+ for (const auto x : site_r.getAlleleFrequencies()) {
+ site.allele_frequencies.push_back(x);
+ }
+
+ site.lost_allele_frequency = site_r.getLostAlleleFrequency();
+ site.qual = site_r.getQual();
+ site.monoallelic = site_r.getMonoallelic();
+
+ sites.push_back(std::move(site));
+ }
+
+ return Status::OK();
+}
+
+Status unified_sites_of_capnp(const std::string &filename, vector<unified_site>& sites) {
+ int fd = open(filename.c_str(), O_RDONLY);
+ if (fd < 0) {
+ return Status::IOError("Could not open file for reading", filename);
+ }
+ Status s;
+ try {
+ // Capnp throws exceptions on errors, and only in extreme cases.
+ s = _capnp_read_unified_sites_fd(fd, sites);
+ } catch (exception &e) {
+ return Status::IOError("Capnproto error during de-serialization", e.what());
+ }
+ if (close(fd) != 0) {
+ return Status::IOError("file close failed", filename);
+ }
+ return s;
+}
+
+// verify roundtrip of sites
+Status capnp_unified_sites_verify(const vector<unified_site>& sites, const string& filename) {
+ Status s;
+ vector<unified_site> sites2;
+ S(capnp_of_unified_sites(sites, filename));
+ S(unified_sites_of_capnp(filename, sites2));
+ if (sites == sites2) {
+ return Status::OK();
+ }
+ return Status::Invalid("capnp serialization/deserialization of unified sites does not return original value");
+}
+
Status unifier_config::of_yaml(const YAML::Node& yaml, unifier_config& ans) {
Status s;
View
@@ -231,6 +231,7 @@ class GVCFTestCase {
const auto n_unified_sites = yaml["truth_unified_sites"];
REQUIRE(n_unified_sites);
S(parse_unified_sites(n_unified_sites, contigs));
+ REQUIRE(GLnexus::capnp_unified_sites_verify(truth_sites, "/tmp/unified_sites_capnp_check").ok());
}
// write truth gvcf if test_genotypes is set to true
View
@@ -345,13 +345,14 @@ quality: 100
SECTION("snp+del") {
vector<YAML::Node> ns = YAML::LoadAll("---\n" + string(snp) + "\n---\n" + string(del) + "\n...");
REQUIRE(ns.size() == 2);
- unified_site us(range(-1,-1,-1));
+ unified_site us(range(-1,-1,-1)), us2(range(-1,-1,-1));
Status s = unified_site::of_yaml(ns[0], contigs, us);
REQUIRE(s.ok());
VERIFY_SNP(us);
- s = unified_site::of_yaml(ns[1], contigs, us);
+ s = unified_site::of_yaml(ns[1], contigs, us2);
REQUIRE(s.ok());
- VERIFY_DEL(us);
+ VERIFY_DEL(us2);
+ REQUIRE(GLnexus::capnp_unified_sites_verify({us, us2}, "/tmp/unified_sites.capnp").ok());
}
SECTION("optional ref") {
@@ -497,6 +498,8 @@ monoallelic: false
unified_site us2(range(-1,-1,-1));
REQUIRE(unified_site::of_yaml(n, contigs, us2).ok());
REQUIRE(us == us2);
+
+ REQUIRE(GLnexus::capnp_unified_sites_verify({us}, "/tmp/unified_sites.capnp").ok());
}
}