[MRG] remove redundant ACGT-checking code. #1590

Merged
merged 55 commits into from Jun 2, 2017
Commits
+146 −212
Split
View
@@ -7,12 +7,15 @@ The khmer project's command line scripts adhere to
under semantic versioning, but will be in future versions of khmer.
## [Unreleased]
+
### Added
- Cython wrapper for liboxli.
- Cython containers for parsing, assembly, and hashing.
- Header install for liboxli.
### Changed
+- Non-ACTG handling significantly changed so that only bulk-loading functions
+ "clean" sequences of non-DNA characters. See #1590 for details.
- Split CPython wrapper file into per-class files under src/khmer and include/khmer.
- Moved liboxli headers to include/oxli and implementations to src/oxli.
- Removed CPython assembler wrappers.
@@ -260,13 +260,6 @@ public:
// count every k-mer in the string.
unsigned int consume_string(const std::string &s);
- // checks each read for non-ACGT characters
- bool check_and_normalize_read(std::string &read) const;
-
- // check each read for non-ACGT characters, and then consume it.
- unsigned int check_and_process_read(std::string &read,
- bool &is_valid);
-
// Count every k-mer in a file containing nucleotide sequences.
template<typename SeqIO>
void consume_seqfile(
View
@@ -309,13 +309,12 @@ void Hashgraph::consume_seqfile_and_tag(
break;
}
- if (check_and_normalize_read( read.sequence )) {
- unsigned long long this_n_consumed = 0;
- consume_sequence_and_tag( read.sequence, this_n_consumed );
+ read.set_clean_seq();
+ unsigned long long this_n_consumed = 0;
+ consume_sequence_and_tag(read.cleaned_seq, this_n_consumed);
- __sync_add_and_fetch( &n_consumed, this_n_consumed );
- __sync_add_and_fetch( &total_reads, 1 );
- }
+ __sync_add_and_fetch(&n_consumed, this_n_consumed);
+ __sync_add_and_fetch(&total_reads, 1);
} // while reads left for parser
}
@@ -373,21 +372,20 @@ void Hashgraph::consume_partitioned_fasta(
} catch (NoMoreReadsAvailable &exc) {
break;
}
- seq = read.sequence;
+ read.set_clean_seq();
+ seq = read.cleaned_seq;
- if (check_and_normalize_read(seq)) {
- // First, figure out what the partition is (if non-zero), and save that.
- PartitionID p = _parse_partition_id(read.name);
+ // First, figure out what the partition is (if non-zero), and save that.
+ PartitionID p = _parse_partition_id(read.name);
- // Then consume the sequence
- n_consumed += consume_string(seq); // @CTB why are we doing this?
+ // Then consume the sequence
+ n_consumed += consume_string(seq); // @CTB why are we doing this?
- // Next, compute the tag & set the partition, if nonzero
- HashIntoType kmer = hash_dna(seq.c_str());
- all_tags.insert(kmer);
- if (p > 0) {
- partition->set_partition_id(kmer, p);
- }
+ // Next, compute the tag & set the partition, if nonzero
+ HashIntoType kmer = hash_dna(seq.c_str());
+ all_tags.insert(kmer);
+ if (p > 0) {
+ partition->set_partition_id(kmer, p);
}
// reset the sequence info, increment read number
@@ -467,10 +465,6 @@ unsigned int Hashgraph::kmer_degree(const char * kmer_s)
size_t Hashgraph::trim_on_stoptags(std::string seq) const
{
- if (!check_and_normalize_read(seq)) {
- return 0;
- }
-
KmerIterator kmers(seq.c_str(), _ksize);
size_t i = _ksize - 2;
View
@@ -58,51 +58,6 @@ using namespace oxli:: read_parsers;
//
-// check_and_process_read: checks for non-ACGT characters before consuming
-//
-
-unsigned int Hashtable::check_and_process_read(std::string &read,
- bool &is_valid)
-{
- is_valid = check_and_normalize_read(read);
-
- if (!is_valid) {
- return 0;
- }
-
- return consume_string(read);
-}
-
-//
-// check_and_normalize_read: checks for non-ACGT characters
-// converts lowercase characters to uppercase one
-// Note: Usually it is desirable to keep checks and mutations separate.
-// However, in the interests of efficiency (we are potentially working
-// with TB of data), a check and mutation have been placed inside the
-// same loop. Potentially trillions fewer fetches from memory would
-// seem to be a worthwhile goal.
-//
-
-bool Hashtable::check_and_normalize_read(std::string &read) const
-{
- bool rc = true;
-
- if (read.length() < _ksize) {
- return false;
- }
-
- for (unsigned int i = 0; i < read.length(); i++) {
- read[ i ] &= 0xdf; // toupper - knock out the "lowercase bit"
- if (!is_valid_dna( read[ i ] )) {
- rc = false;
- break;
- }
- }
-
- return rc;
-}
-
-//
// consume_seqfile: consume a file of reads
//
@@ -136,8 +91,8 @@ void Hashtable::consume_seqfile(
break;
}
- unsigned int this_n_consumed =
- check_and_process_read(read.sequence, is_valid);
+ read.set_clean_seq();
+ unsigned int this_n_consumed = consume_string(read.cleaned_seq);
__sync_add_and_fetch( &n_consumed, this_n_consumed );
__sync_add_and_fetch( &total_reads, 1 );
@@ -350,20 +305,19 @@ uint64_t * Hashtable::abundance_distribution(
} catch (NoMoreReadsAvailable &exc) {
break;
}
- seq = read.sequence;
+ read.set_clean_seq();
+ seq = read.cleaned_seq;
- if (check_and_normalize_read(seq)) {
- KmerHashIteratorPtr kmers = new_kmer_iterator(seq);
+ KmerHashIteratorPtr kmers = new_kmer_iterator(seq);
- while(!kmers->done()) {
- HashIntoType kmer = kmers->next();
+ while(!kmers->done()) {
+ HashIntoType kmer = kmers->next();
- if (!tracking->get_count(kmer)) {
- tracking->count(kmer);
+ if (!tracking->get_count(kmer)) {
+ tracking->count(kmer);
- BoundedCounterType n = get_count(kmer);
- dist[n]++;
- }
+ BoundedCounterType n = get_count(kmer);
+ dist[n]++;
}
name.clear();
@@ -387,10 +341,6 @@ unsigned long Hashtable::trim_on_abundance(
BoundedCounterType min_abund)
const
{
- if (!check_and_normalize_read(seq)) {
- return 0;
- }
-
KmerHashIteratorPtr kmers = new_kmer_iterator(seq);
HashIntoType kmer;
@@ -422,12 +372,7 @@ unsigned long Hashtable::trim_below_abundance(
BoundedCounterType max_abund)
const
{
- if (!check_and_normalize_read(seq)) {
- return 0;
- }
-
KmerHashIteratorPtr kmers = new_kmer_iterator(seq);
-
HashIntoType kmer;
if (kmers->done()) {
@@ -458,10 +403,6 @@ std::vector<unsigned int> Hashtable::find_spectral_error_positions(
const
{
std::vector<unsigned int> posns;
- if (!check_and_normalize_read(seq)) {
- throw oxli_exception("invalid read");
- }
-
KmerHashIteratorPtr kmers = new_kmer_iterator(seq);
HashIntoType kmer = kmers->next();
View
@@ -106,21 +106,21 @@ void LabelHash::consume_seqfile_and_tag_with_labels(
break;
}
- if (graph->check_and_normalize_read( read.sequence )) {
- // TODO: make threadsafe!
- unsigned long long this_n_consumed = 0;
- consume_sequence_and_tag_with_labels( read.sequence,
- this_n_consumed,
- the_label );
- the_label++;
+ read.set_clean_seq();
+
+ // TODO: make threadsafe!
+ unsigned long long this_n_consumed = 0;
+ consume_sequence_and_tag_with_labels( read.cleaned_seq,
+ this_n_consumed,
+ the_label );
+ the_label++;
#if (0) // Note: Used with callback - currently disabled.
- n_consumed_LOCAL = __sync_add_and_fetch( &n_consumed, this_n_consumed );
+ n_consumed_LOCAL = __sync_add_and_fetch( &n_consumed, this_n_consumed );
#else
- __sync_add_and_fetch( &n_consumed, this_n_consumed );
+ __sync_add_and_fetch( &n_consumed, this_n_consumed );
#endif
- __sync_add_and_fetch( &total_reads, 1 );
- }
+ __sync_add_and_fetch( &total_reads, 1 );
// TODO: Figure out alternative to callback into Python VM
// Cannot use in multi-threaded operation.
@@ -169,19 +169,19 @@ void LabelHash::consume_partitioned_fasta_and_tag_with_labels(
PartitionID p;
while(!parser->is_complete()) {
read = parser->get_next_read();
- seq = read.sequence;
-
- if (graph->check_and_normalize_read(seq)) {
- // First, figure out what the partition is (if non-zero), and
- // save that.
- printdbg(parsing partition id)
- p = _parse_partition_id(read.name);
- printdbg(consuming sequence and tagging)
- consume_sequence_and_tag_with_labels( seq,
- n_consumed,
- p );
- printdbg(back in consume_partitioned)
- }
+
+ read.set_clean_seq();
+ seq = read.cleaned_seq;
+
+ // First, figure out what the partition is (if non-zero), and
+ // save that.
+ printdbg(parsing partition id)
+ p = _parse_partition_id(read.name);
+ printdbg(consuming sequence and tagging)
+ consume_sequence_and_tag_with_labels( seq,
+ n_consumed,
+ p );
+ printdbg(back in consume_partitioned)
// reset the sequence info, increment read number
total_reads++;
View
@@ -147,61 +147,59 @@ size_t SubsetPartition::output_partitioned_file(
break;
}
- seq = read.sequence;
-
- if (_ht->check_and_normalize_read(seq)) {
- const char * kmer_s = seq.c_str();
-
- bool found_tag = false;
- for (unsigned int i = 0; i < seq.length() - ksize + 1; i++) {
- kmer = _ht->hash_dna(kmer_s + i);
-
- // is this a known tag?
- if (set_contains(partition_map, kmer)) {
- found_tag = true;
- break;
- }
+ read.set_clean_seq();
+ seq = read.cleaned_seq;
+
+ bool found_tag = false;
+ KmerHashIteratorPtr kmers = _ht->new_kmer_iterator(read.cleaned_seq);
+ while (!kmers->done()) {
+ kmer = kmers->next();
+
+ // is this a known tag?
+ if (set_contains(partition_map, kmer)) {
+ found_tag = true;
+ break;
}
+ }
- // all sequences should have at least one tag in them.
- // assert(found_tag); @CTB currently breaks tests. give fn flag
- // to disable.
+ // all sequences should have at least one tag in them.
+ // assert(found_tag); @CTB currently breaks tests. give fn flag
+ // to disable.
- PartitionID partition_id = 0;
- if (found_tag) {
- PartitionID * partition_p = partition_map[kmer];
- if (partition_p == NULL ) {
- partition_id = 0;
- n_singletons++;
- } else {
- partition_id = *partition_p;
- partitions.insert(partition_id);
- }
+ PartitionID partition_id = 0;
+ if (found_tag) {
+ PartitionID * partition_p = partition_map[kmer];
+ if (partition_p == NULL ) {
+ partition_id = 0;
+ n_singletons++;
+ } else {
+ partition_id = *partition_p;
+ partitions.insert(partition_id);
}
+ }
- if (partition_id > 0 || output_unassigned) {
- if (read.quality.length()) { // FASTQ
- outfile << "@" << read.name << "\t" << partition_id
- << "\n";
- outfile << seq << "\n+\n";
- outfile << read.quality << "\n";
- } else { // FASTA
- outfile << ">" << read.name << "\t" << partition_id;
- outfile << "\n" << seq << "\n";
- }
+ if (partition_id > 0 || output_unassigned) {
+ if (read.quality.length()) { // FASTQ
+ outfile << "@" << read.name << "\t" << partition_id
+ << "\n";
+ outfile << seq << "\n+\n";
+ outfile << read.quality << "\n";
+ } else { // FASTA
+ outfile << ">" << read.name << "\t" << partition_id;
+ outfile << "\n" << seq << "\n";
}
+ }
- total_reads++;
+ total_reads++;
- // run callback, if specified
- if (total_reads % CALLBACK_PERIOD == 0 && callback) {
- try {
- callback("output_partitions", callback_data,
- total_reads, reads_kept);
- } catch (...) {
- outfile.close();
- throw;
- }
+ // run callback, if specified
+ if (total_reads % CALLBACK_PERIOD == 0 && callback) {
+ try {
+ callback("output_partitions", callback_data,
+ total_reads, reads_kept);
+ } catch (...) {
+ outfile.close();
+ throw;
}
}
}
Oops, something went wrong.