Skip to content

Commit

Permalink
[Update] Basic unit test for SNP class done
Browse files Browse the repository at this point in the history
Still got clumping and thresholding related function to test. Also reduced total number of test case in other unit tests to reduce run time requirement
  • Loading branch information
choishingwan committed Apr 14, 2020
1 parent e8857d7 commit 836bbb9
Show file tree
Hide file tree
Showing 17 changed files with 455 additions and 366 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,5 @@ build_test/
coverage/

build_quick/

mac_build/
18 changes: 17 additions & 1 deletion inc/genotype.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1160,7 +1160,23 @@ class Genotype
*/
std::unordered_set<std::string>
load_ref(std::unique_ptr<std::istream> input, bool ignore_fid);

bool check_rs(std::string& rsid, std::string& snpid,
std::unordered_set<std::string>& processed_snps,
std::unordered_set<std::string>& duplicated_snps,
Genotype* genotype);
bool check_chr(const std::string& chr_str, std::string& prev_chr,
size_t& chr_num, bool& chr_error, bool& sex_error);
void process_snp(
const std::vector<IITree<size_t, size_t>>& exclusion_regions,
const std::string& chr_str, const std::string& mismatch_snp_record_name,
const std::string& mismatch_source, const size_t& bp,
const size_t file_idx, const std::streampos byte_pos, std::string& a1,
std::string& a2, std::string& rsid, std::string& snpid,
std::unordered_set<std::string>& processed_snps,
std::unordered_set<std::string>& duplicated_snps,
std::vector<bool>& retain_snp, std::string& prev_chr, size_t& chr_num,
size_t ref_target_match, bool& chr_error, bool& sex_error,
Genotype* genotype);
void shrink_snp_vector(const std::vector<bool>& retain)
{
m_existed_snps.erase(
Expand Down
79 changes: 24 additions & 55 deletions inc/snp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@
#include <numeric>
#include <stdexcept>
#include <string>
static_assert(sizeof(std::streamsize) <= sizeof(unsigned long long),
"streampos larger than long long, don't know how to proceed. "
"Please use PRSice on another machine");
static_assert(
sizeof(std::streamsize) <= sizeof(unsigned long long),
"streampos larger than unsigned long long, don't know how to proceed. "
"Please use PRSice on another machine");
class Genotype;
class SNP
{
Expand Down Expand Up @@ -60,18 +61,7 @@ class SNP
target.name_idx = idx;
target.byte_pos = byte_pos;
}
void update_file(const size_t& idx, const std::streampos byte_pos,
const bool is_ref, const bool flip)
{
auto&& target = is_ref ? m_reference : m_target;
target.name_idx = idx;
target.byte_pos = byte_pos;
if (is_ref) { m_ref_flipped = flip; }
else
{
m_flipped = flip;
}
}

void add_snp_info(const size_t& idx, const std::streampos byte_pos,
const size_t chr, const size_t loc,
const std::string& ref, const std::string& alt,
Expand All @@ -83,9 +73,9 @@ class SNP
m_target.byte_pos = byte_pos;
m_chr = chr;
m_loc = loc;
m_flipped = flipping;
m_ref = ref;
m_alt = alt;
m_flipped = flipping;
}
else
{
Expand All @@ -109,7 +99,7 @@ class SNP
* \param chr is the chromosome encoding of the other SNP
* \param loc is the coordinate of the other SNP
* \param ref is the reference allele of the other SNP
* \param alt is the alternative allele of teh other SNP
* \param alt is the alternative allele of the other SNP
* \param flipped is used as a return value. If flipping is required,
* flipped = true
* \return true if it is a match
Expand All @@ -136,14 +126,17 @@ class SNP
else
return true;
}
else if (!m_alt.empty() && !alt.empty())
else if (!alt.empty())
{
if ((m_ref == alt) && (m_alt == ref))
// here, we already know the refs don't match so we want it to match
// with alt
if ((m_ref == alt) && (m_alt.empty() || m_alt == ref))
{
flipped = true;
return true;
}
if ((complement(m_ref) == alt) && (complement(m_alt) == ref))
if ((complement(m_ref) == alt)
&& (m_alt.empty() || complement(m_alt) == ref))
{
flipped = true;
return true;
Expand Down Expand Up @@ -372,29 +365,6 @@ class SNP
target.has_count = true;
}

std::vector<size_t> get_set_idx(const size_t num_sets) const
{
std::vector<uintptr_t> flags = m_clump_info.flags;
uintptr_t bitset;
std::vector<size_t> out;
out.reserve(num_sets);
for (size_t k = 0; k < m_clump_info.max_flag_idx; ++k)
{
bitset = m_clump_info.flags[k];
while (bitset != 0)
{
uint64_t t = bitset & -bitset;
// TODO: Potential bug here. CTZLU seems to only take 32bit on
// none-64bit window according to plink (NOTE: PLINK also used
// this in their score calculation. Maybe ask Chris about it?)
int r = CTZLU(bitset);
out.push_back(k * BITCT + static_cast<size_t>(r));
bitset ^= t;
}
}
return out;
}


/*!
* \brief Obtain the upper bound of the clump region correspond to this SNP
Expand Down Expand Up @@ -422,6 +392,17 @@ class SNP
m_genotype = genotype;
}
std::vector<uintptr_t> get_genotype() const { return m_genotype; }
static std::string complement(const std::string& allele)
{
// assume capitalized
if (allele == "A") return "T";
if (allele == "T") return "A";
if (allele == "G") return "C";
if (allele == "C")
return "G";
else
return allele; // Cannot flip, so will just return it as is
}

private:
AlleleCounts m_ref_count;
Expand All @@ -446,18 +427,6 @@ class SNP
bool m_flipped = false;
bool m_ref_flipped = false;
bool m_is_valid = true;

inline std::string complement(const std::string& allele) const
{
// assume capitalized
if (allele == "A") return "T";
if (allele == "T") return "A";
if (allele == "G") return "C";
if (allele == "C")
return "G";
else
return allele; // Cannot flip, so will just return it as is
}
};

#endif // SNP_H
123 changes: 7 additions & 116 deletions src/binarygen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -358,21 +358,15 @@ void BinaryGen::gen_snp_vector(
std::string file_name;
std::string error_message = "";
std::string A1, A2, prefix;
std::streampos byte_pos, start;
std::streampos byte_pos;
size_t total_unfiltered_snps = 0;
size_t ref_target_match = 0;
size_t num_snp;
size_t chr_num = 0;
uint32_t SNP_position = 0;
uint32_t offset;
int chr_code = 0;
bool exclude_snp = false;
bool chr_sex_error = false;
bool chr_error = false;
bool prev_chr_sex_error = false;
bool prev_chr_error = false;
bool flipping = false;
bool to_remove = false;
for (size_t i = 0; i < m_genotype_file_names.size(); ++i)
{
// get the total unfiltered snp size so that we can initalize the vector
Expand Down Expand Up @@ -422,124 +416,21 @@ void BinaryGen::gen_snp_vector(
fprintf(stderr, "\r%zu SNPs processed in %s\r", i_snp,
bgen_name.c_str());
}
m_unfiltered_marker_ct++;
start = bgen_file.tellg();
++m_unfiltered_marker_ct;
// directly use the library without decompressing the genotype
read_snp_identifying_data(bgen_file, context, &SNPID, &RSID,
&chromosome, &SNP_position, &A1, &A2);
exclude_snp = false;
if (chromosome != prev_chr)
{
chr_code = get_chrom_code_raw(chromosome.c_str());
if (chr_code_check(chr_code, chr_sex_error, chr_error,
error_message))
{
if (chr_error && !prev_chr_error)
{
std::cerr << error_message << "\n";
prev_chr_error = chr_error;
}
if (chr_sex_error && !prev_chr_sex_error)
{
std::cerr << error_message << "\n";
prev_chr_sex_error = chr_sex_error;
}
exclude_snp = true;
}
chr_num = ~size_t(0);
if (!exclude_snp)
{
// only update prev_chr if we want to include this SNP
prev_chr = chromosome;
chr_num = static_cast<size_t>(chr_code);
}
}

if (RSID == "." && SNPID == ".")
{
// when both rs id and SNP id isn't available, just skip
exclude_snp = true;
}
// default to RS
cur_id = RSID;
// by this time point, we should always have the
// m_existed_snps_index propagated with SNPs from the base. So we
// can first check if the SNP are presented in base
auto&& find_rs = genotype->m_existed_snps_index.find(RSID);
auto&& find_snp = genotype->m_existed_snps_index.find(SNPID);
if (find_rs == genotype->m_existed_snps_index.end()
&& find_snp == genotype->m_existed_snps_index.end())
{
// this is the reference panel, and the SNP wasn't found in the
// target doens't matter if we use RSID or SNPID
++m_base_missed;
exclude_snp = true;
}
else if (find_snp != genotype->m_existed_snps_index.end())
{
// we found the SNPID
cur_id = SNPID;
}
else if (find_rs != genotype->m_existed_snps_index.end())
{
// we found the RSID
cur_id = RSID;
}

bool ambig = ambiguous(A1, A2);
if (processed_snps.find(cur_id) != processed_snps.end())
{
duplicated_snps.insert(cur_id);
exclude_snp = true;
}
// perform check on ambiguousity
else if (ambig)
{
++m_num_ambig;
if (!m_keep_ambig) exclude_snp = true;
}

to_remove = Genotype::within_region(exclusion_regions, chr_num,
SNP_position);
if (to_remove)
{
++m_num_xrange;
exclude_snp = true;
}
// get the current location of bgen file, this will be used to skip
// to current location later on
byte_pos = bgen_file.tellg();
process_snp(exclusion_regions, chromosome, mismatch_snp_record_name,
mismatch_source, SNP_position, file_idx, byte_pos, A1,
A2, RSID, SNPID, processed_snps, duplicated_snps,
retain_snp, prev_chr, chr_num, ref_target_match,
chr_error, chr_sex_error, genotype);
// read in the genotype data block so that we advance the ifstream
// pointer to the next SNP entry
read_genotype_data_block(bgen_file, context, &m_buffer1);
// if we want to exclude this SNP, we will not perform
// decompression
if (!exclude_snp)
{
// A1 = alleles.front();
// A2 = alleles.back();
auto&& target_index = genotype->m_existed_snps_index[cur_id];
if (!genotype->m_existed_snps[target_index].matching(
chr_num, SNP_position, A1, A2, flipping))
{
genotype->print_mismatch(
mismatch_snp_record_name, mismatch_source,
genotype->m_existed_snps[target_index], cur_id, A1, A2,
chr_num, SNP_position);
++m_num_ref_target_mismatch;
}
else
{
processed_snps.insert(cur_id);
if (ambig)
{ flipping = (A1 != m_existed_snps[target_index].ref()); }
genotype->m_existed_snps[target_index].add_snp_info(
file_idx, byte_pos, chr_num, SNP_position, A1, A2,
flipping, m_is_ref);
retain_snp[target_index] = true;
++ref_target_match;
}
}
}
if (num_snp % 1000 == 0)
{
Expand Down
Loading

0 comments on commit 836bbb9

Please sign in to comment.