Skip to content

Commit

Permalink
[Update] Completed unit test for sample generation
Browse files Browse the repository at this point in the history
  • Loading branch information
choishingwan committed Apr 13, 2020
1 parent 52782ab commit 98f03f9
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 89 deletions.
10 changes: 7 additions & 3 deletions src/genotype.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,12 @@ void Genotype::gen_sample(const size_t fid_idx, const size_t iid_idx,
const std::string fid = (m_ignore_fid) ? "" : token[fid_idx];
const std::string id =
(m_ignore_fid) ? token[iid_idx] : fid + m_delim + token[iid_idx];
// end immediately if duplicated samples are found
if (sample_in_file.find(id) != sample_in_file.end())
{
duplicated_sample_id.push_back(id);
return;
}
auto&& find_id =
m_sample_selection_list.find(id) != m_sample_selection_list.end();
bool inclusion = m_remove_sample ^ find_id;
Expand Down Expand Up @@ -493,9 +499,7 @@ void Genotype::gen_sample(const size_t fid_idx, const size_t iid_idx,
++m_num_ambig_sex;
}
// this must be incremented within each loop
if (sample_in_file.find(id) != sample_in_file.end())
duplicated_sample_id.push_back(id);
else if (inclusion && !m_is_ref)
if (inclusion && !m_is_ref)
{
sample_storage.emplace_back(
Sample_ID(fid, token[iid_idx], pheno, in_regression));
Expand Down
2 changes: 1 addition & 1 deletion test/csrc/genotype_read_base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ TEST_CASE("base file read")
base_qc.maf_case = 0.04;
PThresholding threshold_info;
threshold_info.no_full = true;
threshold_info.fastscore = true;
threshold_info.fastscore = GENERATE(true, false);
threshold_info.bar_levels = {0.5};
// won't have header as read_base should have dealt with the header
std::vector<std::string> base = {
Expand Down
222 changes: 137 additions & 85 deletions test/csrc/genotype_read_sample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,99 +42,151 @@ TEST_CASE("Sample object generation")
geno.test_init_sample_vectors();
// test handling of ignore fid
auto ignore_fid = GENERATE(true, false);
auto mock_file = std::make_unique<std::istringstream>(std::ios_base::ate
| std::ios_base::app);
geno.set_ignore_fid(ignore_fid);
auto keep_nonfounder = GENERATE(true, false);
geno.set_keep_nonfounder(keep_nonfounder);
auto delim = GENERATE(" ", "-", "\t");
geno.set_delim(delim);
geno.set_ignore_fid(ignore_fid);
SECTION("without family")
// select
// extract
// duplicates
// test different file format
auto n_col = GENERATE(range(5, 20));
// test handling of sex chromosome
auto has_sex = GENERATE(true, false);
auto has_father_id = GENERATE(true, false);
auto has_mother_id = GENERATE(true, false);
auto remove_samples = GENERATE(true, false);
std::random_device rd;
std::mt19937 gen(rd());
// make fake id
std::uniform_int_distribution<> dis(1, static_cast<int>(n_sample) * 1000);
std::uniform_int_distribution<> sex_rand(1, 3);
std::uniform_int_distribution<> remove_rand(0, 1);
std::uniform_int_distribution<> has_father(0, 1);
std::uniform_int_distribution<> has_mother(0, 1);
std::uniform_int_distribution<> is_duplicate(0, 100);
std::unordered_set<int> dup_gen;
std::vector<std::string> included_samples;
std::string file_input;
size_t exp_male = 0, exp_female = 0, exp_ambig = 0;
const size_t fid_idx = 0;
const size_t iid_idx = ignore_fid ? 0 : 1;
size_t sex_idx = 0;
geno.change_sample_selection(remove_samples);
for (size_t i = 0; i < static_cast<uintptr_t>(n_sample); ++i)
{
// select
// extract
// duplicates
SECTION("all included")
auto b_in_prs = geno.calculate_prs();
auto b_in_ld = geno.sample_for_ld();
REQUIRE_FALSE(IS_SET(b_in_prs.data(), i));
REQUIRE_FALSE(IS_SET(b_in_ld.data(), i));
std::vector<std::string> mock_input(static_cast<uintptr_t>(n_col),
std::string("@"));
int temp = dis(gen);
// generate non-duplicated samples
while (dup_gen.find(temp) != dup_gen.end()) temp = dis(gen);
dup_gen.insert(temp);
// around 2% duplication?
auto duplicated = is_duplicate(gen) < 2;

mock_input[0] = "F" + std::to_string(temp);
std::string exp = mock_input[0];
if (!ignore_fid) { mock_input[1] = exp; }
if (duplicated)
{ processed_samples.insert((ignore_fid ? "" : exp + delim) + exp); }
included_samples.push_back(exp);
// always include this info
mock_input[3] = "Dad" + std::to_string(dis(gen));
mock_input[4] = "Mum" + std::to_string(dis(gen));
if (has_sex)
{
// test different file format
auto n_col = GENERATE(range(3, 10));
// test handling of sex chromosome
auto has_sex = GENERATE(true, false);
std::random_device rd;
std::mt19937 gen(rd());
// make fake id
std::uniform_int_distribution<> dis(1, static_cast<int>(n_sample)
* 1000);
std::uniform_int_distribution<> sex_rand(1, 3);
std::unordered_set<int> dup_gen;
std::vector<std::string> included_samples;
std::string file_input;
size_t exp_male = 0, exp_female = 0, exp_ambig = 0;
const size_t fid_idx = 0;
const size_t iid_idx = ignore_fid ? 0 : 1;
size_t sex_idx = 0;
for (size_t i = 0; i < static_cast<uintptr_t>(n_sample); ++i)
auto sex = sex_rand(gen);
switch (sex)
{
case 1:
mock_input[2] = "1";
if (!duplicated) ++exp_male;
break;
case 2:
mock_input[2] = "2";
if (!duplicated) ++exp_female;
break;
default:
mock_input[2] = "NA";
if (!duplicated) ++exp_ambig;
break;
}
sex_idx = 2;
}
else
{
// when there's no sex information, we assume ambiguous
sex_idx = ~size_t(0);
if (!duplicated) ++exp_ambig;
}
auto has_dad = has_father(gen) == 1;
auto has_mum = has_mother(gen) == 1;
if (has_dad)
{
founder_info.insert((ignore_fid ? "" : exp + delim)
+ mock_input[3]);
}
if (has_mum)
{
founder_info.insert((ignore_fid ? "" : exp + delim)
+ mock_input[4]);
}
// sample selection
auto remove = remove_rand(gen);
if (remove == 1)
{
auto id = ((ignore_fid) ? "" : exp + delim) + exp;
geno.add_select_sample(id);
}
auto dad_idx = has_father_id ? 3 : ~size_t(0);
auto mum_idx = has_mother_id ? 4 : ~size_t(0);
auto dup_size = duplicated_sample_names.size();
REQUIRE_NOTHROW(
geno.test_gen_sample(fid_idx, iid_idx, sex_idx, dad_idx, mum_idx, i,
founder_info, "", mock_input, sample_storage,
processed_samples, duplicated_sample_names));
REQUIRE(duplicated_sample_names.size() == dup_size + duplicated);
auto in_prs = geno.calculate_prs();
auto in_ld = geno.sample_for_ld();
if (remove_samples ^ (remove == 1) && !duplicated)
{
REQUIRE(IS_SET(in_prs.data(), i));
// never use non-founder for LD calculation
if (!ignore_fid
&& ((has_dad && has_father_id) || (has_mum && has_mother_id)))
{ REQUIRE_FALSE(IS_SET(in_ld.data(), i)); }
else
{
auto b_in_prs = geno.calculate_prs();
auto b_in_ld = geno.sample_for_ld();
REQUIRE_FALSE(IS_SET(b_in_prs.data(), i));
REQUIRE_FALSE(IS_SET(b_in_ld.data(), i));
std::vector<std::string> mock_input(
static_cast<uintptr_t>(n_col), std::string("@"));
int temp = dis(gen);
// generate non-duplicated samples
while (dup_gen.find(temp) != dup_gen.end()) temp = dis(gen);
dup_gen.insert(temp);
mock_input[0] = "F" + std::to_string(temp);
std::string exp = mock_input[0];
if (!ignore_fid) { mock_input[1] = exp; }
included_samples.push_back(exp);
if (has_sex)
{
auto sex = sex_rand(gen);
switch (sex)
{
case 1:
mock_input[2] = "1";
++exp_male;
break;
case 2:
mock_input[2] = "2";
++exp_female;
break;
default:
mock_input[2] = "NA";
++exp_ambig;
break;
}
sex_idx = 2;
}
else
{
// when there's no sex information, we assume ambiguous
sex_idx = ~size_t(0);
++exp_ambig;
}
REQUIRE_NOTHROW(geno.test_gen_sample(
fid_idx, iid_idx, sex_idx, ~size_t(0), ~size_t(0), i,
founder_info, "", mock_input, sample_storage,
processed_samples, duplicated_sample_names));
REQUIRE(duplicated_sample_names.empty());
auto in_prs = geno.calculate_prs();
auto in_ld = geno.sample_for_ld();
REQUIRE(IS_SET(in_prs.data(), i));
REQUIRE(IS_SET(in_ld.data(), i));
REQUIRE(geno.num_male() == exp_male);
REQUIRE(geno.num_female() == exp_female);
REQUIRE(geno.num_ambig_sex() == exp_ambig);
// always have IID
REQUIRE(sample_storage.back().IID == exp);
}
// always have IID
REQUIRE(sample_storage.back().IID == exp);
// only use non-founder for regression if asked to
if (!ignore_fid && !keep_nonfounder
&& ((has_dad && has_father_id) || (has_mum && has_mother_id)))
{ REQUIRE_FALSE(sample_storage.back().in_regression); }
else
{
REQUIRE(sample_storage.back().in_regression);
if (!ignore_fid) { REQUIRE(sample_storage.back().FID == exp); }
else
{
REQUIRE(sample_storage.back().FID.empty());
}
}
if (!ignore_fid) { REQUIRE(sample_storage.back().FID == exp); }
else
{
REQUIRE(sample_storage.back().FID.empty());
}
}
else
{
REQUIRE_FALSE(IS_SET(in_prs.data(), i));
REQUIRE_FALSE(IS_SET(in_ld.data(), i));
}
REQUIRE(geno.num_male() == exp_male);
REQUIRE(geno.num_female() == exp_female);
REQUIRE(geno.num_ambig_sex() == exp_ambig);
}
SECTION("with family") {}
}
9 changes: 9 additions & 0 deletions test/inc/mock_genotype.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,20 @@ class mockGenotype : public Genotype
founder_info, pheno, token, sample_storage, sample_in_file,
duplicated_sample_id);
}
void add_select_sample(const std::string& in){
m_sample_selection_list.insert(in);
}
void change_sample_selection(bool remove){
m_remove_sample = remove;
}
void add_select_snp(const std::string& in, bool exclude)
{
m_snp_selection_list.insert(in);
m_exclude_snp = exclude;
}
void set_keep_nonfounder(bool keep_nonfounder){
m_keep_nonfounder = keep_nonfounder;
}
uint32_t num_auto() const { return m_autosome_ct; }
std::vector<int32_t> xymt_codes() const { return m_xymt_codes; }
std::vector<uintptr_t> haploid_mask() const { return m_haploid_mask; }
Expand Down

0 comments on commit 98f03f9

Please sign in to comment.