Skip to content

Commit

Permalink
[Update] Implemented unit test for load_sample for different file types
Browse files Browse the repository at this point in the history
  • Loading branch information
choishingwan committed Apr 13, 2020
1 parent 98f03f9 commit 8d6fb3b
Show file tree
Hide file tree
Showing 10 changed files with 475 additions and 96 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ endif()

add_subdirectory(src)

option (BUILD_TESTING "Build the testing tree." OFF)
option (BUILD_TESTING "Build the unit test." OFF)
# Only build tests if we are the top-level project
# Allows this to be used by super projects with `add_subdirectory`
if (BUILD_TESTING AND (PROJECT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR))
Expand Down
3 changes: 2 additions & 1 deletion inc/binarygen.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class BinaryGen : public Genotype
* or just a simple text file
* \return
*/
static bool check_is_sample_format(const std::string& input);
static bool check_is_sample_format(std::unique_ptr<std::istream>& input);

protected:
typedef std::vector<std::vector<double>> Data;
Expand All @@ -59,6 +59,7 @@ class BinaryGen : public Genotype
* \return Vector containing the sample information
*/
std::vector<Sample_ID> gen_sample_vector();
void handle_pheno_header(std::unique_ptr<std::istream>& sample);
void
gen_snp_vector(const std::vector<IITree<size_t, size_t>>& exclusion_regions,
const std::string& out_prefix, Genotype* target = nullptr);
Expand Down
3 changes: 2 additions & 1 deletion inc/binaryplink.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ class BinaryPlink : public Genotype
bool force_cal = false);
void check_bed(const std::string& bed_name, size_t num_marker,
uintptr_t& bed_offset);
std::unordered_set<std::string> get_founder_info(std::ifstream& famfile);
std::unordered_set<std::string>
get_founder_info(std::unique_ptr<std::istream>& famfile);
inline void read_genotype(uintptr_t* __restrict genotype,
const std::streampos byte_pos,
const size_t& file_idx)
Expand Down
140 changes: 62 additions & 78 deletions src/binarygen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,37 @@ size_t BinaryGen::get_sex_col(const std::string& header,
return sex_col;
}

void BinaryGen::handle_pheno_header(std::unique_ptr<std::istream>& sample)
{
std::string line;
bool have_header = false;
size_t num_line = 0;
while (std::getline(*sample, line))
{
misc::trim(line);
if (!line.empty()) ++num_line;
}
if (num_line == m_unfiltered_sample_ct + 1) { have_header = true; }
else if (num_line != m_unfiltered_sample_ct)
{
throw std::runtime_error(
"Error: Number of sample in phenotype file does not match "
"number of samples specified in bgen file. Please check "
"you "
"have the correct phenotype file input. Note: Phenotype "
"file "
"should have the same number of samples as the bgen file "
"and "
"they should appear in the same order");
}
(*sample).clear();
(*sample).seekg(0);
if (have_header)
{
std::getline(*sample, line);
m_reporter->report("Assume phenotype file has header line: " + line);
}
}
std::vector<Sample_ID> BinaryGen::gen_sample_vector()
{
// this is the first time we do something w.r.t bgen file
Expand All @@ -77,48 +108,39 @@ std::vector<Sample_ID> BinaryGen::gen_sample_vector()
// we always know the sample size from context
m_unfiltered_sample_ct = m_context_map[0].number_of_samples;
init_sample_vectors();
if (m_is_ref)
if (m_is_ref && m_sample_selection_list.empty())
{
for (size_t i = 0; i < m_unfiltered_sample_ct; ++i)
{
++m_sample_ct;
SET_BIT(i, m_calculate_prs.data());
// we assume all bgen samples to be founder
SET_BIT(i, m_sample_for_ld.data());
}
}
else if (!m_is_ref || !m_sample_selection_list.empty())
{
// don't bother with sample check if we are using bgen as reference and
// not exclude/extracting samples
if ((!m_keep_file.empty() || !m_remove_file.empty())
&& (m_sample_file.empty()))
// this is the target, where the m_sample_file must be correct, or
// this is the reference, which we asked for --keep or --remove and
// an external sample file was provided (that's why we don't get
// into the runtime_error)
if (m_is_ref && m_sample_file.empty())
{
throw std::runtime_error("Error: Cannot perform sample "
"filtering on the LD reference "
"file without the sample file!");
}
else
{
for (size_t i = 0; i < m_unfiltered_sample_ct; ++i)
{
++m_sample_ct;
SET_BIT(i, m_calculate_prs.data());
// we assume all bgen samples to be founder
SET_BIT(i, m_sample_for_ld.data());
}
}
}
else if (!m_is_ref || (!m_keep_file.empty() || !m_remove_file.empty()))
{
// this is the target, where the m_sample_file must be correct, or this
// is the reference, which we asked for --keep or --remove and an
// external sample file was provided (that's why we don't get into the
// runtime_error)
const bool is_sample_format = check_is_sample_format(m_sample_file);
std::ifstream sample_file(m_sample_file.c_str());
// don't need to check again as the check_is_sample_format function
// already checked if the file is opened
auto sample = misc::load_stream(m_sample_file);
const bool is_sample_format = check_is_sample_format(sample);
std::string line;
size_t sex_col = ~size_t(0);

// now check if there's a sex information
if (is_sample_format)
{
// only do this if the file is sample format
std::getline(sample_file, line);
std::getline(*sample, line);
std::string format;
std::getline(sample_file, format);
std::getline(*sample, format);
sex_col = get_sex_col(line, format);
}
// now start reading the file
Expand All @@ -127,44 +149,11 @@ std::vector<Sample_ID> BinaryGen::gen_sample_vector()
std::vector<std::string> duplicated_sample_id;
std::vector<std::string> token;
const size_t required_column =
((sex_col != ~size_t(0)) ? (sex_col) : (1 + !m_ignore_fid));
((sex_col != ~size_t(0)) ? (sex_col + 1) : (1 + !m_ignore_fid));
const size_t iid_idx = (is_sample_format || !m_ignore_fid) ? 1 : 0;
const size_t fid_idx = 0;
// more robust header check, only remove header if sample size = line +
// 1
// first, get number of lines in the file
if (!is_sample_format)
{
bool have_header = false;
size_t num_line = 0;
while (std::getline(sample_file, line))
{
misc::trim(line);
if (!line.empty()) ++num_line;
}
if (num_line == m_unfiltered_sample_ct + 1) { have_header = true; }
else if (num_line != m_unfiltered_sample_ct)
{
throw std::runtime_error(
"Error: Number of sample in phenotype file does not match "
"number of samples specified in bgen file. Please check "
"you "
"have the correct phenotype file input. Note: Phenotype "
"file "
"should have the same number of samples as the bgen file "
"and "
"they should appear in the same order");
}
sample_file.clear();
sample_file.seekg(0);
if (have_header)
{
std::getline(sample_file, line);
m_reporter->report("Assume phenotype file has header line: "
+ line);
}
}
while (std::getline(sample_file, line))
if (!is_sample_format) { handle_pheno_header(sample); }
while (std::getline(*sample, line))
{
misc::trim(line);
if (line.empty()) continue;
Expand All @@ -179,8 +168,8 @@ std::vector<Sample_ID> BinaryGen::gen_sample_vector()
+ " columns! Number of column="
+ misc::to_string(token.size()));
}
gen_sample(fid_idx, iid_idx, sex_col, 0, 0, line_id,
std::unordered_set<std::string> {}, "", token,
gen_sample(fid_idx, iid_idx, sex_col, ~size_t(0), ~size_t(0),
line_id, std::unordered_set<std::string> {}, "", token,
sample_name, sample_in_file, duplicated_sample_id);
++line_id;
}
Expand All @@ -194,29 +183,24 @@ std::vector<Sample_ID> BinaryGen::gen_sample_vector()
"have an "
"unique identifier");
}
sample_file.close();
sample.reset();
}
post_sample_read_init();
return sample_name;
}

bool BinaryGen::check_is_sample_format(const std::string& input)
bool BinaryGen::check_is_sample_format(std::unique_ptr<std::istream>& input)
{
// read the sample file
// might want to change it according to the new sample file,
// which only mandate the first column
std::ifstream sample_file(input.c_str());
if (!sample_file.is_open())
{
std::string error_message = "Error: Cannot open sample file: " + input;
throw std::runtime_error(error_message);
}
// get the first two line of input
std::string first_line, second_line;
std::getline(sample_file, first_line);
std::getline(*input, first_line);
// we must have at least 2 row for a sample file
if (!std::getline(sample_file, second_line)) { return false; }
sample_file.close();
if (!std::getline(*input, second_line)) { return false; }
(*input).clear();
(*input).seekg(0);
// split the first two lines
const std::vector<std::string_view> first_row = misc::tokenize(first_line);
const std::vector<std::string_view> second_row =
Expand Down
21 changes: 7 additions & 14 deletions src/binaryplink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ BinaryPlink::BinaryPlink(const GenoFile& geno, const Phenotype& pheno,
}

std::unordered_set<std::string>
BinaryPlink::get_founder_info(std::ifstream& famfile)
BinaryPlink::get_founder_info(std::unique_ptr<std::istream>& famfile)
{
std::string line;
std::vector<std::string> token;
std::unordered_set<std::string> founder_info;
while (std::getline(famfile, line))
while (std::getline(*famfile, line))
{
misc::trim(line);
if (line.empty()) continue;
Expand All @@ -47,20 +47,14 @@ BinaryPlink::get_founder_info(std::ifstream& famfile)
founder_info.insert(token[+FAM::FID] + m_delim + token[+FAM::IID]);
++m_unfiltered_sample_ct;
}
famfile.clear();
famfile.seekg(0);
(*famfile).clear();
(*famfile).seekg(0);
return founder_info;
}
std::vector<Sample_ID> BinaryPlink::gen_sample_vector()
{
assert(m_genotype_file_names.size() > 0);
std::ifstream famfile;
famfile.open(m_sample_file.c_str());
if (!famfile.is_open())
{
throw std::runtime_error("Error: Cannot open fam file: "
+ m_sample_file);
}
auto famfile = misc::load_stream(m_sample_file);
m_unfiltered_sample_ct = 0;
// will also count number of samples here. Which initialize the important
// m_unfiltered_sample_ct
Expand All @@ -74,7 +68,7 @@ std::vector<Sample_ID> BinaryPlink::gen_sample_vector()
uintptr_t sample_index = 0; // this is just for error message
std::vector<std::string> token;
std::string line;
while (std::getline(famfile, line))
while (std::getline(*famfile, line))
{
misc::trim(line);
if (line.empty()) continue;
Expand All @@ -93,8 +87,7 @@ std::vector<Sample_ID> BinaryPlink::gen_sample_vector()
+ " duplicated samples detected!\n"
+ "Please ensure all samples have an unique identifier");
}

famfile.close();
famfile.reset();
post_sample_read_init();
return sample_name;
}
Expand Down
4 changes: 3 additions & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ add_executable(tests ${TEST_SOURCES}
${TEST_SRC_DIR}/misc_test.cpp
${TEST_SRC_DIR}/genotype_basic.cpp
${TEST_SRC_DIR}/genotype_read_base.cpp
${TEST_SRC_DIR}/genotype_read_sample.cpp)
${TEST_SRC_DIR}/genotype_read_sample.cpp
${TEST_SRC_DIR}/binaryplink_sample_load.cpp
${TEST_SRC_DIR}/binarygen_sample_load.cpp)
target_link_libraries(tests PUBLIC
Catch
genotyping
Expand Down

0 comments on commit 8d6fb3b

Please sign in to comment.