diff --git a/inc/genotype.hpp b/inc/genotype.hpp index 7354b292..040134b6 100644 --- a/inc/genotype.hpp +++ b/inc/genotype.hpp @@ -100,7 +100,10 @@ class Genotype void load_snps(const std::string& out, const std::vector>& exclusion_regions, bool verbose, Genotype* target = nullptr); - + std::tuple get_max_id_length() const + { + return {m_max_fid_length, m_max_iid_length}; + } void calc_freqs_and_intermediate(const QCFiltering& filter_info, const std::string& prefix, bool verbose, Genotype* target = nullptr); @@ -685,6 +688,8 @@ class Genotype size_t m_num_female = 0; size_t m_num_ambig_sex = 0; size_t m_num_non_founder = 0; + size_t m_max_fid_length = 3; + size_t m_max_iid_length = 3; uintptr_t m_unfiltered_sample_ct = 0; // number of unfiltered samples uintptr_t m_unfiltered_marker_ct = 0; uintptr_t m_sample_ct = 0; diff --git a/inc/prsice.hpp b/inc/prsice.hpp index bbc29bb7..534a9d61 100644 --- a/inc/prsice.hpp +++ b/inc/prsice.hpp @@ -72,14 +72,17 @@ class PRSice public: PRSice() {} PRSice(const CalculatePRS& prs_info, const PThresholding& p_info, - const Phenotype& pheno, const Permutations& perm, - const std::string& output, Reporter* reporter) + const Permutations& perm, const std::string& output, + const size_t max_fid, const size_t max_iid, const bool binary, + Reporter* reporter) : m_prefix(output) + , m_max_fid_length(max_fid) + , m_max_iid_length(max_iid) + , m_binary_trait(binary) , m_reporter(reporter) , m_prs_info(prs_info) , m_p_info(p_info) , m_perm_info(perm) - , m_pheno_info(pheno) { } @@ -93,10 +96,7 @@ class PRSice */ static void pheno_check(const bool no_regress, Phenotype& pheno, Reporter& reporter); - bool pheno_skip(size_t idx) const - { - return m_pheno_info.skip_pheno.at(idx); - } + // init_matrix whenever phenotype changes /*! * \brief init_matrix will initialize the independent and dependent matrix @@ -108,15 +108,6 @@ class PRSice */ void init_matrix(const size_t pheno_index, const std::string& delim, Genotype& target); - /*! - * \brief Return the total number of phenotype involved - * \return the total number of phenotype to process - */ - size_t num_phenotype() const { return m_pheno_info.binary.size(); } - std::string pheno_name(const size_t i) const - { - return m_pheno_info.pheno_col.at(i); - } void new_phenotype(Genotype& target); bool run_prsice(const size_t pheno_index, const size_t region_index, const std::vector>& region_membership, @@ -191,9 +182,6 @@ class PRSice // now calculate the number of competitive permutation we need // we only use the best threshold for set based permutation m_total_competitive_process = (num_region - 2) * num_set_perm; - } - void reset_progress() - { m_analysis_done = 0; m_total_competitive_perm_done = 0; } @@ -231,14 +219,6 @@ class PRSice const std::vector::const_iterator& bk_start_idx, const std::vector::const_iterator& bk_end_idx, const size_t pheno_index); - bool valid_pheno(const size_t idx) const - { - return !m_pheno_info.skip_pheno.at(idx); - } - std::vector get_prevalence() const - { - return m_pheno_info.prevalence; - } /*! * \brief Function responsible to generate the best score file @@ -295,18 +275,22 @@ class PRSice processed_threshold = 0; } }; - // struct Pheno_Info - // { - // std::vector col; - // std::vector name; - // std::vector order; - // std::vector binary; - // bool use_pheno = false; - // } m_pheno_info; // store the number of non-sig, margin sig, and sig pathway & phenotype static std::mutex lock_guard; - + // As R has a default precision of 7, we will go a bit + // higher to ensure we use up all precision + const std::string m_prefix; + const long long m_precision = 9; + // the 7 are: + // 1 for sign + // 1 for dot + // 2 for e- (scientific) + // 3 for exponent (max precision is somewhere around +-e297, so 3 is enough + const long long m_numeric_width = m_precision + 7; + const size_t m_max_fid_length = 3; + const size_t m_max_iid_length = 3; + const bool m_binary_trait = true; Eigen::MatrixXd m_independent_variables; // TODO: Use other method for faster best output Eigen::MatrixXd m_fast_best_output; @@ -334,21 +318,10 @@ class PRSice uint32_t m_num_snp_included = 0; uint32_t m_analysis_done = 0; - // As R has a default precision of 7, we will go a bit - // higher to ensure we use up all precision - const long long m_precision = 9; - // the 7 are: - // 1 for sign - // 1 for dot - // 2 for e- (scientific) - // 3 for exponent (max precision is somewhere around +-e297, so 3 is enough - const long long m_numeric_width = m_precision + 7; - long long m_max_fid_length = 3; - long long m_max_iid_length = 3; + int m_best_index = -1; bool m_quick_best = true; bool m_printed_warning = false; - const std::string m_prefix; Reporter* m_reporter; CalculatePRS m_prs_info; PThresholding m_p_info; @@ -384,6 +357,22 @@ class PRSice */ void gen_pheno_vec(Genotype& target, const size_t pheno_index, const std::string& delim); + void process_phenotype_file(const std::string& file_name, + const std::string& pheno_name, + const std::string& delim, + const std::size_t pheno_idx, + const bool ignore_fid, Genotype& target); + void process_phenotype_info(const std::string& pheno_name, + const std::string& delim, const bool ignore_fid, + Genotype& target); + std::tuple + binary_pheno_is_valid(const int max_pheno_code, + std::vector& pheno_store); + bool quantitative_pheno_is_valid(const std::vector& pheno_store); + void print_pheno_log(const std::string& name, const size_t sample_ct, + const size_t num_not_found, const size_t invalid_pheno, + const int max_pheno_code, const bool ignore_fid, + std::vector& pheno_store); /*! * \brief Function to generate the m_independent_variable matrix * \param c_cov_file is the name of the covariate file @@ -542,13 +531,14 @@ class PRSice Pmat, const Eigen::MatrixXd& R, const bool run_glm); - void parse_pheno(const bool binary, const std::string& pheno, - std::vector& pheno_store, double& first_pheno, - bool& more_than_one_pheno, size_t& num_case, - size_t& num_control, int& max_pheno_code); + void parse_pheno(const std::string& pheno, std::vector& pheno_store, + int& max_pheno_code); std::unordered_map - load_pheno_map(const size_t idx, const std::string& delim); + load_pheno_map(const std::string& delim, const size_t idx, + const bool ignore_fid, + std::unique_ptr pheno_file); + void reset_result_containers(const Genotype& target, const size_t region_idx); diff --git a/src/genotype.cpp b/src/genotype.cpp index 354d9d65..aff404cf 100644 --- a/src/genotype.cpp +++ b/src/genotype.cpp @@ -575,6 +575,11 @@ void Genotype::gen_sample(const size_t fid_idx, const size_t iid_idx, const std::string fid = (m_ignore_fid) ? "" : token[fid_idx]; const std::string id = (m_ignore_fid) ? token[iid_idx] : fid + m_delim + token[iid_idx]; + if (m_max_fid_length < token[fid_idx].length()) + m_max_fid_length = token[fid_idx].length(); + if (m_max_iid_length < token[iid_idx].length()) + m_max_iid_length = token[iid_idx].length(); + // end immediately if duplicated samples are found if (sample_in_file.find(id) != sample_in_file.end()) { diff --git a/src/main.cpp b/src/main.cpp index 38a1098b..697b4c36 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -140,16 +140,11 @@ int main(int argc, char* argv[]) // one extra progress bar for competitive permutation assert(target_file->get_set_thresholds().size() == num_regions); - + const auto [max_fid, max_iid] = target_file->get_max_id_length(); const size_t num_pheno = pheno_info.pheno_col_idx.size(); for (size_t i_pheno = 0; i_pheno < num_pheno; ++i_pheno) { - PRSice prsice(commander.get_prs_instruction(), - commander.get_p_threshold(), pheno_info, - commander.get_perm(), commander.out(), &reporter); - prsice.init_progress_count(target_file->get_set_thresholds()); - - if (prsice.pheno_skip(i_pheno)) + if (pheno_info.skip_pheno[i_pheno]) { reporter.simple_report("Skipping the " + std::to_string(i_pheno + 1) @@ -159,6 +154,11 @@ int main(int argc, char* argv[]) reporter.simple_report("Processing the " + std::to_string(i_pheno + 1) + " th phenotype"); + PRSice prsice(commander.get_prs_instruction(), + commander.get_p_threshold(), commander.get_perm(), + commander.out(), max_fid, max_iid, + pheno_info.binary[i_pheno], &reporter); + prsice.init_progress_count(target_file->get_set_thresholds()); prsice.new_phenotype(*target_file); if (!commander.get_prs_instruction().no_regress) { diff --git a/src/prsice.cpp b/src/prsice.cpp index a8ca9529..2a212a1a 100644 --- a/src/prsice.cpp +++ b/src/prsice.cpp @@ -168,6 +168,8 @@ void PRSice::new_phenotype(Genotype& target) // only need to give the target file to calculate the FID and IID length if (m_prs_info.no_regress) { update_sample_included("", false, target); } } + + void PRSice::init_matrix(const size_t pheno_index, const std::string& delim, Genotype& target) { @@ -219,12 +221,6 @@ void PRSice::init_matrix(const size_t pheno_index, const std::string& delim, void PRSice::update_sample_included(const std::string& delim, const bool binary, Genotype& target) { - // this is a bit tricky. The reason we need to calculate the max fid and - // iid length is so that we can generate the best file and all score - // file vertically by replacing pre-placed space characters with the - // desired number - m_max_fid_length = 3; - m_max_iid_length = 3; // we also want to avoid always having to search from // m_sample_with_phenotypes. Therefore, we push in the sample index to // m_matrix_index @@ -234,7 +230,6 @@ void PRSice::update_sample_included(const std::string& delim, const bool binary, // correspond to the phenotype vector and covariance matrix's order // correctly m_matrix_index.clear(); - long long fid_length, iid_length; const bool ctrl_std = binary && m_prs_info.scoring_method == SCORING::CONTROL_STD; for (size_t i_sample = 0; i_sample < target.num_sample(); ++i_sample) @@ -249,10 +244,6 @@ void PRSice::update_sample_included(const std::string& delim, const bool binary, throw std::runtime_error( "Error: FID / IID are pathologically long"); } - fid_length = static_cast(target.fid(i_sample).length()); - iid_length = static_cast(target.iid(i_sample).length()); - if (m_max_fid_length < fid_length) m_max_fid_length = fid_length; - if (m_max_iid_length < iid_length) m_max_iid_length = iid_length; // update the in regression flag according to covariate if (!m_sample_with_phenotypes.empty()) { @@ -279,27 +270,18 @@ void PRSice::update_sample_included(const std::string& delim, const bool binary, } } -void PRSice::parse_pheno(const bool binary, const std::string& pheno, - std::vector& pheno_store, double& first_pheno, - bool& more_than_one_pheno, size_t& num_case, - size_t& num_control, int& max_pheno_code) +void PRSice::parse_pheno(const std::string& pheno, + std::vector& pheno_store, int& max_pheno_code) { - if (binary) + if (m_binary_trait) { - // if trait is binary - // we first convert it to a temporary int temp = misc::convert(pheno); - // so taht we can check if the input is valid if (temp >= 0 && temp <= 2) { pheno_store.push_back(temp); if (max_pheno_code < temp) max_pheno_code = temp; - if (temp == 1) - ++num_case; - else - ++num_control; } - else + else if (temp != -9) { throw std::runtime_error("Invalid binary phenotype format!"); } @@ -307,48 +289,34 @@ void PRSice::parse_pheno(const bool binary, const std::string& pheno, else { pheno_store.push_back(misc::convert(pheno)); - if (pheno_store.size() == 1) { first_pheno = pheno_store[0]; } - else if (!more_than_one_pheno - && !misc::logically_equal(first_pheno, pheno_store.back())) - { - more_than_one_pheno = true; - } } } std::unordered_map -PRSice::load_pheno_map(const size_t idx, const std::string& delim) +PRSice::load_pheno_map(const std::string& delim, const size_t idx, + const bool ignore_fid, + std::unique_ptr pheno_file) { - const size_t pheno_col_index = m_pheno_info.pheno_col_idx[idx]; - std::ifstream pheno_file; - pheno_file.open(m_pheno_info.pheno_file.c_str()); - if (!pheno_file.is_open()) - { - throw std::runtime_error("Cannot open phenotype file: " - + m_pheno_info.pheno_file); - } - // we first store everything into a map. This allow the phenotype and - // genotype file to have completely different ordering and allow - // different samples to be included in each file std::unordered_map phenotype_info; - std::vector token; + std::vector token; std::string line, id; - while (std::getline(pheno_file, line)) + while (std::getline(*pheno_file, line)) { misc::trim(line); if (line.empty()) continue; - token = misc::split(line); + token = misc::tokenize(line); // Check if we have the minimal required column number - if (token.size() < pheno_col_index + 1) + if (token.size() < idx + 1) { throw std::runtime_error( "Malformed pheno file, should contain at least " - + misc::to_string(pheno_col_index + 1) + + misc::to_string(idx + 1) + " columns. " "Have you use the --ignore-fid option?"); } - id = (m_pheno_info.ignore_fid) ? token[0] : token[0] + delim + token[1]; - // and store the information into the map + id = (ignore_fid) + ? std::string(token[0]) + : std::string(token[0]) + delim + std::string(token[1]); if (phenotype_info.find(id) != phenotype_info.end()) { throw std::runtime_error("Error: Duplicated sample ID in " @@ -357,95 +325,123 @@ PRSice::load_pheno_map(const size_t idx, const std::string& delim) + ". Please " "check if your input is correct!"); } - phenotype_info[id] = token[pheno_col_index]; + phenotype_info[id] = std::string(token[idx]); } - pheno_file.close(); + pheno_file.reset(); return phenotype_info; } -void PRSice::gen_pheno_vec(Genotype& target, const size_t pheno_index, - const std::string& delim) +std::tuple +PRSice::binary_pheno_is_valid(const int max_pheno_code, + std::vector& pheno_store) { - const bool binary = m_pheno_info.binary[pheno_index]; - const size_t sample_ct = target.num_sample(); - std::string line; - int max_pheno_code = 0; + size_t min_pheno = max_pheno_code == 1 ? 0 : 1; size_t num_case = 0; size_t num_control = 0; + bool valid = true; + for (auto&& pheno : pheno_store) + { + if (pheno < min_pheno) + { + valid = false; + break; + } + pheno -= min_pheno; + misc::logically_equal(pheno, 1) ? ++num_case : ++num_control; + } + return {valid, num_case, num_control}; +} +bool PRSice::quantitative_pheno_is_valid(const std::vector& pheno_store) +{ + // assume pheno_store is valid + double prev = pheno_store.front(); + for (auto&& p : pheno_store) + { + if (!misc::logically_equal(p, prev)) { return true; } + } + return false; +} +void PRSice::process_phenotype_info(const std::string& pheno_name, + const std::string& delim, + const bool ignore_fid, Genotype& target) +{ + const size_t sample_ct = target.num_sample(); + std::vector pheno_store; + pheno_store.reserve(sample_ct); size_t invalid_pheno = 0; size_t num_not_found = 0; size_t sample_index_ct = 0; - // we will first store the phenotype into the double vector and then - // later use this to construct the matrix - std::vector pheno_store; - pheno_store.reserve(sample_ct); - std::string pheno_name = "Phenotype"; - std::string id; - - // check if input is sensible - double first_pheno = 0.0; - bool more_than_one_pheno = false; - - if (!m_pheno_info.pheno_file.empty()) // use phenotype file + int max_pheno_code = 0; + for (size_t i_sample = 0; i_sample < sample_ct; ++i_sample) { - // read in the phenotype index - pheno_name = m_pheno_info.pheno_col[pheno_index]; - std::unordered_map phenotype_info = - load_pheno_map(pheno_index, delim); - for (size_t i_sample = 0; i_sample < sample_ct; ++i_sample) + if (target.pheno_is_na(i_sample) || !target.in_regression(i_sample)) + { continue; } + try { - id = target.sample_id(i_sample, delim); - if (phenotype_info.find(id) != phenotype_info.end() - && phenotype_info[id] != "NA" && target.in_regression(i_sample)) - { - try - { - parse_pheno(binary, phenotype_info[id], pheno_store, - first_pheno, more_than_one_pheno, num_case, - num_control, max_pheno_code); - m_sample_with_phenotypes[id] = sample_index_ct; - ++sample_index_ct; - } - catch (...) - { - ++invalid_pheno; - } - } - else - { - ++num_not_found; - } + parse_pheno(target.pheno(i_sample), pheno_store, max_pheno_code); + m_sample_with_phenotypes[target.sample_id(i_sample, delim)] = + sample_index_ct; + ++sample_index_ct; + } + catch (const std::runtime_error&) + { + ++invalid_pheno; } } - else + print_pheno_log(pheno_name, sample_ct, num_not_found, invalid_pheno, + max_pheno_code, ignore_fid, pheno_store); +} +void PRSice::process_phenotype_file(const std::string& file_name, + const std::string& pheno_name, + const std::string& delim, + const std::size_t pheno_idx, + const bool ignore_fid, Genotype& target) +{ + auto pheno_stream = misc::load_stream(file_name); + auto phenotype_info = + load_pheno_map(delim, pheno_idx, ignore_fid, std::move(pheno_stream)); + const size_t sample_ct = target.num_sample(); + std::string id; + int max_pheno_code = 0; + size_t invalid_pheno = 0; + size_t num_not_found = 0; + size_t sample_index_ct = 0; + std::vector pheno_store; + pheno_store.reserve(sample_ct); + for (size_t i_sample = 0; i_sample < sample_ct; ++i_sample) { - // No phenotype file is provided - // Use information from the fam file directly - for (size_t i_sample = 0; i_sample < sample_ct; ++i_sample) + id = target.sample_id(i_sample, delim); + auto pheno_tmp = phenotype_info[id]; + misc::to_lower(pheno_tmp); + if (phenotype_info.find(id) != phenotype_info.end() && pheno_tmp != "na" + && phenotype_info[id] != "nan" && target.in_regression(i_sample)) { - if (target.pheno_is_na(i_sample) || !target.in_regression(i_sample)) - { - // it is ok to skip NA as default = sample.has_pheno = false - continue; - } try { - parse_pheno(binary, target.pheno(i_sample), pheno_store, - first_pheno, more_than_one_pheno, num_case, - num_control, max_pheno_code); - m_sample_with_phenotypes[target.sample_id(i_sample, delim)] = - sample_index_ct; + parse_pheno(phenotype_info[id], pheno_store, max_pheno_code); + m_sample_with_phenotypes[id] = sample_index_ct; ++sample_index_ct; } - catch (const std::runtime_error&) + catch (...) { ++invalid_pheno; } } + else + { + ++num_not_found; + } } - - std::string message = ""; - message = pheno_name + " is a "; - if (binary) { message.append("binary phenotype\n"); } + print_pheno_log(pheno_name, sample_ct, num_not_found, invalid_pheno, + max_pheno_code, ignore_fid, pheno_store); +} +void PRSice::print_pheno_log(const std::string& name, const size_t sample_ct, + const size_t num_not_found, + const size_t invalid_pheno, + const int max_pheno_code, const bool ignore_fid, + std::vector& pheno_store) +{ + std::string message = name + " is a "; + if (m_binary_trait) { message.append("binary phenotype\n"); } else { message.append("continuous phenotype\n"); @@ -460,12 +456,11 @@ void PRSice::gen_pheno_vec(Genotype& target, const size_t pheno_index, message.append(std::to_string(invalid_pheno) + " sample(s) with invalid phenotype\n"); } - if (num_not_found == sample_ct) { message.append("None of the target samples were found in the " "phenotype file. "); - if (m_pheno_info.ignore_fid) + if (ignore_fid) { message.append("Maybe the first column of your phenotype file " "is the FID?"); @@ -473,11 +468,12 @@ void PRSice::gen_pheno_vec(Genotype& target, const size_t pheno_index, else { message.append( - "Maybe your phenotype file doesn not contain the FID?\n"); - message.append("Might want to consider using --ignore-fid\n"); + "Maybe your phenotype file doesn not contain the FID?\n" + "Might want to consider using --ignore-fid\n"); } - message.append("Or it is possible that only non-founder sample contain " - "the phenotype information and you did not use " + message.append("Or it is possible that only non-founder sample have " + "phenotype information " + " and you did not use " "--nonfounders?\n"); m_reporter->report(message); throw std::runtime_error("Error: No sample left"); @@ -488,58 +484,69 @@ void PRSice::gen_pheno_vec(Genotype& target, const size_t pheno_index, m_reporter->report(message); throw std::runtime_error("Error: No sample left"); } - if (!binary && !more_than_one_pheno) - { - message.append("Only one phenotype value detected"); - if (misc::logically_equal(first_pheno, -9)) - { message.append(" and they are all -9"); } - m_reporter->report(message); - throw std::runtime_error("Not enough valid phenotype"); - } - // finished basic logs - // we now check if the binary encoding is correct - bool error = false; - if (max_pheno_code > 1 && binary) + if (pheno_store.empty()) + { throw std::runtime_error("No phenotype presented"); } + if (m_binary_trait) { - num_case = 0; - num_control = 0; - for (auto&& pheno : pheno_store) + auto [valid, num_case, num_control] = + binary_pheno_is_valid(max_pheno_code, pheno_store); + if (!valid) { - --pheno; - if (pheno < 0) { error = true; } - else - (misc::logically_equal(pheno, 1)) ? ++num_case : ++num_control; + throw std::runtime_error( + "Mixed encoding! Both 0/1 and 1/2 encoding found!"); } - } - if (error) - { - m_reporter->report(message); - throw std::runtime_error( - "Mixed encoding! Both 0/1 and 1/2 encoding found!"); - } - if (pheno_store.size() == 0) - { - m_reporter->report(message); - throw std::runtime_error("No phenotype presented"); - } - // now store the vector into the m_phenotype vector - m_phenotype = Eigen::Map( - pheno_store.data(), static_cast(pheno_store.size())); - if (binary) - { message.append(std::to_string(num_control) + " control(s)\n"); message.append(std::to_string(num_case) + " case(s)\n"); if (num_control == 0) + { + m_reporter->report(message); throw std::runtime_error("There are no control samples"); - if (num_case == 0) throw std::runtime_error("There are no cases"); + } + if (num_case == 0) + { + m_reporter->report(message); + throw std::runtime_error("There are no cases"); + } } else { - message.append(std::to_string(m_phenotype.rows()) - + " sample(s) with valid phenotype\n"); + bool valid = quantitative_pheno_is_valid(pheno_store); + if (!valid) + { + m_reporter->report(message); + std::string error_message = "Only one phenotype value detected"; + if (misc::logically_equal(pheno_store.front(), -9)) + { error_message.append(" and they are all -9"); } + throw std::runtime_error(error_message + + ". Not enough valid phenotype"); + } + else + { + message.append(std::to_string(m_phenotype.rows()) + + " sample(s) with valid phenotype\n"); + } } m_reporter->report(message); } +void PRSice::gen_pheno_vec(Genotype& target, const size_t pheno_index, + const std::string& delim) +{ + // we will first store the phenotype into the double vector and then + // later use this to construct the matrix + if (!m_pheno_info.pheno_file.empty()) // use phenotype file + { + process_phenotype_file(m_pheno_info.pheno_file, + m_pheno_info.pheno_col[pheno_index], delim, + pheno_index, m_pheno_info.ignore_fid, target); + } + else + { + // No phenotype file is provided + // Use information from the fam file directly + process_phenotype_info(m_pheno_info.pheno_col[pheno_index], delim, + m_pheno_info.ignore_fid, target); + } +} bool PRSice::validate_covariate(const std::string& covariate, const size_t num_factors, const size_t idx, diff --git a/test/csrc/prsice_pheno.cpp b/test/csrc/prsice_pheno.cpp index 553f506c..2a742f49 100644 --- a/test/csrc/prsice_pheno.cpp +++ b/test/csrc/prsice_pheno.cpp @@ -1,13 +1,15 @@ #include "catch.hpp" #include "mock_prsice.hpp" #include "prsice.hpp" +#include +#include + TEST_CASE("Simple Phenotype check") { Reporter reporter("log", 60, true); Phenotype pheno; SECTION("Get Phenotype index") { - mock_prsice prsice(&reporter); std::string input = "FID IID SCZ MDD Bipolar SCZ"; std::vector col_sv = misc::tokenize(input); SECTION("With duplicated target phenotype column") @@ -33,7 +35,6 @@ TEST_CASE("Simple Phenotype check") SECTION("Parse phenotype file header") { PThresholding pthres; - auto output = "PRSice"; Permutations perm; CalculatePRS prs_info; Phenotype pheno; @@ -77,16 +78,12 @@ TEST_CASE("Simple Phenotype check") SECTION("Non of the column were found") { pheno.pheno_col = {"ALZ", "ADD", "ANX"}; - mock_prsice prsice(prs_info, pthres, pheno, perm, output, - &reporter); REQUIRE_THROWS(mock_prsice::test_parse_pheno_header( std::move(input), pheno, reporter)); } SECTION("Some column were found") { pheno.pheno_col = {"ALZ", "MDD"}; - mock_prsice prsice(prs_info, pthres, pheno, perm, output, - &reporter); REQUIRE_NOTHROW(mock_prsice::test_parse_pheno_header( std::move(input), pheno, reporter)); REQUIRE_THAT(pheno.pheno_col, @@ -166,3 +163,208 @@ TEST_CASE("Simple Phenotype check") } } } + +TEST_CASE("load_pheno_map") +{ + std::string delim = " "; + auto ignore_fid = GENERATE(true, false); + Reporter reporter("log", 60, true); + mock_prsice prsice; + size_t idx = 2; + SECTION("Without duplicated samples") + { + auto pheno_file = std::make_unique("FID IID Pheno\n" + "ID1 ID1 1\n" + "ID2 ID2 2\n" + "ID3 ID3 3\n" + "ID4 ID4 4\n" + "ID5 ID5 5\n" + "ID6 ID6 6\n"); + + SECTION("Wrong idx size") + { + REQUIRE_THROWS(prsice.test_load_pheno_map(delim, 4, ignore_fid, + std::move(pheno_file))); + } + SECTION("Correct idx") + { + auto res = prsice.test_load_pheno_map(delim, idx, ignore_fid, + std::move(pheno_file)); + std::unordered_map expected; + for (size_t i = 0; i < 6; ++i) + { + std::string id = "ID" + std::to_string(i + 1); + if (!ignore_fid) id = id + delim + id; + expected[id] = std::to_string(i + 1); + } + // we also read in the header as we don't bother differentiating the + // header line from the phenotype line + auto header = "FID" + (ignore_fid ? "" : delim + "IID"); + expected[header] = "Pheno"; + REQUIRE(expected.size() == res.size()); + for (auto&& e : expected) + { + auto check = res.find(e.first); + REQUIRE(check != res.end()); + REQUIRE(check->second == e.second); + } + } + } + SECTION("With duplicated FID") + { + + // should be valid if we don't ignore FID + auto pheno_file = std::make_unique("FID IID Pheno\n" + "ID1 ID1 1\n" + "ID2 ID2 2\n" + "ID3 ID3 3\n" + "ID4 ID4 4\n" + "ID5 ID5 5\n" + "ID3 ID6 6\n"); + if (ignore_fid) + { + REQUIRE_THROWS(prsice.test_load_pheno_map(delim, idx, ignore_fid, + std::move(pheno_file))); + } + else + { + // don't bother to do additional check + REQUIRE_NOTHROW(prsice.test_load_pheno_map(delim, idx, ignore_fid, + std::move(pheno_file))); + } + } +} + +TEST_CASE("Generate pheno vector") +{ + Reporter reporter("log", 60, true); + SECTION("Check phenotype") + { + std::vector pheno_store; + SECTION("Quantitative trait") + { + mock_prsice prsice(false, &reporter); + SECTION("One phenotype only") + { + pheno_store.resize(1000, -9); + REQUIRE_FALSE( + prsice.test_quantitative_pheno_is_valid(pheno_store)); + } + SECTION("Valid") + { + std::random_device rnd_device; + std::mt19937 mersenne_engine {rnd_device()}; + std::uniform_real_distribution dist {-1.0, 1.0}; + + auto gen = [&dist, &mersenne_engine]() { + return dist(mersenne_engine); + }; + pheno_store.resize(1000); + std::generate(std::begin(pheno_store), std::end(pheno_store), + gen); + REQUIRE(prsice.test_quantitative_pheno_is_valid(pheno_store)); + } + } + SECTION("Binary trait") + { + mock_prsice prsice(false, &reporter); + SECTION("Invalid pheno ") + { + // mixing 0 1 2 + std::random_device rnd_device; + std::mt19937 mersenne_engine {rnd_device()}; + std::uniform_int_distribution dist {0, 2}; + + auto gen = [&dist, &mersenne_engine]() { + return dist(mersenne_engine); + }; + pheno_store.resize(1000); + std::generate(std::begin(pheno_store), std::end(pheno_store), + gen); + auto [valid, ncase, ncontrol] = + prsice.test_binary_pheno_is_valid(2, pheno_store); + REQUIRE_FALSE(valid); + // don't need to check case and control + } + SECTION("Valid") + { + // just 1 and 2 or 0 /1 + auto base = GENERATE(0ul, 1ul); + std::random_device rnd_device; + std::mt19937 mersenne_engine {rnd_device()}; + std::uniform_int_distribution dist {0, 1}; + + auto gen = [&dist, &mersenne_engine]() { + return dist(mersenne_engine); + }; + pheno_store.resize(1000); + std::generate(std::begin(pheno_store), std::end(pheno_store), + gen); + std::vector expected = pheno_store; + auto expect_case = + std::count(expected.begin(), expected.end(), 1.0); + auto expect_control = + std::count(expected.begin(), expected.end(), 0.0); + if (base == 1) + { + for (auto&& p : pheno_store) ++p; + } + auto [valid, ncase, ncontrol] = + prsice.test_binary_pheno_is_valid(1 + base, pheno_store); + REQUIRE(valid); + REQUIRE(ncase == Approx(expect_case)); + REQUIRE(ncontrol == Approx(expect_control)); + REQUIRE_THAT(pheno_store, Catch::Equals(expected)); + // don't need to check case and control + } + } + } + SECTION("Parse phenotype") + { + std::vector pheno_store; + int max_pheno = 0; + SECTION("bianry trait") + { + mock_prsice prsice(true, &reporter); + SECTION("Invalid inputs") + { + std::string pheno = GENERATE("3", "String", "Case", "Control"); + REQUIRE_THROWS( + prsice.test_parse_pheno(pheno, pheno_store, max_pheno)); + } + SECTION("Valid inputs") + { + std::string pheno = GENERATE("0", "1", "2"); + REQUIRE_NOTHROW( + prsice.test_parse_pheno(pheno, pheno_store, max_pheno)); + REQUIRE(max_pheno == pheno_store.back()); + } + SECTION("Missing data") + { + std::string pheno = "-9"; + REQUIRE_NOTHROW( + prsice.test_parse_pheno(pheno, pheno_store, max_pheno)); + REQUIRE(pheno_store.empty()); + } + } + SECTION("Quantitative trait") + { + mock_prsice prsice(false, &reporter); + SECTION("Valid inputs") + { + std::string pheno = GENERATE("-1.96", "0", "1", "2"); + REQUIRE_NOTHROW( + prsice.test_parse_pheno(pheno, pheno_store, max_pheno)); + REQUIRE(misc::convert(pheno) == pheno_store.back()); + } + SECTION("Invalid input") + { + std::string pheno = GENERATE("string", "nan"); + REQUIRE_THROWS( + prsice.test_parse_pheno(pheno, pheno_store, max_pheno)); + } + } + } + SECTION("Quantitative trait") {} + SECTION("Binary trait") {} +} diff --git a/test/inc/mock_prsice.hpp b/test/inc/mock_prsice.hpp index 2848cbf3..17678038 100644 --- a/test/inc/mock_prsice.hpp +++ b/test/inc/mock_prsice.hpp @@ -6,11 +6,18 @@ class mock_prsice : public PRSice { public: mock_prsice(Reporter* reporter) { set_reporter(reporter); } + mock_prsice(const bool binary, Reporter* reporter) + : PRSice(CalculatePRS(), PThresholding(), Permutations(), "PRSice", 3, + 3, binary, reporter) + { + } mock_prsice() {}; mock_prsice(const CalculatePRS& prs_info, const PThresholding& p_info, - const Phenotype& pheno, const Permutations& perm, - const std::string& output, Reporter* reporter) - : PRSice(prs_info, p_info, pheno, perm, output, reporter) + const Permutations& perm, const std::string& output, + const size_t max_fid, const size_t max_iid, const bool binary, + Reporter* reporter) + : PRSice(prs_info, p_info, perm, output, max_fid, max_iid, binary, + reporter) { } void set_reporter(Reporter* reporter) { m_reporter = reporter; } @@ -41,6 +48,30 @@ class mock_prsice : public PRSice m_analysis_done = analysis_done; m_total_competitive_perm_done = competitive_done; } + std::unordered_map + test_load_pheno_map(const std::string& delim, const size_t idx, + const bool ignore_fid, + std::unique_ptr pheno_file) + { + return load_pheno_map(delim, idx, ignore_fid, std::move(pheno_file)); + } + void test_parse_pheno(const std::string& pheno, + std::vector& pheno_store, int& max_pheno_code) + { + parse_pheno(pheno, pheno_store, max_pheno_code); + } + bool + test_quantitative_pheno_is_valid(const std::vector& pheno_store) + { + return quantitative_pheno_is_valid(pheno_store); + } + + std::tuple + test_binary_pheno_is_valid(const int max_pheno_code, + std::vector& pheno_store) + { + return binary_pheno_is_valid(max_pheno_code, pheno_store); + } }; #endif // MOCK_PRSICE_HPP