Skip to content

Commit

Permalink
[Update] Add test for phenotype check
Browse files Browse the repository at this point in the history
  • Loading branch information
choishingwan committed May 15, 2020
1 parent 3e82e90 commit 369894b
Show file tree
Hide file tree
Showing 7 changed files with 469 additions and 229 deletions.
7 changes: 6 additions & 1 deletion inc/genotype.hpp
Expand Up @@ -100,7 +100,10 @@ class Genotype
void load_snps(const std::string& out,
const std::vector<IITree<size_t, size_t>>& exclusion_regions,
bool verbose, Genotype* target = nullptr);

std::tuple<size_t, size_t> get_max_id_length() const
{
return {m_max_fid_length, m_max_iid_length};
}
void calc_freqs_and_intermediate(const QCFiltering& filter_info,
const std::string& prefix, bool verbose,
Genotype* target = nullptr);
Expand Down Expand Up @@ -685,6 +688,8 @@ class Genotype
size_t m_num_female = 0;
size_t m_num_ambig_sex = 0;
size_t m_num_non_founder = 0;
size_t m_max_fid_length = 3;
size_t m_max_iid_length = 3;
uintptr_t m_unfiltered_sample_ct = 0; // number of unfiltered samples
uintptr_t m_unfiltered_marker_ct = 0;
uintptr_t m_sample_ct = 0;
Expand Down
96 changes: 43 additions & 53 deletions inc/prsice.hpp
Expand Up @@ -72,14 +72,17 @@ class PRSice
public:
PRSice() {}
PRSice(const CalculatePRS& prs_info, const PThresholding& p_info,
const Phenotype& pheno, const Permutations& perm,
const std::string& output, Reporter* reporter)
const Permutations& perm, const std::string& output,
const size_t max_fid, const size_t max_iid, const bool binary,
Reporter* reporter)
: m_prefix(output)
, m_max_fid_length(max_fid)
, m_max_iid_length(max_iid)
, m_binary_trait(binary)
, m_reporter(reporter)
, m_prs_info(prs_info)
, m_p_info(p_info)
, m_perm_info(perm)
, m_pheno_info(pheno)
{
}

Expand All @@ -93,10 +96,7 @@ class PRSice
*/
static void pheno_check(const bool no_regress, Phenotype& pheno,
Reporter& reporter);
bool pheno_skip(size_t idx) const
{
return m_pheno_info.skip_pheno.at(idx);
}

// init_matrix whenever phenotype changes
/*!
* \brief init_matrix will initialize the independent and dependent matrix
Expand All @@ -108,15 +108,6 @@ class PRSice
*/
void init_matrix(const size_t pheno_index, const std::string& delim,
Genotype& target);
/*!
* \brief Return the total number of phenotype involved
* \return the total number of phenotype to process
*/
size_t num_phenotype() const { return m_pheno_info.binary.size(); }
std::string pheno_name(const size_t i) const
{
return m_pheno_info.pheno_col.at(i);
}
void new_phenotype(Genotype& target);
bool run_prsice(const size_t pheno_index, const size_t region_index,
const std::vector<std::vector<size_t>>& region_membership,
Expand Down Expand Up @@ -191,9 +182,6 @@ class PRSice
// now calculate the number of competitive permutation we need
// we only use the best threshold for set based permutation
m_total_competitive_process = (num_region - 2) * num_set_perm;
}
void reset_progress()
{
m_analysis_done = 0;
m_total_competitive_perm_done = 0;
}
Expand Down Expand Up @@ -231,14 +219,6 @@ class PRSice
const std::vector<size_t>::const_iterator& bk_start_idx,
const std::vector<size_t>::const_iterator& bk_end_idx,
const size_t pheno_index);
bool valid_pheno(const size_t idx) const
{
return !m_pheno_info.skip_pheno.at(idx);
}
std::vector<double> get_prevalence() const
{
return m_pheno_info.prevalence;
}

/*!
* \brief Function responsible to generate the best score file
Expand Down Expand Up @@ -295,18 +275,22 @@ class PRSice
processed_threshold = 0;
}
};
// struct Pheno_Info
// {
// std::vector<int> col;
// std::vector<std::string> name;
// std::vector<int> order;
// std::vector<bool> binary;
// bool use_pheno = false;
// } m_pheno_info;

// store the number of non-sig, margin sig, and sig pathway & phenotype
static std::mutex lock_guard;

// As R has a default precision of 7, we will go a bit
// higher to ensure we use up all precision
const std::string m_prefix;
const long long m_precision = 9;
// the 7 are:
// 1 for sign
// 1 for dot
// 2 for e- (scientific)
// 3 for exponent (max precision is somewhere around +-e297, so 3 is enough
const long long m_numeric_width = m_precision + 7;
const size_t m_max_fid_length = 3;
const size_t m_max_iid_length = 3;
const bool m_binary_trait = true;
Eigen::MatrixXd m_independent_variables;
// TODO: Use other method for faster best output
Eigen::MatrixXd m_fast_best_output;
Expand Down Expand Up @@ -334,21 +318,10 @@ class PRSice
uint32_t m_num_snp_included = 0;
uint32_t m_analysis_done = 0;

// As R has a default precision of 7, we will go a bit
// higher to ensure we use up all precision
const long long m_precision = 9;
// the 7 are:
// 1 for sign
// 1 for dot
// 2 for e- (scientific)
// 3 for exponent (max precision is somewhere around +-e297, so 3 is enough
const long long m_numeric_width = m_precision + 7;
long long m_max_fid_length = 3;
long long m_max_iid_length = 3;

int m_best_index = -1;
bool m_quick_best = true;
bool m_printed_warning = false;
const std::string m_prefix;
Reporter* m_reporter;
CalculatePRS m_prs_info;
PThresholding m_p_info;
Expand Down Expand Up @@ -384,6 +357,22 @@ class PRSice
*/
void gen_pheno_vec(Genotype& target, const size_t pheno_index,
const std::string& delim);
void process_phenotype_file(const std::string& file_name,
const std::string& pheno_name,
const std::string& delim,
const std::size_t pheno_idx,
const bool ignore_fid, Genotype& target);
void process_phenotype_info(const std::string& pheno_name,
const std::string& delim, const bool ignore_fid,
Genotype& target);
std::tuple<bool, size_t, size_t>
binary_pheno_is_valid(const int max_pheno_code,
std::vector<double>& pheno_store);
bool quantitative_pheno_is_valid(const std::vector<double>& pheno_store);
void print_pheno_log(const std::string& name, const size_t sample_ct,
const size_t num_not_found, const size_t invalid_pheno,
const int max_pheno_code, const bool ignore_fid,
std::vector<double>& pheno_store);
/*!
* \brief Function to generate the m_independent_variable matrix
* \param c_cov_file is the name of the covariate file
Expand Down Expand Up @@ -542,13 +531,14 @@ class PRSice
Pmat,
const Eigen::MatrixXd& R, const bool run_glm);

void parse_pheno(const bool binary, const std::string& pheno,
std::vector<double>& pheno_store, double& first_pheno,
bool& more_than_one_pheno, size_t& num_case,
size_t& num_control, int& max_pheno_code);
void parse_pheno(const std::string& pheno, std::vector<double>& pheno_store,
int& max_pheno_code);

std::unordered_map<std::string, std::string>
load_pheno_map(const size_t idx, const std::string& delim);
load_pheno_map(const std::string& delim, const size_t idx,
const bool ignore_fid,
std::unique_ptr<std::istream> pheno_file);

void reset_result_containers(const Genotype& target,
const size_t region_idx);

Expand Down
5 changes: 5 additions & 0 deletions src/genotype.cpp
Expand Up @@ -575,6 +575,11 @@ void Genotype::gen_sample(const size_t fid_idx, const size_t iid_idx,
const std::string fid = (m_ignore_fid) ? "" : token[fid_idx];
const std::string id =
(m_ignore_fid) ? token[iid_idx] : fid + m_delim + token[iid_idx];
if (m_max_fid_length < token[fid_idx].length())
m_max_fid_length = token[fid_idx].length();
if (m_max_iid_length < token[iid_idx].length())
m_max_iid_length = token[iid_idx].length();

// end immediately if duplicated samples are found
if (sample_in_file.find(id) != sample_in_file.end())
{
Expand Down
14 changes: 7 additions & 7 deletions src/main.cpp
Expand Up @@ -140,16 +140,11 @@ int main(int argc, char* argv[])
// one extra progress bar for competitive permutation
assert(target_file->get_set_thresholds().size() == num_regions);


const auto [max_fid, max_iid] = target_file->get_max_id_length();
const size_t num_pheno = pheno_info.pheno_col_idx.size();
for (size_t i_pheno = 0; i_pheno < num_pheno; ++i_pheno)
{
PRSice prsice(commander.get_prs_instruction(),
commander.get_p_threshold(), pheno_info,
commander.get_perm(), commander.out(), &reporter);
prsice.init_progress_count(target_file->get_set_thresholds());

if (prsice.pheno_skip(i_pheno))
if (pheno_info.skip_pheno[i_pheno])
{
reporter.simple_report("Skipping the "
+ std::to_string(i_pheno + 1)
Expand All @@ -159,6 +154,11 @@ int main(int argc, char* argv[])
reporter.simple_report("Processing the "
+ std::to_string(i_pheno + 1)
+ " th phenotype");
PRSice prsice(commander.get_prs_instruction(),
commander.get_p_threshold(), commander.get_perm(),
commander.out(), max_fid, max_iid,
pheno_info.binary[i_pheno], &reporter);
prsice.init_progress_count(target_file->get_set_thresholds());
prsice.new_phenotype(*target_file);
if (!commander.get_prs_instruction().no_regress)
{
Expand Down

0 comments on commit 369894b

Please sign in to comment.