Skip to content

Commit

Permalink
[Update] Change pheno check to static
Browse files Browse the repository at this point in the history
this allow us to initialize a new PRSice object for each phenotype
  • Loading branch information
choishingwan committed May 15, 2020
1 parent 154ec4e commit 3e82e90
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 79 deletions.
10 changes: 6 additions & 4 deletions inc/prsice.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ class PRSice
* \param is the column name of the desired phenotypes
* \param reporter is the logger
*/
void pheno_check();
static void pheno_check(const bool no_regress, Phenotype& pheno,
Reporter& reporter);
bool pheno_skip(size_t idx) const
{
return m_pheno_info.skip_pheno.at(idx);
Expand Down Expand Up @@ -251,10 +252,11 @@ class PRSice
const size_t pheno_index);

protected:
void parse_pheno_header(std::unique_ptr<std::istream> pheno_file);
std::tuple<size_t, bool>
static void parse_pheno_header(std::unique_ptr<std::istream> pheno_file,
Phenotype& pheno_info, Reporter& reporter);
static std::tuple<size_t, bool>
get_pheno_idx(const std::vector<std::string_view>& column,
const std::string& pheno);
const Phenotype& pheno_info, const std::string& pheno);

struct prsice_result
{
Expand Down
26 changes: 16 additions & 10 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,10 @@ int main(int argc, char* argv[])
}
const auto [region_names, num_regions] =
add_gene_set_info(commander, target_file, reporter);

PRSice prsice(commander.get_prs_instruction(),
commander.get_p_threshold(), commander.get_pheno(),
commander.get_perm(), commander.out(), &reporter);
prsice.pheno_check();
auto prs_instruction = commander.get_prs_instruction();
auto pheno_info = commander.get_pheno();
PRSice::pheno_check(prs_instruction.no_regress, pheno_info,
reporter);

if (!commander.get_clump_info().no_clump)
{
Expand Down Expand Up @@ -140,11 +139,16 @@ int main(int argc, char* argv[])
// one progress bar per phenotype
// one extra progress bar for competitive permutation
assert(target_file->get_set_thresholds().size() == num_regions);
prsice.init_progress_count(target_file->get_set_thresholds());
const size_t num_pheno = prsice.num_phenotype();


const size_t num_pheno = pheno_info.pheno_col_idx.size();
for (size_t i_pheno = 0; i_pheno < num_pheno; ++i_pheno)
{
prsice.reset_progress();
PRSice prsice(commander.get_prs_instruction(),
commander.get_p_threshold(), pheno_info,
commander.get_perm(), commander.out(), &reporter);
prsice.init_progress_count(target_file->get_set_thresholds());

if (prsice.pheno_skip(i_pheno))
{
reporter.simple_report("Skipping the "
Expand Down Expand Up @@ -195,11 +199,13 @@ int main(int argc, char* argv[])
}
}
}
prsice.print_progress(true);
fprintf(stderr, "\n");
// prsice.print_progress(true);
// fprintf(stderr, "\n");
/*
if (!commander.get_prs_instruction().no_regress)
// now generate the summary file
prsice.summarize();
*/
}
catch (const std::invalid_argument& ia)
{
Expand Down
73 changes: 37 additions & 36 deletions src/prsice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@
std::mutex PRSice::lock_guard;
std::tuple<size_t, bool>
PRSice::get_pheno_idx(const std::vector<std::string_view>& column,
const std::string& pheno)
const Phenotype& pheno_info, const std::string& pheno)
{
if (pheno.empty()) return {1 + !m_pheno_info.ignore_fid, false};
if (pheno.empty()) return {1 + !pheno_info.ignore_fid, false};
// there should not be nay duplicated column in the pheno_col
bool duplicated_file_column = false;
size_t col_idx = 0;
// don't start from 0 as we expect those to be the FID / IID
for (size_t i_col = 1 + !m_pheno_info.ignore_fid; i_col < column.size();
for (size_t i_col = 1 + !pheno_info.ignore_fid; i_col < column.size();
++i_col)
{
if (column[i_col] == pheno)
Expand All @@ -45,7 +45,8 @@ PRSice::get_pheno_idx(const std::vector<std::string_view>& column,
}
return {col_idx, duplicated_file_column};
}
void PRSice::parse_pheno_header(std::unique_ptr<std::istream> pheno_file)
void PRSice::parse_pheno_header(std::unique_ptr<std::istream> pheno_file,
Phenotype& pheno_info, Reporter& reporter)
{
std::string line;
std::getline(*pheno_file, line);
Expand All @@ -57,49 +58,47 @@ void PRSice::parse_pheno_header(std::unique_ptr<std::istream> pheno_file)
pheno_file.reset();
misc::trim(line);
std::vector<std::string_view> col = misc::tokenize(line);
if (col.size() < 2ul + !m_pheno_info.ignore_fid)
if (col.size() < 2ul + !pheno_info.ignore_fid)
{
throw std::runtime_error(
"Error: Not enough column in Phenotype file. "
"If the phenotype does not contain the FID, use --ignore-fid");
}
std::string message = "";
std::string sample_id = std::string(col[0]);
if (!m_pheno_info.ignore_fid)
{ sample_id.append("+" + std::string(col[1])); }
message.append("Phenotype file: " + m_pheno_info.pheno_file + "\n");
if (!pheno_info.ignore_fid) { sample_id.append("+" + std::string(col[1])); }
message.append("Phenotype file: " + pheno_info.pheno_file + "\n");
message.append("Column Name of Sample ID: " + sample_id + "\n");
message.append("Note: If the phenotype file does not contain a header, "
"the column name will be displayed as the Sample ID "
"which is expected.\n");
// no user input
if (m_pheno_info.pheno_col.empty())
if (pheno_info.pheno_col.empty())
{
m_pheno_info.pheno_col_idx.push_back(1 + !m_pheno_info.ignore_fid);
m_pheno_info.pheno_col = {
std::string(col[1 + !m_pheno_info.ignore_fid])};
if (isdigit(col[1 + !m_pheno_info.ignore_fid].at(0)))
{ m_pheno_info.pheno_col = {"Phenotype"}; }
m_pheno_info.skip_pheno = {false};
pheno_info.pheno_col_idx.push_back(1 + !pheno_info.ignore_fid);
pheno_info.pheno_col = {std::string(col[1 + !pheno_info.ignore_fid])};
if (isdigit(col[1 + !pheno_info.ignore_fid].at(0)))
{ pheno_info.pheno_col = {"Phenotype"}; }
pheno_info.skip_pheno = {false};
}
else
{
m_pheno_info.skip_pheno.resize(m_pheno_info.pheno_col.size());
pheno_info.skip_pheno.resize(pheno_info.pheno_col.size());
bool has_valid_pheno = false;
for (size_t i_pheno = 0; i_pheno < m_pheno_info.pheno_col.size();
for (size_t i_pheno = 0; i_pheno < pheno_info.pheno_col.size();
++i_pheno)
{
auto [idx, found] =
get_pheno_idx(col, m_pheno_info.pheno_col[i_pheno]);
get_pheno_idx(col, pheno_info, pheno_info.pheno_col[i_pheno]);
if (found) { has_valid_pheno = true; }
else
{
message.append("Warning: Phenotype: "
+ m_pheno_info.pheno_col[i_pheno]
+ pheno_info.pheno_col[i_pheno]
+ " cannot be found in the phenotype file\n");
m_pheno_info.skip_pheno[i_pheno] = true;
pheno_info.skip_pheno[i_pheno] = true;
}
m_pheno_info.pheno_col_idx.push_back(idx);
pheno_info.pheno_col_idx.push_back(idx);
}
if (!has_valid_pheno)
{
Expand All @@ -108,44 +107,46 @@ void PRSice::parse_pheno_header(std::unique_ptr<std::istream> pheno_file)
throw std::runtime_error(message);
}
}
m_reporter->report(message);
reporter.report(message);
}

void PRSice::pheno_check()
void PRSice::pheno_check(const bool no_regress, Phenotype& pheno,
Reporter& reporter)
{
// don't bother to check anything, just add a place holder in binary
if (m_prs_info.no_regress)
if (no_regress)
{
m_pheno_info.binary = {true};
m_pheno_info.pheno_col = {"PlaceHolder"};
m_pheno_info.skip_pheno = {false};
pheno.binary = {true};
pheno.pheno_col = {"PlaceHolder"};
pheno.skip_pheno = {false};
pheno.pheno_col_idx = {~size_t(0)};
return;
}
if (m_pheno_info.binary.empty())
if (pheno.binary.empty())
{ throw std::runtime_error("Error: No phenotype provided"); }

// want to update the binary and prevalence vector by removing
// phenotypes not found / duplicated
m_pheno_info.skip_pheno.resize(m_pheno_info.binary.size(), false);
if (!m_pheno_info.pheno_file.empty())
pheno.skip_pheno.resize(pheno.binary.size(), false);
if (!pheno.pheno_file.empty())
{
auto pheno = misc::load_stream(m_pheno_info.pheno_file);
parse_pheno_header(std::move(pheno));
auto pheno_file = misc::load_stream(pheno.pheno_file);
parse_pheno_header(std::move(pheno_file), pheno, reporter);
}
else
{
m_pheno_info.pheno_col = {"Phenotype"};
pheno.pheno_col = {"Phenotype"};
pheno.pheno_col_idx = {~size_t(0)};
}
assert(m_pheno_info.pheno_col_idx.size() == m_pheno_info.pheno_col.size());
auto message = "There are a total of "
+ std::to_string(m_pheno_info.pheno_col.size())
+ std::to_string(pheno.pheno_col.size())
+ " phenotype to process\n";
m_reporter->report(message);
reporter.report(message);
}

void PRSice::new_phenotype(Genotype& target)
{

// remove all content from phenotype vector and independent matrix so
// that we don't get into trouble also clean out the dictionary which
// indicate which sample contain valid phenotype
Expand Down

0 comments on commit 3e82e90

Please sign in to comment.