Skip to content

Commit

Permalink
[Update] New function for factor count
Browse files Browse the repository at this point in the history
  • Loading branch information
choishingwan committed May 17, 2020
1 parent e2a0af5 commit f4203b8
Show file tree
Hide file tree
Showing 7 changed files with 619 additions and 46 deletions.
12 changes: 12 additions & 0 deletions inc/genotype.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,18 @@ class Genotype
misc::to_lower(pheno);
return pheno == "na" || pheno == "nan";
}
void update_valid_sample(size_t idx, bool in_regression)
{
m_sample_id.at(idx).in_regression = in_regression;
}
bool sample_selected_for_prs(size_t idx)
{
return IS_SET(m_calculate_prs.data(), idx);
}
bool sample_valid_for_regress(size_t idx)
{
return m_sample_id.at(idx).in_regression;
}
/*!
* \brief Return the fid of the i th sample
* \param i is the sample index
Expand Down
19 changes: 19 additions & 0 deletions inc/prsice.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,23 @@ class PRSice
static void pheno_check(const bool no_regress, Phenotype& pheno,
Reporter& reporter);

bool is_valid_covariate(const std::set<size_t>& factor_idx,
const std::vector<size_t>& cov_idx,
std::vector<std::string>& cov_line,
std::vector<size_t>& missing_count);
std::string output_missing(const std::set<size_t>& factor_idx,
const std::vector<std::string>& cov_names,
const std::vector<size_t>& cov_idx,
const std::vector<size_t>& factor_levels,
const std::vector<size_t>& missing_count);
std::vector<std::unordered_map<std::string, size_t>>
cov_check_and_factor_level_count(const std::set<size_t>& factor_idx,
const std::vector<std::string>& cov_names,
const std::vector<size_t>& cov_idx,
const std::string& delim,
const bool ignore_fid,
std::unique_ptr<std::istream>& cov_file,
Genotype& target);
void init_matrix(const std::string& file_name,
const std::string& pheno_name, const std::string& delim,
const size_t file_idx, const bool ignore_fid,
Expand Down Expand Up @@ -402,6 +419,8 @@ class PRSice
const size_t num_factors, const size_t idx,
size_t& factor_level_idx,
std::vector<size_t>& missing_count);
void update_phenotype_matrix(const std::vector<bool>& valid_samples,
const size_t num_valid, Genotype& target);
void update_sample_matrix(
const std::vector<size_t>& missing_count,
std::vector<std::pair<std::string, size_t>>& valid_sample_index);
Expand Down
208 changes: 198 additions & 10 deletions src/prsice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -376,20 +376,21 @@ PRSice::process_phenotype_info(const std::string& delim, const bool ignore_fid,
std::vector<double> pheno_store;
pheno_store.reserve(sample_ct);
size_t invalid_pheno = 0;
size_t sample_index_ct = 0;
int max_pheno_code = 0;
for (size_t i_sample = 0; i_sample < sample_ct; ++i_sample)
{
if (target.pheno_is_na(i_sample) || !target.in_regression(i_sample)
target.update_valid_sample(i_sample, false);
if (target.pheno_is_na(i_sample)
|| !target.sample_selected_for_prs(i_sample)
|| (m_binary_trait && target.pheno(i_sample) == "-9"))
{ continue; }
try
{
parse_pheno(target.pheno(i_sample), pheno_store, max_pheno_code);
auto id = ignore_fid ? target.iid(i_sample)
: target.sample_id(i_sample, delim);
m_sample_with_phenotypes[id] = sample_index_ct;
++sample_index_ct;
m_sample_with_phenotypes[id] = i_sample;
target.update_valid_sample(i_sample, true);
}
catch (const std::runtime_error&)
{
Expand All @@ -414,11 +415,11 @@ PRSice::process_phenotype_file(const std::string& file_name,
int max_pheno_code = 0;
size_t invalid_pheno = 0;
size_t num_not_found = 0;
size_t sample_index_ct = 0;
std::vector<double> pheno_store;
pheno_store.reserve(sample_ct);
for (size_t i_sample = 0; i_sample < sample_ct; ++i_sample)
{
target.update_valid_sample(i_sample, false);
id = (ignore_fid) ? target.iid(i_sample)
: target.sample_id(i_sample, delim);
auto pheno_in_file = phenotype_info.find(id);
Expand All @@ -427,15 +428,15 @@ PRSice::process_phenotype_file(const std::string& file_name,
auto pheno_tmp = pheno_in_file->second;
misc::to_lower(pheno_tmp);
if (pheno_tmp != "na" && phenotype_info[id] != "nan"
&& target.in_regression(i_sample)
&& target.sample_selected_for_prs(i_sample)
&& !(m_binary_trait && target.pheno(i_sample) == "-9"))
{
try
{
parse_pheno(phenotype_info[id], pheno_store,
max_pheno_code);
m_sample_with_phenotypes[id] = sample_index_ct;
++sample_index_ct;
m_sample_with_phenotypes[id] = i_sample;
target.update_valid_sample(i_sample, true);
}
catch (...)
{
Expand Down Expand Up @@ -556,6 +557,9 @@ void PRSice::gen_pheno_vec(const std::string& pheno_file,
size_t invalid_pheno = 0;
int max_pheno_code = 0;
std::vector<double> pheno_store;
// m_sample_with_phenotype will have ID as key and idx on the vector
// SampleID as value because we know the phenobtype are stored w.r.t the
// order of vector SampleID
if (!pheno_file.empty()) // use phenotype file
{
std::tie(pheno_store, num_not_found, invalid_pheno, max_pheno_code) =
Expand Down Expand Up @@ -872,6 +876,184 @@ void PRSice::gen_cov_matrix(const std::string& cov_file,
return;
}
}*/

bool PRSice::is_valid_covariate(const std::set<size_t>& factor_idx,
const std::vector<size_t>& cov_idx,
std::vector<std::string>& cov_line,
std::vector<size_t>& missing_count)
{
bool valid = true;
for (size_t idx = 0; idx < cov_idx.size(); ++idx)
{
auto cur_cov_idx = cov_idx[idx];
auto cur_cov = cov_line[cur_cov_idx];
misc::to_upper(cur_cov);
if (cur_cov == "NAN" || cur_cov == "NA")
{
++missing_count[idx];
valid = false;
}
else if (factor_idx.find(cur_cov_idx) == factor_idx.end())
{
// not factor
try
{
misc::convert<double>(cur_cov);
}
catch (...)
{
++missing_count[idx];
valid = false;
}
}
}
return valid;
}
std::string PRSice::output_missing(const std::set<size_t>& factor_idx,
const std::vector<std::string>& cov_names,
const std::vector<size_t>& cov_idx,
const std::vector<size_t>& factor_levels,
const std::vector<size_t>& missing_count)
{
std::string message =
"Include Covariates:\nName\tMissing\tNumber of levels\n";
size_t cur_name_idx = 0;
size_t i_factor = 0;
for (auto&& cov : cov_idx)
{
if (factor_idx.find(cov) == factor_idx.end())
{
// non-factor
message.append(cov_names[cur_name_idx] + "\t"
+ std::to_string(missing_count[cur_name_idx])
+ "\t-\n");
}
else
{
message.append(cov_names[cur_name_idx] + "\t"
+ std::to_string(missing_count[cur_name_idx]) + "\t"
+ std::to_string(factor_levels[i_factor++]) + "\n");
}
++cur_name_idx;
}
return message;
}
void PRSice::update_phenotype_matrix(const std::vector<bool>& valid_samples,
const size_t num_valid, Genotype& target)
{
Eigen::VectorXd new_pheno = Eigen::VectorXd::Zero(num_valid);
const size_t num_sample = target.num_sample();
size_t new_matrix_idx = 0, pheno_matrix_idx = 0;
for (size_t i = 0; i < num_sample; ++i)
{
if (target.sample_valid_for_regress(i))
{
if (valid_samples[i])
{
new_pheno(new_matrix_idx, 0) = m_phenotype(pheno_matrix_idx, 0);
++new_matrix_idx;
}
else
{
target.update_valid_sample(i, false);
}
++pheno_matrix_idx;
}
}
m_phenotype = new_pheno;
}

std::vector<std::unordered_map<std::string, size_t>>
PRSice::cov_check_and_factor_level_count(
const std::set<size_t>& factor_idx,
const std::vector<std::string>& cov_names,
const std::vector<size_t>& cov_idx, const std::string& delim,
const bool ignore_fid, std::unique_ptr<std::istream>& cov_file,
Genotype& target)
{
const size_t max_idx = cov_idx.back() + 1;
std::vector<size_t> missing_count(cov_idx.size(), 0);
std::vector<size_t> current_factor_level(factor_idx.size(), 0);
std::unordered_set<std::string> duplicated_id;
std::string line, id;
std::vector<std::string> token;
size_t num_duplicated_id = 0;
// indicate if the sample is valid after covariate read
// we need this as the covariate file might not follow order of the
// fam file. Size should be total number of samples in vector not in the
// matrix as we don't know the order
std::vector<bool> valid_samples(target.num_sample(), false);
bool valid;
size_t num_valid = 0;
std::vector<std::unordered_map<std::string, size_t>> factor_levels(
factor_idx.size());
while (std::getline(*cov_file, line))
{
misc::trim(line);
if (line.empty()) continue;
token = misc::split(line);
if (token.size() < max_idx)
{
throw std::runtime_error(
"Error: Malformed covariate file, should have at least "
+ std::to_string(max_idx) + " columns");
}
id = (ignore_fid) ? token[0] : token[0] + delim + token[1];
auto sample_idx = m_sample_with_phenotypes.find(id);
if (sample_idx != m_sample_with_phenotypes.end())
{
valid =
is_valid_covariate(factor_idx, cov_idx, token, missing_count);
if (!valid) continue;
if (duplicated_id.find(id) != duplicated_id.end())
{
++num_duplicated_id;
continue;
}
duplicated_id.insert(id);
// sample is valid
valid_samples[sample_idx->second] = true;
++num_valid;
size_t i_factor = 0;
for (auto&& f : factor_idx)
{
auto&& cur_level = factor_levels[i_factor];
auto&& cur_cov = token[f];
if (cur_level.find(cur_cov) == cur_level.end())
{ cur_level[cur_cov] = current_factor_level[i_factor]++; }
++i_factor;
}
}
}
if (num_duplicated_id != 0)
{
throw std::runtime_error("Error: " + std::to_string(num_duplicated_id)
+ " duplicated IDs in covariate file!\n");
}
else if (num_valid == 0)
{
throw std::runtime_error("Error: All samples removed due to "
"missingness in covariate file!");
}
auto message = output_missing(factor_idx, cov_names, cov_idx,
current_factor_level, missing_count);
double ratio = static_cast<double>(num_valid)
/ static_cast<double>(m_sample_with_phenotypes.size());
if (ratio < 0.95)
{
message.append(
"Warning: More than " + std::to_string((1.0 - ratio) * 100)
+ "% of your samples were removed! "
"You should check if your covariate file is correct\n");
}
m_reporter->report(message);
update_phenotype_matrix(valid_samples, num_valid, target);
// update phenotype matrix here
cov_file->clear();
cov_file->seekg(0, cov_file->beg);
return factor_levels;
}

void PRSice::gen_cov_matrix(const std::string& delim)
{
// The size of the map should be informative of the number of sample
Expand All @@ -883,8 +1065,14 @@ void PRSice::gen_cov_matrix(const std::string& delim)
m_independent_variables = Eigen::MatrixXd::Ones(num_sample, 2);
return;
}
// obtain the index of each covariate
// the key is the variable name and the value is the index on the matrix
// convert factor idx to unordered_set for more elegant way of determining
// if idx is factor
std::set<size_t> is_factor;
for (auto&& f : m_pheno_info.col_index_of_factor_cov)
{ is_factor.insert(f); }

// obtain the index of each covariate the key is the
// variable name and the value is the index on the matrix

// need to account for the situation where the same variable name can
// occur in different covariates
Expand Down

0 comments on commit f4203b8

Please sign in to comment.