Skip to content

Commit

Permalink
[Update] Start working on bgen
Browse files Browse the repository at this point in the history
  • Loading branch information
choishingwan committed Apr 23, 2020
1 parent 49161da commit 325483c
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 80 deletions.
157 changes: 80 additions & 77 deletions inc/binarygen.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,12 +439,6 @@ class BinaryGen : public Genotype
bool m_centre = false;
};

/*!
* \brief The PLINK_generator struct. This will be passed into the bgen
* library and used for parsing the BGEN data. This will generate a
* plink binary as an output (stored in the genotype vector passed
* during construction)
*/
struct PLINK_generator
{
PLINK_generator(std::vector<uintptr_t>* sample, uintptr_t* genotype,
Expand All @@ -467,16 +461,11 @@ class BinaryGen : public Genotype
m_homrar_ct = 0;
m_het_ct = 0;
m_missing_ct = 0;
// we also clean the running stat (not RS ID), so that we can go
m_impute2 = 0;
// we also clean the running stat, so that we can go
// through another round of calculation of mean and sd
rs.clear();
statistic.clear();
}
/*!
* \brief set_min_max_ploidy is a function required by BGEN. As we
* only allow diploid, this function should not concern us (as we
* will error out otherwise
*
*/
void set_min_max_ploidy(uint32_t, uint32_t, uint32_t, uint32_t) {}
/*!
* \brief set_sample is the function to be called when BGEN start to
Expand All @@ -489,14 +478,10 @@ class BinaryGen : public Genotype
{
// we set the sample index to i
m_sample_i = i;
// set the genotype binary representation to 2 (missing)
m_geno = 2;
// we also reset the hard_prob to 0
m_hard_prob = 0.0;
// and expected value to 0
m_exp_value = 0.0;
m_missing = false;
std::fill(m_prob.begin(), m_prob.end(), 0.0);
// then we determine if we want to include sample using by
// consulting the flag on m_sample
return IS_SET(m_sample->data(), m_sample_i);
Expand All @@ -505,7 +490,9 @@ class BinaryGen : public Genotype
genfile::OrderType phased,
genfile::ValueType)
{
m_phased = phased;
m_phased =
(phased == genfile::OrderType::ePerPhasedHaplotypePerAllele);
m_prob.resize(3 + m_phased, 0.0);
}

// Called once for each genotype (or haplotype) probability per
Expand All @@ -515,7 +502,7 @@ class BinaryGen : public Genotype
* probability per sample \param value where value is the
* probability of the genotype
*/
void set_value(uint32_t geno, double value)
void set_value(uint32_t idx, double value)
{
// if the current probability is the highest and the value is
// higher than the required threshold, we will assign it
Expand All @@ -538,32 +525,17 @@ class BinaryGen : public Genotype
// }
// TODO: To account for situation where there are more than 3
// genotype (which shouldn't happen to be honest)
if (m_phased == genfile::OrderType::ePerPhasedHaplotypePerAllele
&& geno > 1)
{
switch (geno)
{
case 3: geno = 2; break;
default: geno = 0;
}
}
m_prob[geno] = value;
// when we calculate the expected value, we want to multiply the
// probability with our coding instead of just using byte
// representation
m_exp_value += static_cast<double>(geno) * value;

// when data is phased, BGEN Lib will give us 4 probabbilities:
// prob of allele 1 in hap 1, prob of allele 2 in hap 1 and then
// prob of allele 1 in hap 2 and then prob of allele 2 in hap 2,
// which is slightly different from what was stated in the bgen
// format as the bgen lib api automatically do the 1-prob allele 1
// to give us the prob of allele 2
m_prob[idx] = value;
}
/*!
* \brief set_value will capture the missingness signature and do
* nothing
*
* \param value this is the missing signature used by bgen v1.2+
*/

void set_value(uint32_t, genfile::MissingValue) { m_missing = true; }
/*!
* \brief finalise This function is called when the SNP is
* processed. Do nothing here
*/
void finalise() {}
/*!
* \brief sample_completed is called when each sample is completed.
Expand All @@ -574,39 +546,54 @@ class BinaryGen : public Genotype
// if m_shift is zero, it is when we meet the index for the
// first time, therefore we want to initialize it
if (m_shift == 0) m_genotype[m_index] = 0;
// check if it is missing in addition to original
double prob1 = m_prob[0] * 2 + m_prob[1];
double prob2 = m_prob[2] * 2 + m_prob[1];
double missing_check = m_prob[0] + m_prob[1] + m_prob[2];
if ((std::fabs(prob1 - std::round(prob1))
+ std::fabs(prob2 - std::round(prob2)))
* 0.5
> m_hard_threshold
|| misc::logically_equal(missing_check, 0.0) || m_missing)
// check missing not required for phased data, as phased data only
// supported in 1.2+, which represent missing value differently
if (!m_phased && !m_missing)
{
// set to missing
m_geno = 1;
m_missing = misc::logically_equal(
m_prob[0] + m_prob[1] + m_prob[2], 0.0);
m_geno_prob = m_prob;
}
else
{
// convert haplotype prob into genotype prob
m_geno_prob.resize(3, 0.0);
m_geno_prob[0] = m_prob[0] * m_prob[2];
m_geno_prob[1] = m_prob[0] * m_prob[3] + m_prob[1] * m_prob[2];
m_geno_prob[2] = m_prob[1] * m_prob[3];
}
double impute2_tmp = 0.0;
m_exp_value = 0;
for (size_t i = 0; i < 3; ++i)
{
m_exp_value += m_geno_prob[i] * i;
impute2_tmp += m_geno_prob[i] * i * i;
}
m_impute2 += impute2_tmp - m_exp_value;

const double prob1 = m_geno_prob[0] * 2 + m_geno_prob[1];
const double prob2 = m_geno_prob[2] * 2 + m_geno_prob[1];
uintptr_t obs_genotype = 1;
const double hard_score = (std::fabs(prob1 - std::round(prob1))
+ std::fabs(prob2 - std::round(prob2)))
* 0.5;
if (!m_missing && (hard_score <= m_hard_threshold))
{
m_hard_prob = 0;
for (size_t geno = 0; geno < 3; ++geno)
{
if (m_prob[geno] > m_hard_prob)
if (m_prob[geno] > m_hard_prob
&& m_prob[geno] >= m_dose_threshold)
{
m_geno = (geno == 0) ? geno : geno + 1;
// +1 because geno ==1 represents missing
obs_genotype = (geno == 0) ? geno : geno + 1;
m_hard_prob = m_prob[geno];
}
}
if (m_hard_prob < m_dose_threshold)
{
// set to missing
m_geno = 1;
}
}
// we now add the genotype to the vector
m_genotype[m_index] |= m_geno << m_shift;
switch (m_geno)
m_genotype[m_index] |= obs_genotype << m_shift;
switch (obs_genotype)
{
case 3: ++m_homrar_ct; break;
case 2: ++m_het_ct; break;
Expand All @@ -624,20 +611,27 @@ class BinaryGen : public Genotype
}
// we can now push in the expected value for this sample. This
// can then use for the calculation of the info score
// always calculate for any inclusion
if (IS_SET(m_sample->data(), m_sample_i)) rs.push(m_exp_value);
// only calculate for included samples
if (IS_SET(m_sample->data(), m_sample_i))
statistic.push(m_exp_value);
}
/*!
* \brief info_score is the function use to calculate the info score
* \return return the MaCH INFO score
*/
double info_score() const
{
double p = rs.mean() / 2.0;
double p = statistic.mean() / 2.0;
double p_all = 2.0 * p * (1.0 - p);
return (rs.var() / p_all);
return (statistic.var() / p_all);
}
double expected() const { return rs.mean(); }
double impute_info_score() const
{
double p = statistic.mean() / 2.0;
double p_all = 2.0 * p * (1.0 - p) * statistic.get_n();
return 1 - (m_impute2 / p_all);
}
double expected() const { return statistic.mean(); }
void get_count(size_t& homcom_ct, size_t& het_ct, size_t& homrar_ct,
size_t& missing_ct) const
{
Expand All @@ -646,28 +640,37 @@ class BinaryGen : public Genotype
homrar_ct = m_homrar_ct;
missing_ct = m_missing_ct;
}
void get_count(uint32_t& homcom_ct, uint32_t& het_ct,
uint32_t& homrar_ct, uint32_t& missing_ct) const
{
homcom_ct = m_homcom_ct;
het_ct = m_het_ct;
homrar_ct = m_homrar_ct;
missing_ct = m_missing_ct;
}
virtual ~PLINK_generator() {}

private:
// is the sample inclusion vector, if bit is set, sample is required
// the sample inclusion vector, if bit is set, sample is required
std::vector<uintptr_t>* m_sample;
std::vector<double> m_prob;
std::vector<double> m_geno_prob;
// is the genotype vector
uintptr_t* m_genotype;
misc::RunningStat rs;
misc::RunningStat statistic;
double m_hard_threshold = 0.0;
double m_dose_threshold = 0.0;
double m_hard_prob = 0;
double m_exp_value = 0.0;
uintptr_t m_geno = 1;
double m_impute2 = 0.0;
uint32_t m_shift = 0;
uint32_t m_index = 0;
size_t m_sample_i = 0;
size_t m_homcom_ct = 0;
size_t m_homrar_ct = 0;
size_t m_het_ct = 0;
size_t m_missing_ct = 0;
genfile::OrderType m_phased = genfile::OrderType::ePerUnorderedGenotype;
uint32_t m_homcom_ct = 0;
uint32_t m_homrar_ct = 0;
uint32_t m_het_ct = 0;
uint32_t m_missing_ct = 0;
bool m_phased = false;
bool m_missing = false;
};
};
Expand Down
2 changes: 1 addition & 1 deletion inc/commander.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ class Commander
if (value <= 0)
{
m_error_message.append("Error: Non-zero positive number required. "
+ misc::to_string(target)
+ input
+ " provided, please check if you have "
"provided the correct input\n");
return false;
Expand Down
43 changes: 43 additions & 0 deletions inc/genotype.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1216,6 +1216,49 @@ class Genotype
m_existed_snps.end());
m_existed_snps.shrink_to_fit();
}
bool filter_snp(const uint32_t ref_ct, const uint32_t het_ct,
const uint32_t alt_ct, const uint32_t ref_founder_ct,
const uint32_t het_founder_ct,
const uint32_t alt_founder_ct, const double geno,
const double maf, uint32_t& missing_founder_ct)
{
uint32_t total_alleles = ref_ct + het_ct + alt_ct;
double cur_geno =
1.0 - total_alleles / (static_cast<double>(m_sample_ct));
uint32_t missing_ct =
static_cast<uint32_t>(m_sample_ct) - total_alleles;
if (missing_ct == m_sample_ct)
{
++m_num_miss_filter;
return true;
}
if (geno < cur_geno)
{
++m_num_geno_filter;
return true;
}

uint32_t total_founder_alleles =
ref_founder_ct + het_founder_ct + alt_founder_ct;
missing_founder_ct =
static_cast<uint32_t>(m_founder_ct) - total_founder_alleles;
double cur_maf =
(static_cast<double>(2.0 * alt_founder_ct + het_founder_ct))
/ (static_cast<double>(2.0 * total_founder_alleles));
cur_maf = (cur_maf > 0.5) ? 1 - cur_maf : cur_maf;
if (missing_founder_ct == m_founder_ct)
{
++m_num_miss_filter;
return true;
}
if (alt_founder_ct == total_founder_alleles
|| ref_founder_ct == total_founder_alleles || cur_maf < maf)
{
++m_num_maf_filter;
return true;
}
return false;
}
/*!
* \brief Function to load in SNP extraction exclusion list
* \param input the file name of the SNP list
Expand Down
9 changes: 7 additions & 2 deletions inc/snp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,8 +391,13 @@ class SNP
* \return the lower bound of the region
*/
size_t low_bound() const { return m_clump_info.low_bound; }
void set_expected(double expected) { m_expected_value = expected; }
void set_ref_expected(double expected) { m_ref_expected_value = expected; }
void set_expected(double expected, bool is_ref = false)
{
if (is_ref)
m_ref_expected_value = expected;
else
m_expected_value = expected;
}
bool has_expected() const { return m_has_expected; }
bool has_ref_expected() const { return m_has_ref_expected; }
double get_expected(bool use_ref_maf) const
Expand Down

0 comments on commit 325483c

Please sign in to comment.