Skip to content
This repository was archived by the owner on Oct 29, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,12 @@
<artifactId>protobuf-java</artifactId>
<version>3.0.0-alpha-3</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
<version>3.2</version>
<type>jar</type>
</dependency>
</dependencies>

<profiles>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
/*
* Copyright 2015 Google.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.genomics.dataflow.functions;

import com.google.api.services.genomics.model.Position;
import com.google.cloud.genomics.dataflow.model.ReadCounts;
import com.google.cloud.genomics.dataflow.model.ReadQualityCount;
import com.google.cloud.genomics.dataflow.model.ReadQualityCount.Base;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;

import org.apache.commons.math3.analysis.UnivariateFunction;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map;

/**
* Implementation of the likelihood function in equation (2) in
* G. Jun, M. Flickinger, K. N. Hetrick, Kurt, J. M. Romm, K. F. Doheny,
* G. Abecasis, M. Boehnke,and H. M. Kang, Detecting and Estimating
* Contamination of Human DNA Samples in Sequencing and Array-Based Genotype
* Data, American journal of human genetics doi:10.1016/j.ajhg.2012.09.004
* (volume 91 issue 5 pp.839 - 848)
* http://www.sciencedirect.com/science/article/pii/S0002929712004788
*/
public class LikelihoodFn implements UnivariateFunction {

/** Possible genotypes for a SNP with a single alternate */
enum Genotype {
REF_HOMOZYGOUS, HETEROZYGOUS, NONREF_HOMOZYGOUS
}
/** Possible error statuses for a base in a read */
enum ReadStatus {
CORRECT, ERROR
}

static int toTableIndex(Base observed, Genotype trueGenotype, ReadStatus status) {
return observed.ordinal()
+ Base.values().length * (status.ordinal()
+ ReadStatus.values().length * trueGenotype.ordinal());
}

/*
* P_OBS_GIVEN_TRUTH contains the probability of observing a particular
* base (reference, non-reference, or other) given the true genotype
* and the error status of the read. See Table 1 of Jun et al.
*/
private static final ImmutableList<Double> P_OBS_GIVEN_TRUTH;
static {
final ImmutableList<Double> pTable = ImmutableList.of(
// Observed base
// REF NONREF OTHER
1.0, 0.0, 0.0, // P(base | REF_HOMOZYGOUS, CORRECT)
0.0, 1.0 / 3.0, 2.0 / 3.0, // P(base | REF_HOMOZYGOUS, ERROR)
0.5, 0.5, 0.0, // P(base | HETEROZYGOUS, CORRECT)
1.0 / 6.0, 1.0 / 6.0, 2.0 / 3.0, // P(base | HETEROZYGOUS, ERROR)
0.0, 1.0, 0.0, // P(base | NONREF_HOMOZYGOUS, CORRECT)
1.0 / 3.0, 0.0, 2.0 / 3.0); // P(base | NONREF_HOMOZYGOUS, ERROR)
Iterator<Double> itProb = pTable.iterator();
ArrayList<Double> pCond = new ArrayList<>();
pCond.addAll(Collections.nCopies(
Base.values().length * ReadStatus.values().length * Genotype.values().length, 0.0));
for (
Genotype g : ImmutableList.of(Genotype.REF_HOMOZYGOUS, Genotype.HETEROZYGOUS,
Genotype.NONREF_HOMOZYGOUS)) {
for (ReadStatus r : ImmutableList.of(ReadStatus.CORRECT, ReadStatus.ERROR)) {
for (Base b : ImmutableList.of(Base.REF, Base.NONREF, Base.OTHER)) {
pCond.set(toTableIndex(b, g, r), itProb.next());
}
}
}
P_OBS_GIVEN_TRUTH = ImmutableList.copyOf(pCond);
}

private final Map<Position, ReadCounts> readCounts;

/**
* Create a new LikelihoodFn instance for a given set of read counts.
*
* @param readCounts counts of reads by quality for each position of interest
*/
public LikelihoodFn(Map<Position, ReadCounts> readCounts) {
// copy the map so the counts don't get changed out from under us
this.readCounts = ImmutableMap.copyOf(readCounts);
}

/**
* Compute the probability of a genotype given the reference allele probability.
*/
private static double pGenotype(Genotype g, double refProb) {
switch(g) {
case REF_HOMOZYGOUS:
return refProb * refProb;
case HETEROZYGOUS:
return refProb * (1.0 - refProb);
case NONREF_HOMOZYGOUS:
return (1.0 - refProb) * (1.0 - refProb);
default:
throw new IllegalArgumentException("Illegal genotype");
}
}

/**
* Look up the probability of an observation conditioned on the underlying state.
*/
private static double probObsGivenTruth(Base observed, Genotype trueGenotype,
ReadStatus trueStatus) {
return P_OBS_GIVEN_TRUTH.get(toTableIndex(observed, trueGenotype, trueStatus));
}

/**
* Compute the likelihood of a contaminant fraction alpha.
*
* <p>See equation (2) in Jun et al.
*/
@Override
public double value(double alpha) {
double logLikelihood = 0.0;
for (ReadCounts rc : readCounts.values()) {
double refProb = rc.getRefFreq();

double pPosition = 0.0;
for (Genotype trueGenotype1 : Genotype.values()) {
double pGenotype1 = pGenotype(trueGenotype1, refProb);
for (Genotype trueGenotype2 : Genotype.values()) {
double pGenotype2 = pGenotype(trueGenotype2, refProb);

double pObsGivenGenotype = 1.0;

for (ReadQualityCount rqc : rc.getReadQualityCounts()) {
Base base = rqc.getBase();
double pErr = phredToProb(rqc.getQuality());
double pObs
= ((1.0 - alpha)
* probObsGivenTruth(base, trueGenotype1, ReadStatus.CORRECT)
+ (alpha)
* probObsGivenTruth(base, trueGenotype2, ReadStatus.CORRECT)
) * (1.0 - pErr)
+ ((1.0 - alpha)
* probObsGivenTruth(base, trueGenotype1, ReadStatus.ERROR)
+ (alpha)
* probObsGivenTruth(base, trueGenotype2, ReadStatus.ERROR)
) * pErr;
pObsGivenGenotype *= Math.pow(pObs, rqc.getCount());
}
pPosition += pObsGivenGenotype * pGenotype1 * pGenotype2;
}
}
logLikelihood += Math.log(pPosition);
}
return logLikelihood;
}

/**
* Convert a Phred score to a probability.
*/
private static double phredToProb(int phred) {
return Math.pow(10.0, -(double) phred / 10.0);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* Copyright 2015 Google.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.genomics.dataflow.model;

import com.google.api.client.json.GenericJson;
import com.google.cloud.dataflow.sdk.coders.DefaultCoder;
import com.google.cloud.genomics.dataflow.coders.GenericJsonCoder;

import java.util.List;

/**
* Contains frequency for a set of alleles for a single position on a single chromosome.
* Used in VerifyBamId.
*/
@DefaultCoder(GenericJsonCoder.class)
public class AlleleFreq extends GenericJson {
// Strings of length 1 of one of the following bases: ['A', 'C', 'T', 'G'].
private String refBases;
// List of length 1 of a String of length 1 of one of the following bases: ['A', 'C', 'T', 'G'].
private List<String> altBases;
// Frequency for a set of alleles for the given position on the given chromosome
// in the range [0,1].
private double refFreq;

public String getRefBases() {
return refBases;
}

public void setRefBases(String refBases) {
this.refBases = refBases;
}

public List<String> getAltBases() {
return altBases;
}

public void setAltBases(List<String> altBases) {
this.altBases = altBases;
}

public double getRefFreq() {
return refFreq;
}

public void setRefFreq(double refFreq) {
this.refFreq = refFreq;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright 2015 Google.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.genomics.dataflow.model;

import com.google.api.client.json.GenericJson;
import com.google.api.client.util.Lists;
import com.google.cloud.dataflow.sdk.coders.DefaultCoder;
import com.google.cloud.genomics.dataflow.coders.GenericJsonCoder;
import com.google.cloud.genomics.dataflow.model.ReadQualityCount.Base;

import java.util.List;

/**
* Counts of reads for a single SNP with a single alternate value for use in the
* VerifyBamId pipeline. For each SNP, we accumulate counts of bases and quality scores
* for associated aligned reads.
*/
@DefaultCoder(GenericJsonCoder.class)
public class ReadCounts extends GenericJson {
/**
* The count for a single base and quality score is stored in a ReadQualityCount object.
*/
private List<ReadQualityCount> readQualityCounts = Lists.newArrayList();
/**
* refFreq contains the population frequency of the reference allele.
*/
private double refFreq;

public List<ReadQualityCount> getReadQualityCounts() {
return readQualityCounts;
}

public void setReadQualityCounts(List<ReadQualityCount> readQualityCounts) {
this.readQualityCounts = readQualityCounts;
}

public void addReadQualityCount(Base base, int quality, long count) {
ReadQualityCount rqc = new ReadQualityCount();
rqc.setBase(base);
rqc.setCount(count);
rqc.setQuality(quality);
this.readQualityCounts.add(rqc);
}

public void addReadQualityCount(ReadQualityCount rqc) {
this.readQualityCounts.add(rqc);
}

public double getRefFreq() {
return refFreq;
}

public void setRefFreq(double refFreq) {
this.refFreq = refFreq;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Copyright 2015 Google.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.cloud.genomics.dataflow.model;

/**
* This class is used to count the number of reads aligned to a SNP that show the reference base,
* the non-reference base, some other base, or an unknown base. Within each category, we count the
* number with each quality score.
*
* For example, we might have 2 reads that show the reference base with quality 10, 5 reads that
* show the non-reference base with quality 60, and 1 read that shows a different nucleotide with
* quality 0.
*/
public class ReadQualityCount {

private Base base;
private int quality;
private long count;

/**
* Which type of Base this ReadQualityCount represents.
*/
public enum Base {
UNKNOWN, REF, NONREF, OTHER
};

public Base getBase() {
return base;
}

public void setBase(Base base) {
this.base = base;
}

public int getQuality() {
return quality;
}

public void setQuality(int quality) {
this.quality = quality;
}

public long getCount() {
return count;
}

public void setCount(long count) {
this.count = count;
}
}
Loading