Skip to content

Commit

Permalink
Merge pull request #319 from sambrightman/uniquesampleids
Browse files Browse the repository at this point in the history
Ensure FreeBayes fails on multiple BAMs with conflicting read groups
  • Loading branch information
ekg committed Sep 29, 2016
2 parents a5f71a9 + 53f594b commit d57be62
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 9 deletions.
44 changes: 36 additions & 8 deletions src/AlleleParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ void AlleleParser::openBams(void) {

// report differently if we have one or many bam files
if (parameters.bams.size() == 1) {
DEBUG("Opening BAM fomat alignment input file: " << parameters.bams.front() << " ...");
DEBUG("Opening BAM format alignment input file: " << parameters.bams.front() << " ...");
} else if (parameters.bams.size() > 1) {
DEBUG("Opening " << parameters.bams.size() << " BAM fomat alignment input files");
DEBUG("Opening " << parameters.bams.size() << " BAM format alignment input files");
for (vector<string>::const_iterator b = parameters.bams.begin();
b != parameters.bams.end(); ++b) {
DEBUG2(*b);
Expand Down Expand Up @@ -84,13 +84,21 @@ void AlleleParser::openBams(void) {
}
}


// retrieve header information
bamHeader = bamMultiReader.GetHeaderText();
bamHeaderLines = split(bamHeader, '\n');
if (!parameters.useStdin) {
BamReader reader;
for (vector<string>::const_iterator b = parameters.bams.begin();
b != parameters.bams.end(); ++b) {
reader.Open(*b);
string bamHeader = reader.GetHeaderText();
vector<string> headerLines = split(bamHeader, '\n');
bamHeaderLines.insert(bamHeaderLines.end(), headerLines.begin(), headerLines.end());
reader.Close();
}
} else {
bamHeaderLines = split(bamMultiReader.GetHeaderText(), '\n');
}

DEBUG(" done");

}

void AlleleParser::openOutputFile(void) {
Expand Down Expand Up @@ -141,6 +149,26 @@ void AlleleParser::getSequencingTechnologies(void) {
cerr << "no sequencing technology specified in @RG tag (no PL: in @RG tag) " << endl << headerLine << endl;
}
} else {
map<string, string>::iterator s = readGroupToTechnology.find(readGroupID);
if (s != readGroupToTechnology.end()) {
if (s->second != tech) {
ERROR("multiple technologies (PL) map to the same read group (RG)" << endl
<< endl
<< "technologies " << tech << " and " << s->second << " map to " << readGroupID << endl
<< endl
<< "As freebayes operates on a virtually merged stream of its input files," << endl
<< "it will not be possible to determine what technology an alignment belongs to" << endl
<< "at runtime." << endl
<< endl
<< "To resolve the issue, ensure that RG ids are unique to one technology" << endl
<< "across all the input files to freebayes." << endl
<< endl
<< "See bamaddrg (https://github.com/ekg/bamaddrg) for a method which can" << endl
<< "add RG tags to alignments." << endl);
exit(1);
}
// if it's the same technology and RG combo, no worries
}
readGroupToTechnology[readGroupID] = tech;
technologies[tech] = true;
}
Expand Down Expand Up @@ -267,7 +295,7 @@ void AlleleParser::getSampleNames(void) {
map<string, string>::iterator s = readGroupToSampleNames.find(readGroupID);
if (s != readGroupToSampleNames.end()) {
if (s->second != name) {
ERROR("ERROR: multiple samples (SM) map to the same read group (RG)" << endl
ERROR("multiple samples (SM) map to the same read group (RG)" << endl
<< endl
<< "samples " << name << " and " << s->second << " map to " << readGroupID << endl
<< endl
Expand Down
1 change: 0 additions & 1 deletion src/AlleleParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,6 @@ class AlleleParser {
//RefLength; //!< Length of reference sequence
//RefHasAlignments; //!< True if BAM file contains alignments mapped to reference sequence

string bamHeader;
vector<string> bamHeaderLines;

void openBams(void);
Expand Down
88 changes: 88 additions & 0 deletions test/t/02_multi_bam.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/env bash

BASH_TAP_ROOT=bash-tap
source ./bash-tap/bash-tap-bootstrap

PATH=../bin:$PATH # for freebayes

plan tests 7

ref=$(basename $0).ref

trap 'rm -f ${ref}* $(basename $0)*.bam*' EXIT

cat >${ref} <<REF
>ref
AATCGGCTA
REF
samtools faidx ${ref}

function run_freebayes() {
freebayes "$@" \
--haplotype-length 0 --min-alternate-count 1 \
--min-alternate-fraction 0 --pooled-continuous --report-monomorphic \
--ploidy 1 \
-f $ref $bam \
2>&1 \
| grep -vE "^#" | cut -f1-5
}

function make_bam() {
local id=$1 && shift
local sm=$1 && shift
local pl=$1 && shift
local suffix=${1:-} && shift
local first_snp=${1:-G} && shift

local bam="$(basename $0).${id}.${sm}.${pl}${suffix}.bam"
samtools view -S -b - >${bam} <<SAM
@HD VN:1.5 SO:coordinate
@SQ SN:ref LN:9
@RG ID:${id} SM:${sm} PL:${pl}
alt 0 ref 1 30 1=1X1=2X1=1X1=1X * 0 0 A${first_snp}TTAGGTT * RG:Z:${id}
SAM
samtools index ${bam}
echo ${bam}
}

expected=$(cat <<END
ref 2 . A G
ref 4 . CG TA
ref 7 . C G
ref 9 . A T
END
)
bam1=$(make_bam "id1" "sample1" "platform1" "a")
bam2=$(make_bam "id1" "sample1" "platform1" "b")
is "$(run_freebayes -f ${ref} ${bam1} ${bam2})" "${expected}" "freebayes calls from two identical BAMs"

bam1=$(make_bam "id1" "sample1" "platform1")
bam2=$(make_bam "id2" "sample2" "platform2")
is "$(run_freebayes -f ${ref} ${bam1} ${bam2})" "${expected}" "freebayes calls from two BAMs with different samples for different read groups"

bam1=$(make_bam "id1" "sample1" "platform1")
bam2=$(make_bam "id2" "sample1" "platform2")
is "$(run_freebayes -f ${ref} ${bam1} ${bam2})" "${expected}" "freebayes calls from two BAMs with different technologies for different read groups"

bam1=$(make_bam "id1" "sample1" "platform1")
bam2=$(make_bam "id1" "sample2" "platform1")
expected='ERROR\(freebayes\): multiple samples \(SM\) map to the same read group \(RG\)'
like "$(run_freebayes -f ${ref} ${bam1} ${bam2})" "${expected}" "freebayes rejects two BAMs with different samples for same read groups"

bam1=$(make_bam "id1" "sample1" "platform1")
bam2=$(make_bam "id1" "sample1" "platform2")
expected='ERROR\(freebayes\): multiple technologies \(PL\) map to the same read group \(RG\)'
like "$(run_freebayes -f ${ref} ${bam1} ${bam2})" "${expected}" "freebayes rejects two BAMs with different technologies for same read groups"

bam1=$(make_bam "id1" "sample1" "platform1")
bam2=$(make_bam "id2" "sample2" "platform2" "" "C")
expected=$(cat <<END
ref 2 . A C,G
ref 4 . CG TA
ref 7 . C G
ref 9 . A T
END
)
is "$(run_freebayes -f ${ref} ${bam1} ${bam2})" "${expected}" "freebayes calls multiple alts from two different BAMs"

is "$(run_freebayes -f ${ref} ${bam1} ${bam1})" "Error: Duplicate bam file '02_multi_bam.t.id1.sample1.platform1.bam'" "freebayes rejects two BAMs with the same name"

0 comments on commit d57be62

Please sign in to comment.