src/AlleleParser.cpp

#include "AlleleParser.h"
#include "multichoose.h" // includes generic functions, so it must be included here
                         // otherwise we will get a linker error
                         // see: http://stackoverflow.com/questions/36039/templates-spread-across-multiple-files
                         // http://www.cplusplus.com/doc/tutorial/templates/ "Templates and Multi-file projects"
#include "multipermute.h"

// local helper debugging macros to improve code readability
#define DEBUG(msg) \
    if (parameters.debug) { cerr << msg << endl; }

// lower-priority messages
#ifdef VERBOSE_DEBUG
#define DEBUG2(msg) \
    if (parameters.debug2) { cerr << msg << endl; }
#else
#define DEBUG2(msg)
#endif

// must-see error messages
#define ERROR(msg) \
    cerr << "ERROR(freebayes): " << msg << endl;

// must-see warning messages
#define WARNING(msg) \
    cerr << "WARNING(freebayes): " << msg << endl;

using namespace std;


// open BAM input file
void AlleleParser::openBams(void) {

    // report differently if we have one or many bam files
    if (parameters.bams.size() == 1) {
        DEBUG("Opening BAM fomat alignment input file: " << parameters.bams.front() << " ...");
    } else if (parameters.bams.size() > 1) {
        DEBUG("Opening " << parameters.bams.size() << " BAM fomat alignment input files");
        for (vector<string>::const_iterator b = parameters.bams.begin(); 
                b != parameters.bams.end(); ++b) {
            DEBUG2(*b);
        }
    }
    
    if (parameters.useStdin) {
        if (!bamMultiReader.Open(parameters.bams)) {
            ERROR("Could not read BAM data from stdin");
            cerr << bamMultiReader.GetErrorString() << endl;
            exit(1);
        }
    } else {
        if (!bamMultiReader.Open(parameters.bams)) {
            ERROR("Could not open input BAM files");
            cerr << bamMultiReader.GetErrorString() << endl;
            exit(1);
        } else {
            if (!bamMultiReader.LocateIndexes()) {
                ERROR("Opened BAM reader without index file, jumping is disabled.");
                cerr << bamMultiReader.GetErrorString() << endl;
                if (!targets.empty()) {
                    ERROR("Targets specified but no BAM index file provided.");
                    ERROR("FreeBayes cannot jump through targets in BAM files without BAM index files, exiting.");
                    ERROR("Please generate a BAM index file eithe, e.g.:");
                    ERROR("    \% bamtools index -in <bam_file>");
                    ERROR("    \% samtools index <bam_file>");
                    exit(1);
                }
            }
        }
        if (!bamMultiReader.SetExplicitMergeOrder(bamMultiReader.MergeByCoordinate)) {
            ERROR("could not set sort order to coordinate");
            cerr << bamMultiReader.GetErrorString() << endl;
            exit(1);
        }
    }


    // retrieve header information
    bamHeader = bamMultiReader.GetHeaderText();
    bamHeaderLines = split(bamHeader, '\n');

    DEBUG(" done");

}

void AlleleParser::openTraceFile(void) {
    if (parameters.trace) {
        traceFile.open(parameters.traceFile.c_str(), ios::out);
        DEBUG("Opening trace file: " << parameters.traceFile << " ...");
        if (!traceFile) {
            ERROR(" unable to open trace file: " << parameters.traceFile );
            exit(1);
        }
    }
}

void AlleleParser::openFailedFile(void) {
    if (!parameters.failedFile.empty()) {
        failedFile.open(parameters.failedFile.c_str(), ios::out);
        DEBUG("Opening failed alleles file: " << parameters.failedFile << " ...");
        if (!failedFile) {
            ERROR(" unable to open failed alleles file: " << parameters.failedFile );
            exit(1);
        }
    }
}

void AlleleParser::openOutputFile(void) {
    if (parameters.outputFile != "") {
        outputFile.open(parameters.outputFile.c_str(), ios::out);
        DEBUG("Opening output file: " << parameters.outputFile << " ...");
        if (!outputFile) {
            ERROR(" unable to open output file: " << parameters.outputFile);
            exit(1);
        }
        output = &outputFile;
    } else {
        output = &cout;
    }
}

void AlleleParser::getSequencingTechnologies(void) {

    map<string, bool> technologies;

    for (vector<string>::const_iterator it = bamHeaderLines.begin(); it != bamHeaderLines.end(); ++it) {

        // get next line from header, skip if empty
        string headerLine = *it;
        if ( headerLine.empty() ) { continue; }

        // lines of the header look like:
        // "@RG     ID:-    SM:NA11832      CN:BCM  PL:454"
        //                     ^^^^^^^\ is our sample name
        if ( headerLine.find("@RG") == 0 ) {
            vector<string> readGroupParts = split(headerLine, "\t ");
            string tech;
            string readGroupID;
            for (vector<string>::const_iterator r = readGroupParts.begin(); r != readGroupParts.end(); ++r) {
                size_t colpos = r->find(":");
                if (colpos != string::npos) {
                    string fieldname = r->substr(0, colpos);
                    if (fieldname == "PL") {
                        tech = r->substr(colpos+1);
                    } else if (fieldname == "ID") {
                        readGroupID = r->substr(colpos+1);
                    }
                }
            }

            if (tech.empty()) {
                if (!sequencingTechnologies.empty()) {
                    cerr << "no sequencing technology specified in @RG tag (no PL: in @RG tag) " << endl << headerLine << endl;
                }
            } else {
                readGroupToTechnology[readGroupID] = tech;
                technologies[tech] = true;
            }

            if (readGroupID.empty()) {
                cerr << "could not find ID: in @RG tag " << endl << headerLine << endl;
                continue;
            }
            //string name = nameParts.back();
            //mergedHeader.append(1, '\n');
            //cerr << "found read group id " << readGroupID << " containing sample " << name << endl;
        }
    }

    for (map<string, bool>::iterator st = technologies.begin(); st != technologies.end(); ++st) {
        sequencingTechnologies.push_back(st->first);
    }

}

void AlleleParser::getPopulations(void) {

    map<string, string> allSamplePopulation;

    if (!parameters.populationsFile.empty()) {
        ifstream populationsFile(parameters.populationsFile.c_str(), ios::in);
        if (!populationsFile) {
            cerr << "unable to open population file: " << parameters.populationsFile << endl;
            exit(1);
        }
        string line;
        while (getline(populationsFile, line)) {
            DEBUG2("found sample-population mapping: " << line);
            vector<string> popsample = split(line, "\t ");
            if (popsample.size() == 2) {
                string& sample = popsample.front();
                string& population = popsample.back();
                DEBUG2("sample: " << sample << " population: " << population);
                allSamplePopulation[sample] = population;
            } else {
                cerr << "malformed population/sample pair, " << line << endl;
                exit(1);
            }
        }
    }

    // XXX
    // TODO now, assign a default population to all the rest of the samples...
    // XXX
    for (vector<string>::iterator s = sampleList.begin(); s != sampleList.end(); ++s) {
        if (!allSamplePopulation.count(*s)) {
            samplePopulation[*s] = "DEFAULT";
        } else {
            samplePopulation[*s] = allSamplePopulation[*s];
        }
    }

    // now, only keep the samples we are using for processing


    for (map<string, string>::iterator s = samplePopulation.begin(); s != samplePopulation.end(); ++s) {
        populationSamples[s->second].push_back(s->first);
    }

}

// read sample list file or get sample names from bam file header
void AlleleParser::getSampleNames(void) {

    // If a sample file is given, use it.  But otherwise process the bam file
    // header to get the sample names.
    //

    if (!parameters.samples.empty()) {
        ifstream sampleFile(parameters.samples.c_str(), ios::in);
        if (! sampleFile) {
            cerr << "unable to open file: " << parameters.samples << endl;
            exit(1);
        }
        string line;
        while (getline(sampleFile, line)) {
            DEBUG2("found sample " << line);
            sampleList.push_back(line);
        }
    }

    for (vector<string>::const_iterator it = bamHeaderLines.begin(); it != bamHeaderLines.end(); ++it) {

        // get next line from header, skip if empty
        string headerLine = *it;
        if ( headerLine.empty() ) { continue; }

        // lines of the header look like:
        // "@RG     ID:-    SM:NA11832      CN:BCM  PL:454"
        //                     ^^^^^^^\ is our sample name
        if ( headerLine.find("@RG") == 0 ) {
            vector<string> readGroupParts = split(headerLine, "\t ");
            string name = "";
            string readGroupID = "";
            for (vector<string>::const_iterator r = readGroupParts.begin(); r != readGroupParts.end(); ++r) {
                size_t colpos = r->find(":");
                if (colpos != string::npos) {
                    string fieldname = r->substr(0, colpos);
                    if (fieldname == "SM") {
                        name = r->substr(colpos+1);
                    } else if (fieldname == "ID") {
                        readGroupID = r->substr(colpos+1);
                    }
                }
            }
            if (name == "") {
                ERROR(" could not find SM: in @RG tag " << endl << headerLine);
                exit(1);
            }
            if (readGroupID == "") {
                ERROR(" could not find ID: in @RG tag " << endl << headerLine);
                exit(1);
            }
            //string name = nameParts.back();
            //mergedHeader.append(1, '\n');
            DEBUG2("found read group id " << readGroupID << " containing sample " << name);
            sampleListFromBam.push_back(name);

            map<string, string>::iterator s = readGroupToSampleNames.find(readGroupID);
            if (s != readGroupToSampleNames.end()) {
                if (s->second != name) {
                    ERROR("ERROR: multiple samples (SM) map to the same read group (RG)" << endl
                       << endl
                       << "samples " << name << " and " << s->second << " map to " << readGroupID << endl
                       << endl
                       << "As freebayes operates on a virtually merged stream of its input files," << endl 
                       << "it will not be possible to determine what sample an alignment belongs to" << endl
                       << "at runtime." << endl
                       << endl
                       << "To resolve the issue, ensure that RG ids are unique to one sample" << endl
                       << "across all the input files to freebayes." << endl
                       << endl
                       << "See bamaddrg (https://github.com/ekg/bamaddrg) for a method which can" << endl
                       << "add RG tags to alignments." << endl);
                    exit(1);
                }
                // if it's the same sample name and RG combo, no worries
            }
            readGroupToSampleNames[readGroupID] = name;
        }
    }
    //cout << sampleListFromBam.size() << endl;
    // no samples file given, read from BAM file header for sample names
    if (sampleList.empty()) {
        DEBUG("no sample list file given, reading sample names from bam file");
        for (vector<string>::const_iterator s = sampleListFromBam.begin(); s != sampleListFromBam.end(); ++s) {
            DEBUG2("found sample " << *s);
            if (!stringInVector(*s, sampleList)) {
                sampleList.push_back(*s);
            }
        }
        DEBUG("found " << sampleList.size() << " samples in BAM file");
    } else {
        // verify that the samples in the sample list are present in the bam,
        // and raise an error and exit if not
        for (vector<string>::const_iterator s = sampleList.begin(); s != sampleList.end(); ++s) {
            bool inBam = false;
            bool inReadGroup = false;
            //cout << "checking sample from sample file " << *s << endl;
            for (vector<string>::const_iterator b = sampleListFromBam.begin(); b != sampleListFromBam.end(); ++b) {
                //cout << *s << " against " << *b << endl;
                if (*s == *b) { inBam = true; break; }
            }
            for (map<string, string>::const_iterator p = readGroupToSampleNames.begin(); p != readGroupToSampleNames.end(); ++p) {
                if (*s == p->second) { inReadGroup = true; break; }
            }
            if (!inBam) {
                ERROR("sample " << *s << " listed in sample file "
                    << parameters.samples.c_str() << " is not listed in the header of BAM file(s) "
                    << parameters.bam);
                exit(1);
            }
            if (!inReadGroup) {
                ERROR("sample " << *s << " listed in sample file "
                    << parameters.samples.c_str() << " is not associated with any read group in the header of BAM file(s) "
                    << parameters.bam);
                exit(1);
            }
        }
    }

    if (sampleList.empty()) {
        /*
        ERROR(string(80, '-') << endl
             //--------------------------------------------------------------------------------
           << "Warning: No sample file given, and no @RG tags found in BAM header." << endl
           << "All alignments from all input files will be assumed to come from the same" << endl
           << "individual.  To group alignments by sample, you must add read groups and sample" << endl 
           << "names to your alignments.  You can do this using ./scripts/sam_add_rg.pl in the" << endl
           << "freebayes source tree, or by specifying read groups and sample names when you" << endl
           << "prepare your sequencing data for alignment." << endl
           << string(80, '-'));
           */
        sampleList.push_back("unknown");
        readGroupToSampleNames["unknown"] = "unknown";
        oneSampleAnalysis = true;
    }

}

string AlleleParser::vcfHeader() {

    stringstream headerss;
    headerss
        << "##fileformat=VCFv4.1" << endl
        << "##fileDate=" << dateStr() << endl
        << "##source=freeBayes " << VERSION_GIT << endl
        << "##reference=" << reference.filename << endl
        << "##phasing=none" << endl
        << "##commandline=\"" << parameters.commandline << "\"" << endl
        << "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">" << endl
        << "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total read depth at the locus\">" << endl
        << "##INFO=<ID=DPB,Number=1,Type=Float,Description=\"Total read depth per bp at the locus; bases in reads overlapping / bases in haplotype\">" << endl

        // allele frequency metrics
        << "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Total number of alternate alleles in called genotypes\">" << endl
        << "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">" << endl
        << "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Estimated allele frequency in the range (0,1]\">" << endl

        // observation counts
        << "##INFO=<ID=RO,Number=1,Type=Integer,Description=\"Reference allele observation count, with partial observations recorded fractionally\">" << endl
        << "##INFO=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observations, with partial observations recorded fractionally\">" << endl
        << "##INFO=<ID=PRO,Number=1,Type=Float,Description=\"Reference allele observation count, with partial observations recorded fractionally\">" << endl
        << "##INFO=<ID=PAO,Number=A,Type=Float,Description=\"Alternate allele observations, with partial observations recorded fractionally\">" << endl

        // qualities
        << "##INFO=<ID=QR,Number=1,Type=Integer,Description=\"Reference allele quality sum in phred\">" << endl
        << "##INFO=<ID=QA,Number=A,Type=Integer,Description=\"Alternate allele quality sum in phred\">" << endl
        << "##INFO=<ID=PQR,Number=1,Type=Float,Description=\"Reference allele quality sum in phred for partial observations\">" << endl
        << "##INFO=<ID=PQA,Number=A,Type=Float,Description=\"Alternate allele quality sum in phred for partial observations\">" << endl


        // binomial balance metrics
        << "##INFO=<ID=SRF,Number=1,Type=Integer,Description=\"Number of reference observations on the forward strand\">" << endl
        << "##INFO=<ID=SRR,Number=1,Type=Integer,Description=\"Number of reference observations on the reverse strand\">" << endl
        << "##INFO=<ID=SAF,Number=A,Type=Integer,Description=\"Number of alternate observations on the forward strand\">" << endl
        << "##INFO=<ID=SAR,Number=A,Type=Integer,Description=\"Number of alternate observations on the reverse strand\">" << endl
        //<< "##INFO=<ID=SRB,Number=1,Type=Float,Description=\"Strand bias for the reference allele: SRF / ( SRF + SRR )\">" << endl
        //<< "##INFO=<ID=SAB,Number=1,Type=Float,Description=\"Strand bias for the alternate allele: SAF / ( SAF + SAR )\">" << endl
        << "##INFO=<ID=SRP,Number=1,Type=Float,Description=\"Strand balance probability for the reference allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SRF and SRR given E(SRF/SRR) ~ 0.5, derived using Hoeffding's inequality\">" << endl
        << "##INFO=<ID=SAP,Number=A,Type=Float,Description=\"Strand balance probability for the alternate allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SAF and SAR given E(SAF/SAR) ~ 0.5, derived using Hoeffding's inequality\">" << endl
        //<< "##INFO=<ID=ABR,Number=1,Type=Integer,Description=\"Reference allele balance count: the number of sequence reads from apparent heterozygotes supporting the reference allele\">" << endl
        //<< "##INFO=<ID=ABA,Number=1,Type=Integer,Description=\"Alternate allele balance count: the number of sequence reads from apparent heterozygotes supporting the alternate allele\">" << endl
        << "##INFO=<ID=AB,Number=A,Type=Float,Description=\"Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous\">" << endl
        << "##INFO=<ID=ABP,Number=A,Type=Float,Description=\"Allele balance probability at heterozygous sites: Phred-scaled upper-bounds estimate of the probability of observing the deviation between ABR and ABA given E(ABR/ABA) ~ 0.5, derived using Hoeffding's inequality\">" << endl
        << "##INFO=<ID=RUN,Number=A,Type=Integer,Description=\"Run length: the number of consecutive repeats of the alternate allele in the reference genome\">" << endl
        //<< "##INFO=<ID=RL,Number=1,Type=Integer,Description=\"Reads Placed Left: number of reads supporting the alternate balanced to the left (5') of the alternate allele\">" << endl
        //<< "##INFO=<ID=RR,Number=1,Type=Integer,Description=\"Reads Placed Right: number of reads supporting the alternate balanced to the right (3') of the alternate allele\">" << endl
        << "##INFO=<ID=RPP,Number=A,Type=Float,Description=\"Read Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality\">" << endl
        << "##INFO=<ID=RPPR,Number=1,Type=Float,Description=\"Read Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality\">" << endl
        << "##INFO=<ID=RPL,Number=A,Type=Float,Description=\"Reads Placed Left: number of reads supporting the alternate balanced to the left (5') of the alternate allele\">" << endl
        //<< "##INFO=<ID=RPLR,Number=A,Type=Float,Description=\"Reads Placed Left: number of reads supporting the alternate balanced to the left (5') of the alternate allele\">" << endl
        << "##INFO=<ID=RPR,Number=A,Type=Float,Description=\"Reads Placed Right: number of reads supporting the alternate balanced to the right (3') of the alternate allele\">" << endl
        //<< "##INFO=<ID=RPRR,Number=A,Type=Float,Description=\"Reads Placed Right: number of reads supporting the alternate balanced to the right (3') of the alternate allele\">" << endl
        //<< "##INFO=<ID=EL,Number=1,Type=Integer,Description=\"Allele End Left: number of observations of the alternate where the alternate occurs in the left end of the read\">" << endl
        //<< "##INFO=<ID=ER,Number=1,Type=Integer,Description=\"Allele End Right: number of observations of the alternate where the alternate occurs in the right end of the read\">" << endl
        << "##INFO=<ID=EPP,Number=A,Type=Float,Description=\"End Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality\">" << endl
        << "##INFO=<ID=EPPR,Number=1,Type=Float,Description=\"End Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality\">" << endl
        //<< "##INFO=<ID=BL,Number=1,Type=Integer,Description=\"Base Pairs Left: number of base pairs in reads supporting the alternate to the left (5') of the alternate allele\">" << endl
        //<< "##INFO=<ID=BR,Number=1,Type=Integer,Description=\"Base Pairs Right: number of base pairs in reads supporting the alternate to the right (3') of the alternate allele\">" << endl
        //<< "##INFO=<ID=LRB,Number=1,Type=Float,Description=\"((max(BR, BL) / (BR + BL)) - 0.5) * 2 : The proportion of base pairs in reads on one side of the alternate allele relative to total bases, scaled from [0.5,1] to [0,1]\">" << endl
        //<< "##INFO=<ID=LRBP,Number=1,Type=Float,Description=\"Left-Right Balance Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between BL and BR given E(BR/BL) ~ 0.5, derived using Hoeffding's inequality\">" << endl
        << "##INFO=<ID=DPRA,Number=A,Type=Float,Description=\"Alternate allele depth ratio.  Ratio between depth in samples with each called alternate allele and those without.\">" << endl

        // error rates
        /*
        << "##INFO=<ID=XRM,Number=1,Type=Float,Description=\"Reference allele read mismatch rate: The rate of SNPs + MNPs + INDELs in reads supporting the reference allele.\">" << endl
        << "##INFO=<ID=XRS,Number=1,Type=Float,Description=\"Reference allele read SNP rate: The rate of per-base mismatches (SNPs + MNPs) in reads supporting the reference allele.\">" << endl
        << "##INFO=<ID=XRI,Number=1,Type=Float,Description=\"Reference allele read INDEL rate: The rate of INDELs (gaps) in reads supporting the reference allele.\">" << endl
        << "##INFO=<ID=XAM,Number=A,Type=Float,Description=\"Alternate allele read mismatch rate: The rate of SNPs + MNPs + INDELs in reads supporting the alternate allele, excluding the called variant.\">" << endl
        << "##INFO=<ID=XAS,Number=A,Type=Float,Description=\"Alternate allele read SNP rate: The rate of per-base mismatches (SNPs + MNPs) in reads supporting the alternate allele, excluding the called variant.\">" << endl
        << "##INFO=<ID=XAI,Number=A,Type=Float,Description=\"Alternate allele read INDEL rate: The rate of INDELs (gaps) in reads supporting the alternate allele, excluding the called variant.\">" << endl
        */
        // error rate ratios
        //<< "##INFO=<ID=ARM,Number=A,Type=Float,Description=\"Alternate allele / reference allele read mismatch ratio: The rate of SNPs + MNPs + INDELs in reads supporting the alternate allele versus reads supporting the reference allele, excluding the called variant.\">" << endl
        //<< "##INFO=<ID=ARS,Number=A,Type=Float,Description=\"Alternate allele / reference allele read SNP ratio: The rate of per-base mismatches (SNPs + MNPs) in reads supporting the alternate allele versus reads supporting the reference allele, excluding the called variant.\">" << endl
        //<< "##INFO=<ID=ARI,Number=A,Type=Float,Description=\"Alternate allele / reference allele read INDEL ratio: The ratio in rate rate of INDELs (gaps) in reads supporting the alternate allele versus reads supporting the reference allele, excluding the called variant.\">" << endl

        // supplementary information about the site
        << "##INFO=<ID=ODDS,Number=1,Type=Float,Description=\"The log odds ratio of the best genotype combination to the second-best.\">" << endl
        << "##INFO=<ID=GTI,Number=1,Type=Integer,Description=\"Number of genotyping iterations required to reach convergence or bailout.\">" << endl
        //<< "##INFO=<ID=TS,Number=0,Type=Flag,Description=\"site has transition SNP\">" << endl
        //<< "##INFO=<ID=TV,Number=0,Type=Flag,Description=\"site has transversion SNP\">" << endl
        //<< "##INFO=<ID=CpG,Number=0,Type=Flag,Description=\"CpG site (either CpG, TpG or CpA)\">" << endl
        << "##INFO=<ID=TYPE,Number=A,Type=String,Description=\"The type of allele, either snp, mnp, ins, del, or complex.\">" << endl
        << "##INFO=<ID=CIGAR,Number=A,Type=String,Description=\"The extended CIGAR representation of each alternate allele, with the exception that '=' is replaced by 'M' to ease VCF parsing.  Note that INDEL alleles do not have the first matched base (which is provided by default, per the spec) referred to by the CIGAR.\">" << endl
        //<< "##INFO=<ID=SNP,Number=0,Type=Flag,Description=\"SNP allele at site\">" << endl
        //<< "##INFO=<ID=MNP,Number=0,Type=Flag,Description=\"MNP allele at site\">" << endl
        //<< "##INFO=<ID=INS,Number=0,Type=Flag,Description=\"insertion allele at site\">" << endl
        //<< "##INFO=<ID=DEL,Number=0,Type=Flag,Description=\"deletion allele at site\">" << endl
        //<< "##INFO=<ID=COMPLEX,Number=0,Type=Flag,Description=\"complex allele (insertion/deletion/substitution composite) at site\">" << endl
        << "##INFO=<ID=NUMALT,Number=1,Type=Integer,Description=\"Number of unique non-reference alleles in called genotypes at this position.\">" << endl
        << "##INFO=<ID=MEANALT,Number=A,Type=Float,Description=\"Mean number of unique non-reference allele observations per sample with the corresponding alternate alleles.\">" << endl
        //<< "##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Phred-scaled discrete HWE prior probability of the genotyping across all samples.\">" << endl
        << "##INFO=<ID=LEN,Number=A,Type=Integer,Description=\"allele length\">" << endl
        << "##INFO=<ID=MQM,Number=A,Type=Float,Description=\"Mean mapping quality of observed alternate alleles\">" << endl
        << "##INFO=<ID=MQMR,Number=1,Type=Float,Description=\"Mean mapping quality of observed reference alleles\">" << endl
        << "##INFO=<ID=PAIRED,Number=A,Type=Float,Description=\"Proportion of observed alternate alleles which are supported by properly paired read fragments\">" << endl
        << "##INFO=<ID=PAIREDR,Number=1,Type=Float,Description=\"Proportion of observed reference alleles which are supported by properly paired read fragments\">" << endl;

    // sequencing technology tags, which vary according to input data
    for (vector<string>::iterator st = sequencingTechnologies.begin(); st != sequencingTechnologies.end(); ++st) {
        string& tech = *st;
        headerss << "##INFO=<ID=technology." << tech << ",Number=A,Type=Float,Description=\"Fraction of observations supporting the alternate observed in reads from " << tech << "\">" << endl;
    }

    if (parameters.showReferenceRepeats) {
        headerss << "##INFO=<ID=REPEAT,Number=1,Type=String,Description=\"Description of the local repeat structures flanking the current position\">" << endl;
    }

        // format fields for genotypes
    headerss << "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" << endl
        << "##FORMAT=<ID=GQ,Number=1,Type=Float,Description=\"Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype\">" << endl
        // this can be regenerated with RA, AA, QR, QA
        << "##FORMAT=<ID=GL,Number=G,Type=Float,Description=\"Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy\">" << endl
	//<< "##FORMAT=<ID=GLE,Number=1,Type=String,Description=\"Genotype Likelihood Explicit, same as GL, but with tags to indicate the specific genotype.  For instance, 0^-75.22|1^-223.42|0/0^-323.03|1/0^-99.29|1/1^-802.53 represents both haploid and diploid genotype likilehoods in a biallelic context\">" << endl
        << "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">" << endl
        << "##FORMAT=<ID=RO,Number=1,Type=Integer,Description=\"Reference allele observation count\">" << endl
        << "##FORMAT=<ID=QR,Number=1,Type=Integer,Description=\"Sum of quality of the reference observations\">" << endl
        << "##FORMAT=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observation count\">" << endl
        << "##FORMAT=<ID=QA,Number=A,Type=Integer,Description=\"Sum of quality of the alternate observations\">" << endl
        //<< "##FORMAT=<ID=SRF,Number=1,Type=Integer,Description=\"Number of reference observations on the forward strand\">" << endl
        //<< "##FORMAT=<ID=SRR,Number=1,Type=Integer,Description=\"Number of reference observations on the reverse strand\">" << endl
        //<< "##FORMAT=<ID=SAF,Number=1,Type=Integer,Description=\"Number of alternate observations on the forward strand\">" << endl
        //<< "##FORMAT=<ID=SAR,Number=1,Type=Integer,Description=\"Number of alternate observations on the reverse strand\">" << endl
        //<< "##FORMAT=<ID=LR,Number=1,Type=Integer,Description=\"Number of reference observations placed left of the loci\">" << endl
        //<< "##FORMAT=<ID=LA,Number=1,Type=Integer,Description=\"Number of alternate observations placed left of the loci\">" << endl
        //<< "##FORMAT=<ID=ER,Number=1,Type=Integer,Description=\"Number of reference observations overlapping the loci in their '3 end\">" << endl
        //<< "##FORMAT=<ID=EA,Number=1,Type=Integer,Description=\"Number of alternate observations overlapping the loci in their '3 end\">" << endl
        << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t"
        << join(sampleList, "\t") << endl;

    return headerss.str();

}


void AlleleParser::setupVCFOutput(void) {
    string vcfheader = vcfHeader();
    variantCallFile.openForOutput(vcfheader);
}

void AlleleParser::setupVCFInput(void) {
    // variant input for analysis and targeting
    if (!parameters.variantPriorsFile.empty()) {
        variantCallInputFile.open(parameters.variantPriorsFile);
        currentVariant = new vcf::Variant(variantCallInputFile);
        usingVariantInputAlleles = true;

        // get sample names from VCF input file
        //
        // NB, adding this stanza will change the way that the VCF output
        // describes alternates, present observations, etc. so that the samples
        // in the VCF input are also included.  the result is confusing output,
        // but it could be useful in some situations.
        //
        // TODO optionally include this (via command-line parameter)
        //
        //for (vector<string>::iterator s = variantCallInputFile.sampleNames.begin(); s != variantCallInputFile.sampleNames.end(); ++s) {
        //    sampleList.push_back(*s);
        //}

    }

    // haplotype alleles for constructing haplotype alleles
    if (!parameters.haplotypeVariantFile.empty()) {
        haplotypeVariantInputFile.open(parameters.haplotypeVariantFile);
        usingHaplotypeBasisAlleles = true;
    }
}

void AlleleParser::loadBamReferenceSequenceNames(void) {

    //--------------------------------------------------------------------------
    // read reference sequences from input file
    //--------------------------------------------------------------------------

    // store the names of all the reference sequences in the BAM file
    referenceSequences = bamMultiReader.GetReferenceData();
    int i = 0;
    for (RefVector::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) {
        referenceIDToName[i] = r->RefName;
        ++i;
    }

    DEBUG("Number of ref seqs: " << bamMultiReader.GetReferenceCount());

}


void AlleleParser::loadFastaReference(void) {

    DEBUG("loading fasta reference " << parameters.fasta);

    // This call loads the reference and reads any index file it can find.  If
    // it can't find an index file for the reference, it will attempt to
    // generate one alongside it.  Note that this only loads the reference.
    // Sequence data is obtained by progressive calls to
    // reference.getSubSequence(..), thus keeping our memory requirements low.

    reference.open(parameters.fasta);

}

bool AlleleParser::hasMoreInputVariants(void) {
    pair<int, long> next = nextInputVariantPosition();
    return next.first != -1;
}

bool AlleleParser::loadNextPositionWithAlignmentOrInputVariant(BamAlignment& alignment) {
    pair<int, long> next = nextInputVariantPosition();
    if (next.first != -1) {
        int varRefID = next.first;
        //cerr << varRefID << " " << alignment.RefID << " " << next.second << " " << alignment.Position << endl;
        if (!hasMoreAlignments || varRefID < alignment.RefID || varRefID == alignment.RefID && next.second < alignment.Position) {
            return loadNextPositionWithInputVariant();
        } else {
            loadReferenceSequence(alignment);
        }
    } else {
        loadReferenceSequence(alignment);
    }
    return true;
}

bool AlleleParser::loadNextPositionWithInputVariant(void) {
    pair<int, long> next = nextInputVariantPosition();
    if (next.first != -1) {
        //cerr << "Next is " << next.first << ":" << next.second << endl;
        loadReferenceSequence(referenceIDToName[next.first]);
        currentPosition = next.second;
        rightmostHaplotypeBasisAllelePosition = currentPosition;
        return true;
    } else {
        return false;
    }
}

// alignment-based method for loading the first bit of our reference sequence
void AlleleParser::loadReferenceSequence(BamAlignment& alignment) {
    loadReferenceSequence(referenceIDToName[alignment.RefID]);
    currentPosition = alignment.Position;
}

void AlleleParser::loadReferenceSequence(string& seqname) {
    if (currentSequenceName != seqname) {
        currentSequenceName = seqname;
        currentSequenceStart = 0;
        currentRefID = bamMultiReader.GetReferenceID(currentSequenceName);
        currentSequence = uppercase(reference.getSequence(currentSequenceName));
    }
}

void AlleleParser::loadTargets(void) {

    // if we have a targets file, use it...
    // if target file specified use targets from file
    if (!parameters.targets.empty()) {

        DEBUG("Making BedReader object for target file: " << parameters.targets << " ...");

        bedReader.openFile(parameters.targets);

        if (!bedReader.is_open()) {
            ERROR("Unable to open target file: " << parameters.targets << "... terminating.");
            exit(1);
        }

        targets = bedReader.targets;

        if (targets.empty()) {
            ERROR("Could not load any targets from " << parameters.targets);
            exit(1);
        }

        bedReader.close();

        DEBUG("done");

    }

    // if we have a region specified, use it to generate a target
    for (vector<string>::iterator r = parameters.regions.begin(); r != parameters.regions.end(); ++r) {
        // drawn from bamtools_utilities.cpp, modified to suit 1-based context, no end sequence

        string region = *r;
        string startSeq;
        int startPos;
        int stopPos;

        size_t foundFirstColon = region.find(":");

        // we only have a single string, use the whole sequence as the target
        if (foundFirstColon == string::npos) {
            startSeq = region;
            startPos = 0;
            stopPos = -1;
        } else {
            startSeq = region.substr(0, foundFirstColon);
            string sep = "..";
            size_t foundRangeSep = region.find(sep, foundFirstColon);
            if (foundRangeSep == string::npos) {
                sep = "-";
                foundRangeSep = region.find("-", foundFirstColon);
            }
            if (foundRangeSep == string::npos) {
                startPos = atoi(region.substr(foundFirstColon + 1).c_str());
                // differ from bamtools in this regard, in that we process only
                // the specified position if a range isn't given
                stopPos = startPos + 1;
            } else {
                startPos = atoi(region.substr(foundFirstColon + 1, foundRangeSep - foundFirstColon).c_str());
                // if we have range sep specified, but no second number, read to the end of sequence
                if (foundRangeSep + sep.size() != region.size()) {
                    stopPos = atoi(region.substr(foundRangeSep + sep.size()).c_str()); // end-exclusive, bed-format
                } else {
                    stopPos = -1;
                }
            }
        }

        //DEBUG("startPos == " << startPos);
        //DEBUG("stopPos == " << stopPos);

        // REAL BED format is 0 based, half open (end base not included)
        BedTarget bd(startSeq,
                    (startPos == 0) ? 0 : startPos,
                    ((stopPos == -1) ? reference.sequenceLength(startSeq) : stopPos) - 1); // internally, we use 0-base inclusive end
        DEBUG("will process reference sequence " << startSeq << ":" << bd.left << ".." << bd.right + 1);
        targets.push_back(bd);
        bedReader.targets.push_back(bd);

    }

    // check validity of targets wrt. reference
    for (vector<BedTarget>::iterator e = targets.begin(); e != targets.end(); ++e) {
        BedTarget& bd = *e;
        // internally, we use 0-base inclusive end
        if (bd.left < 0 || bd.right + 1 > reference.sequenceLength(bd.seq)) {
            ERROR("Target region coordinates (" << bd.seq << " "
                    << bd.left << " " << bd.right + 1
                    << ") outside of reference sequence bounds ("
                    << bd.seq << " " << reference.sequenceLength(bd.seq) << ") terminating.");
            exit(1);
        }
        if (bd.right < bd.left) {
            ERROR("Invalid target region coordinates (" << bd.seq << " " << bd.left << " " << bd.right + 1 << ")"
                    << " right bound is lower than left bound!");
            exit(1);
        }
    }

    bedReader.buildIntervals(); // set up interval tree in the bedreader

    DEBUG("Number of target regions: " << targets.size());

}

void AlleleParser::loadTargetsFromBams(void) {
    // otherwise, if we weren't given a region string or targets file, analyze
    // all reference sequences from BAM file
    DEBUG2("no targets specified, using all targets from BAM files");
    RefVector::iterator refIter = referenceSequences.begin();
    RefVector::iterator refEnd  = referenceSequences.end();
    for( ; refIter != refEnd; ++refIter) {
        RefData refData = *refIter;
        string refName = refData.RefName;
        BedTarget bd(refName, 0, refData.RefLength); // 0-based inclusive internally
        DEBUG2("will process reference sequence " << refName << ":" << bd.left << ".." << bd.right + 1);
        targets.push_back(bd);
    }
}

void AlleleParser::loadSampleCNVMap(void) {
    // set default ploidy
    sampleCNV.setDefaultPloidy(parameters.ploidy);

    // load CNV map if provided
    if (!parameters.cnvFile.empty()) {
        if (!sampleCNV.load(parameters.cnvFile)) {
            ERROR("could not load sample map " << parameters.cnvFile << " ... exiting!");
            exit(1);
        }
    }

    // to assert that the reference is haploid, we can iterate through the BAM
    // header to get the reference names and sizes, and then setPloidy on them
    // in the sampleCNV map.  note that the reference "sample" is named after
    // the current reference sequence.
    if (!parameters.diploidReference) {
        for (RefVector::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) {
            sampleCNV.setPloidy(referenceSampleName, r->RefName, 0, r->RefLength, 1);
        }
    }

}

int AlleleParser::currentSamplePloidy(string const& sample) {
    return sampleCNV.ploidy(sample, currentSequenceName, currentPosition);
}

int AlleleParser::copiesOfLocus(Samples& samples) {
    int copies = 0;
    for (Samples::iterator s = samples.begin(); s != samples.end(); ++s) {
        string const& name = s->first;
        copies += currentSamplePloidy(name);
    }
    return copies;
}

vector<int> AlleleParser::currentPloidies(Samples& samples) {
    map<int, bool> ploidiesMap;
    vector<int> ploidies;
    for (Samples::iterator s = samples.begin(); s != samples.end(); ++s) {
        string const& name = s->first;
        int samplePloidy = currentSamplePloidy(name);
        ploidiesMap[samplePloidy] = true;
    }
    ploidiesMap[parameters.ploidy] = true;
    for (map<int, bool>::iterator p = ploidiesMap.begin(); p != ploidiesMap.end(); ++p) {
        ploidies.push_back(p->first);
    }
    return ploidies;
}

// meant to be used when we are reading from stdin, to check if we are within targets
bool AlleleParser::inTarget(void) {
    if (targets.empty()) {
        return true;  // everything is in target if we don't have targets
    } else {
        // expects 0-based, fully-closed, and we're only checking a single
        // base, so start == end.
        if (bedReader.targetsOverlap(currentSequenceName, currentPosition, currentPosition)) {
            return true;
        } else {
            return false;
        }
    }
}

// initialization function
// sets up environment so we can start registering alleles
AlleleParser::AlleleParser(int argc, char** argv) : parameters(Parameters(argc, argv))
{

    oneSampleAnalysis = false;
    currentRefID = 0; // will get set properly via toNextRefID
    currentPosition = 0;
    currentTarget = NULL; // to be initialized on first call to getNextAlleles
    currentReferenceAllele = NULL; // same, NULL is brazenly used as an initialization flag
    justSwitchedTargets = false;  // flag to trigger cleanup of Allele*'s and objects after jumping targets
    hasMoreAlignments = true; // flag to track when we run out of alignments in the current target or BAM files
    currentSequenceStart = 0;
    lastHaplotypeLength = 0;
    usingHaplotypeBasisAlleles = false;
    usingVariantInputAlleles = false;
    rightmostHaplotypeBasisAllelePosition = 0;
    rightmostInputAllelePosition = 0;
    nullSample = new Sample();
    referenceSampleName = "reference_sample";

    // initialization
    openTraceFile();
    openFailedFile();
    openOutputFile();

    loadFastaReference();
    // when we open the bam files we can use the number of targets to decide if
    // we should load the indexes
    openBams();
    loadBamReferenceSequenceNames();
    // check how many targets we have specified
    loadTargets();
    getSampleNames();
    getPopulations();
    getSequencingTechnologies();

    // sample CNV
    loadSampleCNVMap();

    // output
    setupVCFOutput();

    // input
    // (now that the VCF file is set up with the samples which are in the input alignments
    // add the samples from the input VCF to the mix)
    setupVCFInput();


}

AlleleParser::~AlleleParser(void) {

    delete nullSample;

    // close trace file?  seems to get closed properly on object deletion...
    if (currentReferenceAllele) delete currentReferenceAllele;

    if (variantCallInputFile.is_open()) delete currentVariant;

}

// position of alignment relative to current sequence
int AlleleParser::currentSequencePosition(const BamAlignment& alignment) {
    return alignment.Position - currentSequenceStart;
}

// relative current position within the cached currentSequence
int AlleleParser::currentSequencePosition() {
    return currentPosition - currentSequenceStart;
}

char AlleleParser::currentReferenceBaseChar(void) {
    return toupper(*currentReferenceBaseIterator());
}

string AlleleParser::currentReferenceBaseString(void) {
    return currentSequence.substr(floor(currentPosition) - currentSequenceStart, 1);
}

string::iterator AlleleParser::currentReferenceBaseIterator(void) {
    return currentSequence.begin() + (floor(currentPosition) - currentSequenceStart);
}

string AlleleParser::currentReferenceHaplotype(void) {
    return currentSequence.substr(floor(currentPosition) - currentSequenceStart, lastHaplotypeLength);
}

string AlleleParser::referenceSubstr(long int pos, unsigned int len) {
    return uppercase(reference.getSubSequence(currentSequenceName, floor(pos), len));
}

bool AlleleParser::isCpG(string& altbase) {
    // bounds check
    if (floor(currentPosition) - currentSequenceStart - 1 < 0
            || floor(currentPosition) - currentSequenceStart + 1 >= currentSequence.size()) {
        return false;
    }
    string prevb = currentSequence.substr(floor(currentPosition) - currentSequenceStart - 1, 1);
    string currb = currentSequence.substr(floor(currentPosition) - currentSequenceStart, 1);
    string nextb = currentSequence.substr(floor(currentPosition) - currentSequenceStart + 1, 1);
    // 5'-3' CpG <-> TpG is represented as CpG <-> CpA in on the opposite strand
    if ((nextb == "G" && ((currb == "C" && altbase == "T") || (currb == "T" && altbase == "C")))
        ||
        (prevb == "C" && ((currb == "G" && altbase == "A") || (currb == "A" && altbase == "G"))))
    {
        return true;
    } else {
        return false;
    }
}

void capBaseQuality(BamAlignment& alignment, int baseQualityCap) {
    string& rQual = alignment.Qualities;
    char qualcap = qualityInt2Char(baseQualityCap);
    for (string::iterator c = rQual.begin(); c != rQual.end(); ++c) {
        if (qualityChar2ShortInt(*c) > baseQualityCap) {
            *c = qualcap;
        }
    }
}

void RegisteredAlignment::addAllele(Allele newAllele, bool mergeComplex, int maxComplexGap, bool boundIndels) {

    // allele combination rules.  combine the last allele in the list of allele
    // observations according to the following rules
    // 0) reference + SNP, MNP
    // 1) INDEL + (REF <= maxComplexGap) + MNP, INDEL + (REF <= maxComplexGap) + SNP -> complex
    // 2) MNP + SNP, SNP + SNP -> MNP
    // 2) reference + INDEL -> reference.substr(0, reference.size() - 1), reference.at(reference.size()) + INDEL
    if (newAllele.alternateSequence.size() != newAllele.baseQualities.size()) {
        cerr << "new allele qualities not == in length to sequence: " << newAllele << endl;
        assert(false);
    }

    //cerr << "adding allele " << newAllele << " to " << alleles.size() << " alleles" << endl;
    //if (!alleles.empty()) { cerr << "last allele " << alleles.back() << endl; }

    alleleTypes |= newAllele.type;

    if (alleles.empty()) {

        // presently, it's unclear how to handle insertions and deletions
        // reported at the beginning of the read.  are these events actually
        // indicative of longer alleles?
        if (boundIndels && (newAllele.isInsertion() || newAllele.isDeletion() || !newAllele.isNull())) {
            // ignore the allele
        } else {
            alleles.push_back(newAllele);
        }
        // the same goes for insertions and deletions at the end of reads,
        // these must be dealt with elsewhere

    } else {

        Allele& lastAllele = alleles.back();

        if (isEmptyAllele(newAllele) ||
            newAllele.isReference() && newAllele.referenceLength == 0) {
            // do nothing
        } else if (newAllele.isReference() && isUnflankedIndel(lastAllele)) {
            // add flanking base to indel, ensuring haplotype length of 2 for all indels
            string seq; vector<pair<int, string> > cig; vector<short> quals;
            //cerr << "subtracting from start " << newAllele << " giving to " << lastAllele << endl;
            newAllele.subtractFromStart(1, seq, cig, quals);
            lastAllele.addToEnd(seq, cig, quals);
            //cerr << "done " << newAllele << " gave to " << lastAllele << " reflen " << lastAllele.referenceLength << endl;
            // check that the new allele still has sequence
            if (!isEmptyAllele(newAllele)) {
                alleles.push_back(newAllele);
            }
        } else if (newAllele.isReference()
                   && (newAllele.referenceLength > maxComplexGap
                       || newAllele.basesRight == 0)) {
            // if the last allele is reference too, we need to combine them!
            if (lastAllele.isReference()) {
                DEBUG2("addAllele: mergeAllele/1:"
                    << " lastAllele " << lastAllele.typeStr() << "@" << lastAllele.position << ":" << lastAllele.cigar
                    << " newAllele "  << newAllele.typeStr()  << "@" << newAllele.position  << ":" << newAllele.cigar);
                lastAllele.mergeAllele(newAllele, ALLELE_REFERENCE);
                assert(lastAllele.alternateSequence.size() == lastAllele.baseQualities.size());
            } else if (lastAllele.isComplex() || lastAllele.isMNP() || lastAllele.isSNP()) {
                // split apart the last allele if it's 'complex' but followed by another reference allele
                // that would cause the reference gap to be greater than the maxComplexGap
                vector<pair<int, string> > cigar = splitCigar(lastAllele.cigar);
                if (cigar.back().second == "M") {
                    int matchlen = cigar.back().first;
                    if (matchlen + newAllele.referenceLength > maxComplexGap) {
                        // break apart the complex allele
                        alleles.push_back(lastAllele);
                        Allele& pAllele = alleles.at(alleles.size() - 2);
                        string seq; vector<pair<int, string> > cig; vector<short> quals;
                        pAllele.subtractFromEnd(matchlen, seq, cig, quals);
                        alleles.back().subtractFromStart(pAllele.referenceLength, seq, cig, quals);
                        DEBUG2("addAllele: mergeAllele/2:"
                           << " lastAllele " << lastAllele.typeStr() << "@" << lastAllele.position << ":" << lastAllele.cigar
                           << " .back() "    << alleles.back().typeStr() << "@" << alleles.back().position << ":" << alleles.back().cigar
                           << " newAllele "  << newAllele.typeStr()  << "@" << newAllele.position  << ":" << newAllele.cigar);
                        alleles.back().mergeAllele(newAllele, ALLELE_REFERENCE);
                    } else { // expand the complex allele
                        DEBUG2("addAllele: mergeAllele/3:"
                           << " lastAllele " << lastAllele.typeStr() << "@" << lastAllele.position << ":" << lastAllele.cigar
                           << " newAllele "  << newAllele.typeStr()  << "@" << newAllele.position  << ":" << newAllele.cigar);
                        lastAllele.mergeAllele(newAllele, ALLELE_COMPLEX);
                    }
                } else {
                    alleles.push_back(newAllele);
                }
            } else {
                alleles.push_back(newAllele);
            }
        } else if (lastAllele.isReference()) {
            if (newAllele.isSNP() || newAllele.isMNP() || newAllele.isComplex()) {
                alleles.push_back(newAllele);
            } else if (newAllele.isInsertion() || newAllele.isDeletion()) {
                int p = newAllele.position - 1;
                string seq; vector<pair<int, string> > cig; vector<short> quals;
                lastAllele.subtractFromEnd(1, seq, cig, quals);
                if (lastAllele.length == 0) {
                    alleles.pop_back(); // remove 0-length alleles
                }
                newAllele.addToStart(seq, cig, quals);
                if (newAllele.position != p) {
                    cerr << "newAllele.position != p" << endl << newAllele << " != " << p << endl;
                    exit(1);
                }
                alleles.push_back(newAllele);
                assert(newAllele.alternateSequence.size() == newAllele.baseQualities.size());
            } else {
                alleles.push_back(newAllele); // NULL case
            }
        } else if (newAllele.isNull()) {
            if (lastAllele.isComplex()) {
                // split apart the last allele if it's 'complex' but followed by a null allele
                vector<pair<int, string> > cigar = splitCigar(lastAllele.cigar);
                if (cigar.back().second == "M") {
                    int matchlen = cigar.back().first;
                    alleles.push_back(lastAllele);
                    Allele& pAllele = alleles.at(alleles.size() - 2);
                    string seq; vector<pair<int, string> > cig; vector<short> quals;
                    pAllele.subtractFromEnd(matchlen, seq, cig, quals);
                    alleles.back().subtractFromStart(pAllele.referenceLength, seq, cig, quals);
                }
            }
            alleles.push_back(newAllele);
        } else {
            // -> complex event or MNP
            if (mergeComplex && lastAllele.position + lastAllele.referenceLength == newAllele.position
                && !lastAllele.isNull()) {

                vector<pair<int, string> > lastCigar = splitCigar(lastAllele.cigar);

                // If the last allele is complex and ends in a match, we need
                // to check that after merging the then-embedded match won't be
                // longer than maxComplexGap.  We do this for every new allele,
                // since we don't want to allow the complex allele to grow
                // beyond maxComplexGap before splitting.
                if (lastAllele.isComplex()
                    && lastCigar.back().second == "M"
                    && lastCigar.back().first > maxComplexGap)
                {
                    // Break apart the complex allele into one complex and one
                    // reference allele.
                    //
                    // FIXME TODO: The allele may not actually be complex
                    // anymore after splitting, in which case we should demote
                    // its type to SNP/MNP/INDEL.
                    // -trs, 20 Nov 2014
                    alleles.push_back(lastAllele);
                    Allele& pAllele = alleles.at(alleles.size() - 2);
                    string seq; vector<pair<int, string> > cig; vector<short> quals;
                    pAllele.subtractFromEnd(lastCigar.back().first, seq, cig, quals);
                    alleles.back().subtractFromStart(pAllele.referenceLength, seq, cig, quals);

                    if (newAllele.isReference()) {
                        DEBUG2("addAllele: mergeAllele/5:"
                            << " lastAllele " << lastAllele.typeStr()     << "@" << lastAllele.position     << ":" << lastAllele.cigar
                            << " .back() "    << alleles.back().typeStr() << "@" << alleles.back().position << ":" << alleles.back().cigar
                            << " newAllele "  << newAllele.typeStr()      << "@" << newAllele.position      << ":" << newAllele.cigar);
                        alleles.back().mergeAllele(newAllele, ALLELE_REFERENCE);
                    } else {
                        alleles.push_back(newAllele);
                    }
                } else {
                    AlleleType atype = ALLELE_COMPLEX;
                    if (lastAllele.isSNP() || lastAllele.isMNP()) {
                        if (lastCigar.back().second == "X" && newAllele.isSNP() || newAllele.isMNP()) {
                            atype = ALLELE_MNP;
                        }
                    }

                    DEBUG2("addAllele: mergeAllele/4:"
                       << " lastAllele " << lastAllele.typeStr() << "@" << lastAllele.position << ":" << lastAllele.cigar
                       << " newAllele "  << newAllele.typeStr()  << "@" << newAllele.position  << ":" << newAllele.cigar);

                    lastAllele.mergeAllele(newAllele, atype);
                    assert(lastAllele.alternateSequence.size() == lastAllele.baseQualities.size());
                }
            } else {
                alleles.push_back(newAllele);
            }
        }

    }
}

// TODO erase alleles which are beyond N bp before the current position on position step
void AlleleParser::updateHaplotypeBasisAlleles(long int pos, int referenceLength) {
    if (pos + referenceLength > rightmostHaplotypeBasisAllelePosition) {
        stringstream r;
        //r << currentSequenceName << ":" << rightmostHaplotypeBasisAllelePosition << "-" << pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
        //cerr << "getting variants in " << r.str() << endl;

        // tabix expects 1-based, fully closed regions for ti_parse_region()
        // (which is what setRegion() calls eventually)
        if (haplotypeVariantInputFile.setRegion(currentSequenceName,
                                                rightmostHaplotypeBasisAllelePosition + 1,
                                                pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW + 1)) {
            //cerr << "the vcf line " << haplotypeVariantInputFile.line << endl;
            // get the variants in the target region
            vcf::Variant var(haplotypeVariantInputFile);
            while (haplotypeVariantInputFile.getNextVariant(var)) {
                //cerr << "input variant: " << var << endl;
		
                // the following stanza is for parsed
                // alternates. instead use whole haplotype calls, as
                // alternates can be parsed prior to providing the
                // file as input.
                /*
                  for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a) {
                  haplotypeBasisAlleles[var.position].insert(AllelicPrimitive(var.ref.size(), *a));
                  }
                */

                map<string, vector<vcf::VariantAllele> > variants = var.parsedAlternates();
                for (map<string, vector<vcf::VariantAllele> >::iterator a = variants.begin(); a != variants.end(); ++a) {
                    for (vector<vcf::VariantAllele>::iterator v = a->second.begin(); v != a->second.end(); ++v) {
                        //cerr << v->ref << "/" << v->alt << endl;
                        if (v->ref != v->alt) {
                            //cerr << "basis allele " << v->position << " " << v->ref << "/" << v->alt << endl;
                            haplotypeBasisAlleles[v->position].push_back(AllelicPrimitive(v->ref, v->alt));
                            //cerr << "number of alleles at position " <<  haplotypeBasisAlleles[v->position].size() << endl;
                        }
                    }
                }

            }
        } else {
            // indicates empty region
            //ERROR("Could not set haplotype-basis VCF file to target region");
            //exit(1);
        }
        // set the rightmost haplotype position to trigger the next update
        rightmostHaplotypeBasisAllelePosition = pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
    }
}


bool AlleleParser::allowedHaplotypeBasisAllele(long int pos, string& ref, string& alt) {
    // check the haplotypeBasisAllele map for membership of the allele in question in the current sequence
    //cerr << "is allowed: " << pos << " " << ref << "/" << alt << " ?" << endl;
    if (!usingHaplotypeBasisAlleles) {
        return true; // always true if we aren't using the haplotype basis allele system
    } else {
        map<long int, vector<AllelicPrimitive> >::iterator p = haplotypeBasisAlleles.find(pos);
        if (p != haplotypeBasisAlleles.end()) {
            vector<AllelicPrimitive>& alleles = p->second;
            for (vector<AllelicPrimitive>::iterator z = alleles.begin(); z != alleles.end(); ++z) {
                //cerr << "overlapping allele " << z->ref << ":" << z->alt << endl;
                if (z->ref == ref && z->alt == alt) {
                    //cerr << "yess" << endl;
                    return true;
                }
            }
        }
        return false;
    }

}

Allele AlleleParser::makeAllele(RegisteredAlignment& ra,
                                AlleleType type,
                                long int pos,
                                int length,
                                int basesLeft,
                                int basesRight,
                                string& readSequence,
                                string& sampleName,
                                BamAlignment& alignment,
                                string& sequencingTech,
                                long double qual,
                                string& qualstr
    ) {


    string cigar;
    int reflen = length;

    if (type == ALLELE_REFERENCE) {
        cigar = convert(length) + "M";
    } else if (type == ALLELE_SNP || type == ALLELE_MNP) {
        cigar = convert(length) + "X";
    } else if (type == ALLELE_INSERTION) {
        reflen = 0;
        cigar = convert(length) + "I";
    } else if (type == ALLELE_DELETION) {
        cigar = convert(length) + "D";
    } else if (type == ALLELE_NULL) {
        cigar = convert(length) + "N";
    }

    string refSequence;
    if (type != ALLELE_NULL) { // only used for non null allele, avoid soft clipping edge cases
        refSequence = currentSequence.substr(pos - currentSequenceStart, reflen);
    }

    long int repeatRightBoundary = pos;

    // check if it's allowed

    // if it isn't allowed
    // and referenceLength > 0, make a reference allele with reference quality
    // if referenceLength == 0 (insertion), make a reference allele with 0 length (it will be filtered out in another context)

    // if it is allowed, make a normal allele

    // if not, adjust the allele so that it's a reference allele with preset BQ and length
    // in effect, this means creating a reference allele of the reference length of the allele with 0 BQ

    // NB, if we are using haplotype basis alleles the algorithm forces
    // alleles that aren't in the haplotype basis set into the reference space
    if (type != ALLELE_REFERENCE
        && type != ALLELE_NULL 
        && !allowedHaplotypeBasisAllele(pos + 1,
                                        refSequence,
                                        readSequence)) {
        type = ALLELE_REFERENCE;
        length = referenceLengthFromCigar(cigar);
        cigar = convert(length) + "M";
        // by adjusting the cigar, we implicitly adjust
        // allele.referenceLength, which is calculated when the allele is made
        qualstr = string(length, qualityInt2Char(0));
        readSequence = currentSequence.substr(pos - currentSequenceStart, length);
    }

    // cache information about repeat structure in the alleles, to
    // allow haplotype construction to be forced to extend across
    // tandem repeats and homopolymers when indels are present
    if (type == ALLELE_INSERTION || type == ALLELE_DELETION) {
        string alleleseq;
        if (type == ALLELE_INSERTION) {
            alleleseq = readSequence;
        } else if (type == ALLELE_DELETION) {
            alleleseq = refSequence;
        }
        map<long int, map<string, int> >::iterator rc = cachedRepeatCounts.find(pos);
        if (rc == cachedRepeatCounts.end()) {
            cachedRepeatCounts[pos] = repeatCounts(pos - currentSequenceStart, currentSequence, 12);
            rc = cachedRepeatCounts.find(pos);
        }
        map<string, int>& matchedRepeatCounts = rc->second;
        for (map<string, int>::iterator r = matchedRepeatCounts.begin(); r != matchedRepeatCounts.end(); ++r) {
            const string& repeatunit = r->first;
            int rptcount = r->second;
            string repeatstr = repeatunit * rptcount;
            // assumption of left-alignment may be problematic... so this should be updated
            if (repeatstr.size() >= parameters.minRepeatSize && isRepeatUnit(alleleseq, repeatunit)) {
                // determine the boundaries of the repeat
                long int p = pos - currentSequenceStart;
                // adjust to ensure we hit the first of the repeatstr
                size_t startpos = currentSequence.find(repeatstr, max((long int) 0, p - (long int) repeatstr.size() - 1));
                long int leftbound = startpos + currentSequenceStart;
                if (startpos == string::npos) {
                    cerr << "could not find repeat sequence?" << endl;
                    cerr << "repeat sequence: " << repeatstr << endl;
                    cerr << "currentsequence start: " << currentSequenceStart << endl;
                    cerr << currentSequence << endl;
                    cerr << "matched repeats:" << endl;
                    for (map<string, int>::iterator q = matchedRepeatCounts.begin(); q != matchedRepeatCounts.end(); ++q) {
                        cerr << q->first  << " : " << q->second << endl;
                        cerr << "... at position " << pos << endl;
                    }
                    break; // ignore right-repeat boundary in this case
                }
                repeatRightBoundary = leftbound + repeatstr.size() + 1; // 1 past edge of repeat
            }
        }

        // a dangerous game
        int start = pos - currentSequenceStart;
        double minEntropy = parameters.minRepeatEntropy;
        // check first that' wer'e actually ina repeat... TODO
        //cerr << "entropy of " << entropy(currentSequence.substr(start, repeatRightBoundary - pos)) << " is too low, " << endl;
        while (minEntropy > 0 && // ignore if turned off
               repeatRightBoundary - currentSequenceStart < currentSequence.size() && //guard
               entropy(currentSequence.substr(start, repeatRightBoundary - pos)) < minEntropy) {
            //cerr << "entropy of " << entropy(currentSequence.substr(start, repeatRightBoundary - pos)) << " is too low, "; 
            //cerr << "increasing rought boundary to ";
            ++repeatRightBoundary;
            //cerr << repeatRightBoundary << endl;
        }

        // now we
        //cachedRepeatCounts[pos] = repeatCounts(pos - currentSequenceStart, currentSequence, 12);
        // edge case, the indel is an insertion and matches the reference to the right
        // this means there is a repeat structure in the read, but not the ref
        if (currentSequence.substr(pos - currentSequenceStart, length) == readSequence) {
            repeatRightBoundary = max(repeatRightBoundary, pos + length + 1);
        }
    }

    return Allele(type,
                  currentSequenceName,
                  pos,
                  &currentPosition,
                  &currentReferenceBase,
                  length,
                  repeatRightBoundary,
                  basesLeft,
                  basesRight,
                  readSequence,
                  sampleName,
                  alignment.Name,
                  ra.readgroup,
                  sequencingTech,
                  !alignment.IsReverseStrand(),
                  max(qual, (long double) 0), // ensure qual is at least 0
                  qualstr,
                  alignment.MapQuality,
                  alignment.IsPaired(),
                  alignment.IsMateMapped(),
                  alignment.IsProperPair(),
                  cigar,
                  &ra.alleles,
                  alignment.Position,
                  alignment.GetEndPosition());

}

RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, RegisteredAlignment& ra, string& sampleName, string& sequencingTech) {

    string rDna = alignment.QueryBases;
    string rQual = alignment.Qualities;
    int rp = 0;  // read position, 0-based relative to read
    int csp = currentSequencePosition(alignment); // current sequence position, 0-based relative to currentSequence
    int sp = alignment.Position;  // sequence position

    if (usingHaplotypeBasisAlleles) {
        updateHaplotypeBasisAlleles(sp, alignment.AlignedBases.size());
    }

#ifdef VERBOSE_DEBUG
    if (parameters.debug2) {
        DEBUG2("registering alignment " << rp << " " << csp << " " << sp << endl <<
               "alignment readName " << alignment.Name << endl <<
               "alignment isPaired " << alignment.IsPaired() << endl <<
               "alignment isMateMapped " << alignment.IsMateMapped() << endl <<
               "alignment isProperPair " << alignment.IsProperPair() << endl <<
               "alignment mapQual " << alignment.MapQuality << endl <<
               "alignment sampleID " << sampleName << endl << 
               "alignment position " << alignment.Position << endl <<
               "alignment length " << alignment.Length << endl <<
               "alignment AlignedBases.size() " << alignment.AlignedBases.size() << endl <<
               "alignment GetEndPosition() " << alignment.GetEndPosition() << endl <<
               "alignment end position " << alignment.Position + alignment.AlignedBases.size());

        stringstream cigarss;
        int alignedLength = 0;
        for (vector<CigarOp>::const_iterator c = alignment.CigarData.begin(); c != alignment.CigarData.end(); ++c) {
            cigarss << c->Type << c->Length;
            if (c->Type == 'D')
                alignedLength += c->Length;
            if (c->Type == 'M')
                alignedLength += c->Length;
        }

        DEBUG2("alignment cigar " << cigarss.str());

        DEBUG2("current sequence pointer: " << csp);

        DEBUG2("read:          " << rDna);
        DEBUG2("aligned bases: " << alignment.AlignedBases);
        DEBUG2("qualities:     " << alignment.Qualities);
        DEBUG2("reference seq: " << currentSequence.substr(csp, alignment.AlignedBases.size()));
    }
#endif

    /*
     * The cigar only records matches for sequences that have embedded
     * mismatches.
     *
     * Also, we don't store the entire undelying sequence; just the subsequence
     * that matches our current target region.
     * 
     * As we step through a match sequence, we look for mismatches.  When we
     * see one we set a positional flag indicating the location, and we emit a
     * 'Reference' allele that stretches from the the base after the last
     * mismatch to the base before the current one.
     *
     * An example follows:
     *
     * NNNNNNNNNNNMNNNNNNNNNNNNNNNN
     * reference  ^\-snp  reference
     *
     */

    vector<bool> indelMask (alignment.AlignedBases.size(), false);

    vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin();
    vector<CigarOp>::const_iterator cigarEnd  = alignment.CigarData.end();
    for ( ; cigarIter != cigarEnd; ++cigarIter ) {
        int l = cigarIter->Length;
        char t = cigarIter->Type;
        DEBUG2("cigar item: " << t << l);

        if (t == 'M' || t == 'X' || t == '=') { // match or mismatch
            int firstMatch = csp; // track the first match after a mismatch, for recording 'reference' alleles
            int mismatchStart = -1;
            bool inMismatch = false;

            // for each base in the match region
            // increment the csp, sp, and rp
            // if there is a mismatch, record the last matching stretch as a reference allele
            // presently just record one snp per mismatched position, whether or not they are in a series

            for (int i=0; i<l; i++) {

                // extract aligned base
                string b;
                try {
                    b = rDna.at(rp);
                } catch (std::out_of_range outOfRange) {
                    cerr << "Exception: Cannot read past the end of the alignment's sequence." << endl
                         << alignment.Name << endl
                         << currentSequenceName << ":" << (long unsigned int) currentPosition + 1 << endl
                         << alignment.AlignedBases << endl
                         << currentSequence.substr(csp, alignment.AlignedBases.size()) << endl;
                    abort();
                }

                // convert base quality value into short int
                long double qual = qualityChar2LongDouble(rQual.at(rp));

                // get reference allele
                string sb;
                try {
                    sb = currentSequence.at(csp);
                } catch (std::out_of_range outOfRange) {
                    cerr << "Exception: Unable to read reference sequence base past end of current cached sequence." << endl
                         << currentSequenceName << ":" << (long unsigned int) currentPosition + 1 << endl
                         << alignment.Position << "-" << alignment.GetEndPosition() << endl
                         << "alignment: " << alignment.AlignedBases << endl
                         << "currentSequence: " << currentSequence << endl
                         << "currentSequence matching: " << currentSequence.substr(csp, alignment.AlignedBases.size()) << endl;
                    //abort();
                    break;
                }

                // record mismatch if we have a mismatch here
                if (b != sb || sb == "N") {  // when the reference is N, we should always call a mismatch
                    if (firstMatch < csp) {
                        int length = csp - firstMatch;
                        string readSequence = rDna.substr(rp - length, length);
                        string qualstr = rQual.substr(rp - length, length);
                        // record 'reference' allele for last matching region
                        if (allATGC(readSequence)) {
                            ra.addAllele(
                                makeAllele(ra,
                                           ALLELE_REFERENCE,
                                           sp - length,
                                           length,
                                           rp, // bases left (for first base in ref allele)
                                           alignment.QueryBases.size() - rp, // bases right (for first base in ref allele)
                                           readSequence,
                                           sampleName,
                                           alignment,
                                           sequencingTech,
                                           alignment.MapQuality, // reference allele quality == mapquality
                                           qualstr),
                                parameters.allowComplex, parameters.maxComplexGap);
                        }
                    }

                    // register mismatch
                    if (qual >= parameters.BQL2) {
                        ++ra.mismatches;  // increment our mismatch counter if we're over BQL2
                        ++ra.snpCount; // always increment snp counter
                    }

                    // always emit a snp, if we have too many mismatches over
                    // BQL2 then we will discard the registered allele in the
                    // calling context

                    if (!inMismatch) {
                        mismatchStart = csp;
                        inMismatch = true;
                    }
                    firstMatch = csp + 1;

                } else if (inMismatch) {
                    inMismatch = false;
                    int length = csp - mismatchStart;
                    string readSequence = rDna.substr(rp - length, length);
                    string qualstr = rQual.substr(rp - length, length);
                    for (int j = 0; j < length; ++j) {
                        long double lqual = qualityChar2LongDouble(qualstr.at(j));
                        string qualp = qualstr.substr(j, 1);
                        string rs = readSequence.substr(j, 1);
                        if (allATGC(rs)) {
                            ra.addAllele(
                                makeAllele(ra,
                                           ALLELE_SNP,
                                           sp - length + j,
                                           1,
                                           rp - length - j, // bases left
                                           alignment.QueryBases.size() - rp + j, // bases right
                                           rs,
                                           sampleName,
                                           alignment,
                                           sequencingTech,
                                           lqual,
                                           qualp),
                                parameters.allowComplex, parameters.maxComplexGap);
			    
                        } else {
                            ra.addAllele(
                                makeAllele(ra,
                                           ALLELE_NULL,
                                           sp - length + j,
                                           1,
                                           rp - length - j, // bases left
                                           alignment.QueryBases.size() - rp + j, // bases right
                                           rs,
                                           sampleName,
                                           alignment,
                                           sequencingTech,
                                           lqual,
                                           qualp),
                                parameters.allowComplex, parameters.maxComplexGap);
                        }
                    }
                }

                // update positions
                ++sp;
                ++csp;
                ++rp;
            }
            // catch mismatches at the end of the match
            if (inMismatch) {
                inMismatch = false;
                int length = csp - mismatchStart;
                string readSequence = rDna.substr(rp - length, length);
                string qualstr = rQual.substr(rp - length, length);
                for (int j = 0; j < length; ++j) {
                    long double lqual = qualityChar2LongDouble(qualstr.at(j));
                    string qualp = qualstr.substr(j, 1);
                    string rs = readSequence.substr(j, 1);
                    if (allATGC(rs)) {
                        ra.addAllele(
                            makeAllele(ra,
                                       ALLELE_SNP,
                                       sp - length + j,
                                       1,
                                       rp - length - j, // bases left
                                       alignment.QueryBases.size() - rp + j, // bases right
                                       rs,
                                       sampleName,
                                       alignment,
                                       sequencingTech,
                                       lqual,
                                       qualp),
                            parameters.allowComplex, parameters.maxComplexGap);
			
                    } else {
                        ra.addAllele(
                            makeAllele(ra,
                                       ALLELE_NULL,
                                       sp - length + j,
                                       1,
                                       rp - length - j, // bases left
                                       alignment.QueryBases.size() - rp + j, // bases right
                                       rs,
                                       sampleName,
                                       alignment,
                                       sequencingTech,
                                       lqual,
                                       qualp),
                            parameters.allowComplex, parameters.maxComplexGap);
                    }
                }
                // or, if we are not in a mismatch, construct the last reference allele of the match
            } else if (firstMatch < csp) {
                int length = csp - firstMatch;
                //string matchingSequence = currentSequence.substr(csp - length, length);
                string readSequence = rDna.substr(rp - length, length);
                string qualstr = rQual.substr(rp - length, length);
                if (allATGC(readSequence)) {
                    ra.addAllele(
                        makeAllele(ra,
                                   ALLELE_REFERENCE,
                                   sp - length,
                                   length,
                                   rp, // bases left (for first base in ref allele)
                                   alignment.QueryBases.size() - rp, // bases right (for first base in ref allele)
                                   readSequence,
                                   sampleName,
                                   alignment,
                                   sequencingTech,
                                   alignment.MapQuality, // ... hmm
                                   qualstr),
                        parameters.allowComplex, parameters.maxComplexGap);
                }
            }
        } else if (t == 'D') { // deletion

            // because deletions have no quality information,
            // use the surrounding sequence quality as a proxy
            // to provide quality scores of equivalent magnitude to insertions,
            // take N bp, right-centered on the position of the deletion
            // this logic prevents overflow of the read
            int spanstart;

            // this is used to calculate the quality string adding 2bp grounds
            // the indel in the surrounding sequence, which it is dependent
            // upon
            int L = l + 2;

            if (L > rQual.size()) {
                L = rQual.size();
                spanstart = 0;
            } else {
                // set lower bound to 0
                if (rp < (L / 2)) {
                    spanstart = 0;
                } else {
                    spanstart = rp - (L / 2);
                }
                // set upper bound to the string length
                if (spanstart + L > rQual.size()) {
                    spanstart = rQual.size() - L;
                }
            }

            string qualstr = rQual.substr(spanstart, L);

            long double qual;
            if (parameters.useMinIndelQuality) {
                qual = minQuality(qualstr);
                //qual = averageQuality(qualstr);
            } else {
                // quality, scaled inversely by the ratio between the quality
                // string length and the length of the event
                qual = sumQuality(qualstr);
                // quality adjustment:
                // scale the quality by the inverse harmonic sum of the length of
                // the quality string X a scaling constant derived from the ratio
                // between the length of the quality string and the length of the
                // allele
                //qual += ln2phred(log((long double) l / (long double) L));
                qual += ln2phred(log((long double) L / (long double) l));
                qual /= harmonicSum(l);
            }

            if (qual >= parameters.BQL2) {
                //ra.mismatches += l;
                for (int i=0; i<l; i++) {
                    indelMask[sp - alignment.Position + i] = true;
                }
            }

            string refseq = currentSequence.substr(csp, l);
            // some aligners like to report deletions at the beginnings and ends of reads.
            // without any sequence in the read to support this, it is hard to believe
            // that these deletions are real, so we ignore them here.
            if (cigarIter != alignment.CigarData.begin()      // guard against deletion at beginning
                && (cigarIter+1) != alignment.CigarData.end() // and against deletion at end
                && allATGC(refseq)) {
                string nullstr;
                ra.addAllele(
                    makeAllele(ra,
                               ALLELE_DELETION,
                               sp,
                               l,
                               rp, // bases left (for first base in ref allele)
                               alignment.QueryBases.size() - rp, // bases right (for first base in ref allele)
                               nullstr, // no read sequence for deletions
                               sampleName,
                               alignment,
                               sequencingTech,
                               qual,
                               nullstr), // no qualstr for deletions
                    parameters.allowComplex, parameters.maxComplexGap);
            }
            ++ra.indelCount;

            sp += l;  // update sample position
            csp += l;

        } else if (t == 'I') { // insertion

            //string qualstr = rQual.substr(rp, l);
            int spanstart;

            // this is used to calculate the quality string adding 2bp grounds
            // the indel in the surrounding sequence, which it is dependent
            // upon
            int L = l + 2;

            if (L > rQual.size()) {
                L = rQual.size();
                spanstart = 0;
            } else {
                // set lower bound to 0
                if (rp < 1) {
                    spanstart = 0;
                } else {
                    spanstart = rp - 1;
                }
                // set upper bound to the string length
                if (spanstart + L > rQual.size()) {
                    spanstart = rQual.size() - L;
                }
            }

            string qualstr = rQual.substr(spanstart, L);

            long double qual;
            if (parameters.useMinIndelQuality) {
                qual = minQuality(qualstr);
                //qual = averageQuality(qualstr); // does not work as well as the min
            } else {
                // quality, scaled inversely by the ratio between the quality
                // string length and the length of the event
                qual = sumQuality(qualstr);
                // quality adjustment:
                // scale the quality by the inverse harmonic sum of the length of
                // the quality string X a scaling constant derived from the ratio
                // between the length of the quality string and the length of the
                // allele
                //qual += ln2phred(log((long double) l / (long double) L));
                qual += ln2phred(log((long double) L / (long double) l));
                qual /= harmonicSum(l);
            }

            if (qual >= parameters.BQL2) {
                //ra.mismatches += l;
                indelMask[sp - alignment.Position] = true;
            }

            string readseq = rDna.substr(rp, l);
            if (allATGC(readseq)) {
                string qualstr = rQual.substr(rp, l);
                ra.addAllele(
                    makeAllele(ra,
                               ALLELE_INSERTION,
                               sp,
                               l,
                               rp - l, // bases left (for first base in ref allele)
                               alignment.QueryBases.size() - rp, // bases right (for first base in ref allele)
                               readseq,
                               sampleName,
                               alignment,
                               sequencingTech,
                               qual,
                               qualstr),
                    parameters.allowComplex, parameters.maxComplexGap);
            }
            ++ra.indelCount;

            rp += l;

            // handle other cigar element types
        } else if (t == 'S') { // soft clip, clipped sequence present in the read not matching the reference
            if (sp - l < 0) {
                // nothing to do, soft clip is beyond the beginning of the reference
            } else {
                string qualstr = rQual.substr(rp, l);
                string readseq = alignment.QueryBases.substr(rp, l);
                // skip these bases in the read
                ra.addAllele(
                    makeAllele(ra,
                               ALLELE_NULL,
                               sp - l,
                               l,
                               rp - l, // bases left
                               alignment.QueryBases.size() - rp, // bases right
                               readseq,
                               sampleName,
                               alignment,
                               sequencingTech,
                               alignment.MapQuality,
                               qualstr),
                    parameters.allowComplex, parameters.maxComplexGap);
            }
            rp += l;// sp += l; csp += l;
        } else if (t == 'H') { // hard clip on the read, clipped sequence is not present in the read
            // the alignment position is the first non-clipped base.
            // thus, hard clipping seems to just be an indicator that we clipped something
            // here we do nothing
            //sp += l; csp += l;
        } else if (t == 'N') { // skipped region in the reference not present in read, aka splice
            sp += l; csp += l;
        }
        // ignore padding
        //} else if (t == 'P') { // padding, silent deletion from the padded reference sequence
        //    sp += l; csp += l;
        //}
    } // end cigar iter loop

    if (ra.alleles.empty()) {
        DEBUG2("generated no alleles from read");
        return ra;
    }

    // this deals with the case in which we have embedded Ns in the read
    // often this happens at the start or end of reads, thus affecting our RegisteredAlignment::start and ::end
    ra.start = ra.alleles.front().position;
    ra.end = ra.alleles.back().position + ra.alleles.back().referenceLength;

    double alignedBases = 0;
    double mismatchCount = 0;
    double matchCount = 0;
    double indelCount = 0;

    // tally mismatches in two categories, gaps and mismatched bases
    for (vector<Allele>::iterator a = ra.alleles.begin(); a != ra.alleles.end(); ++a) {
        Allele& allele = *a;
        switch (allele.type) {
        case ALLELE_REFERENCE:
            alignedBases += allele.length;
            matchCount += allele.length;
            break;
        case ALLELE_SNP:
        case ALLELE_MNP:
            alignedBases += allele.length;
            mismatchCount += allele.length;
            break;
        case ALLELE_INSERTION:
        case ALLELE_DELETION:
        case ALLELE_COMPLEX:
            ++indelCount;
            break;
        default:
            break;
        }
    }

    double mismatchRate = ( indelCount + mismatchCount ) / alignedBases;
    double snpRate = mismatchCount / alignedBases;
    double indelRate = indelCount / alignedBases;

    // store mismatch information about the alignment in the alleles
    // for each allele, normalize the mismatch rates by ignoring that allele,
    // this allows us to relate the mismatch rate without reference to called alleles
    for (vector<Allele>::iterator a = ra.alleles.begin(); a != ra.alleles.end(); ++a) {
        Allele& allele = *a;
        allele.readMismatchRate = mismatchRate;
        allele.readSNPRate = snpRate;
        allele.readIndelRate = indelRate;

        switch (allele.type) {
        case ALLELE_REFERENCE:
            allele.readMismatchRate = mismatchRate;
            allele.readSNPRate = snpRate;
            allele.readIndelRate = indelRate;
            break;
        case ALLELE_SNP:
        case ALLELE_MNP:
            allele.readSNPRate = ( mismatchCount - allele.length ) / alignedBases;
            allele.readIndelRate = indelRate;
            allele.readMismatchRate = indelRate + allele.readSNPRate;
            break;
        case ALLELE_INSERTION:
        case ALLELE_DELETION:
        case ALLELE_COMPLEX:
            allele.readSNPRate = snpRate;
            allele.readIndelRate = ( indelCount - 1 ) / alignedBases;
            allele.readMismatchRate = allele.readIndelRate + snpRate;
            break;
        default:
            break;
        }
    }

    // ignore insertions, deletions, and N's which occur at the end of the read with
    // no reference-matching bases before the end of the read
    if (parameters.boundIndels &&
        (ra.alleles.back().isInsertion()
         || ra.alleles.back().isDeletion()
         || ra.alleles.back().isNull())) {
        ra.alleles.pop_back();
    }

#ifdef VERBOSE_DEBUG
    if (parameters.debug2) {
        cerr << "alleles:\n" << join(ra.alleles, "\n");
        cerr << endl;
    }
#endif

    /*
      cerr << "ra.alleles.size() = " << ra.alleles.size() << endl;
      for (vector<Allele>::iterator a = ra.alleles.begin(); a != ra.alleles.end(); ++a) {
      cerr << *a << endl;
      }
    */

    return ra;

}


void AlleleParser::updateAlignmentQueue(long int position,
                                        vector<Allele*>& newAlleles,
                                        bool gettingPartials) {

    DEBUG2("updating alignment queue");
    DEBUG2("currentPosition = " << position 
           << "; currentSequenceStart = " << currentSequenceStart 
           << "; currentSequence end = " << currentSequence.size() + currentSequenceStart);

    // make sure we have sequence for the *first* alignment
    //extendReferenceSequence(currentAlignment);

    // push to the front until we get to an alignment that doesn't overlap our
    // current position or we reach the end of available alignments
    // filter input reads; only allow mapped reads with a certain quality
    DEBUG2("currentAlignment.Position == " << currentAlignment.Position 
           << ", currentAlignment.AlignedBases.size() == " << currentAlignment.AlignedBases.size()
           << ", currentPosition == " << position
           << ", currentSequenceStart == " << currentSequenceStart
           << " .. + currentSequence.size() == " << currentSequenceStart + currentSequence.size()
        );

    if (hasMoreAlignments
        && currentAlignment.Position <= position
        && currentAlignment.RefID == currentRefID) {
        do {
            DEBUG2("top of alignment parsing loop");
            DEBUG("alignment: " << currentAlignment.Name);
            // get read group, and map back to a sample name
            string readGroup;
            if (!currentAlignment.GetTag("RG", readGroup)) {
                if (!oneSampleAnalysis) {
                    ERROR("Couldn't find read group id (@RG tag) for BAM Alignment " <<
                          currentAlignment.Name << " at position " << position
                          << " in sequence " << currentSequence << " EXITING!");
                    exit(1);
                } else {
                    readGroup = "unknown";
                }
            } else {
                if (oneSampleAnalysis) {
                    ERROR("No read groups specified in BAM header, but alignment " <<
                          currentAlignment.Name << " at position " << position
                          << " in sequence " << currentSequence << " has a read group.");
                    exit(1);
                }
            }

            // skip this alignment if we are not analyzing the sample it is drawn from
            if (readGroupToSampleNames.find(readGroup) == readGroupToSampleNames.end()) {
                ERROR("could not find sample matching read group id " << readGroup);
                continue;
            }

            // skip this alignment if we are not using duplicate reads (we remove them by default)
            if (currentAlignment.IsDuplicate() && !parameters.useDuplicateReads) {
                //DEBUG("skipping alignment " << currentAlignment.Name << " because it is a duplicate read");
                continue;
            }

            // skip unmapped alignments, as they cannot be used in the algorithm
            if (!currentAlignment.IsMapped()) {
                //DEBUG("skipping alignment " << currentAlignment.Name << " because it is not mapped");
                continue;
            }

            // skip alignments which have no aligned bases
            if (currentAlignment.AlignedBases.size() == 0) {
                //DEBUG("skipping alignment " << currentAlignment.Name << " because it has no aligned bases");
                continue;
            }

            // skip alignments which are non-primary
            if (!currentAlignment.IsPrimaryAlignment()) {
                //DEBUG("skipping alignment " << currentAlignment.Name << " because it is not marked primary");
                continue;
            }

            if (!gettingPartials && currentAlignment.GetEndPosition() < position) {
                cerr << currentAlignment.Name << " at " << currentSequenceName << ":" << currentAlignment.Position << " is out of order!"
                     << " expected after " << position << endl;
                continue;
            }

            // otherwise, get the sample name and register the alignment to generate a sequence of alleles
            // we have to register the alignment to acquire some information required by filters
            // such as mismatches

            // initially skip reads with low mapping quality (what happens if MapQuality is not in the file)
            if (currentAlignment.MapQuality >= parameters.MQL0) {
                // extend our cached reference sequence to allow processing of this alignment
                //extendReferenceSequence(currentAlignment);
                // left realign indels
                if (parameters.leftAlignIndels) {
                    int length = currentAlignment.GetEndPosition() - currentAlignment.Position + 1;
                    stablyLeftAlign(currentAlignment,
                                    currentSequence.substr(currentSequencePosition(currentAlignment), length));
                }
                // get sample name
                string sampleName = readGroupToSampleNames[readGroup];
                string sequencingTech;
                map<string, string>::iterator t = readGroupToTechnology.find(readGroup);
                if (t != readGroupToTechnology.end()) {
                    sequencingTech = t->second;
                }
                // limit base quality if cap set
                if (parameters.baseQualityCap != 0) {
                    capBaseQuality(currentAlignment, parameters.baseQualityCap);
                }
                // decomposes alignment into a set of alleles
                // here we get the deque of alignments ending at this alignment's end position
                deque<RegisteredAlignment>& rq = registeredAlignments[currentAlignment.GetEndPosition()];
                // and insert the registered alignment into that deque
                rq.push_front(RegisteredAlignment(currentAlignment, parameters));
                RegisteredAlignment& ra = rq.front();
                registerAlignment(currentAlignment, ra, sampleName, sequencingTech);
                // backtracking if we have too many mismatches
                // or if there are no recorded alleles
                if (ra.alleles.empty()
                    || ((float) ra.mismatches / (float) currentAlignment.QueryBases.size()) > parameters.readMaxMismatchFraction
                    || ra.mismatches > parameters.RMU
                    || ra.snpCount > parameters.readSnpLimit
                    || ra.indelCount > parameters.readIndelLimit) {
                    rq.pop_front(); // backtrack
                } else {
                    // push the alleles into our new alleles vector
                    for (vector<Allele>::iterator allele = ra.alleles.begin(); allele != ra.alleles.end(); ++allele) {
                        newAlleles.push_back(&*allele);
                    }
                }
            }
        } while ((hasMoreAlignments = bamMultiReader.GetNextAlignment(currentAlignment))
                 && currentAlignment.Position <= position
                 && currentAlignment.RefID == currentRefID);
    }

    DEBUG2("... finished pushing new alignments");

}

void AlleleParser::addToRegisteredAlleles(vector<Allele*>& alleles) {
    registeredAlleles.insert(registeredAlleles.end(),
                             alleles.begin(),
                             alleles.end());
}

// updates registered alleles and erases the unused portion of our cached reference sequence
void AlleleParser::updateRegisteredAlleles(void) {

    long int lowestPosition = currentSequenceStart + currentSequence.size();

    // remove reference alleles which are no longer overlapping the current position
    // http://stackoverflow.com/questions/347441/erasing-elements-from-a-vector
    vector<Allele*>& alleles = registeredAlleles;


    for (vector<Allele*>::iterator allele = alleles.begin(); allele != alleles.end(); ++allele) {
        long unsigned int position = (*allele)->position;
        // note that this will underflow if currentPosition == 0 and lastHaplotypeLength > 0
        // resolved by setting lastHaplotypeLength = 0 in init, and when we switch targets
        if (currentPosition - lastHaplotypeLength > position + (*allele)->referenceLength) {
            *allele = NULL;
        }
        else {
            if (position < lowestPosition)
                lowestPosition = position;
        }
    }

    alleles.erase(remove(alleles.begin(), alleles.end(), (Allele*)NULL), alleles.end());

}

pair<int, long int> AlleleParser::nextInputVariantPosition(void) {
    // are we past the last one in the sequence?
    if (usingVariantInputAlleles &&
        ((inputVariantAlleles.find(currentRefID) != inputVariantAlleles.end()
          && inputVariantAlleles[currentRefID].upper_bound(currentPosition) != inputVariantAlleles[currentRefID].end())
         || inputVariantAlleles.upper_bound(currentRefID) != inputVariantAlleles.end())) {
        map<long, vector<Allele> >& inChrom = inputVariantAlleles[currentRefID];
        map<long, vector<Allele> >::iterator ic = inChrom.upper_bound(currentPosition);
        if (ic != inChrom.end()) {
            return make_pair(currentRefID, ic->first);
        } else {
            // find next chrom with input alleles
            map<int, map<long, vector<Allele> > >::iterator nc = inputVariantAlleles.upper_bound(currentRefID);
            if (nc != inputVariantAlleles.end()) {
                return make_pair(nc->first, nc->second.begin()->first);
            } else {
                return make_pair(-1, 0);
            }
        }
    }
    return make_pair(-1, 0);
}

void AlleleParser::getAllInputVariants(void) {
    string nullstr;
    getInputVariantsInRegion(nullstr);
}

void AlleleParser::getInputVariantsInRegion(string& seq, long start, long end) {

    if (!usingVariantInputAlleles) return;

    // get the variants in the target region
    vcf::Variant var(variantCallInputFile);
    if (!seq.empty()) {
        variantCallInputFile.setRegion(seq, start, end);
    }
    bool ok;
    while (ok = variantCallInputFile.getNextVariant(*currentVariant)) {

        long int pos = currentVariant->position - 1;
        // get alternate alleles
        bool includePreviousBaseForIndels = true;
        map<string, vector<vcf::VariantAllele> > variantAlleles = currentVariant->parsedAlternates();
        // TODO this would be a nice option: why does it not work?
        //map<string, vector<vcf::VariantAllele> > variantAlleles = currentVariant->flatAlternates();
        vector< vector<vcf::VariantAllele> > orderedVariantAlleles;
        for (vector<string>::iterator a = currentVariant->alt.begin(); a != currentVariant->alt.end(); ++a) {
            orderedVariantAlleles.push_back(variantAlleles[*a]);
        }

        vector<Allele> genotypeAlleles;
        set<long int> alternatePositions;

        for (vector< vector<vcf::VariantAllele> >::iterator g = orderedVariantAlleles.begin(); g != orderedVariantAlleles.end(); ++g) {

            vector<vcf::VariantAllele>& altAllele = *g;

            vector<Allele> alleles;

            for (vector<vcf::VariantAllele>::iterator v = altAllele.begin(); v != altAllele.end(); ++v) {
                vcf::VariantAllele& variant = *v;
                long int allelePos = variant.position - 1;
                AlleleType type;
                string alleleSequence = variant.alt;

                int len = 0;
                int reflen = 0;
                string cigar;

                // XXX
                // FAIL
                // you need to add in the reference bases between the non-reference ones!
                // to allow for complex events!

                if (variant.ref == variant.alt) {
                    // XXX note that for reference alleles, we only use the first base internally
                    // but this is technically incorrect, so this hack should be noted
                    len = variant.ref.size();
                    reflen = len;
                    //alleleSequence = alleleSequence.at(0); // take only the first base
                    type = ALLELE_REFERENCE;
                    cigar = convert(len) + "M";
                } else if (variant.ref.size() == variant.alt.size()) {
                    len = variant.ref.size();
                    reflen = len;
                    if (variant.ref.size() == 1) {
                        type = ALLELE_SNP;
                    } else {
                        type = ALLELE_MNP;
                    }
                    cigar = convert(len) + "X";
                } else if (variant.ref.size() > variant.alt.size()) {
                    type = ALLELE_DELETION;
                    len = variant.ref.size() - variant.alt.size();
                    allelePos -= 1;
                    reflen = len + 2;
                    alleleSequence =
                        uppercase(reference.getSubSequence(currentVariant->sequenceName, allelePos, 1))
                        + alleleSequence
                        + uppercase(reference.getSubSequence(currentVariant->sequenceName, allelePos+1+len, 1));
                    cigar = "1M" + convert(len) + "D" + "1M";
                } else {
                    // we always include the flanking bases for these elsewhere, so here too in order to be consistent and trigger use
                    type = ALLELE_INSERTION;
                    // add previous base and post base to match format typically used for calling
                    allelePos -= 1;
                    alleleSequence =
                        uppercase(reference.getSubSequence(currentVariant->sequenceName, allelePos, 1))
                        + alleleSequence
                        + uppercase(reference.getSubSequence(currentVariant->sequenceName, allelePos+1, 1));
                    len = variant.alt.size() - var.ref.size();
                    cigar = "1M" + convert(len) + "I" + "1M";
                    reflen = 2;
                }
                // TODO deal woth complex subs

                Allele allele = genotypeAllele(type, alleleSequence, (unsigned int) len, cigar, (unsigned int) reflen, allelePos);
                DEBUG("input allele: " << allele.referenceName << " " << allele);
                //cerr << "input allele: " << allele.referenceName << " " << allele << endl;

                //alleles.push_back(allele);
                genotypeAlleles.push_back(allele);

                if (allele.type != ALLELE_REFERENCE) {
                    inputVariantAlleles[bamMultiReader.GetReferenceID(currentVariant->sequenceName)][allele.position].push_back(allele);
                    alternatePositions.insert(allele.position);
                }
            }
        }
    }
}

void AlleleParser::updateInputVariants(long int pos, int referenceLength) {

    //cerr << "updating input variants (?) " << pos << " + " << referenceLength << " >? " << rightmostInputAllelePosition << endl;
    if (!usingVariantInputAlleles) return;

    if (pos + referenceLength > rightmostInputAllelePosition) {
        long int start = rightmostInputAllelePosition;
        if (start == 0) {
            start = rightmostHaplotypeBasisAllelePosition;
        }

        /*
        stringstream r;
        r << currentSequenceName << ":" << start
          << "-" << pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
        cerr << "getting variants in " << r.str() << endl;
        */

        // tabix expects 1-based, fully closed regions for ti_parse_region()
        // (which is what setRegion() calls eventually)
        bool gotRegion = false;
        if (referenceLength > 0) {
            gotRegion = variantCallInputFile.setRegion(currentSequenceName,
                                                       start + 1,
                                                       pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW + 1);
        } else {
            // whole chromosome
            gotRegion = variantCallInputFile.setRegion(currentSequenceName);
        }

        if (gotRegion) {

            // get the variants in the target region
            vcf::Variant var(variantCallInputFile);
            bool ok;
            while (ok = variantCallInputFile.getNextVariant(*currentVariant)) {

                DEBUG("getting input alleles from input VCF at position " << currentVariant->sequenceName << ":" << currentVariant->position);
                long int pos = currentVariant->position - 1;
                // get alternate alleles
                bool includePreviousBaseForIndels = true;
                map<string, vector<vcf::VariantAllele> > variantAlleles = currentVariant->parsedAlternates();
                // TODO this would be a nice option: why does it not work?
                //map<string, vector<vcf::VariantAllele> > variantAlleles = currentVariant->flatAlternates();
                vector< vector<vcf::VariantAllele> > orderedVariantAlleles;
                for (vector<string>::iterator a = currentVariant->alt.begin(); a != currentVariant->alt.end(); ++a) {
                    orderedVariantAlleles.push_back(variantAlleles[*a]);
                }

                vector<Allele> genotypeAlleles;
                set<long int> alternatePositions;

                for (vector< vector<vcf::VariantAllele> >::iterator g = orderedVariantAlleles.begin(); g != orderedVariantAlleles.end(); ++g) {

                    vector<vcf::VariantAllele>& altAllele = *g;

                    vector<Allele> alleles;

                    for (vector<vcf::VariantAllele>::iterator v = altAllele.begin(); v != altAllele.end(); ++v) {
                        vcf::VariantAllele& variant = *v;
                        long int allelePos = variant.position - 1;
                        AlleleType type;
                        string alleleSequence = variant.alt;

                        int len = 0;
                        int reflen = 0;
                        string cigar;

                        // XXX
                        // FAIL
                        // you need to add in the reference bases between the non-reference ones!
                        // to allow for complex events!

                        if (variant.ref == variant.alt) {
                            // XXX note that for reference alleles, we only use the first base internally
                            // but this is technically incorrect, so this hack should be noted
                            len = variant.ref.size();
                            reflen = len;
                            //alleleSequence = alleleSequence.at(0); // take only the first base
                            type = ALLELE_REFERENCE;
                            cigar = convert(len) + "M";
                        } else if (variant.ref.size() == variant.alt.size()) {
                            len = variant.ref.size();
                            reflen = len;
                            if (variant.ref.size() == 1) {
                                type = ALLELE_SNP;
                            } else {
                                type = ALLELE_MNP;
                            }
                            cigar = convert(len) + "X";
                        } else if (variant.ref.size() > variant.alt.size()) {
                            type = ALLELE_DELETION;
                            len = variant.ref.size() - variant.alt.size();
                            allelePos -= 1;
                            reflen = len + 2;
                            alleleSequence =
                                uppercase(reference.getSubSequence(currentSequenceName, allelePos, 1))
                                + alleleSequence
                                + uppercase(reference.getSubSequence(currentSequenceName, allelePos+1+len, 1));
                            cigar = "1M" + convert(len) + "D" + "1M";
                        } else {
                            // we always include the flanking bases for these elsewhere, so here too in order to be consistent and trigger use
                            type = ALLELE_INSERTION;
                            // add previous base and post base to match format typically used for calling
                            allelePos -= 1;
                            alleleSequence =
                                uppercase(reference.getSubSequence(currentSequenceName, allelePos, 1))
                                + alleleSequence
                                + uppercase(reference.getSubSequence(currentSequenceName, allelePos+1, 1));
                            len = variant.alt.size() - var.ref.size();
                            cigar = "1M" + convert(len) + "I" + "1M";
                            reflen = 2;
                        }
                        // TODO deal woth complex subs

                        Allele allele = genotypeAllele(type, alleleSequence, (unsigned int) len, cigar, (unsigned int) reflen, allelePos);
                        DEBUG("input allele: " << allele.referenceName << " " << allele);

                        //alleles.push_back(allele);
                        genotypeAlleles.push_back(allele);

                        if (allele.type != ALLELE_REFERENCE) {
                            inputVariantAlleles[bamMultiReader.GetReferenceID(allele.referenceName)][allele.position].push_back(allele);
                            alternatePositions.insert(allele.position);
                        }

                    }

                }

                // store the allele counts, if they are provided
                //
            }

            if (!ok) hasMoreVariants = false;
        }
        /*
        for (map<long int, vector<Allele> >::iterator v = inputVariantAlleles.begin(); v != inputVariantAlleles.end(); ++v) {
            vector<Allele>& iv = v->second;
            cerr << "input variants pos = " << v->first << endl;
            for (vector<Allele>::iterator a = iv.begin(); a != iv.end(); ++a) {
                cerr << *a << endl;
            }
        }
        */
        //rightmostHaplotypeBasisAllelePosition = pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
        //rightmostInputAllelePosition = pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
    }

}

/*
void AlleleParser::addCurrentGenotypeLikelihoods(map<int, vector<Genotype> >& genotypesByPloidy,
    vector<vector<SampleDataLikelihood> >& sampleDataLikelihoods) {

    // check if there are any genotype likelihoods at the current position
    if (inputGenotypeLikelihoods.find(currentPosition) != inputGenotypeLikelihoods.end()) {

        map<string, map<string, long double> >& inputLikelihoodsBySample = inputGenotypeLikelihoods[currentPosition];

        vector<Genotype*> genotypePtrs;
        for (map<int, vector<Genotype> >::iterator gp = genotypesByPloidy.begin(); gp != genotypesByPloidy.end(); ++gp) {
            vector<Genotype>& genotypes = gp->second;
            for (vector<Genotype>::iterator g = genotypes.begin(); g != genotypes.end(); ++g) {
                genotypePtrs.push_back(&*g);
            }
        }
        // if there are, add them to the sample data likelihoods
        for (map<string, map<string, long double> >::iterator gls = inputLikelihoodsBySample.begin();
                gls != inputLikelihoodsBySample.end(); ++gls) {
            const string& sampleName = gls->first;
            map<string, long double>& likelihoods = gls->second;
            map<Genotype*, long double> likelihoodsPtr;
            for (map<string, long double>::iterator gl = likelihoods.begin(); gl != likelihoods.end(); ++gl) {
                const string& genotype = gl->first;
                long double l = gl->second;
                for (vector<Genotype*>::iterator g = genotypePtrs.begin(); g != genotypePtrs.end(); ++g) {
                    if (convert(**g) == genotype) {
                        likelihoodsPtr[*g] = l;
                    }
                }
            }

            Result sampleData;
            sampleData.name = sampleName;
            // TODO add null sample object to sampleData
            // do you need to????
            for (map<Genotype*, long double>::iterator p = likelihoodsPtr.begin(); p != likelihoodsPtr.end(); ++p) {
                sampleData.push_back(SampleDataLikelihood(sampleName, nullSample, p->first, p->second, 0));
            }
            sortSampleDataLikelihoods(sampleData);
            if (!sampleData.empty()) {
                sampleDataLikelihoods.push_back(sampleData);
            }
        }
    }
}


void AlleleParser::getInputAlleleCounts(vector<Allele>& genotypeAlleles, map<string, int>& inputACs) {
    // are there input ACs?
    //
    // if so, match them to the genotype alleles
    if (inputAlleleCounts.find(currentPosition) != inputAlleleCounts.end()) {
        map<Allele, int>& inputCounts = inputAlleleCounts[currentPosition];
        // XXX NB. We only use ACs for alleles in genotypeAlleles
        for (vector<Allele>::iterator a = genotypeAlleles.begin(); a != genotypeAlleles.end(); ++a) {
            if (inputCounts.find(*a) != inputCounts.end()) {
                inputACs[a->currentBase] = inputCounts[*a];
            }
        }
    }
}
*/

void AlleleParser::removeAllelesWithoutReadSpan(vector<Allele*>& alleles, int probeLength, int haplotypeLength) {
    for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
        Allele* allele = *a;
        if (!(allele->position == currentPosition && allele->referenceLength == haplotypeLength))
            continue;
        // require additionally
        int additionalRequiredBases = probeLength - allele->alternateSequence.size();
        int requiredFlank = ceil((double) additionalRequiredBases / 2);
        DEBUG2(allele << " needs at least " << additionalRequiredBases
              << " bpleft " << allele->read5pNonNullBases() << " bpright " << allele->read3pNonNullBases());
        if (additionalRequiredBases > 0 &&
            (allele->read5pNonNullBases() < additionalRequiredBases
             || allele->read3pNonNullBases() < additionalRequiredBases)) {
            DEBUG("removing " << allele << " as it does not have the required probe length");
            *a = NULL;
        }
    }
    alleles.erase(remove(alleles.begin(), alleles.end(), (Allele*)NULL), alleles.end());
}

void AlleleParser::removeNonOverlappingAlleles(vector<Allele*>& alleles, int haplotypeLength, bool getAllAllelesInHaplotype) {
    for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
        Allele* allele = *a;
        if (allele->type == ALLELE_REFERENCE) {
            // does the reference allele overlap the haplotype
            if (getAllAllelesInHaplotype
                && !(currentPosition <= allele->position && allele->position < currentPosition + haplotypeLength)) {
                //cerr << *a << " is not in haplotype" << endl;
                *a = NULL;
            } else if (!(allele->position <= currentPosition
                         && allele->position + allele->referenceLength >= currentPosition + haplotypeLength)) {
                //cerr << *a << " is not fully overlapping haplotype from " << currentPosition << " to " << currentPosition + haplotypeLength << endl;
                *a = NULL;
            } else if (currentPosition < allele->position) { // not there yet
                //cerr << *a << " is not before current position" << endl;
                allele->processed = false;
                *a = NULL;
            }
        } else { // snps, insertions, deletions
            if (getAllAllelesInHaplotype
                && !(currentPosition <= allele->position && allele->position < currentPosition + haplotypeLength)) {
                *a = NULL;
            } else if (!(currentPosition == allele->position && allele->referenceLength == haplotypeLength)) {
                *a = NULL;
            } else if (currentPosition + haplotypeLength <= allele->position) {
                allele->processed = false;
                *a = NULL;
            }
        }
    }
    alleles.erase(remove(alleles.begin(), alleles.end(), (Allele*)NULL), alleles.end());
}

// removes alleles which are filtered at the current position, and unsets their 'processed' flag so they are later evaluated
void AlleleParser::removeFilteredAlleles(vector<Allele*>& alleles) {
    for (vector<Allele*>::iterator allele = alleles.begin(); allele != alleles.end(); ++allele) {
        if ((*allele)->quality < parameters.BQL0 || (*allele)->currentBase == "N") {
            (*allele)->processed = false; // force re-processing later
            *allele = NULL;
        }
    }
    alleles.erase(remove(alleles.begin(), alleles.end(), (Allele*)NULL), alleles.end());
}

void AlleleParser::removePreviousAlleles(vector<Allele*>& alleles) {
    for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
        Allele* allele = *a;
        if (allele->position + allele->referenceLength <= currentPosition) {
            allele->processed = true;
            *a = NULL;
        }
    }
    alleles.erase(remove(alleles.begin(), alleles.end(), (Allele*)NULL), alleles.end());
}

// steps our position/beddata/reference pointers through all positions in all
// targets, returns false when we are finished
//
// pushes and pulls alignments out of our queue of overlapping alignments via
// updateAlignmentQueue() as we progress
//
// returns true if we still have more targets to process
// false otherwise
bool AlleleParser::toNextTarget(void) {

    clearRegisteredAlignments();

    // reset haplotype length; there is no last call in this sequence; it isn't relevant
    lastHaplotypeLength = 0;

    if (targets.empty() && usingVariantInputAlleles) {
        // we are processing everything, so load the entire input variant allele set
        getAllInputVariants();
    }

    // load first target if we have targets and have not loaded the first
    if (!parameters.useStdin && !targets.empty()) {

        bool ok = false;

        // try to load the first target if we need to
        if (!currentTarget) {
            ok = loadTarget(&targets.front()) && getFirstAlignment();
        }

        // step through targets until we get to one with alignments
        while (!ok && currentTarget != &targets.back()) {
            if (!loadTarget(++currentTarget)) {
                continue;
            }
            if (ok = getFirstAlignment()) {
                break;
            }
        }

        if (!ok) {
            return loadNextPositionWithInputVariant();
        }

    // stdin, no targets cases
    } else if (!currentTarget && (parameters.useStdin || targets.empty())) {
        // if we have a target for limiting the analysis, use it
        // this happens when you specify stdin + a region string
        if (!targets.empty()) {
            currentTarget = &targets.front();
            loadTarget(currentTarget);
        }
        if (!getFirstAlignment()) {
            ERROR("Could not get first alignment from target");
            return false;
        }
        loadNextPositionWithAlignmentOrInputVariant(currentAlignment);
        //loadReferenceSequence(currentAlignment); // this seeds us with new reference sequence
        // however, if we have a target list of variants and we should also respect them
    // we've reached the end of file, or stdin
    } else if (parameters.useStdin || targets.empty()) {
        return false;
    }

    loadReferenceSequence(currentSequenceName);

    justSwitchedTargets = true;
    return true;

}


// TODO refactor this to allow reading from stdin or reading the whole file
// without loading each sequence as a target
bool AlleleParser::loadTarget(BedTarget* target) {

    currentTarget = target;

    DEBUG("processing target " << currentTarget->desc << " " <<
          currentTarget->seq << " " << currentTarget->left << " " <<
          currentTarget->right + 1);
    DEBUG2("loading target reference subsequence");

    loadReferenceSequence(currentTarget->seq);

    DEBUG2("setting new position " << currentTarget->left);
    currentPosition = currentTarget->left;
    rightmostHaplotypeBasisAllelePosition = currentTarget->left;

    if (!bamMultiReader.SetRegion(currentRefID, currentTarget->left, currentRefID, currentTarget->right + 1)) { // bamtools expects 0-based, half-open
        ERROR("Could not SetRegion to " << currentTarget->seq << ":" << currentTarget->left << ".." << currentTarget->right + 1);
        cerr << bamMultiReader.GetErrorString() << endl;
        return false;
    }

    if (variantCallInputFile.is_open()) {
        stringstream r;
        // tabix expects 1-based, fully closed regions for ti_parse_region()
        // (which is what setRegion() calls eventually)
        r << currentTarget->seq << ":" << currentTarget->left + 1 << "-" << currentTarget->right + 1;
        if (!variantCallInputFile.setRegion(r.str())) {
            WARNING("Could not set the region of the variants input file to " <<
                    currentTarget->seq << ":" << currentTarget->left << ".." <<
                    currentTarget->right + 1);
            //return false;
        } else {
            DEBUG("set region of variant input file to " << 
                    currentTarget->seq << ":" << currentTarget->left << ".." <<
                    currentTarget->right + 1);
        }
    }

    // now that we've jumped, reset the hasMoreAlignments counter
    hasMoreAlignments = true;

    DEBUG2("set region");

    return true;

}

bool AlleleParser::getFirstAlignment(void) {

    bool hasAlignments = true;
    if (!bamMultiReader.GetNextAlignment(currentAlignment)) {
        hasAlignments = false;
    } else {
        while (!currentAlignment.IsMapped()) {
            if (!bamMultiReader.GetNextAlignment(currentAlignment)) {
                hasAlignments = false;
                break;
            }
        }
    }

    if (hasAlignments) {
        DEBUG2("got first alignment in target region");
    } else {
        if (currentTarget) {
            DEBUG("Could not find any mapped reads in target region " << currentSequenceName << ":" << currentTarget->left << ".." << currentTarget->right + 1);
        } else {
            DEBUG("Could not find any mapped reads in target region " << currentSequenceName);
        }
        return false;
    }

    return true;

}

bool AlleleParser::getFirstVariant(void) {

    hasMoreVariants = false;
    if (variantCallInputFile.is_open()) {
        if (!variantCallInputFile.getNextVariant(*currentVariant)) {
            hasMoreVariants = false;
        } else {
            hasMoreVariants = true;
        }

        if (hasMoreVariants) {
            DEBUG2("got first variant in target region");
        } else {
            return false;
        }
    }

    return true;

}

void AlleleParser::clearRegisteredAlignments(void) {
    DEBUG2("clearing registered alignments and alleles");
    registeredAlignments.clear();
    registeredAlleles.clear();
}

// TODO
// this should be simplified
// there are two modes of operation
// that in which we have targets
// and that without
//
// if we have targets, we need to keep track of which we're in
// and if we're outside of it, try to get to the next one
// and, if we have targets, we will try to jump around the bam file
//
// if we don't have targets we will just GetNextAlignment until we can't
// anymore.  all positionality of the parser will respond to input alignments.
//
// rewrite things so that we aren't strung out between 8 functions
//

// stepping
//
// if the next position is outside of target region
// seek to next target which is in-bounds for its sequence
// if none exist, return false
//
bool AlleleParser::toNextPosition(void) {

    // either bail out
    if (currentSequenceName.empty()) {
        DEBUG("loading first target");
        if (!toNextTarget()) {
            return false;
        }
    } 
    // or step to the next position
    else {
        // if there is no data in the pile
        // and the curentalignment is far away
        // and there are no more variants right here
        // jump the parser to the next position with an input allele or data
        if (registeredAlignments.empty() && !hasInputVariantAllelesAtCurrentPosition()) {
            if (currentAlignment.GetEndPosition() < currentPosition) {
                hasMoreAlignments = getFirstAlignment();
                if (!hasMoreAlignments && !hasMoreInputVariants()) return false;
            }
            // if this is as far as we can go, bail
            long int lastPosition = currentPosition;
            loadNextPositionWithAlignmentOrInputVariant(currentAlignment);
            if (lastPosition == currentPosition) return false;
            justSwitchedTargets = true;
        } else {
            ++currentPosition;
        }
    }

    // if we've run off the right edge of a target
    if (!targets.empty() && currentPosition > currentTarget->right) { // time to move to a new target
        DEBUG("next position " << (long int) currentPosition <<  " outside of current target right bound " << currentTarget->right + 1);
        // try to get to the next one, and if this fails, bail out
        if (!toNextTarget()) {
            DEBUG("no more targets, finishing");
            return false;
        }
    }

    // in the stdin, or no targets case
    // here we assume we are processing an entire BAM or one contiguous region
    if ((parameters.useStdin && targets.empty()) || targets.empty()) {
        // here we loop over unaligned reads at the beginning of a target
        // we need to get to a mapped read to figure out where we are
        while (hasMoreAlignments && !currentAlignment.IsMapped()) {
            hasMoreAlignments = bamMultiReader.GetNextAlignment(currentAlignment);
        }
        // now, if the current position of this alignment is outside of the reference sequence length, switch references
        if (hasMoreAlignments) {
            if (currentPosition >= reference.sequenceLength(currentSequenceName)
                || registeredAlignments.empty() && currentRefID != currentAlignment.RefID) {
                DEBUG("at end of sequence");
                clearRegisteredAlignments();
                loadReferenceSequence(currentAlignment);
                justSwitchedTargets = true;
            }
        // if we have run out of alignments
        } else if (!hasMoreAlignments) {
            if (registeredAlignments.empty()) {
                if (hasMoreInputVariants() || hasInputVariantAllelesAtCurrentPosition()) {
                    //loadNextPositionWithAlignmentOrInputVariant(currentAlignment);
                    //justSwitchedTargets = true;
                } else {
                    DEBUG("no more alignments in input");
                    return false;
                }
            } else if (currentPosition >= currentSequence.size() + currentSequenceStart) {
                if (hasMoreInputVariants() || hasInputVariantAllelesAtCurrentPosition()) {
                    //loadNextPositionWithAlignmentOrInputVariant(currentAlignment);
                    //justSwitchedTargets = true;
                } else {
                    DEBUG("no more alignments in input");
                    DEBUG("at end of sequence");
                    return false;
                }
            }
        }
    }

    // so we have to make sure it's still there (this matters in low-coverage)
    DEBUG2("updating reference sequence cache");
    //preserveReferenceSequenceWindow(CACHED_REFERENCE_WINDOW);
    currentReferenceBase = currentReferenceBaseChar();

    // handle the case in which we don't have targets but in which we've switched reference sequence

    DEBUG("processing position " << (long unsigned int) currentPosition + 1 << " in sequence " << currentSequenceName);
    vector<Allele*> newAlleles;
    updateAlignmentQueue(currentPosition, newAlleles);
    addToRegisteredAlleles(newAlleles);
    DEBUG2("updating variants");
    // done typically at each new read, but this handles the case where there is no data for a while
    //updateInputVariants(currentPosition, 1);

    DEBUG2("updating registered alleles");
    updateRegisteredAlleles(); // this removes unused left-flanking sequence
    //DEBUG2("updating prior variant alleles");
    //updatePriorAlleles();

    // if we have alignments which ended at the previous base, erase them and their alleles
    // TODO check that this doesn't leak...
    DEBUG2("erasing old registered alignments");
    map<long unsigned int, deque<RegisteredAlignment> >::iterator f = registeredAlignments.begin();
    while (f != registeredAlignments.end()
           && f->first < currentPosition - lastHaplotypeLength) {
        registeredAlignments.erase(f++);
    }

    // remove past registered alleles
    DEBUG2("marking previous alleles as processed and removing from registered alleles");
    removePreviousAlleles(registeredAlleles);
    sort(registeredAlleles.begin(), registeredAlleles.end());
    registeredAlleles.erase(unique(registeredAlleles.begin(), registeredAlleles.end()), registeredAlleles.end());

    // and do the same for the variants from the input VCF
    /*
    DEBUG2("erasing old input variant alleles");
    if (inputVariantAlleles.find(currentSequenceName) != inputVariantAlleles.end()) {
        map<long int, vector<Allele> >::iterator v = inputVariantAlleles[currentSequenceName].begin();
        while (v != inputVariantAlleles[currentSequenceName].end() && v->first < currentPosition) {
            inputVariantAlleles[currentSequenceName].erase(v++);
        }
    }
    */

    DEBUG2("erasing old input haplotype basis alleles");
    map<long int, vector<AllelicPrimitive> >::iterator z = haplotypeBasisAlleles.begin();
    while (z != haplotypeBasisAlleles.end() && z->first < currentPosition) {
        haplotypeBasisAlleles.erase(z++);
    }

    DEBUG2("erasing old cached repeat counts");
    map<long int, map<string, int> >::iterator rc = cachedRepeatCounts.begin();
    while (rc != cachedRepeatCounts.end() && rc->first < currentPosition) {
        cachedRepeatCounts.erase(rc++);
    }

    return true;

}

// XXX for testing only, steps targets but does nothing
bool AlleleParser::dummyProcessNextTarget(void) {

    if (!toNextTarget()) {
        DEBUG("no more targets, finishing");
        return false;
    }

    while (bamMultiReader.GetNextAlignment(currentAlignment)) {
    }

    return true;
}

void AlleleParser::removeDuplicateAlleles(Samples& samples, map<string, vector<Allele*> >& alleleGroups, int allowedAlleleTypes, int haplotypeLength, Allele& refallele) {

    map<string, int> seqCounts;
    bool multipleAllelesWithIdenticalAlts = false;
    string refseq = currentReferenceHaplotype();
    ++seqCounts[refseq];
    for (map<string, vector<Allele*> >::iterator a = alleleGroups.begin(); a != alleleGroups.end(); ++a) {
        Allele& allele = *a->second.front();
        if (seqCounts[allele.alternateSequence] > 0) {
            multipleAllelesWithIdenticalAlts = true;
            break;
        } else {
            ++seqCounts[allele.alternateSequence];
        }
    }

    if (multipleAllelesWithIdenticalAlts) {
        homogenizeAlleles(alleleGroups, refseq, refallele);
        getAlleles(samples, allowedAlleleTypes, haplotypeLength, false, true);
        alleleGroups.clear();
        groupAlleles(samples, alleleGroups);  // groups by alternate sequence
    }

}

// adjusts the registered alignment and contained alleles so that one allele
// covers the entire haplotype window
// returns a vector of pointers to alleles generated in this process
// alleles which are discarded are not explicitly removed, but 'squashed',
// which triggers their collection later
bool RegisteredAlignment::fitHaplotype(int haplotypeStart, int haplotypeLength, Allele*& aptr, bool allowPartials) {

    // if the read overlaps the haplotype window,
    // generate one Allele to describe the read in that region
    // and "squash" the unused ones
    vector<Allele*> newAllelesPtr;
    vector<Allele> newAlleles;

    int haplotypeEnd = haplotypeStart + haplotypeLength;
    
    //if (containedAlleleTypes == ALLELE_REFERENCE) {
    //    return false;
    //}
    /*
    cerr << "start: " << start << " end: " << end << endl;
    cerr << "haplotypestart: " << haplotypeStart << " haplotypeend: " << haplotypeEnd << endl;
    cerr << "registered alignment alleles," << endl << alleles << endl;
    */

    // save and bail out if we can't construct a haplotype allele
    vector<Allele> savedAlleles = alleles;

    if ((allowPartials && (start <= haplotypeEnd || end >= haplotypeStart))
        || (start <= haplotypeStart && end >= haplotypeEnd)) {
        vector<Allele>::iterator a = alleles.begin();
        //cerr << "trying to find overlapping haplotype alleles for the range " << haplotypeStart << " to " << haplotypeEnd << endl;
        while (a + 1 != alleles.end() && a->position + a->referenceLength <= haplotypeStart) {
            ++a;
        }
        vector<Allele>::iterator b = a;
        while (b + 1 != alleles.end() && b->position + b->referenceLength < haplotypeEnd) {
            ++b;
        }

        // do not attempt to build haplotype alleles where there are non-contiguous reads
        for (vector<Allele>::iterator p = alleles.begin(); p != alleles.end(); ++p) {
            if (p != alleles.begin()) {
                if (p->position != (p - 1)->position + (p - 1)->referenceLength) {
                    //cerr << "non-contiguous reads, cannot construct haplotype allele" << endl;
                    return true;
                }
            }
        }

        // conceptually it will be easier to work on the haplotype obs if the reference alleles match the haplotype specification
        //if (a == b && a->isReference()) {
            // break the reference observation
            //cerr << "we just have a reference allele" << endl;
            //return true;
        //}

        string seq;
        vector<pair<int, string> > cigar;
        vector<short> quals;

        // now "a" should overlap the start of the haplotype block, and "b" the end
        //cerr << "block start overlaps: " << *a << endl;
        //cerr << "block end overlaps: " << *b << endl;
        //cerr << "haplotype start: " << haplotypeStart << endl;

        for (vector<Allele>::iterator p = a; p != (b+1); ++p) {
            if (p->isNull()) return false; // can't assemble across NULL alleles
        }

        // adjust a to match the start of the haplotype block
        if (a->position == haplotypeStart) {
            // nothing to do!
        } else if (a->position < haplotypeStart) {
            // squeeze bases off the front of this allele onto the last allele
            // generating a new allele if there isn't one
            Allele newAllele = *a;
            newAllele.subtractFromEnd(a->position + a->referenceLength - haplotypeStart, seq, cigar, quals);
            a->subtractFromStart(haplotypeStart - a->position, seq, cigar, quals);
            newAlleles.push_back(newAllele);
        }

        if (b->position + b->referenceLength == haplotypeEnd) {
            // nothing to do!!!!
        } else if (b->position + b->referenceLength > haplotypeEnd) {
            Allele newAllele = *b;
            newAllele.subtractFromStart(haplotypeEnd - b->position, seq, cigar, quals);
            if (isUnflankedIndel(newAllele)) {
                if (b + 1 != alleles.end()) {
                    ++b;
                }
            } else {
                b->subtractFromEnd(b->position + b->referenceLength - haplotypeEnd, seq, cigar, quals);
                newAlleles.push_back(newAllele);
            }
        }

        // now, for everything between a and b, merge them into one allele
        while (a != b) {
            vector<pair<int, string> > cigarV = splitCigar(a->cigar);
            vector<Allele>::iterator p = a + 1;
            // update the quality of the merged allele in the same way as we do
            // for complex events
            if (!a->isReference() && !a->isNull())  {
                p->quality = min(a->quality, p->quality);  // note that phred and log are inverted
                p->lnquality = max(a->lnquality, p->lnquality);
            }
            p->addToStart(a->alternateSequence, cigarV, a->baseQualities);
            a->squash();
            ++a;
        }

        // remove any 0-length alleles, these are useless
        // this operation requires independent removal of references to these alleles (e.g. registeredAlleles.clear())
        alleles.erase(remove_if(alleles.begin(), alleles.end(), isEmptyAllele), alleles.end());

        for (vector<Allele>::iterator p = newAlleles.begin(); p != newAlleles.end(); ++p) {
            alleles.push_back(*p);
        }

        AllelePositionCompare apcomp;
        sort(alleles.begin(), alleles.end(), apcomp);

        // now the pointers have changed, so find the allele we want... again!!!!!!
        //cerr << "registered alignment alleles, after haplotype construction," << endl << alleles << endl;
        bool hasHaplotypeAllele = false;
        bool dividedIndel = false;
        for (vector<Allele>::iterator p = alleles.begin(); p != alleles.end(); ++p) {
            // fix the "base"
            if (!p->isReference()) {
                p->update(haplotypeLength);
            }
            //cerr << *p << endl;
            if (p->position == haplotypeStart && p->position + p->referenceLength == haplotypeEnd) {
                aptr = &*p;
                if (isUnflankedIndel(*p)) {
                    hasHaplotypeAllele = false;
                    dividedIndel = true;
                } else {
                    hasHaplotypeAllele = true;
                }
                break;
            }
        }

        if (hasHaplotypeAllele) {
            //cerr << "registered alignment alleles after (pass)," << endl << alleles << endl;
            return true;
        } else {
            if (!allowPartials) {
                alleles = savedAlleles; // reset alleles
            }
            //cerr << "registered alignment alleles after (fail)," << endl << alleles << endl;
            return false;
            //assert(hasHaplotypeAllele);
        }

    } else {
        cerr << "registered alignment alleles after (pass)," << endl << alleles << endl;
        return true;
    }

}

void AlleleParser::buildHaplotypeAlleles(
    vector<Allele>& alleles,
    Samples& samples,
    map<string, vector<Allele*> >& alleleGroups,
    // provides observation group counts, counts of partial observations
    map<string, vector<Allele*> >& partialObservationGroups,
    map<Allele*, set<Allele*> >& partialObservationSupport,
    int allowedAlleleTypes) {

    int haplotypeLength = 1;
    for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
        Allele& allele = *a;
        if (allele.isReference()) continue;
        // check if there are any complex alleles
        if (allele.referenceLength > haplotypeLength) {
            DEBUG("reference length of " << allele << " is " << allele.referenceLength
                  << " so extending haplotype");
            haplotypeLength = allele.referenceLength;
        }
        // check if we are embedded in a repeat structure
        if (allele.repeatRightBoundary > currentPosition + haplotypeLength) {
            DEBUG("right boundary " << allele.repeatRightBoundary << " for " << allele << " is past "
                  << currentPosition + haplotypeLength);
            haplotypeLength = allele.repeatRightBoundary - currentPosition;
        }
    }

    // return here if we have no registered alignments
    if (registeredAlignments.empty()) return;

    // always attempt to determine haplotype length in this fashion
    {

        DEBUG("haplotype length is " << haplotypeLength);

        // NB: for indels in tandem repeats, if the indel sequence is
        // derived from the repeat structure, build the haplotype
        // across the entire repeat pattern.  This ensures we actually
        // can discriminate between reference and indel/complex
        // alleles in the most common misalignment case.  For indels
        // that match the repeat structure, we have cached the right
        // boundary of the repeat.  We build the haplotype to the
        // maximal boundary indicated by the present alleles.

        int oldHaplotypeLength = haplotypeLength;
        do {
            oldHaplotypeLength = haplotypeLength;

            // rebuild everything...
            registeredAlleles.clear();
            samples.clear();

            long int maxAlignmentEnd = registeredAlignments.rbegin()->first;
            for (long int i = currentPosition+1; i < maxAlignmentEnd; ++i) {
                deque<RegisteredAlignment>& ras = registeredAlignments[i];
                for (deque<RegisteredAlignment>::iterator r = ras.begin(); r != ras.end(); ++r) {
                    RegisteredAlignment& ra = *r;
                    if (ra.start > currentPosition && ra.start < currentPosition + haplotypeLength
                        || ra.end > currentPosition && ra.end < currentPosition + haplotypeLength) {
                        Allele* aptr;
                        bool allowPartials = true;
                        ra.fitHaplotype(currentPosition, haplotypeLength, aptr, allowPartials);
                        for (vector<Allele>::iterator a = ra.alleles.begin(); a != ra.alleles.end(); ++a) {
                            registeredAlleles.push_back(&*a);
                        }
                    }
                }
            }

            getAlleles(samples, allowedAlleleTypes, haplotypeLength, true, true);
            alleleGroups.clear();
            groupAlleles(samples, alleleGroups);
            alleles = genotypeAlleles(alleleGroups, samples, parameters.onlyUseInputAlleles);
            for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
                Allele& allele = *a;
                //cerr << "genotype allele, in haplotype length determination " << allele << endl;
                if (!allele.isReference()) {
                    long int alleleend = (allele.position + allele.referenceLength);
                    // this adjustment forces reference observations to overlap the ends of the indels
                    //if (allele.isInsertion() || allele.isDeletion()) {
                    //    alleleend += 1;
                    //}
                    long int hapend = max((long int) alleleend,
                                          allele.repeatRightBoundary);
                    /*
                    cerr << currentPosition + haplotypeLength << " vs " << alleleend
                         << " end " << hapend << " ? " << allele.position + allele.referenceLengthFromCigar()
                         << " hapend for " << allele << endl;
                    */
                    if (hapend > currentPosition + haplotypeLength) {
                        DEBUG("adjusting haplotype length to " << hapend - currentPosition
                              << " to overlap allele end " << alleleend
                              << " or right repeat boundary " << allele.repeatRightBoundary
                              << " " << allele);
                        haplotypeLength = hapend - currentPosition;
                    }
                }
            }
        } while (haplotypeLength != oldHaplotypeLength); // && haplotypeLength < parameters.maxHaplotypeLength);


        // TODO?
        //haplotypeLength = min(parameters.maxHaplotypeLength, haplotypeLength);

        // TODO adjust haplotypes over indels to include +1 bp on 3' end
        // this will force reference observations across the entire allele

        // for each non-reference allele within the haplotype length of this
        // position, adjust the length and reference sequences of the adjacent
        // alleles 
        DEBUG("fitting haplotype block " << currentPosition << " to " << currentPosition + haplotypeLength << ", " << haplotypeLength << "bp");

        lastHaplotypeLength = haplotypeLength;

        registeredAlleles.clear();
        samples.clear();

        vector<Allele*> haplotypeObservations;
        getCompleteObservationsOfHaplotype(samples, haplotypeLength, haplotypeObservations);
        addToRegisteredAlleles(haplotypeObservations);
        DEBUG("added to registered alleles");

        // add partial observations
        // first get all the alleles up to the end of the haplotype window
        vector<Allele*> partialHaplotypeObservations;
        if (parameters.usePartialObservations && haplotypeLength > 1) {
            getPartialObservationsOfHaplotype(samples, haplotypeLength, partialHaplotypeObservations);
        }
        DEBUG("got partial observations of haplotype");
        //addToRegisteredAlleles(partialHaplotypeObservations);
        // now align the sequences of these alleles to the haplotype alleles
        // and put them into the partials bin in each sample

        // correct quality and alternate sequence for reference
        for (vector<Allele*>::iterator h = haplotypeObservations.begin(); h != haplotypeObservations.end(); ++h) {
            if ((*h)->position == currentPosition && (*h)->referenceLength == haplotypeLength) {
                (*h)->currentBase = (*h)->alternateSequence;
                (*h)->setQuality();
                (*h)->update(haplotypeLength);
                if ((*h)->isReference()) {  // HACK.. undoes damage of update() call
                    (*h)->currentBase = (*h)->alternateSequence;
                }
            }
        }
        for (vector<Allele*>::iterator p = partialHaplotypeObservations.begin(); p != partialHaplotypeObservations.end(); ++p) {
            (*p)->currentBase = (*p)->alternateSequence;
            (*p)->setQuality();
            (*p)->update(haplotypeLength);
        }
        DEBUG("done updating");

        // debugging
        /*
        for (vector<Allele*>::iterator h = haplotypeObservations.begin(); h != haplotypeObservations.end(); ++h) {
            cerr << "haplo_obs\t" << *h << endl;
        }
        */

        if (parameters.debug) {
            cerr << "refr_seq\t" << currentPosition << "\t\t" << reference.getSubSequence(currentSequenceName, currentPosition, haplotypeLength) << endl;
            for (vector<Allele*>::iterator h = haplotypeObservations.begin(); h != haplotypeObservations.end(); ++h) {
                if ((*h)->position == currentPosition && (*h)->referenceLength == haplotypeLength) {
                    cerr << "haplo_obs\t" << (*h)->position << "\t" << (*h)->lnquality << "\t"
                        //<< (*h)->currentBase << "\t"
                         << string(max((long int)0,(*h)->position-currentPosition), ' ')
                         << (*h)->alternateSequence << "\t" << *h << endl;
                }
            }
            for (vector<Allele*>::iterator p = partialHaplotypeObservations.begin(); p != partialHaplotypeObservations.end(); ++p) {
                if ((*p)->position >= currentPosition && (*p)->position < currentPosition+haplotypeLength) {
                    cerr << "part_obs\t" << (*p)->position << "\t" << (*p)->lnquality << "\t"
                        //<< (*p)->currentBase << "\t"
                         << string(max((long int)0,(*p)->position-currentPosition), ' ')
                         << (*p)->alternateSequence << "\t" << *p << endl;
                }
            }
        }

        // now re-get the alleles
        getAlleles(samples, allowedAlleleTypes, haplotypeLength, false, true);

        // re-group the alleles using groupAlleles()
        alleleGroups.clear();
        groupAlleles(samples, alleleGroups);

        /*
        for (Samples::iterator s = samples.begin(); s != samples.end(); ++s) {
            cerr << s->first << endl;
            for (Sample::iterator t = s->second.begin(); t != s->second.end(); ++t) {
                cerr << t->first << " " << t->second << endl << endl;
            }
        }
        */

        Allele refAllele = genotypeAllele(ALLELE_REFERENCE,
                                          uppercase(reference.getSubSequence(currentSequenceName, currentPosition, haplotypeLength)),
                                          haplotypeLength,
                                          convert(haplotypeLength)+"M",
                                          haplotypeLength,
                                          currentPosition);

        // are there two alleles with the same alt sequence?
        // if so, homogenize them, and then re-sort the alleles

        // ensure uniqueness of registered alleles
        sort(registeredAlleles.begin(), registeredAlleles.end());
        registeredAlleles.erase(unique(registeredAlleles.begin(), registeredAlleles.end()), registeredAlleles.end());

        removeDuplicateAlleles(samples, alleleGroups, allowedAlleleTypes, haplotypeLength, refAllele);

        alleles = genotypeAlleles(alleleGroups, samples, parameters.onlyUseInputAlleles, haplotypeLength);

        // require all complete observations to effectively cover the same amount of sequence
        // basically, the "probe" length should be the same or we will incur bias when generating likelihoods
        // should these be put into the partial observations bin?

        int maxAlleleLength = haplotypeLength;
        for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
            // get max allele length
            if (a->alternateSequence.size() > maxAlleleLength) maxAlleleLength = a->alternateSequence.size();
        }

        // bound this to 50bp so as to not drop out reference obs when we have long insertions directly encoded in the reads
        maxAlleleLength = min(50, maxAlleleLength);
        //cerr << "max allele length is " << maxAlleleLength << " but haplotype length = " << haplotypeLength << endl;
        // XXX make work for deletions as well
        if (maxAlleleLength > haplotypeLength) {
            //cerr << "max allele length = " << maxAlleleLength << endl;
            removeAllelesWithoutReadSpan(registeredAlleles, maxAlleleLength, haplotypeLength);
            samples.clear();
            // require that reference obs are over an equivalent amount of sequence as the max allele length
            getAlleles(samples, allowedAlleleTypes, haplotypeLength, false, true);
            alleleGroups.clear();
            groupAlleles(samples, alleleGroups);  // groups by alternate sequence
            // establish alleles again, now that we've filtered observations which don't have the required probe length
            alleles = genotypeAlleles(alleleGroups, samples, parameters.onlyUseInputAlleles, haplotypeLength);
        }

        // force the ref allele into the analysis, if it somehow isn't supported
        // this can happen where we don't have sufficient read span, such as in long deletions
        // or where our samples are homozygous for an alternate
        if (!parameters.useRefAllele) {
            vector<Allele> refAlleleVector;
            refAlleleVector.push_back(refAllele);
            alleles = alleleUnion(alleles, refAlleleVector);
        }

        // this is where we have established our genotype alleles
        /*
        for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
            cerr << "genotype allele " << &*a << " " << *a << endl;
        }
        */

        // pick up observations that are potentially partial (not unambiguous)
        // the way to do this is to test the full observations as if they are partial, and if they
        // end up partially supporting multiple observations, removing them from the "complete" observations

        if (parameters.usePartialObservations && haplotypeLength > 1) {
            // check this out
            // here we are going to pass a set of full haplotype observations
            // and we'll remove now-partial obs from the full set
            samples.assignPartialSupport(alleles,
                                         haplotypeObservations,
                                         partialObservationGroups,
                                         partialObservationSupport,
                                         currentPosition,
                                         haplotypeLength);

            vector<Allele*> pureHaplotypeObservations;
            for (vector<Allele*>::iterator h = haplotypeObservations.begin(); h != haplotypeObservations.end(); ++h) {
                //if (partialObservationSupport.find(*h) != partialObservationSupport.end())
                //cerr << "partials for " << **h << " are " << partialObservationSupport[*h].size() << endl;
                if (partialObservationSupport.find(*h) != partialObservationSupport.end()
                    && partialObservationSupport[*h].size() > 0) {
                    DEBUG("full obs " << **h << " is actually partial and supports "
                          << partialObservationSupport[*h].size() << " alleles");
                    partialObservationSupport.erase(*h);
                    // and remove from partial observation groups?
                } else {
                    //cerr << "saving " << *h << endl;
                    pureHaplotypeObservations.push_back(*h);
                }
            }

            // now regenerate partial observation groups using updated partial support
            partialObservationGroups.clear();
            for (map<Allele*, set<Allele*> >::iterator p = partialObservationSupport.begin();
                 p != partialObservationSupport.end(); ++p) {
                set<Allele*>& supported = p->second;
                for (set<Allele*>::iterator s = supported.begin(); s != supported.end(); ++s) {
                    partialObservationGroups[(*s)->currentBase].push_back(p->first);
                }
            }

            // and keep only the pure haplotype observations for further use
            haplotypeObservations = pureHaplotypeObservations;

            addToRegisteredAlleles(haplotypeObservations);

            // clean up potential duplicates
            sort(registeredAlleles.begin(), registeredAlleles.end());
            registeredAlleles.erase(unique(registeredAlleles.begin(), registeredAlleles.end()), registeredAlleles.end());

            samples.clearFullObservations();
            getAlleles(samples, allowedAlleleTypes, haplotypeLength, false, true);
            alleleGroups.clear();
            groupAlleles(samples, alleleGroups);

            // stash partials for later
            addToRegisteredAlleles(partialHaplotypeObservations);

            for (vector<Allele*>::iterator p = partialHaplotypeObservations.begin(); p != partialHaplotypeObservations.end(); ++p) {
                (*p)->currentBase = (*p)->alternateSequence;
                (*p)->setQuality();
                (*p)->update(haplotypeLength);
            }
            
            // now add in partial observations collected from partially-overlapping reads
            if (!partialHaplotypeObservations.empty()) {
                samples.assignPartialSupport(alleles,
                                             partialHaplotypeObservations,
                                             partialObservationGroups,
                                             partialObservationSupport,
                                             currentPosition,
                                             haplotypeLength);
            }
        }

        registeredAlleles.clear();

        // reset registered alleles
        for (map<long unsigned int, deque<RegisteredAlignment> >::iterator ras = registeredAlignments.begin(); ras != registeredAlignments.end(); ++ras) {
            deque<RegisteredAlignment>& rq = ras->second;
            for (deque<RegisteredAlignment>::iterator rai = rq.begin(); rai != rq.end(); ++rai) {
                RegisteredAlignment& ra = *rai;
                for (vector<Allele>::iterator a = ra.alleles.begin(); a != ra.alleles.end(); ++a) {
                    registeredAlleles.push_back(&*a);
                }
            }
        }

        if (!parameters.useRefAllele) {
            vector<Allele> refAlleleVector;
            refAlleleVector.push_back(refAllele);
            alleles = alleleUnion(alleles, refAlleleVector);
        }

        //removeDuplicateAlleles(samples, alleleGroups, allowedAlleleTypes, haplotypeLength);
        //alleles = genotypeAlleles(alleleGroups, samples, parameters.onlyUseInputAlleles, haplotypeLength);

    }

    // hack......... TODO unhack this and set in Sample class
    samples.setSupportedAlleles();

    // processed flag..
    //unsetAllProcessedFlags();

    // redundant?

    // remove alleles which should no longer be considered
    removePreviousAlleles(registeredAlleles);

    lastHaplotypeLength = haplotypeLength;

}

bool AlleleParser::getCompleteObservationsOfHaplotype(Samples& samples, int haplotypeLength, vector<Allele*>& haplotypeObservations) {
    for (map<long unsigned int, deque<RegisteredAlignment> >::iterator ras = registeredAlignments.begin(); ras != registeredAlignments.end(); ++ras) {
        deque<RegisteredAlignment>& rq = ras->second;
        for (deque<RegisteredAlignment>::iterator rai = rq.begin(); rai != rq.end(); ++rai) {
            RegisteredAlignment& ra = *rai;
            Allele* aptr;
            // this guard prevents trashing allele pointers when getting partial observations
            if (ra.start <= currentPosition && ra.end >= currentPosition + haplotypeLength) {
                if (ra.fitHaplotype(currentPosition, haplotypeLength, aptr)) {
                    for (vector<Allele>::iterator a = ra.alleles.begin(); a != ra.alleles.end(); ++a) {
                        if (a->position == currentPosition && a->referenceLength == haplotypeLength) {
                            haplotypeObservations.push_back(&*a);
                        }
                    }
                } /*else {
                    DEBUG("could not fit observation " << ra.name << " with alleles " << ra.alleles);
                    // the alleles have (possibly) been changed in fithaplotype, so add them to the registered alleles again
                    for (vector<Allele>::iterator a = ra.alleles.begin(); a != ra.alleles.end(); ++a) {
                        registeredAlleles.push_back(&*a);
                    }
                    }*/
            }
        }
    }
    DEBUG("got complete observations of haplotype");
}

void AlleleParser::unsetAllProcessedFlags(void) {
    for (map<long unsigned int, deque<RegisteredAlignment> >::iterator ras = registeredAlignments.begin(); ras != registeredAlignments.end(); ++ras) {
        deque<RegisteredAlignment>& rq = ras->second;
        for (deque<RegisteredAlignment>::iterator rai = rq.begin(); rai != rq.end(); ++rai) {
            RegisteredAlignment& ra = *rai;
            Allele* aptr;
            for (vector<Allele>::iterator a = ra.alleles.begin(); a != ra.alleles.end(); ++a) {
                a->processed = false; // re-trigger use of all alleles
            }
        }
    }
}


// process the next length bp of alignments, so as to get allele observations partially overlapping our calling window
bool AlleleParser::getPartialObservationsOfHaplotype(Samples& samples, int haplotypeLength, vector<Allele*>& partials) {
    //cerr << "getting partial observations of haplotype from " << currentPosition << " to " << currentPosition + haplotypeLength << endl;
    vector<Allele*> newAlleles;

    bool gettingPartials = true;
    DEBUG("in AlleleParser::getPartialObservationsOfHaplotype, updating alignment queue");
    updateAlignmentQueue(currentPosition + haplotypeLength, newAlleles, gettingPartials);
    DEBUG("in AlleleParser::getPartialObservationsOfHaplotype, done updating alignment queue");

    vector<Allele*> otherObs;
    vector<Allele*> partialObs;
    // now get the partial obs
    // get the max alignment end position, iterate to there
    long int maxAlignmentEnd = registeredAlignments.rbegin()->first;
    for (long int i = currentPosition+1; i < maxAlignmentEnd; ++i) {
        DEBUG("getting partial observations of haplotype @" << i);
        deque<RegisteredAlignment>& ras = registeredAlignments[i];
        for (deque<RegisteredAlignment>::iterator r = ras.begin(); r != ras.end(); ++r) {
            RegisteredAlignment& ra = *r;
            if (ra.start > currentPosition && ra.start < currentPosition + haplotypeLength
                || ra.end > currentPosition && ra.end < currentPosition + haplotypeLength) {
                Allele* aptr;
                bool allowPartials = true;
                ra.fitHaplotype(currentPosition, haplotypeLength, aptr, allowPartials);
                for (vector<Allele>::iterator a = ra.alleles.begin(); a != ra.alleles.end(); ++a) {
                    if (a->position >= currentPosition
                        && a->position < currentPosition+haplotypeLength
                        && !a->isNull()) {
                        //a->processed = false; // re-trigger use of all alleles
                        partials.push_back(&*a);
                    } else {
                        //a->processed = false;
                        otherObs.push_back(&*a);
                    }
                }
            } else {
                for (vector<Allele>::iterator a = ra.alleles.begin(); a != ra.alleles.end(); ++a) {
                    //a->processed = false;
                    otherObs.push_back(&*a);
                }
            }
        }
    }
    //addToRegisteredAlleles(partialObs);
    addToRegisteredAlleles(otherObs);
}

bool AlleleParser::getNextAlleles(Samples& samples, int allowedAlleleTypes) {
    long int nextPosition = currentPosition + lastHaplotypeLength;
    while (currentPosition < nextPosition) {
        if (!toNextPosition()) {
            return false;
        } else {
            if (justSwitchedTargets) {
                nextPosition = 0;
                justSwitchedTargets = false;
            }
            getAlleles(samples, allowedAlleleTypes);
        }
    }
    lastHaplotypeLength = 1;
    return true;
}

void AlleleParser::getAlleles(Samples& samples, int allowedAlleleTypes,
                              int haplotypeLength, bool getAllAllelesInHaplotype,
                              bool ignoreProcessedFlag) {

    DEBUG2("getting alleles");

    for (Samples::iterator s = samples.begin(); s != samples.end(); ++s)
        s->second.clear();
    // TODO ^^^ this should be optimized for better scanning performance

    // if we have targets and are outside of the current target, don't return anything

    // add the reference allele to the analysis
    if (parameters.useRefAllele) {
        if (currentReferenceAllele) delete currentReferenceAllele; // clean up after last position
        currentReferenceAllele = referenceAllele(parameters.MQR, parameters.BQR);
        samples[referenceSampleName].clear();
        samples[referenceSampleName][currentReferenceAllele->currentBase].push_back(currentReferenceAllele);
        //alleles.push_back(currentReferenceAllele);
    }

    // get the variant alleles *at* the current position
    // and the reference alleles *overlapping* the current position
    for (vector<Allele*>::const_iterator a = registeredAlleles.begin(); a != registeredAlleles.end(); ++a) {
        Allele& allele = **a;
        //cerr << "getting alleles at position " << currentPosition << " with length " << haplotypeLength << " " << allele << endl;
        if (!ignoreProcessedFlag && allele.processed) continue;
        if (allowedAlleleTypes & allele.type
            && ((haplotypeLength > 1 &&
                 ((allele.type == ALLELE_REFERENCE
                   && allele.position <= currentPosition 
                   && allele.position + allele.referenceLength >= currentPosition + haplotypeLength)
                  || 
                  (allele.position == currentPosition
                   && allele.referenceLength == haplotypeLength)
                  ||
                  (getAllAllelesInHaplotype
                   && allele.type != ALLELE_REFERENCE
                   && allele.position >= currentPosition
                   && allele.position < currentPosition + haplotypeLength)))
                ||
                (haplotypeLength == 1 &&
                 ((allele.type == ALLELE_REFERENCE
                   && allele.position <= currentPosition
                   && allele.position + allele.referenceLength > currentPosition)
                  || 
                  (allele.position == currentPosition)))
                ) ) {
            allele.update(haplotypeLength);
            if (allele.quality >= parameters.BQL0 && allele.currentBase != "N"
                && (allele.isReference() || !allele.alternateSequence.empty())) { // filters haplotype construction chaff
                //cerr << "keeping allele " << allele << endl;
                samples[allele.sampleID][allele.currentBase].push_back(*a);
                // XXX testing
                if (!getAllAllelesInHaplotype) {
                    allele.processed = true;
                    if (haplotypeLength > 1) {
                        if (!allele.isReference() && !(allele.position == currentPosition && allele.referenceLength == haplotypeLength)) {
                            cerr << "non-reference allele should not be added to result alleles because it does not match the haplotype!:" << endl;
                            cerr << "haplotype is from " << currentPosition << " to " << currentPosition + haplotypeLength << ", " << haplotypeLength << "bp" << endl;
                            cerr << allele << endl;
                            assert(false);
                        }
                    }
                }
            }
        }
    }

    vector<string> samplesToErase;
    // now remove empty alleles from our return so as to not confuse processing
    for (Samples::iterator s = samples.begin(); s != samples.end(); ++s) {

        const string& name = s->first;
        Sample& sample = s->second;

        // move updated reference alleles to the right bin
        // everything else will get axed
        //sample.sortReferenceAlleles();

        bool empty = true;
        vector<string> genotypesToErase;
        // and remove any empty groups which remain
        for (Sample::iterator g = sample.begin(); g != sample.end(); ++g) {
            if (g->second.empty()) {
                //cerr << "sample " << name << " has an empty " << g->first << endl;
                //sample.erase(g);
                genotypesToErase.push_back(g->first);
            } else {
                // accumulate bitmap of unique types
                empty = false;
            }
        }

        for (vector<string>::iterator gt = genotypesToErase.begin(); gt != genotypesToErase.end(); ++gt) {
            sample.erase(*gt);
        }

        // and remove the entire sample if it has no alleles
        if (empty || currentSamplePloidy(name) == 0) {
            samplesToErase.push_back(name);
        }
    }

    for (vector<string>::iterator name = samplesToErase.begin(); name != samplesToErase.end(); ++name) {
        samples.erase(*name);
    }

    DEBUG2("done getting alleles");

}

Allele* AlleleParser::referenceAllele(int mapQ, int baseQ) {
    string base = currentReferenceBaseString();
    //string name = reference.filename;
    string name = currentSequenceName; // this behavior matches old bambayes
    string sequencingTech = "reference";
    string baseQstr = "";
    //baseQstr += qualityInt2Char(baseQ);
    Allele* allele = new Allele(ALLELE_REFERENCE, 
                                currentSequenceName,
                                currentPosition,
                                &currentPosition, 
                                &currentReferenceBase,
                                1,
                                currentPosition + 1,
                                0,
                                0,
                                base,
                                name,
                                name,
                                name,
                                sequencingTech,
                                true,
                                baseQ,
                                baseQstr,
                                mapQ,
                                false,
                                false,
                                false,
                                "1M",
                                NULL,
                                currentPosition,
                                currentPosition+1); // pair information
    allele->genotypeAllele = true;
    allele->baseQualities.push_back(baseQ);
    allele->update();
    return allele;
}

vector<Allele> AlleleParser::genotypeAlleles(
    map<string, vector<Allele*> >& alleleGroups, // alleles grouped by equivalence
    Samples& samples, // alleles grouped by sample
    bool useOnlyInputAlleles,
    int haplotypeLength
    ) {

    vector<pair<Allele, int> > unfilteredAlleles;

    DEBUG("getting genotype alleles");

    for (map<string, vector<Allele*> >::iterator group = alleleGroups.begin(); group != alleleGroups.end(); ++group) {
        // for each allele that we're going to evaluate, we have to have at least one supporting read with
        // map quality >= MQL1 and the specific quality of the allele has to be >= BQL1
        DEBUG("allele group " << group->first);
        vector<Allele*>& alleles = group->second;
        DEBUG(alleles);
        if (!allATGC(group->second.front()->alternateSequence)) {
            DEBUG("allele group contains partially-null observations, skipping");
            continue;
        }
        if (alleles.size() < parameters.minAltTotal) {
            DEBUG("allele group lacks sufficient observations in the whole population (min-alternate-total)");
            continue;
        }
        bool passesFilters = false;
        int qSum = 0;
        int mqSum = 0;
        for (vector<Allele*>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
            DEBUG2("allele " << **a);
            Allele& allele = **a;
            qSum += allele.quality;
            mqSum += allele.mapQuality;
        }
        if (qSum >= parameters.minSupportingAlleleQualitySum && mqSum >= parameters.minSupportingMappingQualitySum) {
            Allele& allele = *(alleles.front());
            int length = allele.length;
            int reflength = allele.referenceLength;
            string altseq = allele.alternateSequence;
            if (allele.type == ALLELE_REFERENCE) {
                length = haplotypeLength;
                reflength = haplotypeLength;
                if (haplotypeLength == 1) {
                    altseq = currentReferenceBase;
                } else {
                    altseq = uppercase(reference.getSubSequence(currentSequenceName, currentPosition, haplotypeLength));
                }
            }
            unfilteredAlleles.push_back(make_pair(genotypeAllele(allele.type,
                                                                 altseq,
                                                                 length,
                                                                 allele.cigar,
                                                                 reflength,
                                                                 allele.position,
                                                                 allele.repeatRightBoundary), qSum));
        }
    }
    DEBUG("found genotype alleles");

    map<Allele, int> filteredAlleles;

    DEBUG("filtering genotype alleles which are not supported by at least " << parameters.minAltCount 
           << " observations comprising at least " << parameters.minAltFraction << " of the observations in a single individual");
    for (vector<pair<Allele, int> >::iterator p = unfilteredAlleles.begin();
         p != unfilteredAlleles.end(); ++p) {

        Allele& genotypeAllele = p->first;
        int qSum = p->second;
        DEBUG("genotype allele: " << genotypeAllele << " qsum " << qSum);

        for (Samples::iterator s = samples.begin(); s != samples.end(); ++s) {
            Sample& sample = s->second; 
            int alleleCount = 0;
            int qsum = 0;
            Sample::iterator c = sample.find(genotypeAllele.currentBase);
            if (c != sample.end()) {
                vector<Allele*>& obs = c->second;
                alleleCount = obs.size();
                for (vector<Allele*>::iterator a = obs.begin(); a != obs.end(); ++a) {
                    Allele& allele = **a;
                    qsum += allele.quality;
                }
            }
            int observationCount = sample.observationCount();
            if (qsum >= parameters.minAltQSum
                && alleleCount >= parameters.minAltCount 
                && ((float) alleleCount / (float) observationCount) >= parameters.minAltFraction) {
                DEBUG(genotypeAllele << " has support of " << alleleCount 
                      << " in individual " << s->first << " (" << observationCount << " obs)" <<  " and fraction " 
                      << (float) alleleCount / (float) observationCount);
                filteredAlleles[genotypeAllele] = qSum;
                break;
                //out << *genotypeAllele << endl;
            }
        }
    }
    DEBUG("filtered genotype alleles");


    vector<Allele> resultAlleles;
    vector<Allele> resultIndelAndMNPAlleles;

    //string refBase = currentReferenceBaseString();
    // XXX XXX XXX
    string refBase = currentReferenceHaplotype();

    if (parameters.useBestNAlleles == 0) {
        // this means "use everything"
        bool hasRefAllele = false;
        for (map<Allele, int>::iterator p = filteredAlleles.begin();
             p != filteredAlleles.end(); ++p) {
            if (p->first.currentBase == refBase)
                hasRefAllele = true;
            DEBUG("adding allele to result alleles " << p->first.currentBase);
            resultAlleles.push_back(p->first);
        }
        // and add the reference allele if we need it
        if (parameters.forceRefAllele && !hasRefAllele) {
            DEBUG("including reference allele");
            // XXX TODO change to get the haplotype of the reference sequence
            resultAlleles.insert(resultAlleles.begin(), genotypeAllele(ALLELE_REFERENCE, refBase, 1, "1M", 1, currentPosition));
        }
    } else {
        // this means, use the N best
        vector<pair<Allele, int> > sortedAlleles;
        for (map<Allele, int>::iterator p = filteredAlleles.begin();
             p != filteredAlleles.end(); ++p) {
            sortedAlleles.push_back(make_pair(p->first, p->second));
        }
        DEBUG2("sorting alleles to get best alleles");
        AllelePairIntCompare alleleQualityCompare;
        sort(sortedAlleles.begin(), sortedAlleles.end(), alleleQualityCompare);

        DEBUG("getting " << parameters.useBestNAlleles << " best SNP alleles, and all other alleles");
        bool hasRefAllele = false;
        for (vector<pair<Allele, int> >::iterator a = sortedAlleles.begin(); a != sortedAlleles.end(); ++a) {
            Allele& allele = a->first;
            if (allele.currentBase == refBase) {
                hasRefAllele = true;
            }
            /*            if (allele.type & (ALLELE_DELETION | ALLELE_INSERTION | ALLELE_MNP | ALLELE_COMPLEX)) {
                DEBUG("adding allele to result alleles " << allele.currentBase);
                resultIndelAndMNPAlleles.push_back(allele);
            } else {
                DEBUG("adding allele to SNP alleles " << allele.currentBase);
            }
            */
            DEBUG("adding allele to result alleles " << allele.currentBase);
            resultAlleles.push_back(allele);
            DEBUG("allele quality sum " << a->second);
        }
        DEBUG("found " << sortedAlleles.size() << " SNP/ref alleles of which we now have " << resultAlleles.size() << endl
               << "and " << resultIndelAndMNPAlleles.size() << " INDEL and MNP alleles");

        // if we have reached the limit of allowable alleles, and still
        // haven't included the reference allele, include it
        if (parameters.forceRefAllele && !hasRefAllele) {
            DEBUG("including reference allele in analysis");
            resultAlleles.insert(resultAlleles.begin(), genotypeAllele(ALLELE_REFERENCE, refBase, 1, "1M", 1, currentPosition));
        }

        // if we now have too many alleles (most likely one too many), get rid of some
        while (resultAlleles.size() > parameters.useBestNAlleles) {
            resultAlleles.pop_back();
        }

        // drop the SNPs back into the set of alleles
        for (vector<Allele>::iterator a = resultIndelAndMNPAlleles.begin(); a != resultIndelAndMNPAlleles.end(); ++a) {
            resultAlleles.push_back(*a);
        }

    }

    // now add in the alleles from the input variant set

    if (useOnlyInputAlleles)
        resultAlleles.clear();

    // this needs to be fixed in a big way
    // the alleles have to be put into the local haplotype structure
    if (inputVariantAlleles.find(currentRefID) != inputVariantAlleles.end()) {
        map<long int, vector<Allele> >::iterator v = inputVariantAlleles[currentRefID].find(currentPosition);
        if (v != inputVariantAlleles[currentRefID].end()) {
            vector<Allele>& inputalleles = v->second;
            for (vector<Allele>::iterator a = inputalleles.begin(); a != inputalleles.end(); ++a) {
                DEBUG("evaluating input allele " << *a);
                Allele& allele = *a;
                // check if the allele is already present
                bool alreadyPresent = false;
                for (vector<Allele>::iterator r = resultAlleles.begin(); r != resultAlleles.end(); ++r) {
                    if (r->equivalent(allele)) {
                        alreadyPresent = true;
                        break;
                    }
                }
                if (!alreadyPresent) {
                    resultAlleles.push_back(allele);
                }
            }
        }
    }
    // remove non-unique alleles after

    DEBUG2("found " << resultAlleles.size() << " result alleles");
    return resultAlleles;

}

// homopolymer run length.  number of consecutive nucleotides (prior to this
// position) in the genome reference sequence matching the alternate allele,
// after substituting the alternate in place of the reference sequence allele
int AlleleParser::homopolymerRunLeft(string altbase) {

    int position = currentPosition - 1;
    int sequenceposition = position - currentSequenceStart;
    int runlength = 0;
    while (sequenceposition >= 0 && currentSequence.substr(sequenceposition, 1) == altbase) {
        ++runlength;
        --position;
        sequenceposition = position - currentSequenceStart;
    }
    return runlength;

}

int AlleleParser::homopolymerRunRight(string altbase) {

    int position = currentPosition + 1;
    int sequenceposition = position - currentSequenceStart;
    int runlength = 0;
    while (sequenceposition >= 0 && currentSequence.substr(sequenceposition, 1) == altbase) {
        ++runlength;
        ++position;
        sequenceposition = position - currentSequenceStart;
    }
    return runlength;

}

map<string, int> AlleleParser::repeatCounts(long int position, const string& sequence, int maxsize) {
    map<string, int> counts;
    for (int i = 1; i <= maxsize; ++i) {
        // subseq here i bases
        string seq = sequence.substr(position, i);
        // go left.

        int j = position - i;
        int leftsteps = 0;
        while (j >= 0 && seq == sequence.substr(j, i)) {
            j -= i;
            ++leftsteps;
        }

        // go right.
        j = position;

        int rightsteps = 0;
        while (j + i <= sequence.size() && seq == sequence.substr(j, i)) {
            j += i;
            ++rightsteps;
        }
        // if we went left and right a non-zero number of times, 
        if (leftsteps + rightsteps > 1) {
            counts[seq] = leftsteps + rightsteps;
        }
    }

    // filter out redundant repeat information
    if (counts.size() > 1) {
        map<string, int> filteredcounts;
        map<string, int>::iterator c = counts.begin();
        string prev = c->first;
        filteredcounts[prev] = c->second;  // shortest sequence
        ++c;
        for (; c != counts.end(); ++c) {
            int i = 0;
            string seq = c->first;
            while (i + prev.length() <= seq.length() && seq.substr(i, prev.length()) == prev) {
                i += prev.length();
            }
            if (i < seq.length()) {
                filteredcounts[seq] = c->second;
                prev = seq;
            }
        }
        return filteredcounts;
    } else {
        return counts;
    }
}

bool AlleleParser::isRepeatUnit(const string& seq, const string& unit) {

    if (seq.size() % unit.size() != 0) {
        return false;
    } else {
        int maxrepeats = seq.size() / unit.size();
        for (int i = 0; i < maxrepeats; ++i) {
            if (seq.substr(i * unit.size(), unit.size()) != unit) {
                return false;
            }
        }
        return true;
    }

}

bool AlleleParser::hasInputVariantAllelesAtCurrentPosition(void) {
    if (inputVariantAlleles.find(currentRefID) != inputVariantAlleles.end()) {
        map<long int, vector<Allele> >::iterator v = inputVariantAlleles[currentRefID].find(currentPosition);
        if (v != inputVariantAlleles[currentRefID].end()) {
            return true;
        }
    }
    return false;
}

bool operator<(const AllelicPrimitive& a, const AllelicPrimitive& b) {
    return a.ref < b.ref && a.alt < b.alt;
}