diff --git a/MPI_packing.c b/MPI_packing.c
new file mode 100644
index 0000000..f6008f8
--- /dev/null
+++ b/MPI_packing.c
@@ -0,0 +1,173 @@
+#include "bison.h"
+
+/******************************************************************************
+*
+*   Take a BAM header and pack it into a single contiguous memory block. Store
+*   the resulting block and its size in an MPI_Header structure.
+*
+*   THE RESULT MUST BE free()d
+*
+*   bam_header_t *header: The header to store
+*
+*******************************************************************************/
+MPI_Header * pack_header(bam_header_t *header) {
+    size_t size = sizeof(int32_t); //n_targets
+    int32_t *pint32_t;
+    uint32_t *puint32_t;
+    char *pchar;
+    int *pint;
+    int i;
+    void *p;
+    MPI_Header *output = malloc(sizeof(MPI_Header));
+
+    //target_name
+    for(i=0; i<header->n_targets; i++) {
+        size += (sizeof(char) * (1+strlen(header->target_name[i])));
+    }
+
+    //target_len
+    size += sizeof(uint32_t) * header->n_targets;
+
+    //l_text
+    size += sizeof(int);
+
+    //text
+    size += sizeof(char) * (1 + header->l_text);
+
+    //Start copying, layout is n_targets,target_name[s],target_len[s],l_text,text
+    output->size = (int) size;
+    output->packed = malloc(size);
+    p = output->packed;
+
+    //n_targets
+    memcpy(p, (void *) &(header->n_targets), sizeof(int32_t));
+    pint32_t = (int32_t *) p;
+    p = (void *) (++pint32_t);
+
+    //target_name
+    for(i=0; i<header->n_targets; i++) {
+        memcpy(p, (void *) header->target_name[i], sizeof(char) * (1 + strlen(header->target_name[i])));
+        pchar = (char *) p;
+        p = (void *) (pchar+1+strlen(header->target_name[i]));
+    }
+    //target_len
+    memcpy(p, (void *) header->target_len, sizeof(uint32_t)*(header->n_targets));
+    puint32_t = (uint32_t *) p;
+    p = (void *) (puint32_t + header->n_targets);
+
+    //l_text
+    memcpy(p, (void *) &(header->l_text), sizeof(int));
+    pint = (int *) p;
+    p = (void *) ++pint;
+
+    //text
+    memcpy(p, (void *) (header->text), sizeof(char) * (1 + header->l_text));
+
+    return output;
+}
+
+/******************************************************************************
+*
+*   Unpack a header packed into an initialized bam_header_t
+*
+*   bam_header_t *header: The header to unpack into
+*   void *packed: The packed header
+*
+*******************************************************************************/
+void unpack_header(bam_header_t *header, void *packed) {
+    void *p = packed;
+    int i;
+    int *pint;
+    int32_t *pint32_t;
+    uint32_t *puint32_t;
+    char *pchar;
+    size_t strlength;
+
+    //n_targets
+    header->n_targets = *((int32_t *) packed);
+    pint32_t = (int32_t *) p;
+    p = (void *) (++pint32_t);
+
+    //**target_name
+    header->target_name = (char **) malloc(sizeof(char *) * (header->n_targets));
+    for(i=0; i<header->n_targets; i++) {
+        strlength = strlen((char *) p)+1;
+        header->target_name[i] = malloc(sizeof(char) * strlength);
+        memcpy((void *) (header->target_name[i]), p, sizeof(char)*strlength);
+        pchar = (char *) p;
+        p = (void *) (pchar+strlength);
+    }
+
+    //target_len
+    header->target_len = malloc(sizeof(uint32_t) * (header->n_targets));
+    for(i=0; i<header->n_targets; i++) {
+        header->target_len[i] = *((uint32_t *) p);
+        puint32_t = (uint32_t *) p;
+        p = (void *) ++puint32_t;
+    }
+
+    //l_text
+    header->l_text = *((int *) p);
+    pint = (int *) p;
+    p = (void *) ++pint;
+
+    //text
+    header->text = (char *) malloc(sizeof(char) * (header->l_text+1));
+    memcpy((void *) (header->text), p, sizeof(char) * (header->l_text + 1));
+}
+
+/******************************************************************************
+*
+*   Take a BAM read and pack it into a single contiguous memory block. Store
+*   the resulting block and its size in an MPI_Read structure.
+*
+*   THE RESULT MUST BE free()d
+*
+*   bam1_t *read: The read to store
+*
+*******************************************************************************/
+MPI_read * pack_read(bam1_t *read, MPI_read *output) {
+    bam1_t *pbam1_t;
+    int needed_size, m_data = read->m_data;
+
+    needed_size = (int) (sizeof(bam1_t) + m_data);
+    if(output->size == 0) {
+        output->packed = malloc((size_t) needed_size);
+        output->size = needed_size;
+    } else if(needed_size > output->size) {
+        output->packed = realloc(output->packed, (size_t) needed_size);
+        output->size = needed_size;
+    }
+    memcpy((void *) output->packed, (void *) read, sizeof(bam1_t));
+    pbam1_t = output->packed;
+    pbam1_t++;
+    memcpy((void *) pbam1_t, (void *) read->data, m_data);
+    return output;
+}
+
+/******************************************************************************
+*
+*   Unpack a packed read into an initialized bam1_t read.
+*
+*   bam1_t *read: The read to unpack into
+*   void *packed: The packed read
+*
+*******************************************************************************/
+bam1_t *unpack_read(bam1_t *read, void *packed) {
+    bam1_t *pbam1_t = packed;
+    uint8_t *pdata = (uint8_t *) (pbam1_t+1);
+    uint8_t *newdata;
+
+    pbam1_t->data = pdata;
+    if(read != NULL) bam_destroy1(read);
+    read = bam_init1();
+    read->core = pbam1_t->core;
+    read->l_aux = pbam1_t->l_aux;
+    read->m_data = pbam1_t->m_data;
+    read->data_len= pbam1_t->data_len;
+    newdata = (uint8_t *) malloc(read->m_data);
+    memcpy((void *) newdata, (void *) pdata, read->m_data);
+    read->data = newdata;
+
+    return read;
+}
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..2f0363b
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,82 @@
+WORK=/home/ryand#This should be changed to match your needs
+PREFIX = $(WORK)/bin
+CC = mpicc
+INCLUDE_DIRS = -I$(WORK)/include #This should be were samtools was compiled -I/path/to/samtools/compilation
+LIB_DIRS = -L$(WORK)/lib #As above, but -L/path/to/samtools/compilation
+OPTS = -Wall -O3 #-DDEBUG #-DNOTHROTTLE -g
+MPI = -lmpich -lmpl #This is usually appropriate for mpich2
+#MPI = #This is appropriate for mvapich2
+#MPI = -lmpi #This is usually appropriate for openmpi
+
+#Don't edit below here unless you know what you're doing!
+
+OBJS = aux.o fastq.o genome.o slurp.o master.o common.o MPI_packing.o worker.o
+HERD_OBJS = herd/fastq.o herd/master.o herd/MPI_packing.o herd/slurp.o herd/worker.o herd/writer.o
+
+.SUFFIXES:.c .o
+
+all: align index extractor mbias markduplicates
+
+.c.o:
+	$(CC) -c $(OPTS) $(INCLUDE_DIRS) $< -o $@
+
+markduplicates:
+	$(CC) $(OPTS) $(INCLUDE_DIRS) $(LIB_DIRS) -o bison_markduplicates markduplicates.c -lpthread -lbam -lz
+
+mbias:
+	$(CC) $(OPTS) $(INCLUDE_DIRS) $(LIB_DIRS) -o bison_mbias mbias.c -lpthread -lbam -lz
+
+index:
+	$(CC) $(OPTS) -o bison_index index.c -lpthread
+
+align: $(OBJS)
+	$(CC) -c $(OPTS) $(INCLUDE_DIRS) main.c -o main.o
+	$(CC) $(OPTS) $(OBJS) main.o -o bison $(LIB_DIRS) -lm -lpthread $(MPI) -lbam -lz
+
+extractor:
+	$(CC) -c $(OPTS) $(INCLUDE_DIRS) common.c -o common.o
+	$(CC) -c $(OPTS) $(INCLUDE_DIRS) methylation_extractor.c -o methylation_extractor.o
+	$(CC) $(OPTS) $(LIB_DIRS) common.o methylation_extractor.o -o bison_methylation_extractor -lpthread -lbam -lz
+
+#Don't compile herd by default
+herd:  $(OBJS) $(HERD_OBJS)
+	$(CC) -c $(OPTS) $(INCLUDE_DIRS) herd/main.c -o herd/main.o
+	$(CC) $(OPTS) $(OBJS) $(HERD_OBJS) herd/main.o -o bison_herd $(LIB_DIRS) -lm -lpthread $(MPI) -lbam -lz
+
+#Auxiliary programs, don't compile by default
+auxiliary:	merge_CpGs bedGraph2methylKit make_reduced_genome aux_python_scripts CpG_coverage
+
+aux_python_scripts:
+	cp -f auxiliary/bedGraph2BSseq.py ./
+	cp -f auxiliary/merge_bedGraphs.py ./
+
+CpG_coverage:	common.o
+	$(CC) -c $(OPTS) $(INCLUDE_DIRS) auxiliary/CpG_coverage.c -o auxiliary/CpG_coverage.o
+	$(CC) $(OPTS) $(LIB_DIRS) common.o auxiliary/CpG_coverage.o -o bison_CpG_coverage
+
+merge_CpGs:	common.o
+	$(CC) -c $(OPTS) $(INCLUDE_DIRS) auxiliary/merge_CpGs.c -o auxiliary/merge_CpGs.o
+	$(CC) $(OPTS) $(LIB_DIRS) common.o auxiliary/merge_CpGs.o -o bison_merge_CpGs
+
+bedGraph2methylKit:common.o
+	$(CC) -c $(OPTS) $(INCLUDE_DIRS) auxiliary/bedGraph2methylKit.c -o auxiliary/bedGraph2methylKit.o
+	$(CC) $(OPTS) $(LIB_DIRS) common.o auxiliary/bedGraph2methylKit.o -o bedGraph2methylKit
+
+make_reduced_genome:
+	$(CC) $(OPTS) $(LIB_DIRS) auxiliary/make_reduced_genome.c -o make_reduced_genome
+
+install :
+	mv bison_* $(PREFIX)/ 
+	chmod a+x Rscripts/*
+	cp Rscripts/* $(PREFIX)/
+	if [ -f bison ]; then mv bison $(PREFIX)/ ; fi;
+	if [ -f bedGraph2methylKit ]; then mv bedGraph2methylKit $(PREFIX)/ ; fi;
+	if [ -f bedGraph2BSseq.py ]; then chmod a+x bedGraph2BSseq.py ; mv bedGraph2BSseq.py $(PREFIX)/ ; fi;
+	if [ -f merge_bedGraphs.py ]; then chmod a+x merge_bedGraphs.py ; mv merge_bedGraphs.py $(PREFIX)/ ; fi;
+	if [ -f check_accuracy ]; then mv check_accuracy $(PREFIX)/ ; fi;
+	if [ -f make_reduced_genome ]; then mv make_reduced_genome $(PREFIX)/ ; fi;
+
+clean:
+	rm -f *.o bison bison_* bedGraph2methylKit check_accuracy make_reduced_genome bedGraph2BSseq.py
+	rm -f herd/*.o
+	rm -f auxiliary/*.o
diff --git a/README b/README
new file mode 100644
index 0000000..e79b788
--- /dev/null
+++ b/README
@@ -0,0 +1,468 @@
+This is Bison, bisulfite alignment on nodes of a cluster.
+
+___________________________________________________________________________
+Prerequisites
+
+This program depends upon the following:
+
+1. A functional MPI implementation, such as mpich
+
+2. The SAMtools library or similar. SAMtools is available here: http://samtools.sourceforge.net/
+
+3. Bowtie2, available here: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml
+   The bowtie2 executable MUST be in your PATH.
+
+4. zcat, gzip, and bzcat must also be in your PATH, though this will almost
+   always be the case.
+
+5. To use bison_mbias2pdf (or the -pdf option of bison_mbias), R must be
+   installed and in your PATH. Additionally, the ggplot2 library must be
+   installed.
+
+N.B., the actual SAMtools library and header files are required for the
+   compilation step and can then be removed. The actual samtools executable
+   isn't required.
+
+___________________________________________________________________________
+General setup should go as follows:
+
+0. Download and extract the source code for samtools. Change into the directory
+   containing said code and type "make".
+
+1. Download the source distribution.
+
+2. Unpack, for example: tar zxf bison-0.1.0.tgz
+
+3. Possibly edit the Makefile, to include MPI and SAMtools library and header
+   locations. If these are installed in standard locations, the defaults
+   should suffice. For samtools see example in the Makefile. The default
+   Makefile is suitable for mpich2. If you're using openmpi you'll need to
+   comment out the first MPI line and uncomment the second MPI line.
+
+4. type "make"
+
+4a. If you would like to use bison_herd, type "make herd".
+
+4b. If you would like the auxiliary tools installed, type "make auxiliary".
+
+5. type "make install"
+
+The install path can be changed easily in the Makefile.
+
+___________________________________________________________________________
+Detailed installation instructions:
+
+1. Download samtools (at least version 0.1.19!).
+
+2. Extract the compressed bzipped tar-ball:
+tar jxf samtools-0.1.19.tar.bz2
+
+3. Change to that directory and type:
+make
+
+4. Similarly download and extract the source code for bison
+
+5. Change the installation target. For example, if you would like bison to be
+   installed under "bin" in your home directory, then the PREFIX line should be:
+PREFIX = ~/bin
+
+6. The default compiler is mpicc, but this can be changed by altering the line
+   beginning with "CC".
+
+7. If you extracted and built samtools in your home directory, then you will
+   likely need to change the INCLUDE_DIRS and LIB_DIRS to something like:
+INCLUDE_DIRS = -I/home/username/samtools-0.1.19
+LIB_DIRS = -L/home/username/samtools-0.1.19
+   If you already have the headers and libbam.a file elsewhere, then change
+   these lines appropriately.
+
+   Likewise, add the location of your MPI headers and libraries, if they're not
+   in the normal search path.
+
+8. You can disable throttling in bison_herd by adding "-DNOTHROTTLE" in the
+   "OPTS" line, though read the "Throttling" section , below. Similarly, both
+   bison and bison_herd can be compiled in a special debug mode by adding
+   "-DDEBUG" to the "OPTS" line. See the "Debug mode" section, below.
+
+9. Continue with step #4 in the preceding section.
+
+___________________________________________________________________________
+Usage
+
+Indexing of a directory of fasta (extension .fa or .fasta) can be performed
+as follows:
+
+bison_index [OPTIONS] directory/
+
+Options that are not specific to bison are simply passed to bowtie2, which must
+be in your PATH. The output is placed under "directory/bisulfite_genome".
+
+Alignment can be performed as follows (bison_herd is the same):
+
+mpiexec bison [OPTIONS] -g directory/ {-1 fastq_1.gz -2 fastq_2.gz | -U fastq.fq}
+
+"directory" is identical to that used for indexing. For further details type
+"bison -h". For non-directional libraries, "mpiexec -N 5" should be used,
+otherwise "mpiexec -N 3". Resource managers, such as slurm, should work in
+an equivalent manner. All options not explicitly mentioned by typing
+"bison -h" are passed to bowtie2. Consequently, using the --very-sensitive or
+--dovetail options will work as expected. Bison already passes the following
+flags to bowtie2:
+-q --reorder --no-mixed --no-discordant
+
+bison_herd is equivalent, except that you can specify more nodes. You may also
+input multiple files (comma-separated, no spaces) to align, in which case
+alignments will be printed to multiples files. Furthermore, you may use
+wild-cards in your file list. For example:
+
+mpiexec -N 17 bison_herd -o Alignments -g directory/ -1 exp1/sample*_1.fq.gz,/some/other/path/foo*_1.fq.gz -2 exp1/sample*_2.fq.gz,/some/other/path/foo*_2.fq.gz
+
+Make sure to not have multiple input files with the same name
+(e.g., sample*/read1.fastq), as they will all be written to the same file
+(overwriting any subsequent alignments)!
+
+There is also a methylation extractor that produces a bedGraph file, called 
+bison_methylation_extractor. Note, coordinate-sorted BAM files should not 
+be used! The methylation extractor can be told to ignore certain parts of each
+read. This is particularly useful in cases where there is methylation bias
+across the length of reads (i.e., if one plots the average methylation
+percentage summed per position over all reads, the value goes up/down toward the
+5' or 3' end). It is recommended to always run bison_mbias (with the -pdf option
+if you have R and ggplot2 installed) to generate the required information for
+constructing an M-bias plot. The bison_mbias2pdf script can convert this to a
+PDF file (or a series of PNG files) and will also suggest what, if any, regions
+should be ignored. These regions are strand and read number (in the case of
+paired-end reads) dependent. While the suggested regions are often good, the
+should not be blindly accepted (just look at the graph and use your best
+judgement).
+
+See the "Auxiliary files" section, below, for additional files.
+
+___________________________________________________________________________
+Auxiliary files
+
+The following programs and scripts will be available if you type "make auxiliary":
+
+bedGraph2BSseq.py
+This python script can accept a filename prefix and the names of at least 2
+bedGraph files and output 3 files for input into BSseq. A single chromosome can
+be processed at a time, if desired, by using the -chr option. The output files
+will be named $prefix.M, $prefix.Cov, and $prefix.gr. $prefix.M is a matrix with
+a header line that lists the number of reads supporting methylation at each site
+in the bedGraph files. If there is no coverage in a given sample, the value is
+set to 0. $prefix.Cov is the analogous file listing coverage in each sample
+(again, 0 denotes no coverage). $prefix.gr lists the coordinates for each line
+in the .Cov and .M files. Loading these files into R would be performed as
+follows (in this example "Chr17" was the prefix):
+
+M <- as.matrix(read.delim("Chr17.M", header=T))
+Cov <- as.matrix(read.delim("Chr17.Cov", header=T))
+bed <- read.delim("Chr17.bed", header=F)
+#Remember that BED and bedGraph files are 0-based!
+gr <- GRanges(seqnames=Rle(bed$V1),ranges=IRanges(start=bed$V2+1, end=bed$V3), strand=Rle("*", nrow(bed)))
+groups <- data.frame(row.names=colnames(M),
+    var1 <- c(1,1,1,1,2,2,2,2)) #A very simple experiment with 2 groups of 4 samples
+BS1 <- BSseq(M=M, Cov=Cov, gr=gr, pData=groups, sampleNames=colnames(M)) #You'll want to set some of the additional options!
+
+
+bedGraph2methylKit
+As above, but each bedGraph file is converted to a .methylKit file. The
+bedGraphs should be of CpGs and not have had the strands merged (i.e., don't run
+the merge_CpGs command below).
+
+make_reduced_genome.c
+Create a reduced representation genome appropriate for reads of a given size
+($size, default is 36bp). MspI and TaqI libraries are supported. Nucleotides
+greater than $size+10% are converted to N.
+
+merge_bedGraphs.py
+This will merge bedGraphs from technical replicates of a single sample into a
+single bedGraph file, summing the methylation metrics as it goes. The output,
+like the input is coordinate sorted.
+
+bison_merge_CpGs
+Methylation is usually symmetric at CpG sites. While the output bedGraph files
+have a single-C resolution, this will convert that to single-CpG resolution by
+summing Cs in the same CpG from opposite strands. This saves space and will
+often speed up downstream statistics.
+
+___________________________________________________________________________
+Advanced bison_herd usage
+
+bison_herd has the ability to use a semi-arbitrary number of nodes. In practice,
+if bison is given N nodes, it will effectively use 2*((N-1)/2)+1 or
+4*((N-1)/4)+1 nodes, for directional and non-directional libraries,
+respectively. As an example, if you allot 20 nodes for a directional library,
+bison_herd will only use 19 of them (17 for non-directional reads). The excess
+nodes will exit properly and, unless you specify --quiet, produce an error
+message.
+
+The options -mp, -queue-size, and -@ are bison_herd-specific and deserve further
+description.
+
+-mp sets the number of threads that the master node will use to process
+alignments produced by the worker nodes. Worker nodes are grouped into twos or
+fours, where each group has the a number of nodes equal to the number of
+possible bisulfite converted strands. As the number of allocated nodes
+increases, a point is eventually reached where a single thread on the master
+node is unable to keep up with the workers. In my experience, for directional
+libraries, one thread can handle approximately 130 bowtie2 threads (i.e., if
+using -p 11, -mp should be increased once ~12 worker nodes are allocated, since
+that would equate to 132 threads in use by bowtie2). One should keep in mind
+that there are already at least 3 other threads concurrently running on the
+master node (sending and storing fastq reads, receiving alignments, and writing
+alignments). Consequently, there is a practical limit to the number of nodes is
+determined by how many cores are available on each node.
+
+-queue-size determines the maximum difference between reads sent for alignment
+and reads processed. This option is unavailable if bison_herd was compiled with
+-DNOTHROTTLE. By default, the thread that sends reads for alignment will pause
+if it has sent more than ~1 million reads than have been processed. The purpose
+of this is to prevent overwhelming of the MPI unexpected message buffer, since
+the thread on the master node that sends reads can generally process reads
+faster than all of the worker nodes combined can align them. Setting this value
+too high may result in bison_herd crashing with otherwise cryptic messages
+involving MPI_Send. In such cases, decreasing the value used by -queue-size
+should resolve the problem. On the other hand, setting this value too low can
+result in a deadlocks, due to buffering at various levels. The default value
+hasn't resulted in deadlocking or crashes on our cluster, but yours may be
+different! This difference is checked every 100000 reads, which can changed by
+editting the THROTTLE_CHECK_INTERVAL value in bison.h prior to compilation.
+
+-@ specifies the number of compression threads used for writing the output BAM
+file. In practice, a single compression thread can write ~80 million paired-end
+reads per hour (depending on CPU speed). I routinely use -@ 4 when using more
+than ~9 nodes as this allows writing to occur as quickly as reads are processed.
+To determine if the number of compression threads should be increased, not the
+time difference (especially early on) between when each master processor thread
+has processed 100000 reads and when those reads have been written to a file.
+Even when --reorder is used, if there is >1 second between these, then you may
+benefit from increasing the number of compression threads. For those curious,
+this option is identical to that used in samtools.
+
+___________________________________________________________________________
+Throttling
+
+bison_herd generally uses blocking, but not synchronous sends. What this means
+in practice is that many reads will be queued by the master node for sending to
+the worker nodes. Likewise, many alignments can be queued by the worker nodes
+for sending back to the master node. The queue that many MPI implementations use
+for this is relatively small and immutable. While a full queue should cause
+MPI_Send to block until there is sufficient space, occasionally a constellation
+of events can occur that cause this queue to overflow and the master node to
+then crash. This can be alleviated by limiting the possible number of reads that
+could ever possibly be in the queue at any single time. As the queue is not
+directly pollable, the difference between the number of reads sent and written
+is used as a surrogate. The maximum number of reads in the wild is then either
+2x or 4x this difference (since a read is queued per worker node). In reality,
+the queue should be emptier than this as there are normally reads buffered on
+the worker nodes (being fed to bowtie2, being aligned or being sent) and
+elsewhere on the master node (being received, waiting to be processed, being
+processed, waiting to be written, or being written).
+
+Throttling is not always required, particularly as an increasing number of nodes
+are used. Throttling can be disabled altogether by compiling with -DNOTHROTTLE,
+which will remove all related components.
+
+___________________________________________________________________________
+Debug mode
+
+For debugging, a special debug mode is available for both bison and bison_herd
+by compiling with -DDEBUG. Instead of running of needing multiple nodes, both
+programs will then run as if they were just a single node. Compiling with this
+option adds the -taskid option to both programs. The taskid is equivalent to the
+node number in the bison (or bison_herd) hierarchy. Node 0 is the master node
+and performs the final file writing. For bison, nodes 1-4 are equivalent to the
+worker nodes that align reads to the original top, original bottom,
+complementary to original top and complementary to original bottom strands,
+respectively. For directional libraries, only the first 2 are used. These will
+write alignments to a file for final processing when run as taskid 0. This is
+useful when odd alignments are being output and the source of the error needs to
+be tracked down. The mode for bison_herd is similar, except there are always 8
+theoretical worker nodes (i.e., taskid 1-8 need to be run prior to taskid 0).
+This allows testing multiple master processor threads with both directional and
+non-directional reads.
+
+In general, this mode should not be used unless you are running into extremely
+odd bugs.
+
+___________________________________________________________________________
+Compatibility with Bismark
+
+Bison is generally similar to bismark, however the indexes are incompatible,
+due to bismark renaming contigs. Also, the two will not produce identical 
+output, due to algorithmic differences. Running bison_methylation_extractor
+on the output of bismark will also produce different results, again due to
+algorithmic differences. In addition, bison always outputs BAM files directly.
+
+___________________________________________________________________________
+Other details
+
+Bison needn't be run on multiple computers. You can also use a single
+computer for all compute nodes (e.g. mpiexec -n 5 bison ...). The same holds
+true for bison_herd. Both bison and bison_herd seem to be faster than bismark,
+even when limited to the same resources.
+
+___________________________________________________________________________
+Changes:
+
+0.2.4
+  *  Fixed an off-by-one error in bison_mbias. Also, at some point 1-methylation
+     percentage started getting calculated. That's been fixed.
+
+  *  Added bison_markduplicates, which, as the name implies, marks apparent PCR
+     duplicates. The methylation extractor and m-bias calculator have also been
+     updated to ignore marked duplicates.
+
+  *  Fixed a bug in the CpG coverage program, which wasn't properly handling
+     single-C bedGraph files before (if they were merged, then they were being
+     handled correctly).
+
+0.2.3
+  *  Fix how hard and soft-clipped bases are dealt with (previously, soft-
+     clipped bases resulted in an error and hard-clipped bases in incorrect
+     position assignments!).
+
+  *  Multiple bug fixes related to local alignment, which previously didn't
+     work correctly. These issues seem to generally now be resolved. May thanks
+     to user mvijayen on seqanswers for providing a perfect usage example for
+     testing (see thread http://seqanswers.com/forums/showthread.php?t=39914).
+
+  *  The maximum length of a single contig is now (2^64)-1 (instead of the
+     previous 2^64). I don't think bowtie2 would even support something that
+     long, but if it did then bison wouldn't (internally, a position of 2^64
+     means a base is inserted, soft, or hard-clipped).
+
+  *  A previously missing "*" caused Bison to use the entirety of the
+     description line in the fasta file as the chromosome name. This caused
+     errors since bowtie2 only uses every before the first space (the proper
+     method). Bison now does the same.
+
+  *  A note about creating methylation-bias metrics with locally aligned reads
+     is in order. If a read is soft-clipped, that portion is still included in
+     the M-bias metrics. Likewise, if you pass -OT X,X,X,X or similar
+     parameters to the methylation extractor, the soft-clipped area is also
+     included in there.
+
+  *  Another note regarding local alignments is that the XX auxiliary tag
+     (effectively the more verbose version of the MD tag) contains soft-clipped
+     sequences. I could probably have these removed if someone would like.
+
+0.2.2
+  *  Properly fixed some wording on the textual output (i.e., removed the word
+     "unique").
+
+  *  Lowered the default MAPQ and Phred thresholds used by the methylation
+     extractor to 10 each. That the MAPQ threshold was originally
+     20 was an error on my part.
+
+0.2.1
+  *  Added support for file globbing in bison_herd. You may now input multiple
+     files using a combination of wild-cards (*, ?, etc.) and commas. Remember
+     to put these in quotes (e.g., "foo/*1.fq.gz","bar/*1.fq.gz") so the shell
+     doesn't perform the expansion!). As before, specifying multiple inputs with
+     the same file name (e.g., sample1/reads.fq,sample2/reads.fq) will cause the
+     output from the first reads.fq alignment to be over-written by the second.
+
+  *  Fixed the text output, since "unique alignments" isn't really correct,
+     given that alignments with scores of 0 or 1 can be output but aren't
+     unique.
+
+  *  Added information in the Makefile and above about compiling with openmpi.
+
+  *  Fixed a bug in bison_herd wherein the -upto option wasn't being handled
+     properly. -upto now accepts an unsigned long in bison_herd.
+
+  *  Fixed a bug in bison_herd when paired-end reads were used. This was due to
+     how bowtie2 reads from FIFOs. Changing how things were written to the FIFOs
+     on the worker nodes resolved the problem.
+
+  *  The bison_mbias program has been heavily revamped. It still outputs the
+     number of methylated or unmethylated CpG calls per position, but now keeps
+     the metrics for each strand (and read, when paired-end reads are used)
+     separate. If R and the ggplot2 library are installed, the program can also
+     run the bison_mbias2pdf program (see below).
+
+  *  Created an bison_mbias2pdf Rscript that will read in the output of
+     bison_mbias and plot the results, indicating the region of each read that
+     should be included in methylation extraction. This script also print these
+     suggestions in the format used by bison_methylation_extractor, for
+     convenience.
+
+  *  The methylation extractor can now be told to only include certain regions
+     of each read in the output methylation metrics. This is needed when there
+     is apparent bias in the methylation at one or both ends of a read. 
+
+  *  Previously, the recalculated MAPQ was incorrect when only 1 read in a pair
+     had a valid secondary alignment. This has been fixed.
+
+  *  Fixed another MAPQ recalculation bug, affecting reads with MAPQ 2 that
+     have MAPQ=6.
+
+  *  Fixed a bug in writing unmapped reads.
+
+  *  Fixed a bug in bison_herd that allowed early termination without warning.
+
+0.2.0
+  *  Added a note to the methylation summary statistics output at the end of a
+     run that the numbers will include double counting of any site covered by
+     both mates in a pair. These metrics are only meant for general information
+     and not further analysis, so I don't consider that a bug (it's actually a
+     design decision for the sake of performance).
+
+  *  --ignore-quals is no longer passed to bowtie2 by default. Specifying this
+     will marginally decrease both correct and incorrect alignments. It will
+     also generally decrease the alignment rate.
+
+  *  Fixed --unmapped, which are now written to the directory specified by -o
+
+  *  --maxins was already 500 by default, so it is no longer set by default.
+
+  *  Added bison_herd, see above for usage
+
+  *  The methylation extractor now has a -phred option, to exclude methylation
+     calls from low confidence base-calls. The default threshold is 20.
+
+  *  Added a script to convert bedGraph files to a format suitable for BSseq.
+
+  *  Fixed a bug in bison_merge_CpGs
+
+  *  Both bison and bison_herd now check to ensure that the MPI implementation
+     actually supports the level of thread support requested (previously, this
+     was just assumed).
+
+0.1.1
+  *  Fixed a number of minor bugs.
+
+  *  Added support for uncompressed fastq files, as well as bzipped files
+     (previously, only gzipped fastq files worked properly).
+
+  *  --score-min is now parsed by bison prior to being sent to bowtie2,
+     read MAPQ scores are recalculated accordingly by the same algorithm
+     used by bowtie2 (N.B., this only bears a vague correspondence to
+     -10*log10(probability the mapping position is wrong)!).
+
+  *  Added a bison_mbias function, to process the aligned BAM file and
+     create a text file containing the percentage of methylated C's as a
+     function of read position. For the utility of this, see: Hansen KD,
+     Langmead B and Irizarry RA, BSmooth: from whole genome bisulfite
+     sequencing to differentially methylated regions. Genome Biol 2012;
+     13(10):R83.
+
+  *  The methylation extractor now accepts the -q options, which sets the
+     MAPQ threshold for a read to be included in the methylation results.
+     The default is a minimum MAPQ of 20, which seems to be a reasonable
+     threshold from a few simulations.
+
+  *  In DEBUG mode, the output BAM files used to have fixed names. This was
+     a problem in cases where debugging was being performed on multiple
+     input files. Now, the OT/OB/CTOT/CTOB.bam filename is prepended with
+     an appropriate prefix (extracted from the input file name). In
+     addition, the output directory is now respected in DEBUG mode.
+
+  *  Included an "auxiliary" directory, that includes functions for making
+     an RRBS genome and other possibly useful functions.
+
+
+0.1.0
+  Initial release
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b8b62b3
--- /dev/null
+++ b/README.md
@@ -0,0 +1,462 @@
+#Bison: bisulfite alignment on nodes of a cluster.
+
+##Prerequisites
+
+This program depends upon the following:
+
+1. A functional MPI implementation, such as mpich
+
+2. The SAMtools library or similar. SAMtools is available here: http://samtools.sourceforge.net/
+
+3. Bowtie2, available here: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml
+   The bowtie2 executable MUST be in your PATH.
+
+4. zcat, gzip, and bzcat must also be in your PATH, though this will almost
+   always be the case.
+
+5. To use bison_mbias2pdf (or the -pdf option of bison_mbias), R must be
+   installed and in your PATH. Additionally, the ggplot2 library must be
+   installed.
+
+N.B., the actual SAMtools library and header files are required for the
+   compilation step and can then be removed. The actual samtools executable
+   isn't required.
+
+##General setup should go as follows:
+
+0. Download and extract the source code for samtools. Change into the directory
+   containing said code and type "make".
+
+1. Download the source distribution.
+
+2. Unpack, for example: tar zxf bison-0.1.0.tgz
+
+3. Possibly edit the Makefile, to include MPI and SAMtools library and header
+   locations. If these are installed in standard locations, the defaults
+   should suffice. For samtools see example in the Makefile. The default
+   Makefile is suitable for mpich2. If you're using openmpi you'll need to
+   comment out the first MPI line and uncomment the second MPI line.
+
+4. type "make"
+
+   * If you would like to use `bison_herd`, type "make herd".
+
+   * If you would like the auxiliary tools installed, type "make auxiliary".
+
+5. type "make install"
+
+The install path can be changed easily in the Makefile.
+
+##Detailed installation instructions
+
+1. Download samtools (at least version 0.1.19!).
+
+2. Extract the compressed bzipped tar-ball:
+tar jxf samtools-0.1.19.tar.bz2
+
+3. Change to that directory and type:
+make
+
+4. Similarly download and extract the source code for bison
+
+5. Change the installation target. For example, if you would like bison to be
+   installed under "bin" in your home directory, then the PREFIX line should be:
+PREFIX = ~/bin
+
+6. The default compiler is mpicc, but this can be changed by altering the line
+   beginning with "CC".
+
+7. If you extracted and built samtools in your home directory, then you will
+   likely need to change the `INCLUDE_DIRS` and `LIB_DIRS` to something like:
+
+    INCLUDE_DIRS = -I/home/username/samtools-0.1.19
+    LIB_DIRS = -L/home/username/samtools-0.1.19
+
+   If you already have the headers and libbam.a file elsewhere, then change
+   these lines appropriately.
+
+   Likewise, add the location of your MPI headers and libraries, if they're not
+   in the normal search path.
+
+8. You can disable throttling in `bison_herd` by adding "-DNOTHROTTLE" in the
+   "OPTS" line, though read the "Throttling" section , below. Similarly, both
+   bison and `bison_herd` can be compiled in a special debug mode by adding
+   "-DDEBUG" to the "OPTS" line. See the "Debug mode" section, below.
+
+9. Continue with step #4 in the preceding section.
+
+##Usage
+
+Indexing of a directory of fasta (extension .fa or .fasta) can be performed
+as follows:
+
+    bison_index [OPTIONS] directory/
+
+Options that are not specific to bison are simply passed to bowtie2, which must
+be in your PATH. The output is placed under `directory/bisulfite_genome`.
+
+Alignment can be performed as follows (`bison_herd` is the same):
+
+    mpiexec bison [OPTIONS] -g directory/ {-1 fastq_1.gz -2 fastq_2.gz | -U fastq.fq}
+
+"directory" is identical to that used for indexing. For further details type
+"bison -h". For non-directional libraries, "mpiexec -N 5" should be used,
+otherwise "mpiexec -N 3". Resource managers, such as slurm, should work in
+an equivalent manner. All options not explicitly mentioned by typing
+"bison -h" are passed to bowtie2. Consequently, using the --very-sensitive or
+--dovetail options will work as expected. Bison already passes the following
+flags to bowtie2:
+
+    -q --reorder --no-mixed --no-discordant
+
+`bison_herd` is equivalent, except that you can specify more nodes. You may also
+input multiple files (comma-separated, no spaces) to align, in which case
+alignments will be printed to multiples files. Furthermore, you may use
+wild-cards in your file list. For example:
+
+    mpiexec -N 17 bison_herd -o Alignments -g directory/ -1 exp1/sample*_1.fq.gz,/some/other/path/foo*_1.fq.gz -2 exp1/sample*_2.fq.gz,/some/other/path/foo*_2.fq.gz
+
+Make sure to not have multiple input files with the same name
+(e.g., `sample*/read1.fastq`), as they will all be written to the same file
+(overwriting any subsequent alignments)!
+
+There is also a methylation extractor that produces a bedGraph file, called 
+`bison_methylation_extractor`. Note, coordinate-sorted BAM files should not 
+be used! The methylation extractor can be told to ignore certain parts of each
+read. This is particularly useful in cases where there is methylation bias
+across the length of reads (i.e., if one plots the average methylation
+percentage summed per position over all reads, the value goes up/down toward the
+5' or 3' end). It is recommended to always run `bison_mbias` (with the -pdf option
+if you have R and ggplot2 installed) to generate the required information for
+constructing an M-bias plot. The `bison_mbias2pdf` script can convert this to a
+PDF file (or a series of PNG files) and will also suggest what, if any, regions
+should be ignored. These regions are strand and read number (in the case of
+paired-end reads) dependent. While the suggested regions are often good, the
+should not be blindly accepted (just look at the graph and use your best
+judgement).
+
+See the "Auxiliary files" section, below, for additional files.
+
+##Auxiliary files
+
+The following programs and scripts will be available if you type "make auxiliary":
+
+###bedGraph2BSseq.py
+This python script can accept a filename prefix and the names of at least 2
+bedGraph files and output 3 files for input into BSseq. A single chromosome can
+be processed at a time, if desired, by using the -chr option. The output files
+will be named $prefix.M, $prefix.Cov, and $prefix.gr. $prefix.M is a matrix with
+a header line that lists the number of reads supporting methylation at each site
+in the bedGraph files. If there is no coverage in a given sample, the value is
+set to 0. $prefix.Cov is the analogous file listing coverage in each sample
+(again, 0 denotes no coverage). $prefix.gr lists the coordinates for each line
+in the .Cov and .M files. Loading these files into R would be performed as
+follows (in this example "Chr17" was the prefix):
+
+```R
+M <- as.matrix(read.delim("Chr17.M", header=T))
+Cov <- as.matrix(read.delim("Chr17.Cov", header=T))
+bed <- read.delim("Chr17.bed", header=F)
+#Remember that BED and bedGraph files are 0-based!
+gr <- GRanges(seqnames=Rle(bed$V1),ranges=IRanges(start=bed$V2+1, end=bed$V3), strand=Rle("*", nrow(bed)))
+groups <- data.frame(row.names=colnames(M),
+    var1 <- c(1,1,1,1,2,2,2,2)) #A very simple experiment with 2 groups of 4 samples
+BS1 <- BSseq(M=M, Cov=Cov, gr=gr, pData=groups, sampleNames=colnames(M)) #You'll want to set some of the additional options!
+```
+
+
+###`bedGraph2methylKit`
+As above, but each bedGraph file is converted to a .methylKit file. The
+bedGraphs should be of CpGs and not have had the strands merged (i.e., don't run
+the merge_CpGs command below).
+
+###`make_reduced_genome`
+Create a reduced representation genome appropriate for reads of a given size
+($size, default is 36bp). MspI and TaqI libraries are supported. Nucleotides
+greater than $size+10% are converted to N.
+
+###`merge_bedGraphs.py`
+This will merge bedGraphs from technical replicates of a single sample into a
+single bedGraph file, summing the methylation metrics as it goes. The output,
+like the input is coordinate sorted.
+
+###`bison_merge_CpGs`
+Methylation is usually symmetric at CpG sites. While the output bedGraph files
+have a single-C resolution, this will convert that to single-CpG resolution by
+summing Cs in the same CpG from opposite strands. This saves space and will
+often speed up downstream statistics.
+
+##Advanced bison_herd usage
+
+`bison_herd` has the ability to use a semi-arbitrary number of nodes. In practice,
+if bison is given N nodes, it will effectively use `2*((N-1)/2)+1` or
+`4*((N-1)/4)+1` nodes, for directional and non-directional libraries,
+respectively. As an example, if you allot 20 nodes for a directional library,
+`bison_herd` will only use 19 of them (17 for non-directional reads). The excess
+nodes will exit properly and, unless you specify --quiet, produce an error
+message.
+
+The options -mp, -queue-size, and -@ are `bison_herd`-specific and deserve further
+description.
+
+-mp sets the number of threads that the master node will use to process
+alignments produced by the worker nodes. Worker nodes are grouped into twos or
+fours, where each group has the a number of nodes equal to the number of
+possible bisulfite converted strands. As the number of allocated nodes
+increases, a point is eventually reached where a single thread on the master
+node is unable to keep up with the workers. In my experience, for directional
+libraries, one thread can handle approximately 130 bowtie2 threads (i.e., if
+using -p 11, -mp should be increased once ~12 worker nodes are allocated, since
+that would equate to 132 threads in use by bowtie2). One should keep in mind
+that there are already at least 3 other threads concurrently running on the
+master node (sending and storing fastq reads, receiving alignments, and writing
+alignments). Consequently, there is a practical limit to the number of nodes is
+determined by how many cores are available on each node.
+
+-queue-size determines the maximum difference between reads sent for alignment
+and reads processed. This option is unavailable if `bison_herd` was compiled with
+-DNOTHROTTLE. By default, the thread that sends reads for alignment will pause
+if it has sent more than ~1 million reads than have been processed. The purpose
+of this is to prevent overwhelming of the MPI unexpected message buffer, since
+the thread on the master node that sends reads can generally process reads
+faster than all of the worker nodes combined can align them. Setting this value
+too high may result in `bison_herd` crashing with otherwise cryptic messages
+involving `MPI_Send`. In such cases, decreasing the value used by -queue-size
+should resolve the problem. On the other hand, setting this value too low can
+result in a deadlocks, due to buffering at various levels. The default value
+hasn't resulted in deadlocking or crashes on our cluster, but yours may be
+different! This difference is checked every 100000 reads, which can changed by
+editting the `THROTTLE_CHECK_INTERVAL` value in bison.h prior to compilation.
+
+-@ specifies the number of compression threads used for writing the output BAM
+file. In practice, a single compression thread can write ~80 million paired-end
+reads per hour (depending on CPU speed). I routinely use -@ 4 when using more
+than ~9 nodes as this allows writing to occur as quickly as reads are processed.
+To determine if the number of compression threads should be increased, not the
+time difference (especially early on) between when each master processor thread
+has processed 100000 reads and when those reads have been written to a file.
+Even when --reorder is used, if there is >1 second between these, then you may
+benefit from increasing the number of compression threads. For those curious,
+this option is identical to that used in samtools.
+
+##Throttling
+
+`bison_herd` generally uses blocking, but not synchronous sends. What this means
+in practice is that many reads will be queued by the master node for sending to
+the worker nodes. Likewise, many alignments can be queued by the worker nodes
+for sending back to the master node. The queue that many MPI implementations use
+for this is relatively small and immutable. While a full queue should cause
+`MPI_Send` to block until there is sufficient space, occasionally a constellation
+of events can occur that cause this queue to overflow and the master node to
+then crash. This can be alleviated by limiting the possible number of reads that
+could ever possibly be in the queue at any single time. As the queue is not
+directly pollable, the difference between the number of reads sent and written
+is used as a surrogate. The maximum number of reads in the wild is then either
+2x or 4x this difference (since a read is queued per worker node). In reality,
+the queue should be emptier than this as there are normally reads buffered on
+the worker nodes (being fed to bowtie2, being aligned or being sent) and
+elsewhere on the master node (being received, waiting to be processed, being
+processed, waiting to be written, or being written).
+
+Throttling is not always required, particularly as an increasing number of nodes
+are used. Throttling can be disabled altogether by compiling with -DNOTHROTTLE,
+which will remove all related components.
+
+##Debug mode
+
+For debugging, a special debug mode is available for both bison and `bison_herd`
+by compiling with -DDEBUG. Instead of running of needing multiple nodes, both
+programs will then run as if they were just a single node. Compiling with this
+option adds the -taskid option to both programs. The taskid is equivalent to the
+node number in the bison (or `bison_herd`) hierarchy. Node 0 is the master node
+and performs the final file writing. For bison, nodes 1-4 are equivalent to the
+worker nodes that align reads to the original top, original bottom,
+complementary to original top and complementary to original bottom strands,
+respectively. For directional libraries, only the first 2 are used. These will
+write alignments to a file for final processing when run as taskid 0. This is
+useful when odd alignments are being output and the source of the error needs to
+be tracked down. The mode for `bison_herd` is similar, except there are always 8
+theoretical worker nodes (i.e., taskid 1-8 need to be run prior to taskid 0).
+This allows testing multiple master processor threads with both directional and
+non-directional reads.
+
+In general, this mode should not be used unless you are running into extremely
+odd bugs.
+
+##Compatibility with Bismark
+
+Bison is generally similar to bismark, however the indexes are incompatible,
+due to bismark renaming contigs. Also, the two will not produce identical 
+output, due to algorithmic differences. Running `bison_methylation_extractor`
+on the output of bismark will also produce different results, again due to
+algorithmic differences. In addition, bison always outputs BAM files directly.
+
+##Other details
+
+Bison needn't be run on multiple computers. You can also use a single
+computer for all compute nodes (e.g. mpiexec -n 5 bison ...). The same holds
+true for `bison_herd`. Both bison and `bison_herd` seem to be faster than bismark,
+even when limited to the same resources.
+
+##Changes
+
+###0.2.4
+  *  Fixed an off-by-one error in bison_mbias. Also, at some point 1-methylation
+     percentage started getting calculated. That's been fixed.
+
+  *  Added bison_markduplicates, which, as the name implies, marks apparent PCR
+     duplicates. The methylation extractor and m-bias calculator have also been
+     updated to ignore marked duplicates.
+
+  *  Fixed a bug in the CpG coverage program, which wasn't properly handling
+     single-C bedGraph files before (if they were merged, then they were being
+     handled correctly).
+
+###0.2.3
+  *  Fix how hard and soft-clipped bases are dealt with (previously, soft-
+     clipped bases resulted in an error and hard-clipped bases in incorrect
+     position assignments!).
+
+  *  Multiple bug fixes related to local alignment, which previously didn't
+     work correctly. These issues seem to generally now be resolved. May thanks
+     to user mvijayen on seqanswers for providing a perfect usage example for
+     testing (see thread http://seqanswers.com/forums/showthread.php?t=39914).
+
+  *  The maximum length of a single contig is now (2^64)-1 (instead of the
+     previous 2^64). I don't think bowtie2 would even support something that
+     long, but if it did then bison wouldn't (internally, a position of 2^64
+     means a base is inserted, soft, or hard-clipped).
+
+  *  A previously missing "*" caused Bison to use the entirety of the
+     description line in the fasta file as the chromosome name. This caused
+     errors since bowtie2 only uses every before the first space (the proper
+     method). Bison now does the same.
+
+  *  A note about creating methylation-bias metrics with locally aligned reads
+     is in order. If a read is soft-clipped, that portion is still included in
+     the M-bias metrics. Likewise, if you pass -OT X,X,X,X or similar
+     parameters to the methylation extractor, the soft-clipped area is also
+     included in there.
+
+  *  Another note regarding local alignments is that the XX auxiliary tag
+     (effectively the more verbose version of the MD tag) contains soft-clipped
+     sequences. I could probably have these removed if someone would like.
+
+###0.2.2
+  *  Properly fixed some wording on the textual output (i.e., removed the word
+     "unique").
+
+  *  Lowered the default MAPQ and Phred thresholds used by the methylation
+     extractor to 10 each. That the MAPQ threshold was originally
+     20 was an error on my part.
+
+###0.2.1
+  *  Added support for file globbing in bison_herd. You may now input multiple
+     files using a combination of wild-cards (*, ?, etc.) and commas. Remember
+     to put these in quotes (e.g., "foo/*1.fq.gz","bar/*1.fq.gz") so the shell
+     doesn't perform the expansion!). As before, specifying multiple inputs with
+     the same file name (e.g., sample1/reads.fq,sample2/reads.fq) will cause the
+     output from the first reads.fq alignment to be over-written by the second.
+
+  *  Fixed the text output, since "unique alignments" isn't really correct,
+     given that alignments with scores of 0 or 1 can be output but aren't
+     unique.
+
+  *  Added information in the Makefile and above about compiling with openmpi.
+
+  *  Fixed a bug in bison_herd wherein the -upto option wasn't being handled
+     properly. -upto now accepts an unsigned long in bison_herd.
+
+  *  Fixed a bug in bison_herd when paired-end reads were used. This was due to
+     how bowtie2 reads from FIFOs. Changing how things were written to the FIFOs
+     on the worker nodes resolved the problem.
+
+  *  The bison_mbias program has been heavily revamped. It still outputs the
+     number of methylated or unmethylated CpG calls per position, but now keeps
+     the metrics for each strand (and read, when paired-end reads are used)
+     separate. If R and the ggplot2 library are installed, the program can also
+     run the bison_mbias2pdf program (see below).
+
+  *  Created an bison_mbias2pdf Rscript that will read in the output of
+     bison_mbias and plot the results, indicating the region of each read that
+     should be included in methylation extraction. This script also print these
+     suggestions in the format used by bison_methylation_extractor, for
+     convenience.
+
+  *  The methylation extractor can now be told to only include certain regions
+     of each read in the output methylation metrics. This is needed when there
+     is apparent bias in the methylation at one or both ends of a read. 
+
+  *  Previously, the recalculated MAPQ was incorrect when only 1 read in a pair
+     had a valid secondary alignment. This has been fixed.
+
+  *  Fixed another MAPQ recalculation bug, affecting reads with MAPQ 2 that
+     have MAPQ=6.
+
+  *  Fixed a bug in writing unmapped reads.
+
+  *  Fixed a bug in bison_herd that allowed early termination without warning.
+
+###0.2.0
+  *  Added a note to the methylation summary statistics output at the end of a
+     run that the numbers will include double counting of any site covered by
+     both mates in a pair. These metrics are only meant for general information
+     and not further analysis, so I don't consider that a bug (it's actually a
+     design decision for the sake of performance).
+
+  *  --ignore-quals is no longer passed to bowtie2 by default. Specifying this
+     will marginally decrease both correct and incorrect alignments. It will
+     also generally decrease the alignment rate.
+
+  *  Fixed --unmapped, which are now written to the directory specified by -o
+
+  *  --maxins was already 500 by default, so it is no longer set by default.
+
+  *  Added bison_herd, see above for usage
+
+  *  The methylation extractor now has a -phred option, to exclude methylation
+     calls from low confidence base-calls. The default threshold is 20.
+
+  *  Added a script to convert bedGraph files to a format suitable for BSseq.
+
+  *  Fixed a bug in bison_merge_CpGs
+
+  *  Both bison and bison_herd now check to ensure that the MPI implementation
+     actually supports the level of thread support requested (previously, this
+     was just assumed).
+
+###0.1.1
+  *  Fixed a number of minor bugs.
+
+  *  Added support for uncompressed fastq files, as well as bzipped files
+     (previously, only gzipped fastq files worked properly).
+
+  *  --score-min is now parsed by bison prior to being sent to bowtie2,
+     read MAPQ scores are recalculated accordingly by the same algorithm
+     used by bowtie2 (N.B., this only bears a vague correspondence to
+     -10*log10(probability the mapping position is wrong)!).
+
+  *  Added a bison_mbias function, to process the aligned BAM file and
+     create a text file containing the percentage of methylated C's as a
+     function of read position. For the utility of this, see: Hansen KD,
+     Langmead B and Irizarry RA, BSmooth: from whole genome bisulfite
+     sequencing to differentially methylated regions. Genome Biol 2012;
+     13(10):R83.
+
+  *  The methylation extractor now accepts the -q options, which sets the
+     MAPQ threshold for a read to be included in the methylation results.
+     The default is a minimum MAPQ of 20, which seems to be a reasonable
+     threshold from a few simulations.
+
+  *  In DEBUG mode, the output BAM files used to have fixed names. This was
+     a problem in cases where debugging was being performed on multiple
+     input files. Now, the OT/OB/CTOT/CTOB.bam filename is prepended with
+     an appropriate prefix (extracted from the input file name). In
+     addition, the output directory is now respected in DEBUG mode.
+
+  *  Included an "auxiliary" directory, that includes functions for making
+     an RRBS genome and other possibly useful functions.
+
+
+###0.1.0
+  Initial release
diff --git a/Rscripts/bison_mbias2pdf b/Rscripts/bison_mbias2pdf
new file mode 100755
index 0000000..26f0f39
--- /dev/null
+++ b/Rscripts/bison_mbias2pdf
@@ -0,0 +1,228 @@
+#!/usr/bin/env Rscript
+suppressMessages(require(ggplot2))
+
+#Agresti-Coull confidence interval
+CI <- function(df, which = 0) {
+    X = df$nMethylated
+    N = df$nMethylated + df$nUnmethylated
+    Z = qnorm(1-0.5*min_p)
+    Z_squared = Z*Z
+    N_dot = N + Z_squared
+    P_dot = (1/N_dot)*(X+0.5*Z_squared)
+    if(which == 0) {
+        P_dot - Z*sqrt((P_dot/N_dot)*(1-P_dot))
+    } else {
+        P_dot + Z*sqrt((P_dot/N_dot)*(1-P_dot))
+    }
+}
+
+usage <- function() {
+    cat("Usage: bison_mbias2pdf [OPTIONS] output_from_bison_mbias.txt
+
+    Given the output of bison_mbias, graph it in R and estimate what regions to
+    ignore when extracting methylation. The graph includes the average
+    methylation level at each position as well as the (by default) 99.9%
+    confidence intervals, which are semi-transparent. Changing the -p value
+    changes the confidence intervals accordingly. Vertical lines may be drawn at
+    the bounds of the region suggested for inclusion in the methylation metrics.
+    The appropriate options for the methylation extractor are then printed to
+    the screen, for convenience.
+
+    -5  The 5' most bound on the + strand of the region for calculating the
+        baseline methylation level. The default is 0.2.
+
+    -3  The 3' most bound on the + strand of the region for calculating the
+        baseline methylation level. The default is 0.8.
+
+    -m  Minimum difference from expected methylation level to suggest trimming a
+        base. The default is 0.01 (i.e., 1%). Without a minimum, the script
+        would output spurious results when minimally biased data is processed.
+
+    -p  Minimum p-value for the test of whether a position's methylation is
+        different from expected. The default is 0.001.
+
+    -png Write output to multiple PNG files instead of to PDF.
+
+    -h  Print this message.
+")
+}
+
+#defaults
+left = 0.2
+right = 0.8
+do_png = 0
+f = NULL
+min_percent = 0.01
+min_p = 0.001
+cmd = "" #This will hold the options for the methylation extractor
+
+args = commandArgs(trailingOnly=T)
+i=1
+while(i<=length(args)) {
+    if(args[i] == "-5") {
+        i = i+1
+        left = as.numeric(args[i])
+    } else if(args[i] == "-3") {
+        i = i+1
+        right = as.numeric(args[i])
+    } else if(args[i] == "-m") {
+        i = i+1
+        min_percent = as.numeric(args[i])
+    } else if(args[i] == "-p") {
+        i = i+1
+        min_p = as.numeric(args[i])
+    } else if(args[i] == "-png") {
+        do_png=1
+    } else if(args[i] == "-h") {
+        usage()
+        stop()
+    } else if(is.null(f)) {
+        f = args[i]
+    }
+    i = i+1
+}
+
+if(min_p >= 1) {
+    min_p = 0.001
+} else if(min_p < 0) {
+    min_p = 0.001
+}
+if(min_percent > 1) {
+    min_percent = min_percent/100
+    cat(sprintf("-m reset to %f since the original value was > 1!\n", min_percent))
+} else if(min_percent < 0) {
+    cat(sprintf("-m reset to 0.01 since you specified negative methylation, which makes no sense\n"))
+    min_percent = 0.01
+}
+
+if(is.null(f)) {
+    usage()
+    stop()
+} else {
+    d <- read.delim(f, header=T)
+    #Fix some of the columns
+    d$Read <- factor(d$Read)
+    d$Strand <- relevel(d$Strand, "OT")
+    #Calculate methylation
+    d$Methylation <- d$nMethylated/(d$nMethylated+d$nUnmethylated)
+    #Upper/Lower Confidence Interval
+    d$UpperCI <- CI(d, 1)
+    d$LowerCI <- CI(d, 0)
+
+    #Determine the output prefix
+    prefix = sub("_mbias.txt", "", f)
+
+    if(do_png == 0) {
+        cat(sprintf("Output will be written to %s_mbias.pdf\n",prefix))
+        pdf(file=sprintf("%s_mbias.pdf", prefix))
+    }
+    for(lev in levels(d$Strand)) {
+        if(do_png == 1) {
+            cat(sprintf("Output will be written to %s_%s_mbias.png\n", prefix, lev))
+            png(filename=sprintf("%s_%s_mbias.png", prefix, lev))
+        }
+        #Calculate the cutoffs
+        cutoff_inters = c()
+        cutoff_types = c()
+        cutoff_cols = factor(c(), levels=c("1","2"))
+        #read 1
+        USE <- intersect(which(d$Strand==lev), which(d$Read == 1))
+        lower <- floor(left * max(d$Position[USE]))
+        upper <- ceiling(right * max(d$Position[USE]))
+        USE2 <- intersect(USE, intersect(which(d$Position >= lower), which(d$Position <= upper)))
+        av <- c(mean(d$Methylation[USE2]), min(d$LowerCI[USE2]), max(d$UpperCI[USE2]))
+        read1_5 <- 0
+        read1_3 <- 0
+        #Significantly below
+        to_remove <- intersect(which(d$UpperCI[USE] < av[1]), which(d$Methylation[USE] < av[2]))
+        #Significantly above
+        to_remove <- append(to_remove, intersect(which(d$LowerCI[USE] > av[1]), which(d$Methylation[USE] > av[3])))
+        to_remove <- unique(to_remove)
+        #Difference threshold
+        to_remove <- intersect(to_remove, which(abs(d$Methylation[USE] - av[1]) > min_percent))
+        midway = floor(0.5*max(d$Position[USE]))
+        #5'
+        if(any(d$Position[USE][to_remove] < midway)) {
+            read1_5 = d$Position[USE][max(to_remove[which(d$Position[USE][to_remove] < midway)])]+1
+            cutoff_inters <- append(cutoff_inters, read1_5)
+            cutoff_types <- append(cutoff_types, "L1")
+            cutoff_cols <- append(cutoff_cols, "1")
+        }
+        #3'
+        if(any(d$Position[USE][to_remove] >= midway)) {
+            read1_3 = d$Position[USE][min(to_remove[which(d$Position[USE][to_remove] >= midway)])]-1
+            cutoff_inters <- append(cutoff_inters, read1_3)
+            cutoff_types <- append(cutoff_types, "L2")
+            cutoff_cols <- append(cutoff_cols, "1")
+        }
+
+        #read 2
+        USE <- intersect(which(d$Strand==lev), which(d$Read == 2))
+        read2_5 <- 0
+        read2_3 <- 0
+        if(length(USE) > 0) {
+            lower <- floor(left * max(d$Position[USE]))
+            upper <- ceiling(right * max(d$Position[USE]))
+            USE2 <- intersect(USE, intersect(which(d$Position >= lower), which(d$Position <= upper)))
+            av <- c(mean(d$Methylation[USE2]), min(d$LowerCI[USE2]), max(d$UpperCI[USE2]))
+            #Significantly below
+            to_remove <- intersect(which(d$UpperCI[USE] < av[1]), which(d$Methylation[USE] < av[2]))
+            #Significantly above
+            to_remove <- append(to_remove, intersect(which(d$LowerCI[USE] > av[1]), which(d$Methylation[USE] > av[3])))
+            to_remove <- unique(to_remove)
+            #Difference threshold
+            to_remove <- intersect(to_remove, which(abs(d$Methylation[USE] - av[1]) > min_percent))
+            midway = floor(0.5*max(d$Position[USE]))
+            #5'
+            if(any(d$Position[USE][to_remove] < midway)) {
+                read2_5 = d$Position[USE][max(to_remove[which(d$Position[USE][to_remove] < midway)])]+1
+                cutoff_inters <- append(cutoff_inters, read2_5)
+                cutoff_types <- append(cutoff_types, "L3")
+                cutoff_cols <- append(cutoff_cols, "2")
+            }
+            #3'
+            if(any(d$Position[USE][to_remove] >= midway)) {
+                read2_3 = d$Position[USE][min(to_remove[which(d$Position[USE][to_remove] >= midway)])]-1
+                cutoff_inters <- append(cutoff_inters, read2_3)
+                cutoff_types <- append(cutoff_types, "L4")
+                cutoff_cols <- append(cutoff_cols, "2")
+            }
+        }
+
+        #Make the pplot
+        USE <- which(d$Strand==lev)
+        cutoffs <- data.frame(x=cutoff_inters, types=cutoff_types, cols= cutoff_cols)
+        g <- ggplot(d[USE,], aes(x=Position, y=Methylation, ymin=max(min(LowerCI), 0), ymax=min(max(UpperCI), 1), group=Read))
+        g <- g + geom_ribbon(aes(ymin=LowerCI, ymax=UpperCI, alpha=0.9, fill=Read))
+        g <- g + geom_line(aes(colour=Read))
+        g <- g + scale_alpha(guide='none')
+        g <- g + scale_colour_discrete(guide='none')
+        if(length(unique(d[USE, 2])) == 2) {
+            g <- g + scale_fill_discrete(name="Read", labels=c("#1","#2"))
+        } else {
+            g <- g + scale_fill_discrete(guide='none')
+        }
+        g <- g + xlab("Position along mapped read (5'->3' of + strand)")
+        g <- g + scale_x_continuous(breaks=seq(0, max(d$Position[USE]), 10))
+        g <- g + ggtitle(sprintf("%s strand", lev))
+
+        #Add the cutoffs
+        if(length(cutoff_inters) > 0) {
+            g <- g + geom_vline(data=cutoffs, aes(xintercept=x, colour=cols, linetype=types), show_guide=T)
+            g <- g + scale_linetype_discrete(name="Cutoffs", labels=sprintf("%i", cutoffs$x))
+            g <- g + guides(fill=guide_legend(override.aes=list(linetype=0)))
+
+            cmd <- sprintf("%s-%s %i,%i,%i,%i ", cmd, lev, read1_5, read1_3, read2_5, read2_3)
+        }
+        print(g)
+        if(do_png == 1) {
+            suppressMessages(dev.off())
+        }
+    }
+    if(do_png == 0) {
+        suppressMessages(dev.off())
+    }
+    if(cmd != "") {
+        cat(sprintf("Suggested methylation extractor parameters: %s\n", cmd))
+    }
+}
diff --git a/aux.c b/aux.c
new file mode 100644
index 0000000..3c5f9f5
--- /dev/null
+++ b/aux.c
@@ -0,0 +1,201 @@
+#include "bison.h"
+
+KSTREAM_INIT(gzFile, gzread, 16384)
+KHASH_MAP_INIT_STR(ref, uint64_t)
+FILE *popen_fd;
+
+struct __tamFile_t {
+        gzFile fp;
+        kstream_t *ks;
+        kstring_t *str;
+        uint64_t n_lines;
+        int is_first;
+};
+
+/******************************************************************************
+*
+*   Return the number of nodes what will actually be run (as opposed to the
+*   number allocated)
+*
+*******************************************************************************/
+#ifdef DEBUG
+int effective_nodes() {
+    return(8);
+}
+#else
+int effective_nodes() {
+    int output, remainder;
+
+    MPI_Comm_size(MPI_COMM_WORLD, &output);
+    --output; //Ignore the master node
+
+    if(config.directional) {
+        remainder = output % 2;
+    } else {
+        remainder = output % 4;
+    }
+    output -= remainder;
+    return(output);
+}
+#endif
+
+/******************************************************************************
+*
+*   quit, while performing some cleanup
+*
+*   int FLAG: What to free/close/etc.
+*             0x1 things created by create_fastq_names()
+*             0x2 things pthreads are closed and bam headers destroyed
+*             In addition, the master node will free chromosomes.genome, close
+*             the BAM file, and free everything in the chromosomes struct.
+*
+*   int rv: return value
+*
+*******************************************************************************/
+void quit(int FLAG, int rv) {
+    int taskid, i;
+
+    free(config.bowtie2_options);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
+
+    if(FLAG & 1) { //FASTQ filenames set
+#ifndef DEBUG
+        if(taskid == MASTER) {
+            if(config.FASTQ1CT != NULL) remove(config.FASTQ1CT);
+            if(config.paired && (config.FASTQ2GA != NULL)) remove(config.FASTQ2GA);
+            if(!config.directional) {
+                if(config.FASTQ1GA != NULL) remove(config.FASTQ1GA);
+                if(config.paired && (config.FASTQ2CT != NULL)) remove(config.FASTQ2CT);
+            }
+        }
+#endif
+        if(config.FASTQ1CT != NULL) free(config.FASTQ1CT);
+        if(config.FASTQ1GA != NULL) free(config.FASTQ1GA);
+        if(config.unmapped1 != NULL) free(config.unmapped1);
+        if(config.paired) {
+            if(config.FASTQ2CT != NULL) free(config.FASTQ2CT);
+            if(config.FASTQ2GA != NULL) free(config.FASTQ2GA);
+            if(config.unmapped2 != NULL) free(config.unmapped2);
+        }
+        free(config.basename);
+        free(config.outname);
+    }
+
+    if(taskid == MASTER) {
+        free(chromosomes.genome);
+        for(i=0; i<chromosomes.nchromosomes; i++) {
+            free((chromosomes.chromosome[i])->chrom);
+            free(*(chromosomes.chromosome+i));
+        }
+        free(chromosomes.chromosome);
+        if(FLAG && OUTPUT_BAM) bam_close(OUTPUT_BAM);
+    }
+    MPI_Finalize();
+    if(taskid == MASTER && FLAG > 0) {
+#ifdef DEBUG
+        if(fp1) bam_close(fp1);
+        if(fp2) bam_close(fp2);
+        if(!config.directional) {
+            if(fp3) bam_close(fp3);
+            if(fp4) bam_close(fp4);
+        }
+#else
+        if(config.unmapped) {
+            pclose(unmapped1);
+            if(config.paired) pclose(unmapped2);
+        }
+#endif
+    }
+    exit(rv);
+}
+
+void print_metrics() {
+    char *of = malloc(sizeof(char) * (strlen(config.odir)+5+strlen(config.basename)));
+    FILE *fp;
+    unsigned long long m_reads = m_reads_OT + m_reads_OB + m_reads_CTOT + m_reads_CTOB;
+    sprintf(of, "%s%s.txt", config.odir, config.basename);
+    fp = fopen(of, "w");
+
+    if(!config.quiet) printf("Alignment:\n");
+    fprintf(fp,"Alignment:\n");
+    if(config.paired) {
+        if(!config.quiet) {
+            printf("\t%llu total paired-end reads analysed\n", t_reads);
+            printf("\t%llu paired-end reads mapped (%6.2f%%).\n", m_reads, ((float) (100*m_reads))/((float) t_reads));
+            printf("\n");
+        }
+        fprintf(fp, "\t%llu total paired-end reads analysed\n", t_reads);
+        fprintf(fp, "\t%llu paired-end reads mapped (%6.2f%%).\n", m_reads, ((float) (100*m_reads))/((float) t_reads));
+        fprintf(fp, "\n");
+    } else {
+        if(!config.quiet) {
+            printf("\t%llu total reads analysed\n", t_reads);
+            printf("\t%llu reads mapped (%6.2f%%).\n", m_reads, ((float) (100*m_reads))/((float) t_reads));
+            printf("\n");
+        }
+        fprintf(fp,"\t%llu total reads analysed\n", t_reads);
+        fprintf(fp,"\t%llu reads mapped (%6.2f%%).\n", m_reads, ((float) (100*m_reads))/((float) t_reads));
+        fprintf(fp,"\n");
+    }
+    if(!config.quiet) {
+        printf("Number of hits aligning to each of the orientations:\n");
+        printf("\t%llu\t%6.2f%%\tOT (original top strand)\n", m_reads_OT, ((float) (100*m_reads_OT))/((float) t_reads));
+        printf("\t%llu\t%6.2f%%\tOB (original bottom strand)\n", m_reads_OB, ((float) (100*m_reads_OB))/((float) t_reads));
+        if(!config.directional) printf("\t%llu\t%6.2f%%\tCTOT (complementary to the original top strand)\n", m_reads_CTOT, ((float) (100*m_reads_CTOT))/((float) t_reads));
+        if(!config.directional) printf("\t%llu\t%6.2f%%\tCTOB (complementary to the original bottom strand)\n", m_reads_CTOB, ((float) (100*m_reads_CTOB))/((float) t_reads));
+        printf("\n");
+        printf("Cytosine Methylation (N.B., statistics from overlapping mates are added together!):\n");
+        printf("\tNumber of C's in a CpG context: %llu\n", t_CpG);
+        printf("\tPercentage of methylated C's in a CpG context: %6.2f%%\n", ((float) (100*m_CpG))/((float) t_CpG));
+        printf("\tNumber of C's in a CHG context: %llu\n", t_CHG);
+        printf("\tPercentage of methylated C's in a CHG context: %6.2f%%\n", ((float) (100*m_CHG))/((float) t_CHG));
+        printf("\tNumber of C's in a CHH context: %llu\n", t_CHH);
+        printf("\tPercentage of methylated C's in a CHH context: %6.2f%%\n", ((float) (100*m_CHH))/((float) t_CHH));
+    }
+    fprintf(fp,"Number of hits aligning to each of the orientations:\n");
+    fprintf(fp,"\t%llu\t%6.2f%%\tOT (original top strand)\n", m_reads_OT, ((float) (100*m_reads_OT))/((float) t_reads));
+    fprintf(fp,"\t%llu\t%6.2f%%\tOB (original bottom strand)\n", m_reads_OB, ((float) (100*m_reads_OB))/((float) t_reads));
+    if(!config.directional) fprintf(fp,"\t%llu\t%6.2f%%\tCTOT (complementary to the original top strand)\n", m_reads_CTOT, ((float) (100*m_reads_CTOT))/((float) t_reads));
+    if(!config.directional) fprintf(fp,"\t%llu\t%6.2f%%\tCTOB (complementary to the original bottom strand)\n", m_reads_CTOB, ((float) (100*m_reads_CTOB))/((float) t_reads));
+    fprintf(fp,"\n");
+    fprintf(fp,"Cytosine Methylation (N.B., statistics from overlapping mates are added together!):\n");
+    fprintf(fp,"\tNumber of C's in a CpG context: %llu\n", t_CpG);
+    fprintf(fp,"\tPercentage of methylated C's in a CpG context: %6.2f%%\n", ((float) (100*m_CpG))/((float) t_CpG));
+    fprintf(fp,"\tNumber of C's in a CHG context: %llu\n", t_CHG);
+    fprintf(fp,"\tPercentage of methylated C's in a CHG context: %6.2f%%\n", ((float) (100*m_CHG))/((float) t_CHG));
+    fprintf(fp,"\tNumber of C's in a CHH context: %llu\n", t_CHH);
+    fprintf(fp,"\tPercentage of methylated C's in a CHH context: %6.2f%%\n", ((float) (100*m_CHH))/((float) t_CHH));
+
+    fclose(fp);
+    free(of);
+}
+
+tamFile sam_popen(char *cmd) {
+    tamFile fp = calloc(1, sizeof(struct __tamFile_t));
+    gzFile gzfp;
+    int fid, fid2;
+    popen_fd = popen(cmd, "r"); //Global
+
+    if(popen_fd == NULL) return 0;
+    fid = fileno(popen_fd);
+    fid2 = dup(fid); //otherwise, the file descriptor is closed by zlib and pclose() won't work!!
+    gzfp = gzdopen(fid2, "r");
+    fp->str = (kstring_t*) calloc(1, sizeof(kstring_t));
+    fp->fp = gzfp;
+    fp->ks = ks_init(fp->fp);
+    fp->n_lines = 0;
+    fp->is_first = 1;
+    return fp;
+}
+
+void sam_pclose(tamFile fp) {
+    if(fp) {
+        ks_destroy(fp->ks);
+        gzclose(fp->fp);
+        pclose(popen_fd); //global
+        free(fp->str->s);
+        free(fp->str);
+        free(fp);
+    }
+}
diff --git a/auxiliary/CpG_coverage.c b/auxiliary/CpG_coverage.c
new file mode 100644
index 0000000..c071751
--- /dev/null
+++ b/auxiliary/CpG_coverage.c
@@ -0,0 +1,119 @@
+#include "../bison.h"
+#include "sam.h"
+
+//This will hold the coverage. The last bin is actually for everything >250
+unsigned long long coverage[252];
+struct {
+    char *chrom;
+    unsigned long long position;
+    unsigned long long end;
+    unsigned long coverage;
+} cur_line;
+
+void next_line(FILE *fp, char *buffer) {
+    if(fgets(buffer, 1024, fp) != NULL) {
+        cur_line.chrom = strtok(buffer, "\t");
+        cur_line.position = strtoull(strtok(NULL, "\t"), NULL, 10);
+        cur_line.end = strtoull(strtok(NULL, "\t"), NULL, 10);
+        strtok(NULL, "\t");
+        cur_line.coverage = strtoul(strtok(NULL, "\t"), NULL, 10);
+        cur_line.coverage += strtoul(strtok(NULL, "\n"), NULL, 10);
+    }
+}
+
+void usage(char *prog) {
+    printf("Usage: %s genome_directory input.bedGraph output.txt\n", prog);
+    printf("\n\
+    Calculate a histogram of per-CpG coverage. N.B., the genome and bedGraph\n\
+    file need to be in the same order (they will be if the bedGraph file was\n\
+    produced with bison and the same genome is used).\n\
+\n\
+    -h            Print this message.\n\
+\n");
+}
+
+int main(int argc, char *argv[]) {
+    FILE *fp = NULL;
+    FILE *ofile = NULL;
+    int32_t i = 0;
+    uint32_t j = 0;
+    char *GenomeChrom = NULL;
+    unsigned long temp_coverage = 0;
+    char *line = malloc(sizeof(char) * 1024);
+    unsigned long long k;
+    unsigned long long nCpGs = 0;
+
+    config.genome_dir = NULL;
+    chromosomes.nchromosomes = 0;
+
+    /* read in the file names */
+    if(argc < 4 || strcmp(argv[1], "-h") == 0) {
+        usage(argv[0]);
+        return 1;
+    };
+    config.genome_dir = argv[1];
+    fp = fopen(argv[2], "r");
+    ofile = fopen(argv[3], "w");
+
+    for(i=0; i<252; i++) coverage[i] = 0;
+
+    //Read in the genome
+    chromosomes.max_genome = 3000000000;
+    printf("Allocating space for %llu characters\n", chromosomes.max_genome); fflush(stdout);
+    chromosomes.genome = malloc(sizeof(char)*chromosomes.max_genome);
+    *chromosomes.genome = '\0';
+    if(chromosomes.genome == NULL) {
+        printf("Could not allocate enough room to hold the genome!\n");
+        return -1;
+    }
+    read_genome();
+
+    //Start reading in the file
+    next_line(fp, line);
+
+    //Iterate through the genome
+    for(i=0; i<chromosomes.nchromosomes; i++) {
+        GenomeChrom = chromosomes.chromosome[i]->chrom;
+        j = chromosomes.chromosome[i]->offset;
+        k = 0; //0-based chromosome position
+        while(j < chromosomes.chromosome[i]->length - 1) {
+            if(*(chromosomes.genome+j) == 'C' && *(chromosomes.genome+j+1) == 'G') {
+                nCpGs++;
+                while(strcmp(cur_line.chrom, GenomeChrom) == 0 && k > cur_line.position) next_line(fp, line); //We should never go beyond 1 line...
+                if(strcmp(cur_line.chrom, GenomeChrom) == 0 && (k == cur_line.position || k == cur_line.position-1)) {
+                    temp_coverage = cur_line.coverage;
+                    if(cur_line.end-cur_line.position == 1) { //Single-C resolution rather than merged as CpGs
+                        next_line(fp, line);
+                        if(strcmp(cur_line.chrom, GenomeChrom) == 0 && k == cur_line.position-1) {
+                            temp_coverage += cur_line.coverage;
+                        }
+                    }
+                    if(temp_coverage > 250) temp_coverage = 251;
+                    coverage[temp_coverage]++;
+                } else {
+                    coverage[0]++;
+                }
+            }
+            j++;
+            k++;
+        }
+    }
+
+    //Print some output
+    for(i=0; i<251; i++) fprintf(ofile, "%i\t%llu\n", i, coverage[i]);
+    fprintf(ofile, "251+\t%llu\n", coverage[251]);
+    printf("There were %llu CpGs\n", nCpGs);
+
+    //Close things up
+    free(line);
+    free(chromosomes.genome);
+    for(i=0; i<chromosomes.nchromosomes; i++) {
+        free((chromosomes.chromosome[i])->chrom);
+        free(*(chromosomes.chromosome+i));
+    }
+    free(chromosomes.chromosome);
+    fclose(fp);
+    fclose(ofile);
+
+    return 0;
+};
diff --git a/auxiliary/bedGraph2BSseq.py b/auxiliary/bedGraph2BSseq.py
new file mode 100755
index 0000000..3edc50d
--- /dev/null
+++ b/auxiliary/bedGraph2BSseq.py
@@ -0,0 +1,115 @@
+#!/usr/bin/python
+import argparse
+import csv
+import sys
+
+parser = argparse.ArgumentParser(description='Convert a series of bedGraph files into input files appropriate for BSseq.')
+parser.add_argument('-chr', metavar='chromosome', help="Only output this chromosome (e.g. chr17) instead of all of them.")
+parser.add_argument('prefix', metavar='prefix', help="Output prefix")
+parser.add_argument('files', metavar='files', nargs='*', help="Input bedGraph files. There must be at least 2.")
+args = parser.parse_args()
+
+if((args.prefix == None) or (args.files == None) or (len(args.files) < 2)) :
+    parser.print_help()
+    sys.exit()
+
+files = []
+for f in args.files :
+    files.append(csv.reader(open(f, "r"), dialect="excel-tab"))
+ofM = open("%s.M" % (args.prefix), "w")
+ofCov = open("%s.Cov" % (args.prefix), "w")
+ofbed = open("%s.bed" % (args.prefix), "w")
+
+lines = []
+for f in files :
+    line = f.next()
+    lines.append([line[0],int(line[1]), int(line[2]), int(line[4]), int(line[5])])
+
+#Add a header
+first = 1
+for f in args.files :
+    if(first == 1) :
+        ofM.write("%s" % f)
+        ofCov.write("%s" % f)
+        first = 0
+    else :
+        ofM.write("\t%s" % f)
+        ofCov.write("\t%s" % f)
+ofM.write("\n")
+ofCov.write("\n")
+
+n_finished = 0
+n_total = len(files)
+while(n_finished < n_total) :
+    i = 0
+    lowest = 0
+    #Determine the appropriate starting point
+    while(i<n_total) :
+        if(lines[i][0] != None) :
+            if(lines[i][0] < lines[lowest][0]) :
+                lowest = i
+            elif(lines[i][0] == lines[lowest][0] and lines[i][1] < lines[lowest][1]) :
+                lowest = i
+        elif(lowest == i) :
+                lowest += 1
+        i += 1
+
+    current = lines[lowest]
+    if(lines[lowest][0] == None) :
+        print("Oh shit, this shouldn't happen!")
+        print(lowest, lines)
+        break
+
+    output = 0
+    if(args.chr != None) :
+        if(lines[lowest][0] == args.chr) :
+            ofbed.write("%s\t%i\t%i\n" % (lines[lowest][0], lines[lowest][1], lines[lowest][2])) #Now 1-based
+            output = 1
+    else :
+        ofbed.write("%s\t%i\t%i\n" % (lines[lowest][0], lines[lowest][1], lines[lowest][2])) #Now 1-based
+        output = 1
+
+    if(output == 1) :
+        i = 0
+        while(i < n_total) :
+            if(i != 0 and output == 1) :
+                ofM.write("\t")
+                ofCov.write("\t")
+    
+            if(lines[i][0] != None) :
+                if(lines[i][0] == current[0] and lines[i][1] == current[1]) :
+                    ofM.write("%i" % (lines[i][3]))
+                    ofCov.write("%i" % (lines[i][3] + lines[i][4]))
+                    try :
+                        line = files[i].next()
+                        lines[i] = [line[0],int(line[1]), int(line[2]), int(line[4]), int(line[5])]
+                    except :
+                        lines[i][0] = None
+                        n_finished += 1
+                else :
+                    ofM.write("0")
+                    ofCov.write("0")
+            else :
+                ofM.write("0")
+                ofCov.write("0")
+
+            i += 1
+        ofM.write("\n")
+        ofCov.write("\n")
+    else :
+        #We're on the wrong chromosome
+        i = 0
+        while(i < n_total) :
+            if(lines[i][0] != None) :
+                if(lines[i][0] == current[0] and lines[i][1] == current[1]) :
+                    try :
+                        line = files[i].next()
+                        lines[i] = [line[0],int(line[1]), int(line[2]), int(line[4]), int(line[5])]
+                    except :
+                        lines[i][0] = None
+                        n_finished += 1
+            i += 1
+
+ofM.close()
+ofCov.close()
+ofbed.close()
diff --git a/auxiliary/bedGraph2methylKit.c b/auxiliary/bedGraph2methylKit.c
new file mode 100644
index 0000000..5d03b6a
--- /dev/null
+++ b/auxiliary/bedGraph2methylKit.c
@@ -0,0 +1,161 @@
+#include "../bison.h"
+#include "sam.h"
+
+struct CpG {
+    int tid;
+    int start;
+    int end;
+    unsigned int n_methylated;
+    unsigned int n_unmethylated;
+};
+
+void usage(char *prog) {
+    printf("Usage: %s genome_directory file.bedGraph\n", prog);
+    printf("\n\
+    Convert a CpG bedGraph file to the format required for methylKit.\n\
+    The CpGs in the file should not be merged (i.e., they should represent\n\
+    individual strand)! See the methylKit documentation for the file format.\n\
+\n\
+    -h            Print this message.\n\
+\n");
+}
+
+FILE * generate_output_name(char *iname) {
+    FILE *of = NULL;
+    char *p;
+    char *oname = malloc(sizeof(char) * (strlen(iname) + 8));
+    strcpy(oname, iname);
+    p = strrchr(oname, '.');
+    if(strcmp(p, ".bedGraph") == 0 || strcmp(p, ".bedgraph") == 0) {
+        *p = '\0';
+    } else {
+        oname = realloc(oname, sizeof(char) * (strlen(oname) + strlen(".methylKit")));
+    }
+    sprintf(oname, "%s.methylKit", oname);
+
+    printf("Output will be written to %s\n", oname);
+    of = fopen(oname, "w");
+    free(oname);
+    return of;
+}
+
+//Given a chromosome name, return the numeric index of it's placement in the chromsomes struct
+//It would make sense to memoize the result to prevent continuous lookup
+inline int char2tid(char *chrom) {
+    int i;
+    for(i=0; i<chromosomes.nchromosomes; i++) {
+        if(strcmp(chromosomes.chromosome[i]->chrom, chrom) == 0) return i;
+    }
+    return chromosomes.nchromosomes;
+}
+
+inline void process_line(char *line, struct CpG *current_line) {
+    char *col;
+
+    //start
+    col = strtok(NULL, "\t");
+    current_line->start = (int32_t) atoi(col);
+
+    //end
+    col = strtok(NULL, "\t");
+    current_line->end = (int32_t) atoi(col);
+
+    //1000*methylation percentage
+    col = strtok(NULL, "\t");
+
+    //n_methylated
+    col = strtok(NULL, "\t");
+    current_line->n_methylated = (int32_t) atoi(col);
+
+    //n_unmethylated
+    col = strtok(NULL, "\t");
+    current_line->n_unmethylated = (int32_t) atoi(col);
+}
+
+int main(int argc, char *argv[]) {
+    int i, last_tid = 0;
+    char *fname = NULL, *line = malloc(sizeof(char) * MAXREAD);
+    char *chrom, *last_chrom = NULL;
+    char base, strand;
+    unsigned long long offset;
+    FILE *of, *ifile;
+    struct CpG current_line;
+
+    config.genome_dir = NULL;
+    chromosomes.nchromosomes = 0;
+
+    /* read in the file names */
+    if(argc < 3) {
+        usage(argv[0]);
+        return 0;
+    };
+    for(i=1; i<argc; i++) {
+        if(strcmp(argv[i], "-h") == 0) {
+            usage(argv[0]);
+            return 0;
+        } else if(config.genome_dir == NULL) {
+            config.genome_dir = argv[i];
+        } else if(fname == NULL) {
+            fname = argv[i];
+        } else {
+            printf("Got an unknown option: %s\n", argv[i]);
+            usage(argv[0]);
+            return 1;
+        }
+    }
+
+    if(config.genome_dir == NULL || fname == NULL) {
+        printf("Genome directory or SAM/BAM input file not specified!\n");
+        usage(argv[0]);
+    }
+
+    //Generate the output names and open the output files
+    of = generate_output_name(fname);
+
+    //Read in the genome
+    chromosomes.max_genome = 3000000000;
+    printf("Allocating space for %llu characters\n", chromosomes.max_genome); fflush(stdout);
+    chromosomes.genome = malloc(sizeof(char)*chromosomes.max_genome);
+    *chromosomes.genome = '\0';
+    if(chromosomes.genome == NULL) {
+        printf("Could not allocate enough room to hold the genome!\n");
+        return -1;
+    }
+    read_genome();
+
+    ifile = fopen(fname, "r");
+    fprintf(of, "chrBase\tchr\tbase\tstrand\tcoverage\tfreqC\tfreqT\n");
+    while(fgets(line, MAXREAD, ifile) != NULL) {
+        chrom = strtok(line, "\t");
+        if(last_chrom == NULL || strcmp(chrom, last_chrom) != 0) {
+            last_tid = char2tid(chrom);
+            if(last_chrom != NULL) free(last_chrom);
+            last_chrom = strdup(chrom);
+        }
+        current_line.tid = last_tid;
+        process_line(line, &current_line);
+
+        //Determine the strand
+        offset = chromosomes.chromosome[last_tid]->offset;
+        base = toupper(*(chromosomes.genome+offset+current_line.start));
+        strand='R';
+        if(base=='C') strand='F';
+        fprintf(of, "%s.%i\t%s\t%i\t%c\t%i\t%5.2f\t%5.2f\n", chrom, current_line.start+1, chrom, current_line.start+1, strand, \
+            current_line.n_methylated+current_line.n_unmethylated, \
+            100*((float) current_line.n_methylated)/(float)(current_line.n_methylated + current_line.n_unmethylated), \
+            100*((float) current_line.n_unmethylated)/(float)(current_line.n_methylated + current_line.n_unmethylated));
+    }
+
+    //Close things up
+    free(line);
+    fclose(of);
+    fclose(ifile);
+    free(chromosomes.genome);
+    for(i=0; i<chromosomes.nchromosomes; i++) {
+        free((chromosomes.chromosome[i])->chrom);
+        free(*(chromosomes.chromosome+i));
+    }
+    free(chromosomes.chromosome);
+
+    return 0;
+};
diff --git a/auxiliary/make_reduced_genome.c b/auxiliary/make_reduced_genome.c
new file mode 100644
index 0000000..4eea5e1
--- /dev/null
+++ b/auxiliary/make_reduced_genome.c
@@ -0,0 +1,347 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#define MAXLINE 512
+#define MAXChromosome 400000000
+
+void usage(char *prog_name) {
+    printf("Usage: %s (options) GENOME.FA OUTPUT.FA\n",prog_name);
+    printf("\t-n X	Maximum number of bases in each read (prior to CG/TG/etc. trimming)\n\t\tDefault is 36. N.B. 10%% more is used to the output.\n");
+    printf("\t-TaqI	Create a reduced representation genome that was cut by TaqI as well as MspI.\n");
+    printf("\t-h	Print this message\n");
+    return;
+}
+
+void output_fragment(FILE *of, char *fragment) {
+    fprintf(of,"%s",fragment);
+    return;
+}
+
+unsigned long get_left_mask(char *f, int read_size) {
+    unsigned long output = 0, i = 0, max_len = strlen(f)-1;
+
+    while(i < read_size) {
+        if(*(f+output) != '\n') {
+            i++;
+        }
+        output++;
+        if(output >= max_len) {
+            output = max_len;
+            break;
+        }
+    }
+
+    return output;
+}
+
+unsigned long get_right_mask(char *f, int read_size) {
+    unsigned long output = strlen(f)-1, i = 0;
+
+    while(i < read_size) {
+        if(*(f+output) != '\n') {
+            i++;
+        }
+        output--;
+        if(output <= 0) {
+            output = 0;
+            break;
+        }
+    }
+
+    return output;
+}
+
+void process_fragment(FILE *of, char *fragment, int read_size) {
+    char *fp = fragment;
+    unsigned long i = 0, left_mask, right_mask;
+
+    //Determine the masking coordinates
+    if(strlen(fragment)-1 <= read_size) {
+        output_fragment(of, fragment);
+        return;
+    }
+    left_mask = get_left_mask(fragment, read_size);
+    right_mask = get_right_mask(fragment, read_size);
+
+    if(left_mask < right_mask) {
+        for(i=0; i <= right_mask; i++) {
+            if(i>=left_mask) {
+                if(*fp != '\n') {
+                    *fp = 'N';
+                }
+            }
+            fp++;
+        }
+    }
+    output_fragment(of, fragment);
+
+    return;
+}
+
+void process_chromosome(FILE *of, char *chrom, int read_size, int Taq) {
+    char *fragment = malloc(MAXChromosome * sizeof(char));
+    char *cp = chrom, *fp = fragment;
+
+    while(*cp != '\0') {
+        //Are we at a TaqI site? Do we even care?
+        if(*cp == 'T' && Taq) {
+            if(strncmp(cp,"TCGA", 4) == 0) {
+                //Add on the last T and a null
+                *fp = 'T';
+                *(++fp) = '\0';
+                process_fragment(of,fragment,read_size);
+
+                fp = fragment; //Move the pointer back to the front of the fragment
+                //Start the next fragment
+                *fp = 'C';
+                fp++;
+                *fp = 'G';
+                fp++;
+                *fp = 'A';
+                fp++;
+
+                //Move the chromosome pointer past the cut site and loop
+                cp += 4;
+                continue;
+            } else if(strncmp(cp,"T\nCGA", 5) == 0) {
+                //Add on the last T and a null
+                *fp = 'T';
+                *(++fp) = '\0';
+                process_fragment(of,fragment,read_size);
+
+                fp = fragment; //Move the pointer back to the front of the fragment
+                //Start the next fragment
+                *fp = '\n';
+                fp++;
+                *fp = 'C';
+                fp++;
+                *fp = 'G';
+                fp++;
+                *fp = 'A';
+                fp++;
+
+                //Move the chromosome pointer past the cut site and loop
+                cp += 5;
+                continue;
+            } else if(strncmp(cp,"TC\nGA", 5) == 0) {
+                //Add on the last T and a null
+                *fp = 'T';
+                *(++fp) = '\0';
+                process_fragment(of,fragment,read_size);
+
+                fp = fragment; //Move the pointer back to the front of the fragment
+                //Start the next fragment
+                *fp = 'C';
+                fp++;
+                *fp = '\n';
+                fp++;
+                *fp = 'G';
+                fp++;
+                *fp = 'A';
+                fp++;
+
+                //Move the chromosome pointer past the cut site and loop
+                cp += 5;
+                continue;
+            } else if(strncmp(cp,"TCG\nA", 5) == 0) {
+                //Add on the last T and a null
+                *fp = 'T';
+                *(++fp) = '\0';
+                process_fragment(of,fragment,read_size);
+
+                fp = fragment; //Move the pointer back to the front of the fragment
+                //Start the next fragment
+                *fp = 'C';
+                fp++;
+                *fp = 'G';
+                fp++;
+                *fp = '\n';
+                fp++;
+                *fp = 'A';
+                fp++;
+
+                //Move the chromosome pointer past the cut site and loop
+                cp += 5;
+                continue;
+            }
+        } else if(*cp == 'C') { //MspI site
+            if(strncmp(cp,"CCGG", 4) == 0) {
+                //Add on the last T and a null
+                *fp = 'C';
+                *(++fp) = '\0';
+                process_fragment(of,fragment,read_size);
+
+                fp = fragment; //Move the pointer back to the front of the fragment
+                //Start the next fragment
+                *fp = 'C';
+                fp++;
+                *fp = 'G';
+                fp++;
+                *fp = 'G';
+                fp++;
+
+                //Move the chromosome pointer past the cut site and loop
+                cp += 4;
+                continue;
+            } else if(strncmp(cp,"C\nCGG", 5) == 0) {
+                //Add on the last T and a null
+                *fp = 'C';
+                *(++fp) = '\0';
+                process_fragment(of,fragment,read_size);
+
+                fp = fragment; //Move the pointer back to the front of the fragment
+                //Start the next fragment
+                *fp = '\n';
+                fp++;
+                *fp = 'C';
+                fp++;
+                *fp = 'G';
+                fp++;
+                *fp = 'G';
+                fp++;
+
+                //Move the chromosome pointer past the cut site and loop
+                cp += 5;
+                continue;
+            } else if(strncmp(cp,"CC\nGG", 5) == 0) {
+                //Add on the last T and a null
+                *fp = 'C';
+                *(++fp) = '\0';
+                process_fragment(of,fragment,read_size);
+
+                fp = fragment; //Move the pointer back to the front of the fragment
+                //Start the next fragment
+                *fp = 'C';
+                fp++;
+                *fp = '\n';
+                fp++;
+                *fp = 'G';
+                fp++;
+                *fp = 'G';
+                fp++;
+
+                //Move the chromosome pointer past the cut site and loop
+                cp += 5;
+                continue;
+            } else if(strncmp(cp,"CCG\nG", 5) == 0) {
+                //Add on the last T and a null
+                *fp = 'C';
+                *(++fp) = '\0';
+                process_fragment(of,fragment,read_size);
+
+                fp = fragment; //Move the pointer back to the front of the fragment
+                //Start the next fragment
+                *fp = 'C';
+                fp++;
+                *fp = 'G';
+                fp++;
+                *fp = '\n';
+                fp++;
+                *fp = 'G';
+                fp++;
+
+                //Move the chromosome pointer past the cut site and loop
+                cp += 5;
+                continue;
+            }
+        }
+
+        //We are not at a cut site
+        *fp = *cp;
+        fp++;
+        cp++;
+    }
+
+    //Don't forget the last fragment!
+    *fp = '\0';
+    process_fragment(of,fragment,read_size);
+
+    return;
+}
+
+int main(int argc, char *argv[]) {
+    int Taq = 0;
+    int i = 1;
+    int read_size = 36; //Set by -n, bp maximum in each read and, therefore number of bases on each end of a fragment to print.
+    char *infile = NULL, *outfile = NULL;
+    FILE *f = NULL, *of = NULL;
+    char *chrom_sequence = malloc(MAXChromosome * sizeof(char)), *line = malloc(MAXLINE * sizeof(char));
+    char *p = chrom_sequence;
+
+    if(argc < 3) {
+        usage(argv[0]);
+        return 1;
+    }
+
+    //Parse the input
+    while(i<argc) {
+        if(strcmp(argv[i],"-h") == 0) {
+            usage(argv[0]);
+            return 1;
+        } else if(strcmp(argv[i],"-n") == 0) {
+            read_size = atoi(argv[i+1]);
+            i++;
+            printf("Changing from default read size to %ibp.\n", read_size);
+        } else if(strcmp(argv[i], "-TaqI") == 0) {
+            Taq = 1;
+            printf("TaqI sites will also be cut.\n");
+        } else {
+            if(infile == NULL) {
+                infile = argv[i];
+            } else if(outfile == NULL) {
+                outfile = argv[i];
+            } else {
+                usage(argv[0]);
+                return 1;
+            }
+        };
+        i++;
+    }
+    read_size *= 1.1;
+    printf("The first and last %ibp of each fragment will not be masked\n",read_size);
+
+    //Open files for I/O, we should really check if they exist
+    f = fopen(infile,"r");
+    of = fopen(outfile,"w");
+
+    //Read in the chromomes
+    while(fgets(line, MAXLINE, f) != NULL) {
+        //Have we switched chromosomes?
+        if(*line == '>') {
+            if(chrom_sequence != p) {
+                *p = '\0'; //Ensure that we end in a null
+                process_chromosome(of, chrom_sequence, read_size, Taq);
+            }
+            fprintf(of,"%s",line);
+            p = chrom_sequence;
+        } else {
+            for(i = 0; i<strlen(line); i++) {
+                if(line[i] == 'T' || line[i] == 't') {
+                    *p = 'T';
+                } else if(line[i] == 'G' || line[i] == 'g') {
+                    *p = 'G';
+                } else if(line[i] == 'C' || line[i] == 'c') {
+                    *p = 'C';
+                } else if(line[i] == 'A' || line[i] == 'a') {
+                    *p = 'A';
+                } else if(line[i] == 'N' || line[i] == 'n') {
+                    *p = 'N';
+                } else if(line[i] == '\n') {
+                    *p = '\n';
+                } else {
+                    printf("Uhoh, found %c\n",*p);
+                }
+                p++;
+            }
+        }
+    }
+
+    //Deal with the last chromosome
+    *p = '\0'; //Ensure that we end in a null
+    process_chromosome(of, chrom_sequence, read_size, Taq);
+
+    free(chrom_sequence);
+    fclose(f);
+    fclose(of);
+    return 0;
+}
diff --git a/auxiliary/merge_CpGs.c b/auxiliary/merge_CpGs.c
new file mode 100644
index 0000000..d6c4131
--- /dev/null
+++ b/auxiliary/merge_CpGs.c
@@ -0,0 +1,212 @@
+#include "../bison.h"
+#include "sam.h"
+
+struct CpG {
+    int tid;
+    int start;
+    int end;
+    unsigned int n_methylated;
+    unsigned int n_unmethylated;
+};
+
+void usage(char *prog) {
+    printf("Usage: %s genome_directory file.bedGraph\n", prog);
+    printf("\n\
+    Merge strand metrics for individual CpG calls (i.e. if there are separate\n\
+    methylation metrics for the C's on the + and - strand of a CpG site, combine\n\
+    them).\n\
+\n\
+    -h            Print this message.\n\
+\n");
+}
+
+FILE * generate_output_name(char *iname) {
+    FILE *of = NULL;
+    char *p;
+    char *oname = malloc(sizeof(char) * (strlen(iname) + 8));
+    strcpy(oname, iname);
+    p = strrchr(oname, '.');
+    if(strcmp(p, ".bedGraph") == 0 || strcmp(p, ".bedgraph") == 0) {
+        *p = '\0';
+    } else {
+        oname = realloc(oname, sizeof(char) * (strlen(oname) + strlen(".merged.bedGraph ")));
+    }
+    sprintf(oname, "%s.merged.bedGraph", oname);
+
+    printf("Output will be written to %s\n", oname);
+    of = fopen(oname, "w");
+    free(oname);
+    return of;
+}
+
+//Given a chromosome name, return the numeric index of it's placement in the chromsomes struct
+//It would make sense to memoize the result to prevent continuous lookup
+inline int char2tid(char *chrom) {
+    int i;
+    for(i=0; i<chromosomes.nchromosomes; i++) {
+        if(strcmp(chromosomes.chromosome[i]->chrom, chrom) == 0) return i;
+    }
+    return chromosomes.nchromosomes;
+}
+
+inline void process_line(char *line, struct CpG *current_line) {
+    char *col;
+
+    //start
+    col = strtok(NULL, "\t");
+    current_line->start = (int32_t) atoi(col);
+
+    //end
+    col = strtok(NULL, "\t");
+    current_line->end = (int32_t) atoi(col);
+
+    //1000*methylation percentage
+    col = strtok(NULL, "\t");
+
+    //n_methylated
+    col = strtok(NULL, "\t");
+    current_line->n_methylated = (int32_t) atoi(col);
+
+    //n_unmethylated
+    col = strtok(NULL, "\t");
+    current_line->n_unmethylated = (int32_t) atoi(col);
+}
+
+int main(int argc, char *argv[]) {
+    int i, last_tid = 0, mpercent;
+    char *fname = NULL, *line = malloc(sizeof(char) * MAXREAD);
+    char *chrom, *last_chrom = NULL;
+    char base;
+    unsigned long long offset;
+    FILE *of, *ifile;
+    struct CpG current_line, last_line;
+
+    last_line.tid = -1; //This will mean that the last line has been written
+    last_line.start = 0;
+    last_line.end = 0;
+    last_line.n_methylated = 0;
+    last_line.n_unmethylated = 0;
+    config.genome_dir = NULL;
+    chromosomes.nchromosomes = 0;
+
+    /* read in the file names */
+    if(argc < 3) {
+        usage(argv[0]);
+        return 0;
+    };
+    for(i=1; i<argc; i++) {
+        if(strcmp(argv[i], "-h") == 0) {
+            usage(argv[0]);
+            return 0;
+        } else if(config.genome_dir == NULL) {
+            config.genome_dir = argv[i];
+        } else if(fname == NULL) {
+            fname = argv[i];
+        } else {
+            printf("Got an unknown option: %s\n", argv[i]);
+            usage(argv[0]);
+            return 1;
+        }
+    }
+
+    if(config.genome_dir == NULL || fname == NULL) {
+        printf("Genome directory or SAM/BAM input file not specified!\n");
+        usage(argv[0]);
+    }
+
+    //Generate the output names and open the output files
+    of = generate_output_name(fname);
+
+    //Read in the genome
+    chromosomes.max_genome = 3000000000;
+    printf("Allocating space for %llu characters\n", chromosomes.max_genome); fflush(stdout);
+    chromosomes.genome = malloc(sizeof(char)*chromosomes.max_genome);
+    *chromosomes.genome = '\0';
+    if(chromosomes.genome == NULL) {
+        printf("Could not allocate enough room to hold the genome!\n");
+        return -1;
+    }
+    read_genome();
+
+    ifile = fopen(fname, "r");
+    while(fgets(line, MAXREAD, ifile) != NULL) {
+        chrom = strtok(line, "\t");
+        if(last_chrom == NULL || strcmp(chrom, last_chrom) != 0) {
+            last_tid = char2tid(chrom);
+            if(last_chrom != NULL) free(last_chrom);
+            last_chrom = strdup(chrom);
+        }
+        current_line.tid = last_tid;
+        process_line(line, &current_line);
+
+        //Compare the current and last calls
+        if(current_line.tid == last_line.tid && current_line.start == last_line.end) {
+            //Are these different strands of a single CpG?
+            offset = chromosomes.chromosome[last_line.tid]->offset;
+            base = toupper(*(chromosomes.genome+offset+last_line.start));
+            if(base == 'C') { //Yes
+                last_line.end++;
+                last_line.n_methylated += current_line.n_methylated;
+                last_line.n_unmethylated += current_line.n_unmethylated;
+                mpercent = (int) (1000 * ((float) last_line.n_methylated)/(float)(last_line.n_methylated + last_line.n_unmethylated));
+                fprintf(of, "%s\t%i\t%i\t%i\t%i\t%i\n", chromosomes.chromosome[last_tid]->chrom, last_line.start, \
+                    last_line.end, mpercent, last_line.n_methylated, last_line.n_unmethylated);
+                last_line.tid = -1;
+            } else { //No
+                last_line.start--;
+                mpercent = (int) (1000 * ((float) last_line.n_methylated)/(float)(last_line.n_methylated + last_line.n_unmethylated));
+                fprintf(of, "%s\t%i\t%i\t%i\t%i\t%i\n", chromosomes.chromosome[last_tid]->chrom, last_line.start, \
+                    last_line.end, mpercent, last_line.n_methylated, last_line.n_unmethylated);
+                last_line.tid = current_line.tid;
+                last_line.start = current_line.start;
+                last_line.end = current_line.end;
+                last_line.n_methylated = current_line.n_methylated;
+                last_line.n_unmethylated = current_line.n_unmethylated;
+            }
+        } else {
+            if(last_line.tid != -1) {
+                offset = chromosomes.chromosome[last_line.tid]->offset;
+                base = toupper(*(chromosomes.genome+offset+last_line.start));
+                if(base == 'C') { //Yes
+                    last_line.end++;
+                } else {
+                    last_line.start--;
+                }
+                mpercent = (int) (1000 * ((float) last_line.n_methylated)/(float)(last_line.n_methylated + last_line.n_unmethylated));
+                fprintf(of, "%s\t%i\t%i\t%i\t%i\t%i\n", chromosomes.chromosome[last_line.tid]->chrom, last_line.start, \
+                    last_line.end, mpercent, last_line.n_methylated, last_line.n_unmethylated);
+            }
+            last_line.tid = current_line.tid;
+            last_line.start = current_line.start;
+            last_line.end = current_line.end;
+            last_line.n_methylated = current_line.n_methylated;
+            last_line.n_unmethylated = current_line.n_unmethylated;
+        }
+    }
+    //Attend to a possible remnant line
+    if(last_line.tid != -1) {
+        offset = chromosomes.chromosome[last_tid]->offset;
+        base = toupper(*(chromosomes.genome+offset+last_line.start));
+        if(base == 'C') { //Yes
+            last_line.end++;
+        } else {
+            last_line.start--;
+        }
+        mpercent = (int) (1000 * ((float) last_line.n_methylated)/(float)(last_line.n_methylated + last_line.n_unmethylated));
+        fprintf(of, "%s\t%i\t%i\t%i\t%i\t%i\n", chromosomes.chromosome[last_tid]->chrom, last_line.start, \
+            last_line.end, mpercent, last_line.n_methylated, last_line.n_unmethylated);
+    }
+
+    //Close things up
+    free(line);
+    fclose(of);
+    fclose(ifile);
+    free(chromosomes.genome);
+    for(i=0; i<chromosomes.nchromosomes; i++) {
+        free((chromosomes.chromosome[i])->chrom);
+        free(*(chromosomes.chromosome+i));
+    }
+    free(chromosomes.chromosome);
+
+    return 0;
+};
diff --git a/auxiliary/merge_bedGraphs.py b/auxiliary/merge_bedGraphs.py
new file mode 100755
index 0000000..356732d
--- /dev/null
+++ b/auxiliary/merge_bedGraphs.py
@@ -0,0 +1,72 @@
+#!/usr/bin/python
+import argparse
+import csv
+import sys
+
+parser = argparse.ArgumentParser(description='Merge a number of bedGraph files from the bison methylation extractor')
+parser.add_argument('outfile', metavar='outfile', help="Output bedGraph files")
+parser.add_argument('files', metavar='files', nargs='*', help="Input bedGraph files. There must be at least 2.")
+args = parser.parse_args()
+
+if((args.outfile == None) or (args.files == None) or (len(args.files) < 2)) :
+    parser.print_help()
+    sys.exit()
+
+files = []
+for f in args.files :
+    if(f != args.outfile) :
+        files.append(csv.reader(open(f, "r"), dialect="excel-tab"))
+of = open(args.outfile, "w")
+
+lines = []
+for f in files :
+    line = f.next()
+    lines.append([line[0],int(line[1]), int(line[2]), int(line[4]), int(line[5])])
+
+n_finished = 0
+n_total = len(files)
+while(n_finished < n_total) :
+    i = 0
+    lowest = 0
+    #Determine the appropriate starting point
+    while(i<n_total) :
+        if(lines[i][0] != None) :
+            if(lines[i][0] < lines[lowest][0]) :
+                lowest = i
+            elif(lines[i][0] == lines[lowest][0] and lines[i][1] < lines[lowest][1]) :
+                lowest = i
+        elif(lowest == i) :
+                lowest += 1
+        i += 1
+
+    current = lines[lowest]
+    if(lines[lowest][0] == None) :
+        print("Oh shit, this shouldn't happen!")
+        print(lowest, lines)
+        break
+
+    i = 0
+    while(i < n_total) :
+        if(i != lowest and lines[i][0] != None) :
+            if(lines[i][0] == current[0] and lines[i][1] == current[1]) :
+                current[3] += lines[i][3]
+                current[4] += lines[i][4]
+                try :
+                    line = files[i].next()
+                    lines[i] = [line[0],int(line[1]), int(line[2]), int(line[4]), int(line[5])]
+                except :
+                    lines[i][0] = None
+                    n_finished += 1
+        i += 1
+    frac = round(1000*float(current[3])/float(current[3]+current[4]))
+    of.write("%s\t%i\t%i\t%i\t%i\t%i\n" % (current[0], current[1], current[2], frac, current[3], current[4]))
+
+    #Don't forget to increment the "lowest" one too
+    try :
+        line = files[lowest].next()
+        lines[lowest] = [line[0],int(line[1]), int(line[2]), int(line[4]), int(line[5])]
+    except :
+        lines[lowest][0] = None
+        n_finished += 1
+
+of.close()
diff --git a/bison.h b/bison.h
new file mode 100644
index 0000000..3eaebc6
--- /dev/null
+++ b/bison.h
@@ -0,0 +1,665 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <pthread.h>
+#include <ctype.h>
+#include <kstring.h>
+#include <bam.h>
+#include <sam_header.h>
+#include <kseq.h>
+#include <khash.h>
+#include <inttypes.h>
+#include <time.h>
+#include <assert.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define MAXREAD 1024
+#define MASTER 0
+#define VERSION "0.2.4"
+#define BT2BUF_SZ 256 * 1024
+#define THROTTLE_CHECK_INTERVAL 100000 //When bison_herd auto-throttles, this specifies how frequently it should check whether it should do so (units are "reads")
+#define version() printf("Bison, version %s\n", VERSION)
+
+/******************************************
+*
+* MPI Send/Recv tags:
+*
+* 0: Workers should start
+* 1: Header size (this could be removed)
+* 2: packed header struct
+* 3: Packed fastq struct
+* 4: Unused (used to be packed read size)
+* 5: Packed read
+*
+******************************************/
+
+//Mutexes for thread i/o designation. A thread should not read/write until it's ID number is equal to these
+FILE *zip1;
+FILE *zip2;
+FILE *unmapped1;
+FILE *unmapped2;
+bamFile OUTPUT_BAM;
+unsigned long long t_reads; //total number of reads
+unsigned long long m_reads_OT; //total number mapped to the OT strand
+unsigned long long m_reads_OB;
+unsigned long long m_reads_CTOT;
+unsigned long long m_reads_CTOB;
+unsigned long long t_CpG; //Total CpGs
+unsigned long long m_CpG; //Methylated CpGs
+unsigned long long t_CHG;
+unsigned long long m_CHG;
+unsigned long long t_CHH;
+unsigned long long m_CHH;
+
+//This is useful for single-node debugging
+#ifdef DEBUG
+int global_debug_taskid;
+bamFile fp1;
+bamFile fp2;
+bamFile fp3;
+bamFile fp4;
+#endif
+
+//Some people may find it useful to have the system throttle itself so as not to overwhelm the MPI buffer
+unsigned long long *nwritten;
+
+//Mutex for controlling access to the global metrics struct and the output files
+pthread_mutex_t metrics_mutex;
+
+typedef struct {
+    char *chrom;
+    unsigned long long offset;
+    unsigned long long length;
+} chromosome_struct;
+
+typedef struct {
+    int nchromosomes;
+    unsigned long long max_genome; //The offset value in the read_genome() function will be used to keep track of how close we are
+    chromosome_struct **chromosome;
+    char *genome; //This will hold the genomic sequence in memory as a continuous string.
+} chromosomes_struct;
+
+typedef struct {
+    char *FASTQ1;
+    char *FASTQ2;
+    char *FASTQ1CT;
+    char *FASTQ1GA;
+    char *FASTQ2CT;
+    char *FASTQ2GA;
+    char *unmapped1;
+    char *unmapped2;
+    char *genome_dir;
+    char *basename;
+    char *odir;
+    char *tmpdir;
+    char *bowtie2_options;
+    char *outname;
+    char scoremin_type;
+    int paired;
+    int directional;
+    int nthreads;
+    int nmthreads;
+    int buffer_size;
+    int send_receive_buffer_size;
+    int unmapped;
+    int mode; //0 is --end-to-end (default), 1 is local
+    int quiet; //0 or 1, the latter supresses all output to the screen
+    int reorder; //0 or 1, latter reorders writing to match input, only meaningful in herd
+    int n_compression_threads; //Default is 0
+#ifndef NOTHROTTLE
+    int reads_in_queue;
+#endif
+    float scoremin_intercept;
+    float scoremin_coef;
+} t_config;
+
+typedef struct {
+    int size; //Theres an effective size limit imposed by MPI of whatever int is
+    void *packed;
+} MPI_Header;
+
+typedef struct {
+    int size;
+    void *packed; //the format is sizeof(bam1_t) followed by data, which is of size data_len
+} MPI_read;
+
+typedef struct {
+    int size;
+    void *packed; //format is: char *name1\0seq1\0qual1\0 followed by optional char *name2\0seq2\0qual2\0
+} MPI_Fastq;
+
+typedef struct {
+    int max_name1; //current maximum length of memory for name1
+    int max_seq1;
+    int max_qual1;
+    int max_name2;
+    int max_seq2;
+    int max_qual2;
+    char *name1;
+    char *seq1;
+    char *qual1;
+    char *name2;
+    char *seq2;
+    char *qual2;
+} fastq;
+
+//This is used as the input struct for slurp_fastq
+typedef struct {
+    int thread_id;
+    char *fastq1;
+    char *fastq2;
+} slurp_fastq_struct;
+
+struct packed_struct {
+    void *packed; //If NULL, then finished
+    struct packed_struct *next;
+    struct packed_struct *previous; //Only used on last sentinel struct
+    int state; //0    no next node (not ready)
+               //1    has next node (ready)
+};
+
+//Global values
+t_config config;
+bam_header_t *global_header;
+//This will be the global structure for pointers to chromosome_struct's holding the information for *genome
+chromosomes_struct chromosomes;
+char **fnames1, **fnames2; //This will hold the file names so that the writer thread knows what to rename things
+unsigned long long *flengths; //This will hold the size of each file
+
+//Linked-list of reads
+struct packed_struct *node1, *node1_last_sentinel;
+struct packed_struct *node2, *node2_last_sentinel;
+struct packed_struct *node3, *node3_last_sentinel;
+struct packed_struct *node4, *node4_last_sentinel;
+//bison-herd
+struct packed_struct **nodes, **last_sentinel_node;
+struct packed_struct **fastq_nodes, **last_fastq_sentinel_node;
+struct packed_struct **to_write_node, **to_write_sentinel_node;
+
+/******************************************************************************
+*
+*   Take a fastq struct and convert it G->A, the conversion is in place
+*
+*   fastq *read, input struct
+*   int which, which of the reads to convert
+*
+*******************************************************************************/
+void convertGA(fastq *, int); //fastq.c
+
+/******************************************************************************
+*
+*   Take a fastq struct and convert it C->T, the conversion is in place
+*
+*   fastq *read, input struct
+*   int which, which of the reads to convert
+*
+*******************************************************************************/
+void convertCT(fastq *, int ); //fastq.c
+
+/******************************************************************************
+*
+*   Write an unmapped read to a gzipped fastq file.
+*
+*   FILE *fp: gzipped fastq file
+*   bam1_t *read: read to write in fastq format
+*
+*******************************************************************************/
+void write_unmapped(FILE *, bam1_t *); //fastq.c
+
+/******************************************************************************
+*
+*   Read in the fastq file(s) sending the reads to the appropriate nodes and
+*   also storing the unconverted reads in a linked list on the master node.
+*
+*   This will act as its own thread on the master node.
+*
+*   void *a is unused but required by pthreads
+*
+*******************************************************************************/
+void * send_store_fastq(void *);
+
+/******************************************************************************
+*
+*   Add an element to the end of a linked-list
+*
+*   struct packed_struct *last: last sentinel struct
+*   void *packed: a packed read
+*
+*******************************************************************************/
+void add_element(struct packed_struct *, void *);
+
+/******************************************************************************
+*
+*   Remove an element from the start of a linked-list
+*   is_ready(first, 0) must return 1!
+*
+*   struct packed_struct *: first sentinel struct
+*
+*******************************************************************************/
+void remove_element(struct packed_struct *); //slurp.c
+//As above, but the packed component can't have been updated to a bam1_t
+void remove_raw_element(struct packed_struct *); //slurp.c
+
+/******************************************************************************
+*
+*   Move an element from one linked-list to another.
+*
+*   struct packed_struct *source: source linked list
+*   struct packed-struct *dest: destination sentinel node
+*
+*******************************************************************************/
+void move_element(struct packed_struct *, struct packed_struct *);
+
+/******************************************************************************
+*
+*   Is the first or second element ready?
+*
+*   struct packed_struct *first: first sentinel struct
+*   int offset: 0 (first element) or 1 (second element)
+*
+*    returns 1 for element ready, or 0 otherwise
+*
+*******************************************************************************/
+int is_ready(struct packed_struct *, int); //slurp.c
+
+/******************************************************************************
+*
+*   Is the linked list finished?
+*
+*   struct packed_struct *: first sentinel struct
+*
+*   returns 1 for finished, 0 otherwise
+*
+*******************************************************************************/
+int is_finished(struct packed_struct *); //slurp.c
+
+/******************************************************************************
+*
+*   Add an elemnt to a node designating that the list is finished
+*
+*   struct packed_struct: last sentinel node
+*
+*******************************************************************************/
+void add_finished(struct packed_struct *); //slurp.c
+
+/******************************************************************************
+*
+*   Initialize a linked list, returning the last sentinel struct
+*
+*   struct packed_struct *: first sentinel struct
+*
+*   returns first sentinel struct
+*
+*******************************************************************************/
+struct packed_struct *initialize_list(struct packed_struct *); //slurp.c
+
+/******************************************************************************
+*
+*   Destroy a linked list of packed_structs
+*
+*   struct packed_struct *first: linked list to destroy
+*
+*******************************************************************************/
+void destroy_list(struct packed_struct *); //slurp.c
+//As above, but for lists where ->packed hasn't been converted to a bam1_t
+void destroy_raw_list(struct packed_struct *); //slurp.c
+
+/******************************************************************************
+*
+*   The MPI receiver thread on the main node
+*
+*   void *: NULL input
+*
+*   returns NULL
+*
+*******************************************************************************/
+void *slurp(void *); //slurp.c
+void *herd_slurp(void *); // herd/slurp.c
+
+/******************************************************************************
+*
+*   Construct the output directory name, putting it in config.odir
+*
+*******************************************************************************/
+void update_odir(); //fastq.c
+
+/******************************************************************************
+*
+*   Given the name of a (possibly gzipped) fastq file, return the file name
+*   with the .fastq.gz, .fq.gz, .fastq, or .fq extension removed.
+*
+*   char *file: filename
+*
+*   CAUTION, THE OUTPUT MUST BE free()d!
+*
+*******************************************************************************/
+char * get_basename(char *); //fastq.c
+
+/******************************************************************************
+*
+*   Invoke the C->T and G->A conversion threads of the fastq files (located in
+*   the global config structure).
+*
+*   FLAGS: integer bit field denoting the conversions to make
+*       0x8 fastq #1 C->T
+*       0x4 fastq #1 G->A
+*       0x2 fastq #2 C->T
+*       0x1 fastq #2 G->A
+*
+*******************************************************************************/
+void convert_fastq(int, unsigned int); //fastq.c
+
+/******************************************************************************
+*
+*   Take the config.FASTQ1 and config.FASTQ2 filenames and use them to generate
+*   the config.FASTQ1CT... filenames. These must subsequently be free()d, which
+*   is done in the quit() function.
+*
+*   char *f1, config.FASTQ1
+*   char *f2, config.FASTQ2
+*   these are only really needed if there's more than one input file
+*
+*******************************************************************************/
+void create_fastq_names(char *, char*); //fastq.c
+
+/******************************************************************************
+*
+*   Read in all .fa and .fasta files within config.genome_dir. The sequences 
+*   are concatenated onto chromosomes.genome. The global chromosomes structure
+*   is modified with each new chromosome.
+*
+*   Note, chromosomes.genome (in fact, all of chromosomes) need to be free()d
+*   The is performed by the quit() function.
+*
+*******************************************************************************/
+void read_genome(); //common.c
+
+/******************************************************************************
+*
+*   Print metrics to STDOUT and a file.
+*
+*******************************************************************************/
+void print_metrics(); //aux.c
+
+/******************************************************************************
+*
+*   Return the number of worker nodes that will actually run.
+*
+*******************************************************************************/
+int effective_nodes(); //aux.c
+
+/******************************************************************************
+*
+*   quit, while performing some cleanup
+*
+*   int FLAG: What to free/close/etc.
+*             0x1 things created by create_fastq_names()
+*             0x2 things pthreads are closed and bam headers destroyed
+*             In addition, the master node will free chromosomes.genome, close
+*             the BAM file, and free everything in the chromosomes struct.
+*             Also, everynode will free config.bowtie2_options
+*
+*   int rv: return value
+*
+*******************************************************************************/
+void quit(int, int); //aux.c
+
+/******************************************************************************
+*
+*   Take a BAM header and pack it into a single contiguous memory block. Store
+*   the resulting block and its size in an MPI_Header structure.
+*
+*   THE RESULT MUST BE free()d
+*
+*   bam_header_t *header: The header to store
+*
+*******************************************************************************/
+MPI_Header * pack_header(bam_header_t *); //MPI_packing.c
+
+/******************************************************************************
+*
+*   Unpack a header packed into an initialized bam_header_t
+*
+*   bam_header_t *header: The header to unpack into
+*   void *packed: The packed header
+*
+*******************************************************************************/
+void unpack_header(bam_header_t *, void *); //MPI_packing.c
+
+/******************************************************************************
+*
+*   Take a fastq struct and pack it for shipping
+*
+*   THE RESULT MUST BE free()d eventually
+*
+*   fastq *read: The read(s) to store
+*   MPI_Fastq *output: the struct into which to pack things
+*
+*******************************************************************************/
+MPI_Fastq * pack_fastq(fastq *); //MPI_packing.c
+
+/******************************************************************************
+*
+*   Take unpack a packed fastq struct
+*
+*   THE RESULT MUST BE free()d
+*
+*   fastq *read: The fastq struct to unpack into
+*   void *packed: The packed structure
+*
+*******************************************************************************/
+fastq * unpack_fastq(fastq *, void *); //MPI_packing.c
+
+/******************************************************************************
+*
+*   Unpack a packed read into an initialized bam1_t read.
+*
+*   bam1_t *read: The read to unpack into
+*   void *packed: The packed read
+*
+*******************************************************************************/
+bam1_t *unpack_read(bam1_t *, void *); //MPI_packing.c
+
+/******************************************************************************
+*
+*   Take a BAM read and pack it into a single contiguous memory block. Store
+*   the resulting block and its size in an MPI_Read structure.
+*
+*   THE RESULT MUST BE free()d
+*
+*   bam1_t *read: The read to store
+*
+*******************************************************************************/
+MPI_read * pack_read(bam1_t *, MPI_read *); //MPI_packing.c
+
+/******************************************************************************
+*
+*   Extract the next sequence line from a file stream.
+*
+*   char *seq: destination
+*   FILE *fp: source
+*
+*   THE OUTPUT MUST BE free()d
+*   This function is affected by the MAXREAD definition, above. If this value is
+*   less than the longest read, things will break. It would be better to realloc
+*   as needed.
+*
+*******************************************************************************/
+void get_seq(char *, FILE *); //genome.c
+
+/******************************************************************************
+*
+*   Reverse complement a sequence (in place)
+*
+*   char *seq: the sequence
+*
+*******************************************************************************/
+void reverse_complement(char *); //common.c
+
+/******************************************************************************
+*
+*   Determine the appropriate offset in chromosomes.genome
+*
+*   char *chrom: Chromosome name
+*   int32_t pos: 0-based position on Chromosome. This is read->core.pos
+*
+*******************************************************************************/
+unsigned long long genome_offset(char*, int32_t); //common.c
+
+/******************************************************************************
+*
+*   Return the length of a given chromosome.
+*
+*   char *chrom: the chromosome of interest
+*
+*******************************************************************************/
+unsigned long long genome_chrom_length(char *); //genome.c
+
+/******************************************************************************
+*
+*   Return a pointer to the chromosome name onto which a read maps.
+*
+*   bam1_t *read: The read in question
+*
+*******************************************************************************/
+char *lookup_chrom(bam1_t *); //common.c
+
+/******************************************************************************
+*
+*   Return a base and another 2 bases on one of its sides. This is needed for
+*   making methylation calls. If this span goes off the edge of a chromosome,
+*   N's will be used.
+*
+*   unsigned long long offset: from genome_offset
+*   unsigned long long position: converted read->core.pos
+*   int change: Direction of the context (- is backwards)
+*   unsigned long long chrom_length: from genome_chrom_length
+*
+*   The output needs to be free()d
+*
+*******************************************************************************/
+char* get_genomic_context(unsigned long long, unsigned long long, int, unsigned long long); //genome.c
+
+/*******************************************************************************
+*
+*  Create a position array to account for any InDels
+*  This function assumes that the first base is not marked as an InDel or
+*  clipped in any way. If that occurs then things will break.
+*
+*  The output needs to be free()d
+*
+*******************************************************************************/
+unsigned long long *calculate_positions(bam1_t *); //common.c
+
+/*******************************************************************************
+*
+*   The master node function.
+*
+*   void *a: Actually an int*, the thread_id
+*
+*******************************************************************************/
+void * master_processer_thread(void*); //master.c
+void * herd_master_processer_thread(void*); //master.c under herd/
+
+/******************************************************************************
+*
+*   Given a set of single-end reads, determine which one, if any, aligns best.
+*   Then, add the various XM/XX/etc. tags and prepare the read for writing. The
+*   final read will always be stored in read1. Return the worker node number
+*   producing the best alignment (or 0).
+*
+*   bam1_t *readN: Unpacked reads from the worker nodes
+*   char *seq: The unconverted fastq read
+*
+*******************************************************************************/
+int process_single(bam1_t *, bam1_t *, bam1_t *, bam1_t *, char *); //master.c
+
+/******************************************************************************
+*
+*   Like process_single, but for paired_end reads. The bam1_t**s hold the
+*   buffered reads. i denotes the read#1 of interest (read #2 is the next read)
+*
+*******************************************************************************/
+int process_paired(bam1_t **, bam1_t **, bam1_t **, bam1_t **, char **); //master.c
+
+/*******************************************************************************
+*
+*   Update a packed read so that it's a proper bam1_t and return a pointer
+*
+*   struct packed_struct *first: first sentinel node
+*   int offset: Return the read from the first (0) or second (1) element
+*
+*   returns a pointer to a bam1_t read
+*
+*******************************************************************************/
+bam1_t *update_read(struct packed_struct *, int); //master.c
+
+/******************************************************************************
+*
+*   This function will run as its own thread and process the linked lists
+*   output from the master processor threads, writing them in order to a BAM
+*   file. This will also write all of the other output (aside from metrics).
+*   Furthermore, this provides a readout of the current number of reads
+*   processed.
+*
+*   Output is NULL, as is the input (needed by pthreads).
+*
+*******************************************************************************/
+void * bam_writer(void *); //writer.c
+
+/******************************************************************************
+*
+*   This receives the reads, converts them, and writes them to the FIFO(s)
+*   
+*   void *a: a pointer to a struct with the following components:
+*
+*   int thread_id: the thread_id
+*   char *fastq1: FIFO from which bowtie2 can get read1
+*   char *fastq2: FIFO from which bowtie2 can get read2 (if it exists)
+*
+*******************************************************************************/
+void * slurp_fastq(void *); //worker.c
+
+/******************************************************************************
+*
+*   The main worker node function.
+*
+*   int thread_id: the thread_id
+*
+*******************************************************************************/
+void worker_node(int); //worker.c
+
+/******************************************************************************
+*
+*   The main worker node function.
+*
+*   int thread_id: the thread_id
+*   char *fastq1: FIFO from which bowtie2 can get read1
+*   char *fastq2: FIFO from which bowtie2 can get read2 (if it exists)
+*
+*******************************************************************************/
+void herd_worker_node(int, char *, char *); //worker.c under herd/
+
+/******************************************************************************
+*
+*   Open a sam file for reading via popen
+*
+*   char *cmd: The command given to popen, the mode is always "r".
+*
+*******************************************************************************/
+tamFile sam_popen(char *); //aux.c
+
+/******************************************************************************
+*
+*   Close a SAM file that was opened with sam_popen
+*
+*   tamFile fp: The file pointer struct returned from sam_popen
+*
+*******************************************************************************/
+void sam_pclose(tamFile fp); //aux.c
diff --git a/common.c b/common.c
new file mode 100644
index 0000000..eea72fd
--- /dev/null
+++ b/common.c
@@ -0,0 +1,174 @@
+#include "bison.h"
+
+/*******************************************************************************
+*
+*  Create a position array to account for any InDels
+*  This function assumes that the first base is not marked as an InDel or
+*  clipped in any way. If that occurs then things will break.
+*
+*  The output needs to be free()d
+*
+*******************************************************************************/
+unsigned long long *calculate_positions(bam1_t *read) {
+    unsigned long long *positions = malloc(sizeof(unsigned long long) * (size_t)read->core.l_qseq);
+    int i, j, offset = 0, op, op_len;
+    uint32_t *CIGAR = bam1_cigar(read);
+    unsigned int previous_position = (unsigned int) read->core.pos;
+
+    for(i=0; i<read->core.n_cigar; i++) {
+        op = *(CIGAR+i) & 15;
+        op_len = (*(CIGAR+i)) >> 4;
+        for(j=0; j<op_len; j++) {
+            if(op == 0 || op == 7 || op == 8) { //M, =, X
+                *(positions+offset) = previous_position++;
+                offset++;
+            } else if(op == 1 || op == 4 || op == 5) { //I, S, H
+                *(positions+offset) = ULLONG_MAX; //This sets a practical limit on a contig's length (though this should never occur in reality
+                offset++;
+            } else if(op == 2 || op == 3) { //D, N
+                previous_position++;
+            } else { //P
+                printf("We encountered a CIGAR operation that we're not ready to deal with in %s\n", bam1_qname(read));
+            }
+        }
+    }
+    return positions;
+}
+
+/******************************************************************************
+*
+*   Read in all .fa and .fasta files within config.genome_dir. The sequences 
+*   are concatenated onto chromosomes.genome. The global chromosomes structure
+*   is modified with each new chromosome.
+*
+*   Note, chromosomes.genome (in fact, all of chromosomes) need to be free()d
+*   The is performed by the quit() function.
+*
+*******************************************************************************/
+void read_genome() {
+    DIR *dir = opendir(config.genome_dir);
+    FILE *fp;
+    char *p, *line = malloc(sizeof(char)*MAXREAD), *fullpath = NULL;
+    char *g = chromosomes.genome;
+    struct dirent *file;
+    unsigned long long offset = 0;
+    unsigned long long length = 0;
+    int end, nchromosomes, i;
+    chromosome_struct *chromosome = NULL;
+
+    while((file = readdir(dir)) != NULL) {
+        p = strrchr(file->d_name, '.');
+        if(p == NULL) continue;
+        if(strcmp(p, ".fa") == 0 || strcmp(p, ".fasta") == 0) {
+            //This is a fasta file that we need to read into the genome array and append a chromosome_struct onto chromosomes_struct
+            fullpath = realloc(fullpath, sizeof(char)*(strlen(config.genome_dir)+strlen(file->d_name)+1));
+            sprintf(fullpath, "%s%s",config.genome_dir,file->d_name);
+            fp = fopen(fullpath, "r");
+            if(!config.quiet) printf("Reading in %s\n", fullpath);
+            fflush(stdout);
+            while(fgets(line, MAXREAD, fp) != NULL) {
+                end=strlen(line);
+                if(line[end-1] == '\n') line[end-1] = '\0';
+                if(line[0] == '>') {
+                    //Store the length of the previous contig, if there was one
+                    if(chromosome != NULL) {
+                        chromosome->length = length;
+                    }
+
+                    //Initialize a new chromosome_struct and lengthen the global chromosomes struct
+                    nchromosomes = ++chromosomes.nchromosomes;
+                    chromosomes.chromosome = realloc(chromosomes.chromosome, sizeof(chromosome_struct*) * nchromosomes);
+                    chromosomes.chromosome[nchromosomes-1] = malloc(sizeof(chromosome_struct));
+                    chromosome = chromosomes.chromosome[nchromosomes-1];
+                    chromosome->offset = offset;
+                    p = strchr(line, ' ');
+                    if(p != NULL) *p = '\0'; //If there's anything after the name, ignore it
+                    chromosome->chrom = malloc(sizeof(char)*strlen(line));
+                    strcpy(chromosome->chrom, (line+1)); //ignore the ">"
+                    length = 0;
+                    chromosome->offset = offset;
+                } else {
+                    //Ensure that we have enough space in chromosomes.genome
+                    if(offset + 10000 >= chromosomes.max_genome) {
+                        chromosomes.max_genome += 100000;
+                        chromosomes.genome = realloc(chromosomes.genome, sizeof(char) * chromosomes.max_genome);
+                        g = chromosomes.genome + offset;
+                     }
+                     offset += end-1;
+                     length += end-1;
+                     for(i=0; i<strlen(line); i++) *(line+i) = toupper(*(line+i)); //Make everything upper case
+                     strncpy(g, line, end);
+                     g += end-1;
+                }
+            }
+            //Store the last contig's length
+            chromosome->length = length;
+            if(!config.quiet) printf("Finished %s\n", fullpath);
+            fflush(stdout);
+            fclose(fp);
+        }
+    }
+    free(line);
+    free(fullpath);
+    closedir(dir);
+}
+
+/******************************************************************************
+*
+*   Reverse complement a sequence (in place)
+*
+*   char *seq: the sequence
+*
+*******************************************************************************/
+void reverse_complement(char *seq) {
+    char *tmp = strdup(seq);
+    char current, new;
+    int i, j;
+
+    for(i=0, j=strlen(tmp)-1; j>=0; i++, j--) {
+        current = *(tmp+j);
+        new = 'N';
+        if(current == 'A' || current == 'a') new = 'T';
+        if(current == 'T' || current == 't') new = 'A';
+        if(current == 'C' || current == 'c') new = 'G';
+        if(current == 'G' || current == 'g') new = 'C';
+        *(seq+i) = new;
+    }
+    free(tmp);
+}
+
+/******************************************************************************
+*
+*   Determine the appropriate offset in chromosomes.genome
+*
+*   char *chrom: Chromosome name
+*   int32_t pos: 0-based position on Chromosome. This is read->core.pos
+*
+*******************************************************************************/
+unsigned long long genome_offset(char *chrom, int32_t pos) {
+    int i;
+    unsigned long long chrom_offset = 0;
+
+    for(i=0; i<chromosomes.nchromosomes; i++) {
+        if(strcmp(chromosomes.chromosome[i]->chrom, chrom) == 0) {
+            chrom_offset = chromosomes.chromosome[i]->offset;
+            chrom_offset += pos;
+            break;
+        }
+    }
+
+    if(chrom_offset == 0 && pos != 0) printf("Unable to calculate the genomic offset for %s:%i!\n", chrom, (int) pos);
+    return chrom_offset;
+}
+
+/******************************************************************************
+*
+*   Return a pointer to the chromosome name onto which a read maps.
+*
+*   bam1_t *read: The read in question
+*
+*******************************************************************************/
+inline char *lookup_chrom(bam1_t *read) {
+    int32_t tid = read->core.tid;
+    return global_header->target_name[tid];
+}
diff --git a/fastq.c b/fastq.c
new file mode 100644
index 0000000..e76c96d
--- /dev/null
+++ b/fastq.c
@@ -0,0 +1,385 @@
+#include "bison.h"
+
+char * reverse_qual(char *qual) {
+    char *output = malloc(sizeof(char)*(1+strlen(qual)));
+    int i, j;
+    for(i=0, j=strlen(qual)-1; i<strlen(qual); i++, j--) *(output+i) = *(qual+j);
+    *(output+i) = '\0';
+    free(qual);
+    return output;
+}
+
+/******************************************************************************
+*
+*   Write an unmapped read to a gzipped fastq file.
+*
+*   FILE *fp: gzipped fastq file
+*   bam1_t *read: read to write in fastq format
+*
+*******************************************************************************/
+void write_unmapped(FILE *fp, bam1_t *read) {
+    char *seq = calloc(1+read->core.l_qseq, sizeof(char));
+    char *qual = calloc(1+read->core.l_qseq, sizeof(char));
+    uint8_t b, *seqp = bam1_seq(read), *qualp = bam1_qual(read);
+    int i;
+
+    for(i=0; i<read->core.l_qseq; i++) {
+        b = bam1_seqi(seqp, i);
+        if(b == 1) *(seq+i) = 'A';
+        else if(b == 2) *(seq+i) = 'C';
+        else if(b == 4) *(seq+i) = 'G';
+        else if(b == 8) *(seq+i) = 'T';
+        else if(b == 15) *(seq+i) = 'N';
+        *(qual+i) = qualp[i] + 33;
+    }
+    if(read->core.flag & BAM_FREVERSE) {
+        reverse_complement(seq);
+        qual = reverse_qual(qual);
+    }
+
+    fprintf(fp, "@%s\n", bam1_qname(read));
+    fprintf(fp, "%s\n", seq);
+    fprintf(fp, "+\n");
+    fprintf(fp, "%s\n", qual);
+
+    free(seq);
+    free(qual);
+}
+
+/******************************************************************************
+*
+*   Construct the output directory name, putting it in config.odir
+*
+*******************************************************************************/
+void update_odir() {
+    char *p, *tmp;
+
+    if(config.odir == NULL) {
+        tmp = strdup(config.FASTQ1);
+        p = strrchr(tmp, '/');
+        if(p != NULL) {
+            *(p+1) = '\0';
+            config.odir = tmp;
+        } else {
+            config.odir = malloc(sizeof(char) * 3);
+            sprintf(config.odir, "./");
+        }
+    } else {
+        if(config.odir[strlen(config.odir)-1] != '/') {
+            config.odir = realloc(config.odir, (strlen(config.odir)+2) * sizeof(char));
+            strcat(config.odir, "/");
+        }
+    }
+}
+
+/******************************************************************************
+*
+*   Given the name of a (possibly gzipped) fastq file, return the file name
+*   with the .fastq.gz, .fq.gz, .fastq, or .fq extension removed.
+*
+*   CAUTION, THE OUTPUT MUST BE free()d!
+*
+*******************************************************************************/
+char * get_basename(char *file) {
+    char *output = malloc(sizeof(char) * (strlen(file) + 1));
+    char *p = NULL;
+
+    //Create the basename of the input
+    strcpy(output, file);
+    p = strrchr(output, '.');
+    if(p != NULL) {
+        if(strcmp(p, ".gz") == 0) {
+            *p = '\0';
+            p = strrchr(output, '.');
+            if(p != NULL) {
+                if(strcmp(p, ".fastq") == 0 || strcmp(p, ".fq") == 0) *p = '\0';
+            }
+        } else if(strcmp(p, ".fastq") == 0 || strcmp(p, ".fq") == 0) {
+            *p = '\0';
+        }
+    }
+
+    //Remove any preceding path
+    p = strrchr(output, '/');
+    if(p != NULL) {
+        p++;
+        memmove(output, p, strlen(p)+1);
+    }
+    return output;
+}
+
+/******************************************************************************
+*
+*   These functions are executed via pthreads to convert the fastq sequences.
+*
+*******************************************************************************/
+void * convert1(void *a) {
+    char *cmd = malloc(sizeof(char) * (strlen(config.FASTQ1) + 6));
+    char *line1 = malloc(MAXREAD*sizeof(char));
+    char *line2 = malloc(MAXREAD*sizeof(char));
+    FILE *f, *of1, *of2 = NULL;
+    unsigned long long total = 0;
+    unsigned int limit = *((unsigned int *) a);
+    int i;
+    char *p;
+
+    //Determine how we should read in the file
+    p = strrchr(config.FASTQ1, '.');
+    if(strcmp(p, ".gz") == 0 || strcmp(p, ".GZ") == 0) {
+        sprintf(cmd, "zcat %s", config.FASTQ1);
+    } else if(strcmp(p, ".bz") == 0 || strcmp(p, ".bz2") == 0) {
+        sprintf(cmd, "bzcat %s", config.FASTQ1);
+    } else {
+        sprintf(cmd, "cat %s", config.FASTQ1);
+    }
+    f = popen(cmd, "r");
+
+    //CT
+    cmd = realloc(cmd,sizeof(char) * (strlen(config.FASTQ1CT) + 8));
+    sprintf(cmd, "gzip > %s", config.FASTQ1CT);
+    of1 = popen(cmd, "w");
+
+    //GA
+    if(!config.directional) {
+        sprintf(cmd, "gzip > %s", config.FASTQ1GA);
+        of2 = popen(cmd, "w");
+    }
+
+    //Iterate through
+    while(1) {
+        //Read name
+        if(fgets(line1, MAXREAD, f) == NULL) break;
+        total++;
+        fputs(line1, of1);
+        if(!config.directional) fputs(line1, of2);
+        //Sequence
+        assert(fgets(line1, MAXREAD, f) != NULL);
+        if(!config.directional) strcpy(line2, line1);
+        for(i=0; i<strlen(line1); i++) {
+            if(*(line1+i) == 'C' || *(line1+i) == 'c') *(line1+i) = 'T';
+            if(!config.directional) if(*(line2+i) == 'G' || *(line2+i) == 'g') *(line2+i) = 'A';
+        }
+        fputs(line1, of1);
+        if(!config.directional) fputs(line2, of2);
+        //QUAL header
+        assert(fgets(line1, MAXREAD, f) != NULL);
+        fputs(line1, of1);
+        if(!config.directional) fputs(line1, of2);
+        //QUAL
+        assert(fgets(line1, MAXREAD, f) != NULL);
+        fputs(line1, of1);
+        if(!config.directional) fputs(line1, of2);
+
+        if(limit) if(total >= limit) break;
+    }
+
+    if(!config.quiet) printf("%s contained %llu reads\n", config.FASTQ1, total);
+    pclose(f);
+    pclose(of1);
+    if(!config.directional) pclose(of2);
+    free(cmd);
+    free(line1);
+    free(line2);
+
+    return NULL;
+}
+void * convert2(void *a) {
+    char *cmd = malloc(sizeof(char) * (strlen(config.FASTQ2) + 6));
+    char *line1 = malloc(MAXREAD*sizeof(char));
+    char *line2 = malloc(MAXREAD*sizeof(char));
+    FILE *f, *of1, *of2 = NULL;
+    unsigned long long total = 0;
+    unsigned int limit = *((unsigned int *) a);
+    int i;
+    char *p;
+
+    //Determine how we should read in the file
+    p = strrchr(config.FASTQ2, '.');
+    if(strcmp(p, ".gz") == 0 || strcmp(p, ".GZ") == 0) {
+        sprintf(cmd, "zcat %s", config.FASTQ2);
+    } else if(strcmp(p, ".bz") == 0 || strcmp(p, ".bz2") == 0) {
+        sprintf(cmd, "bzcat %s", config.FASTQ2);
+    } else {
+        sprintf(cmd, "cat %s", config.FASTQ2);
+    }
+    f = popen(cmd, "r");
+
+    //GA
+    cmd = realloc(cmd, sizeof(char) * (strlen(config.FASTQ2GA) + 8));
+    sprintf(cmd, "gzip > %s", config.FASTQ2GA);
+    of1 = popen(cmd, "w");
+
+    //CT
+    if(!config.directional) {
+        sprintf(cmd, "gzip > %s", config.FASTQ2CT);
+        of2 = popen(cmd, "w");
+    }
+
+    //Iterate through
+    while(1) {
+        //Read name
+        if(fgets(line1, MAXREAD, f) == NULL) break;
+        total++;
+        fputs(line1, of1);
+        if(!config.directional) fputs(line1, of2);
+        //Sequence
+        assert(fgets(line1, MAXREAD, f) != NULL);
+        if(!config.directional) strcpy(line2, line1);
+        for(i=0; i<strlen(line1); i++) {
+            if(*(line1+i) == 'G' || *(line1+i) == 'g') *(line1+i) = 'A';
+            if(!config.directional) if(*(line2+i) == 'C' || *(line2+i) == 'c') *(line2+i) = 'T';
+        }
+        fputs(line1, of1);
+        if(!config.directional) fputs(line2, of2);
+        //QUAL header
+        assert(fgets(line1, MAXREAD, f) != NULL);
+        fputs(line1, of1);
+        if(!config.directional) fputs(line1, of2);
+        //QUAL
+        assert(fgets(line1, MAXREAD, f) != NULL);
+        fputs(line1, of1);
+        if(!config.directional) fputs(line1, of2);
+        if(limit) if(total >= limit) break;
+    }
+
+    if(!config.quiet) printf("%s contained %llu reads\n", config.FASTQ2, total);
+    pclose(f);
+    pclose(of1);
+    if(!config.directional) pclose(of2);
+    free(cmd);
+    free(line1);
+    free(line2);
+
+    return NULL;
+}
+
+/******************************************************************************
+*
+*   Invoke the C->T and G->A conversion threads of the fastq files (located in
+*   the global config structure).
+*
+*   FLAGS: integer bit field denoting the conversions to make
+*       0x8 fastq #1 C->T
+*       0x4 fastq #1 G->A
+*       0x2 fastq #2 C->T
+*       0x1 fastq #2 G->A
+*
+*******************************************************************************/
+void convert_fastq(int FLAGS, unsigned int limit) {
+    pthread_t *threads;
+    int rc;
+
+    if(!config.quiet) {
+        if(FLAGS & 8) printf("Will C->T convert %s and store the results in %s.\n", config.FASTQ1, config.FASTQ1CT);
+        if(FLAGS & 4) printf("Will G->A convert %s and store the results in %s.\n", config.FASTQ1, config.FASTQ1GA);
+        if(FLAGS & 2) printf("Will C->T convert %s and store the results in %s.\n", config.FASTQ2, config.FASTQ2CT);
+        if(FLAGS & 1) printf("Will G->A convert %s and store the results in %s.\n", config.FASTQ2, config.FASTQ2GA);
+    }
+
+    if(config.paired) {
+        threads = calloc(2, sizeof(pthread_t));
+        rc = pthread_create(&(threads[0]), NULL, convert1, (void *) &limit);
+        if(rc) {
+            printf("An error occured with invoking pthread_create; %d\n", rc);
+            exit(-1);
+        }
+        rc = pthread_create(&(threads[1]), NULL, convert2, (void *) &limit);
+        if(rc) {
+            printf("An error occured with invoking pthread_create; %d\n", rc);
+            exit(-1);
+        }
+    } else {
+        threads = calloc(1, sizeof(pthread_t));
+        rc = pthread_create(&(threads[0]), NULL, convert1, (void *) &limit);
+        if(rc) {
+            printf("An error occured with invoking pthread_create; %d\n", rc);
+            exit(-1);
+        }
+    }
+    pthread_join(threads[0], NULL);
+    if(config.paired) pthread_join(threads[1], NULL);
+
+    free(threads);
+}
+
+/******************************************************************************
+*
+*   Take the config.FASTQ1 and config.FASTQ2 filenames and use them to generate
+*   the config.FASTQ1CT... filenames. These must subsequently be free()d, which
+*   is done in the quit() function.
+*
+*******************************************************************************/
+void create_fastq_names(char *f1, char *f2) {
+    char *basename1 = malloc(sizeof(char) * (strlen(f1) + 20));
+    char *basename2 = NULL;
+    char *p;
+
+    basename1 = strcpy(basename1, f1);
+    if(config.paired) {
+        basename2 = malloc(sizeof(char) * (strlen(f2) + 20));
+        basename2 = strcpy(basename2, f2);
+    }
+
+    //Create the basename of FASTQ1, trim off [.fastq/.fq].(bz/gz/bz2/fastq/fq)
+    p = strrchr(basename1, '.');
+    if(p != NULL) {
+        if(strcmp(p, ".gz") == 0 || strcmp(p, ".bz") == 0 || strcmp(p, ".bz2") == 0 || strcmp(p, ".fastq") == 0 || strcmp(p, ".fq") == 0) {
+            *p = '\0';
+            p = strrchr(basename1, '.');
+            if(p != NULL) {
+                if(strcmp(p, ".fastq") == 0 || strcmp(p, ".fq") == 0) *p = '\0';
+            }
+        }
+    }
+    config.FASTQ1CT = malloc(sizeof(char) * (strlen(basename1) + 10));
+    config.FASTQ1GA = malloc(sizeof(char) * (strlen(basename1) + 10));
+    if(config.odir != NULL) {
+        p = strrchr(basename1, '/');
+        if(p!=NULL) {
+            p++;
+        } else {
+            p = basename1;
+        }
+        config.unmapped1 = malloc(sizeof(char) * (strlen(config.odir) + strlen(p) + strlen(".unmapped.fq.gz") + 1));
+        sprintf(config.unmapped1, "%s%s.unmapped.fq.gz", config.odir, p);
+    } else {
+        config.unmapped1 = malloc(sizeof(char) * (strlen(basename1) + strlen(".unmapped.fq.gz") + 1));
+        sprintf(config.unmapped1, "%s.unmapped.fq.gz", basename1);
+    }
+    sprintf(config.FASTQ1CT, "%s.CT.fq.gz", basename1);
+    sprintf(config.FASTQ1GA, "%s.GA.fq.gz", basename1);
+
+    //Create the basename of FASTQ2
+    if(config.paired) {
+        p = strrchr(basename2, '.');
+        if(p != NULL) {
+            if(strcmp(p, ".gz") == 0 || strcmp(p, ".bz") == 0 || strcmp(p, ".bz2") == 0 || strcmp(p, ".fastq") == 0 || strcmp(p, ".fq") == 0) {
+                *p = '\0';
+                p = strrchr(basename2, '.');
+                if(p != NULL) {
+                    if(strcmp(p, ".fastq") == 0 || strcmp(p, ".fq") == 0) *p = '\0';
+                }
+            }
+        }
+        config.FASTQ2CT = malloc(sizeof(char) * (strlen(basename2) + 10));
+        config.FASTQ2GA = malloc(sizeof(char) * (strlen(basename2) + 10));
+        if(config.odir != NULL) {
+            p = strrchr(basename2, '/');
+            if(p!=NULL) {
+                p++;
+            } else {
+                p = basename2;
+            }
+            config.unmapped2 = malloc(sizeof(char) * (strlen(config.odir) + strlen(p) + strlen(".unmapped.fq.gz") + 1));
+            sprintf(config.unmapped2, "%s%s.unmapped.fq.gz", config.odir, p);
+        } else {
+            config.unmapped2 = malloc(sizeof(char) * (strlen(basename2) + strlen(".unmapped.fq.gz") + 1));
+            sprintf(config.unmapped2, "%s.unmapped.fq.gz", basename2);
+        }
+        sprintf(config.FASTQ2CT, "%s.CT.fq.gz", basename2);
+        sprintf(config.FASTQ2GA, "%s.GA.fq.gz", basename2);
+        free(basename2);
+    }
+
+    free(basename1);
+}
diff --git a/genome.c b/genome.c
new file mode 100644
index 0000000..3a0e37b
--- /dev/null
+++ b/genome.c
@@ -0,0 +1,81 @@
+#include "bison.h"
+
+/******************************************************************************
+*
+*   Extract the next sequence line from a file stream.
+*
+*   char *seq: destination
+*   FILE *fp: source
+*
+*   THE OUTPUT MUST BE free()d
+*   This function is affected by the MAXREAD definition, above. If this value is
+*   less than the longest read, things will break. It would be better to realloc
+*   as needed.
+*
+*******************************************************************************/
+void get_seq(char *seq, FILE *fp) {
+    char *line = malloc(MAXREAD*sizeof(char));
+    assert(fgets(line, MAXREAD, fp) != NULL);
+    assert(fgets(line, MAXREAD, fp) != NULL);
+    *(line+strlen(line)-1) = '\0'; //remove the \n
+    strcpy(seq, line);
+    assert(fgets(line, MAXREAD, fp) != NULL);
+    assert(fgets(line, MAXREAD, fp) != NULL);
+    free(line);
+}
+
+/******************************************************************************
+*
+*   Return the length of a given chromosome.
+*
+*   char *chrom: the chromosome of interest
+*
+*******************************************************************************/
+unsigned long long genome_chrom_length(char *chrom) {
+    int i;
+    unsigned long long output = 0;
+
+    for(i=0; i<chromosomes.nchromosomes; i++) {
+        if(strcmp(chromosomes.chromosome[i]->chrom, chrom) == 0) {
+            output = chromosomes.chromosome[i]->length;
+            break;
+        }
+    }
+    return output;
+}
+
+/******************************************************************************
+*
+*   Return a base and another 2 bases on one of its sides. This is needed for
+*   making methylation calls. If this span goes off the edge of a chromosome,
+*   N's will be used.
+*
+*   unsigned long long offset: from genome_offset
+*   unsigned long long position: converted read->core.pos
+*   int change: Direction of the context (- is backwards)
+*   unsigned long long chrom_length: from genome_chrom_length
+*
+*******************************************************************************/
+char * get_genomic_context(unsigned long long offset, unsigned long long position, int change, unsigned long long chrom_length) {
+    int i;
+    char *output = calloc(4, sizeof(char));
+
+    if(change > 0) {
+        for(i=0; i<3; i++) {
+            if(position+i < chrom_length) {
+                *(output+i) = toupper(*(chromosomes.genome+offset+position+i));
+            } else {
+                *(output+i) = 'N';
+            }
+        }
+    } else {
+        for(i=0; i<3; i++) {
+            if(position-2+i >= 0) {
+                *(output+i) = toupper(*(chromosomes.genome+offset+position-2+i));
+            } else {
+                *(output+i) = 'N';
+            }
+        }
+    }
+    return output;
+}
diff --git a/herd/MPI_packing.c b/herd/MPI_packing.c
new file mode 100644
index 0000000..b40181c
--- /dev/null
+++ b/herd/MPI_packing.c
@@ -0,0 +1,147 @@
+#include "../bison.h"
+
+/******************************************************************************
+*
+*   Take a fastq struct and pack it for shipping
+*
+*   THE RESULT MUST BE free()d eventually
+*
+*   fastq *read: The read(s) to store
+*   MPI_Fastq *output: the struct into which to pack things
+*
+*******************************************************************************/
+MPI_Fastq * pack_fastq(fastq *read) {
+    size_t size = 0;
+    size_t length1, length2;
+    void *p;
+    char *pchar, null_char = '\0';
+    MPI_Fastq *output = malloc(sizeof(MPI_Fastq));
+
+    //Calculate the size needed for read1
+    length1 = sizeof(char) * (strlen(read->name1) + strlen(read->seq1) + strlen(read->qual1) + 3);
+    size += length1;
+    if(config.paired) {
+        length2 = sizeof(char) * (strlen(read->name2) + strlen(read->seq2) + strlen(read->qual2) + 3);
+        size += length2;
+    }
+    output->size = size;
+    output->packed = malloc(size);
+
+    //Set everything
+    p = output->packed;
+
+    //read1
+    memcpy(p, (void *) read->name1, sizeof(char) * (strlen(read->name1)));
+    pchar = (char *) p;
+    p = (void *) (pchar + strlen(read->name1));
+    memcpy(p, (void *) &null_char, sizeof(char));
+    pchar = (char *) p;
+    p = (void *) (++pchar);
+    memcpy(p, (void *) read->seq1, sizeof(char) * (strlen(read->seq1)));
+    pchar = (char *) p;
+    p = (void *) (pchar + strlen(read->seq1));
+    memcpy(p, (void *) &null_char, sizeof(char));
+    pchar = (char *) p;
+    p = (void *) (++pchar);
+    memcpy(p, (void *) read->qual1, sizeof(char) * (strlen(read->qual1)));
+    pchar = (char *) p;
+    p = (void *) (pchar + strlen(read->qual1));
+    memcpy(p, (void *) &null_char, sizeof(char));
+    pchar = (char *) p;
+    p = (void *) (++pchar);
+
+    //read2
+    if(config.paired) {
+        memcpy(p, (void *) read->name2, sizeof(char) * (strlen(read->name2)));
+        pchar = (char *) p;
+        p = (void *) (pchar + strlen(read->name2));
+        memcpy(p, (void *) &null_char, sizeof(char));
+        pchar = (char *) p;
+        p = (void *) (++pchar);
+        memcpy(p, (void *) read->seq2, sizeof(char) * (strlen(read->seq2)));
+        pchar = (char *) p;
+        p = (void *) (pchar + strlen(read->seq2));
+        memcpy(p, (void *) &null_char, sizeof(char));
+        pchar = (char *) p;
+        p = (void *) (++pchar);
+        memcpy(p, (void *) read->qual2, sizeof(char) * (strlen(read->qual2)));
+        pchar = (char *) p;
+        p = (void *) (pchar + strlen(read->qual2));
+        memcpy(p, (void *) &null_char, sizeof(char));
+        pchar = (char *) p;
+        p = (void *) (++pchar);
+    }
+    return output;
+}
+
+/******************************************************************************
+*
+*   Take unpack a packed fastq struct
+*
+*   THE RESULT MUST BE free()d
+*
+*   fastq *read: The fastq struct to unpack into
+*   void *packed: The packed structure
+*
+*******************************************************************************/
+fastq * unpack_fastq(fastq *read, void *packed) {
+    char *pchar;
+    void *p = packed;
+    size_t len;
+
+    //Read1
+    len = strlen((char *) p) + 1; //name
+    if(len > read->max_name1) {
+        read->name1 = realloc((void *) read->name1, sizeof(char) * len);
+        read->max_name1 = len;
+    }
+    strcpy(read->name1, (char *) p);
+    pchar = (char *) p;
+    p = (void *) (pchar + len);
+    len = strlen((char *) p) + 1; //seq
+    if(len > read->max_seq1) {
+        read->seq1 = realloc((void *) read->seq1, sizeof(char) * len);
+        read->max_seq1 = len;
+    }
+    strcpy(read->seq1, (char *) p);
+    pchar = (char *) p;
+    p = (void *) (pchar + len);
+    len = strlen((char *) p) + 1; //qual
+    if(len > read->max_qual1) {
+        read->qual1 = realloc((void *) read->qual1, sizeof(char) * len);
+        read->max_qual1 = len;
+    }
+    strcpy(read->qual1, (char *) p);
+    pchar = (char *) p;
+    p = (void *) (pchar + len);
+
+    //Read2
+    if(config.paired) {
+        len = strlen((char *) p) + 1; //name
+        if(len > read->max_name2) {
+            read->name2 = realloc((void *) read->name2, sizeof(char) * len);
+            read->max_name2 = len;
+        }
+        strcpy(read->name2, (char *) p);
+        pchar = (char *) p;
+        p = (void *) (pchar + len);
+        len = strlen((char *) p) + 1; //seq
+        if(len > read->max_seq2) {
+            read->seq2 = realloc((void *) read->seq2, sizeof(char) * len);
+            read->max_seq2 = len;
+        }
+        strcpy(read->seq2, (char *) p);
+        pchar = (char *) p;
+        p = (void *) (pchar + len);
+        len = strlen((char *) p) + 1; //qual
+        if(len > read->max_qual2) {
+            read->qual2 = realloc((void *) read->qual2, sizeof(char) * len);
+            read->max_qual2 = len;
+        }
+        strcpy(read->qual2, (char *) p);
+        pchar = (char *) p;
+        p = (void *) (pchar + len);
+    }
+
+    return read;
+}
diff --git a/herd/fastq.c b/herd/fastq.c
new file mode 100644
index 0000000..5191a0d
--- /dev/null
+++ b/herd/fastq.c
@@ -0,0 +1,477 @@
+#include "../bison.h"
+#include <bzlib.h>
+#include <zlib.h>
+#include <wordexp.h>
+
+//This serve as the buffer for reading from compressed files
+struct local_buffer {
+    char *buf;
+    unsigned long pos;
+    int finished; //0 no, 1 yes
+    int type; //0: txt, 1: gz, 2:bz2
+    union {
+        FILE *fptxt;
+        gzFile fpgz;
+        BZFILE *fpbz2;
+    } x;
+};
+
+/******************************************************************************
+*
+*   Take a fastq struct and convert it C->T, the conversion is in place
+*
+*   fastq *read, input struct
+*   int which, which of the reads to convert
+*
+*******************************************************************************/
+void convertCT(fastq *read, int which) {
+    char *p;
+    if(which == 0) {
+        p = read->seq1;
+    } else {
+        p = read->seq2;
+    }
+    while(*p != '\n') {
+        if(*p == 'C' || *p == 'c') *p = 'T';
+        p++;
+    }
+}
+
+/******************************************************************************
+*
+*   Take a fastq struct and convert it G->A, the conversion is in place
+*
+*   fastq *read, input struct
+*   int which, which of the reads to convert
+*
+*******************************************************************************/
+void convertGA(fastq *read, int which) {
+    char *p;
+    if(which == 0) {
+        p = read->seq1;
+    } else {
+        p = read->seq2;
+    }
+    while(*p != '\n') {
+        if(*p == 'G' || *p == 'g') *p = 'A';
+        p++;
+    }
+}
+
+/******************************************************************************
+*
+*   Read a full line into a buffer, increasing its size as needed and returning
+*   its max size.
+*
+*   FILE *fp, input file stream
+*   char *cur_buf, the buffer to expand and insert into
+*   int size, current maximum malloc()ed size of cur_buf
+*   char *buf, a buffer of length sizeof(char)*MAXREAD to use, this simply saves
+*              us from constantly malloc()ing one.
+*   int ignore, if 1, read in the line until the end but don't store it
+*
+*   size is updated on success and set to -1 on error or EOF
+*
+*******************************************************************************/
+char * read_line(struct local_buffer *fp, char *cur_buf, int *size, int ignore) {
+
+    if(fp->type == 0 || fp->type == 2) { //plain text input
+        while(1) {
+            if(fp->finished == 1) {
+                //We hit the end of the file in the last go around
+                *size = -1;
+                break;
+            }
+            if(ignore) {
+                if(fgets(fp->buf, BT2BUF_SZ, fp->x.fptxt) == NULL) {
+                    fp->finished = 1;
+                    *size = -1;
+                    break;
+                }
+                while(fp->buf[strlen(fp->buf)-1] != '\n') {
+                    if(fgets(fp->buf, BT2BUF_SZ, fp->x.fptxt) == NULL) fp->finished = 1;
+                    if(fp->finished == 1) break; //Broken input
+                }
+                break;
+            } else {
+                if(fgets(cur_buf, *size, fp->x.fptxt) == NULL) {
+                    fp->finished = 1;
+                    *size = -1;
+                    break;
+                }
+                while(cur_buf[strlen(cur_buf)-1] != '\n') {
+                    cur_buf = realloc(cur_buf, sizeof(char) * (*size + BT2BUF_SZ));
+                    *size += BT2BUF_SZ;
+                    if(fgets(fp->buf, BT2BUF_SZ, fp->x.fptxt) == NULL) fp->finished = 1;
+                    if(fp->finished == 1) break; //Broken input
+                    cur_buf = strcat(cur_buf, fp->buf);
+                }
+                break;
+            }
+        }
+    } else if(fp->type == 1) { //gzipped input
+        while(1) {
+            if(fp->finished == 1) {
+                //We hit the end of the file in the last go around
+                *size = -1;
+                break;
+            }
+            if(ignore) {
+                if(gzgets(fp->x.fpgz, fp->buf, BT2BUF_SZ) == NULL) {
+                    fp->finished = 1;
+                    *size = -1;
+                    break;
+                }
+                while(fp->buf[strlen(fp->buf)-1] != '\n') {
+                    if(gzgets(fp->x.fpgz, fp->buf, BT2BUF_SZ) == NULL) fp->finished = 1;
+                    if(fp->finished == 1) break; //Broken input
+                }
+                break;
+            } else {
+                if(gzgets(fp->x.fpgz, cur_buf, *size) == NULL) {
+                    fp->finished = 1;
+                    *size = -1;
+                    break;
+                }
+                while(cur_buf[strlen(cur_buf)-1] != '\n') {
+                    cur_buf = realloc(cur_buf, sizeof(char) * (*size + BT2BUF_SZ));
+                    *size += BT2BUF_SZ;
+                    if(gzgets(fp->x.fpgz, fp->buf, BT2BUF_SZ) == NULL) fp->finished = 1;
+                    if(fp->finished == 1) break; //Broken input
+                    cur_buf = strcat(cur_buf, fp->buf);
+                }
+                break;
+            }
+        }
+    }
+
+    return cur_buf;
+}
+
+/******************************************************************************
+*
+*   Read in an actual fastq read into a fastq struct, resizing as needed
+*
+*   FILE *fp, input file pointer
+*   fastq *read, input struct
+*   int which, 0 for read1 and 1 for read2
+*
+*   returns an int, which is -1 on EOF or error
+*
+*******************************************************************************/
+int read_fastq(struct local_buffer *fp, fastq *read, int which) {
+    int *max_name = NULL, *max_seq = NULL, *max_qual = NULL;
+    int orig_maxname;
+    char *name = NULL, *seq = NULL, *qual = NULL;
+
+    //Point everything to the correct read
+    if(which == 0) { //read1
+        name = read->name1;
+        seq = read->seq1;
+        qual = read->qual1;
+        max_name = &(read->max_name1);
+        max_seq = &(read->max_seq1);
+        max_qual = &(read->max_qual1);
+    } else {
+        name = read->name2;
+        seq = read->seq2;
+        qual = read->qual2;
+        max_name = &(read->max_name2);
+        max_seq = &(read->max_seq2);
+        max_qual = &(read->max_qual2);
+    }
+
+    //name
+    orig_maxname = *max_name;
+    name = read_line(fp, name, max_name, 0);
+    if(*max_name == -1) {
+        *max_name = orig_maxname;
+        return -1;
+    }
+    //Seq
+    seq = read_line(fp, seq, max_seq, 0);
+    //+
+    read_line(fp, NULL, 0, 1);
+    //Qual
+    qual = read_line(fp, qual, max_qual, 0);
+
+    //Reset the pointers if they've moved
+    if(which == 0) { //Read1
+        read->name1 = name;
+        read->seq1 = seq;
+        read->qual1 = qual;
+    } else {
+        read->name2 = name;
+        read->seq2 = seq;
+        read->qual2 = qual;
+    }
+
+    return 0;
+}
+
+/******************************************************************************
+*
+*   Read in the fastq file(s) sending the reads to the appropriate nodes and
+*   also storing the unconverted reads in a linked list on the master node.
+*
+*   This will act as its own thread on the master node.
+*
+*   void *a is an unsigned long
+*
+*******************************************************************************/
+void * send_store_fastq(void *a) {
+    char *line = malloc(MAXREAD*sizeof(char));
+    struct local_buffer *f1 = NULL, *f2 = NULL;
+    int i=0, nnodes = effective_nodes(), status;
+    int nnode_groups = nnodes/((config.directional) ? 2 : 4);
+    int j, max_j = 4, multiplier = 4;
+    int current_file = 0;
+    unsigned long upto = *((unsigned long *) a);
+    unsigned long total = 0;
+    char *cmd = NULL;
+    char *p, *fname1 = NULL, *fname2 = NULL, *save_ptr1=NULL, *save_ptr2=NULL;
+    char *finished_signal = NULL;
+    fastq *read = malloc(sizeof(fastq));
+    MPI_Fastq *packed = NULL;
+    int rv1 = 0, rv2 = 0, wordexp_offset=0;
+    wordexp_t fnames1_wordexp, fnames2_wordexp;
+    void *A = malloc(1);
+#ifdef DEBUG
+    int taskid = global_debug_taskid;
+#endif
+    f1 = calloc(1, sizeof(struct local_buffer));
+    f1->buf = malloc(BT2BUF_SZ*sizeof(char));
+    f1->buf[0] = '\0'; //Just so that we know that we're at the start of a buffer
+    if(config.paired) {
+        f2 = calloc(1, sizeof(struct local_buffer));
+        f2->buf = malloc(BT2BUF_SZ*sizeof(char));
+        f2->buf[0] = '\0'; //Just so that we know that we're at the start of a buffer
+    }
+
+    //Initialize the read struct
+    read->max_name1 = 10;
+    read->max_seq1 = 10;
+    read->max_qual1 = 10;
+    read->max_name2 = 10;
+    read->max_seq2 = 10;
+    read->max_qual2 = 10;
+    read->name1 = malloc(sizeof(char)*10);
+    read->seq1 = malloc(sizeof(char)*10);
+    read->qual1 = malloc(sizeof(char)*10);
+    read->name2 = malloc(sizeof(char)*10);
+    read->seq2 = malloc(sizeof(char)*10);
+    read->qual2 = malloc(sizeof(char)*10);
+
+    //These will be used later
+    if(config.directional) {
+        max_j = 2;
+        multiplier = 2;
+    }
+
+    fname1 = strtok_r(config.FASTQ1,",", &save_ptr1);
+    rv1 = wordexp(fname1, &fnames1_wordexp, WRDE_SHOWERR | WRDE_UNDEF);
+    fnames1[current_file] = strdup(fnames1_wordexp.we_wordv[wordexp_offset]); //This will need to be free()d
+    if(config.paired) {
+        fname2 = strtok_r(config.FASTQ2,",", &save_ptr2);
+        rv2 = wordexp(fname2, &fnames2_wordexp, WRDE_SHOWERR | WRDE_UNDEF);
+        fnames2[current_file] = strdup(fnames2_wordexp.we_wordv[wordexp_offset]);
+    }
+    while(fname1 != NULL) {
+        if(rv1 != 0 || rv2 != 0) {
+            printf("An error ocurred when trying to expand the first filename.\n");
+            if(rv1 == WRDE_BADCHAR) {
+                printf("%s contains an illegal character\n", fname1);
+            } else if(rv1 == WRDE_BADVAL) {
+                printf("%s contains an undefined shell variable\n", fname1);
+            } else if(rv1 == WRDE_NOSPACE) {
+                printf("Out of memory when processing %s\n", fname1);
+            } else if(rv1 == WRDE_SYNTAX) {
+                printf("%s had a syntax error\n", fname1);
+            }
+            if(config.paired) {
+                if(rv2 == WRDE_BADCHAR) {
+                    printf("%s contains an illegal character\n", fname2);
+                } else if(rv2 == WRDE_BADVAL) {
+                    printf("%s contains an undefined shell variable\n", fname2);
+                } else if(rv2 == WRDE_NOSPACE) {
+                    printf("Out of memory when processing %s\n", fname2);
+                } else if(rv2 == WRDE_SYNTAX) {
+                    printf("%s had a syntax error\n", fname2);
+                }
+            }
+            goto finish; //Yeah yeah, an evil "goto"
+        }
+        //Determine how we should read in the file(s)
+        p = strrchr(fnames1_wordexp.we_wordv[wordexp_offset], '.');
+        if(strcmp(p, ".gz") == 0 || strcmp(p, ".GZ") == 0) {
+            f1->type = 1;
+            f1->x.fpgz = gzopen(fnames1_wordexp.we_wordv[wordexp_offset], "rb");
+        } else if(strcmp(p, ".bz") == 0 || strcmp(p, ".bz2") == 0) {
+            f1->type = 2;
+            cmd = realloc(cmd, sizeof(char) * (strlen(fnames1_wordexp.we_wordv[wordexp_offset]) + strlen("bzcat  ")));
+            sprintf(cmd, "bzcat %s", fnames1_wordexp.we_wordv[wordexp_offset]);
+            f1->x.fptxt = popen(cmd, "r");
+        } else {
+            f1->type = 0;
+            f1->x.fptxt = fopen(fnames1_wordexp.we_wordv[wordexp_offset], "r");
+        }
+        f1->finished = 0;
+        f1->pos = 0;
+        f1->buf[0] = '\0'; //Just so that we know that we're at the start of a buffer
+        if(config.paired) {
+            p = strrchr(fnames2_wordexp.we_wordv[wordexp_offset], '.');
+            if(strcmp(p, ".gz") == 0 || strcmp(p, ".GZ") == 0) {
+                f2->type = 1;
+                f2->x.fpgz = gzopen(fnames2_wordexp.we_wordv[wordexp_offset], "rb");
+            } else if(strcmp(p, ".bz") == 0 || strcmp(p, ".bz2") == 0) {
+                f2->type = 2;
+                cmd = realloc(cmd, sizeof(char) * (strlen(fnames2_wordexp.we_wordv[wordexp_offset]) + strlen("bzcat  ")));
+                sprintf(cmd, "bzcat %s", fnames2_wordexp.we_wordv[wordexp_offset]);
+                f2->x.fptxt = popen(cmd, "r");
+            } else {
+                f2->type = 0;
+                f2->x.fptxt = fopen(fnames2_wordexp.we_wordv[wordexp_offset], "r");
+            }
+            f2->finished = 0;
+            f2->pos = 0;
+            f2->buf[0] = '\0'; //Just so that we know that we're at the start of a buffer
+        }
+
+        //read everything in
+        total = 0;
+        while(1) {
+            if(upto) {
+                if(total >= upto) break;
+            }
+
+            if(read_fastq(f1, read, 0) == -1) break;
+            if(config.paired) read_fastq(f2, read, 1);
+ 
+            //Pack the struct
+            packed = pack_fastq(read);
+
+            //Store this in the linked-list
+#ifdef DEBUG
+            if(global_debug_taskid == MASTER) {
+#endif
+            add_element(last_fastq_sentinel_node[i], packed->packed);
+#ifdef DEBUG
+            }
+#endif
+
+            //Send it to the appropriate nodes
+            for(j=1; j<=max_j; j++) {
+#ifdef DEBUG
+                if(global_debug_taskid != MASTER) {
+                    if(j+multiplier*i == taskid) {
+                        status = MPI_Send((void *) packed->packed, packed->size, MPI_BYTE, 0, 3, MPI_COMM_WORLD);
+                        if(status != MPI_SUCCESS) {
+                            printf("MPI_Send returned %i\n", status);
+                            fflush(stdout);
+                        }
+                    }
+                }
+#else
+                //Send to j+multiplier*i
+                status = MPI_Send((void *) packed->packed, packed->size, MPI_BYTE, j+multiplier*i, 3, MPI_COMM_WORLD);
+                if(status != MPI_SUCCESS) {
+                    printf("MPI_Send returned %i\n", status);
+                    fflush(stdout);
+                }
+#endif
+            }
+            i++;
+            if(i >= nnode_groups) i=0;
+
+            //Free packed (packed->packed is in the linked list!)
+#ifdef DEBUG
+            if(global_debug_taskid != MASTER) free(packed->packed);
+#endif
+            free(packed);
+            total++;
+
+#ifndef NOTHROTTLE
+            if(config.reads_in_queue > 0) {
+                if(total % THROTTLE_CHECK_INTERVAL == 0) {
+                    while(total - nwritten[current_file] > config.reads_in_queue) sleep(1);
+                }
+            }
+#endif
+        }
+        flengths[current_file] = total; //Otherwise, the writer thread will keep waiting
+        //Notify the master_processor_threads that they need to update the methylation metrics
+        for(j=0; j<nnode_groups; j++) { //This is actually excessive, but we otherwise need to
+            finished_signal = malloc(2*sizeof(char)); //We need to malloc() this or it won't be properly free()d after being added to the linked-list.
+            sprintf(finished_signal, "\2");
+            add_element(last_fastq_sentinel_node[j], (void *) finished_signal);
+        }
+        if(!config.quiet) printf("finished sending reads from %s (%lu reads)\n", fnames1[current_file], total); fflush(stdout);
+        //Close the input files
+        if(f1->type == 0) fclose(f1->x.fptxt);
+        else if(f1->type == 1) { gzclearerr(f1->x.fpgz); gzclose(f1->x.fpgz); }
+        else if(f1->type == 2) pclose(f1->x.fptxt);
+        if(config.paired) {
+            if(f2->type == 0) fclose(f2->x.fptxt);
+            else if(f2->type == 1) { gzclearerr(f2->x.fpgz); gzclose(f2->x.fpgz); }
+            else if(f2->type == 2) pclose(f2->x.fptxt);
+        }
+
+        current_file++;
+        if(++wordexp_offset >= fnames1_wordexp.we_wordc) {
+            //Ensure we move to the next file
+            wordexp_offset = 0;
+            fname1 = strtok_r(NULL,",", &save_ptr1);
+            if(fname1 == NULL) break;
+            rv1 = wordexp(fname1, &fnames1_wordexp, WRDE_SHOWERR | WRDE_UNDEF | WRDE_REUSE);
+            if(config.paired) {
+                fname2 = strtok_r(NULL,",", &save_ptr2);
+                rv2 = wordexp(fname2, &fnames2_wordexp, WRDE_SHOWERR | WRDE_UNDEF | WRDE_REUSE);
+                fnames2[current_file] = strdup(fnames2_wordexp.we_wordv[wordexp_offset]); //This will need to be free()d
+            }
+        } //Else we've incremented to the next file
+        fnames1[current_file] = strdup(fnames1_wordexp.we_wordv[wordexp_offset]); //This will need to be free()d
+        if(config.paired) {
+            fnames2[current_file] = strdup(fnames2_wordexp.we_wordv[wordexp_offset]); //This will need to be free()d
+        }
+    }
+
+finish: //We'll only ever "goto" here on an error, otherwise we'll get here normally
+    //Send a 1-byte package to signal completion
+#ifdef DEBUG
+    if(global_debug_taskid != MASTER) {
+        status = MPI_Send(A, 1, MPI_BYTE, 0, 3, MPI_COMM_WORLD);
+    }
+#else
+    for(j=1; j<=effective_nodes(); j++) {
+        status = MPI_Send(A, 1, MPI_BYTE, j, 3, MPI_COMM_WORLD);
+        if(status != MPI_SUCCESS) printf("Couldn't send 'finished' message to worker %i!\n", j);
+    }
+#endif
+
+    //Add the "finished" element
+#ifdef DEBUG
+    if(global_debug_taskid == MASTER) {
+#endif
+        for(i=0; i<nnode_groups; i++) {
+            add_finished(last_fastq_sentinel_node[i]);
+        }
+#ifdef DEBUG
+    }
+#endif
+
+    //Clean things up
+    if(cmd != NULL) free(cmd);
+    free(A);
+    free(line);
+    free(read->name1);
+    free(read->seq1);
+    free(read->qual1);
+    free(read->name2);
+    free(read->seq2);
+    free(read->qual2);
+    free(read);
+    wordfree(&fnames1_wordexp);
+    if(config.paired) wordfree(&fnames2_wordexp);
+    if(!config.quiet) printf("Finished reading in fastq files!\n"); fflush(stdout);
+    return NULL;
+}
diff --git a/herd/main.c b/herd/main.c
new file mode 100644
index 0000000..e30074b
--- /dev/null
+++ b/herd/main.c
@@ -0,0 +1,509 @@
+#include "../bison.h"
+#include <wordexp.h>
+
+void usage(char *prog) {
+    printf("Usage: %s [OPTIONS] -g genome_dir {-1 fastq_A1.gz,fastq_B1.gz -2 fastq_A2.gz,fastq_B2.gz | -U fastq.gz}\n", prog);
+    printf("\n \
+    N.B., Bison has a number of defaults that are different from that of bowtie2.\n \
+    All of these can be changed with the normal bowtie2 options, which change\n \
+    bison's behavior as well. MAPQ scores are recalculated by bison in the same\n \
+    way as they are in bowtie2 (or at least they should be). Any option not\n \
+    listed below will be passed directly to bowtie2, so you can specify, e.g.,\n \
+    --very-fast if you want. If you specify --local, --score-min is changed back\n \
+    to the bowtie2 default of 'G,20,6', unless you specify otherwise.\n \
+\n \
+    Note also that both -1/-2 and -U can accept a comma-separated list of input\n \
+    files. Unlike other aligners, the alignments from each of these files will\n \
+    be output to different files. This is meant to speed alignments of multiple\n \
+    samples, since the bowtie2 index and the genome sequence only need to be\n \
+    loaded a single time. Inputting more than one file (or pair, when using -1\n \
+    -2) implies --reorder.\n \
+\n \
+-g          Directory containing the genome fasta files and the\n \
+            Bisulfite_Sequences directory.\n \
+\n \
+-1          Fastq file containing read #1 (normally named something like \n \
+            foo_1.fastq.gz). Reads needn't be gzipped, but that'll be more\n \
+            convenient. You may also input a comma-separated list of files to be\n \
+            aligned (but see note above). Doing this implies --reorder.\n \
+\n \
+-2          As with -1, but with read #2.\n \
+\n \
+-U          For convenience, this denotes a fastq file from single-ended reads.\n \
+            Alternatively, -1 can be used without using -2. As with -1, you may\n \
+            also specify more than one file, in which case alignments from each\n \
+            will be printed to different files.\n \
+\n \
+-p          How many threads bowtie2 should use on each node. Default is 11.\n \
+\n \
+-mp         How many processing threads should run on the master node. Default\n \
+            is 1. Increasing this will be required to prevent the MPI buffer\n \
+            from becoming depleted and the master node then crashing. However,\n \
+            too many of these will cause resource underutilization. Keep in\n \
+            mind also that there are an additional 2 threads already running to\n \
+            do other things.\n \
+\n \
+-o          Output directory. By default, everything will be written to the\n \
+            directory holding the fastq files (or the file containing read #1,\n \
+            as appropriate). If you would prefer for the output BAM file and\n \
+            metrics txt file to be placed elsewhere, specify that here.\n \
+\n \
+            N.B., the directory must exist! \n \
+\n \
+-tmp        Temporary directory where named pipes will be created on the worker\n \
+            nodes. This just need to be a directory that is bison_herd can read\n \
+            and write to. The default is \"/tmp\".\n \
+\n \
+--directional Denotes that the library was created in a directional, rather\n \
+            than non-directional manner. This will result in 3, rather than 5\n \
+            nodes being used as only alignments to 2 (rather than 4) strands are\n \
+            possible.\n \
+\n \
+-upto       The maximum number of reads to process. This is mostly useful for\n \
+            debuging and more quickly determining if a library is directional or\n \
+            not. 0 is the default, meaning all reads are used. N.B., the\n \
+            maximum value for this parameter is whatever an unsigned long is on\n \
+            your system.\n \
+\n \
+--reorder   Reorder output to match the same order as the input. This will make\n \
+            things slower, but enable easier comparisons. This is passed to\n \
+            bowtie2 regardless of whether you specify it or not.\n \
+\n \
+-@          Number of BAM compression threads to use. This is equivalent to -@\n \
+            in samtools. The default is 1, but this may need to be increased as\n \
+            you increase the number of alotted nodes.\n \
+\n \
+--unmapped  Save unaligned reads to a file or files (as appropriate). This files\n \
+            will be placed in the same directory as the source fastq files,\n \
+            regardless of whether \"-o\" is used.\n \
+\n");
+#ifndef NOTHROTTLE
+    printf(" \
+-queue_size The maximum difference between the number of reads that have been\n \
+            read and the number that have been written. The default is 1000000\n \
+            and a value of 0 (or just not compiling with -DTHROTTLE) will\n \
+            disable this. Since bison_herd can have a quiet large number of\n \
+            worker nodes performing alignments, it can happen that they\n \
+            overwhelm the master node that must then process their results. This\n \
+            option can help to prevent that (though increasing -mp is a better\n \
+            solution) by pausing the sending of reads out for alignment.\n \
+\n");
+#endif
+    printf(" \
+--quiet     Don't print anything but errors to the console (this is also passed\n \
+            to bowtie2).\n \
+\n \
+-h          Print this help message.\n \
+\n \
+-v          Print version information.\n \
+\n");
+#ifdef DEBUG
+    printf("\
+-taskid     Which node number to act as. The default is 0, the master node.\n \
+            Other possibilities are 1-4, which are the worker nodes that\n \
+            process OT, OB, CTOT, and CTOB alignments, respectively.\n \
+\n \
+            Note that if you plan to run with taskid=0 (i.e., as the master\n \
+            node), files named OT.bam, OB.bam, etc. should exist in your\n \
+            working directory. These will be created automatically if you run\n \
+            each pseudo-worker node first, which is recommended.\n\n");
+#endif
+}
+
+int main(int argc, char *argv[]) {
+    int i, taskid=0, provided;
+    pthread_t *threads;
+    int bowtie2_options_max = MAXREAD;
+    char *p = NULL, *tmp = NULL;
+    wordexp_t p_wordexp;
+    unsigned long upto = 0;
+    int ngroups;
+    int multi_file=0;
+#ifndef DEBUG
+    int name_len;
+    char processor_name[MPI_MAX_PROCESSOR_NAME];
+#endif
+
+    //Deal with MPI initialization, this seems like an odd way to do things.
+    MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
+    if(provided != MPI_THREAD_MULTIPLE) {
+        printf("You're MPI implementation doesn't support MPI_THREAD_MULTIPLE, which is required for bison_herd to work.\n");
+        return -1;
+    }
+#ifndef DEBUG
+    MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
+    MPI_Get_processor_name(processor_name, &name_len);
+#endif
+
+    config.odir = NULL;
+    config.paired = 0; //Default is single-ended
+    config.directional = 0; //Default is non-directional
+    config.nthreads = 11; //Default is 11 threads/node
+    config.bowtie2_options = calloc(MAXREAD, sizeof(char));
+    config.unmapped = 0; //By default, unmapped reads are NOT written to a fastq file
+    config.scoremin_type = 'L'; //--score-min 'L,-0.6,-0.6'
+    config.scoremin_intercept = -0.6;
+    config.scoremin_coef = -0.6;
+    config.mode = 0; //--end-to-end
+    config.tmpdir = NULL; //-tmpdir
+    config.nmthreads = 1; //-mp
+    config.reorder = 0; //--reorder
+    config.outname = NULL; //Otherwise, we'll have problems when we realloc!
+    config.basename = NULL; //To handle multiple inputs
+    config.n_compression_threads = 0;
+    config.unmapped1 = NULL;
+    config.unmapped2 = NULL;
+    global_header = NULL;
+    unmapped1 = NULL;
+    unmapped2 = NULL;
+#ifndef NOTHROTTLE
+    config.reads_in_queue = 1000000;
+    nwritten = 0;
+#endif
+    chromosomes.nchromosomes = 0; //We need to initialize the struct
+
+    //These are only used during cleanup and will otherwise cause an error
+    config.FASTQ1CT = NULL;
+    config.FASTQ1GA = NULL;
+    config.FASTQ2CT = NULL;
+    config.FASTQ2GA = NULL;
+
+    //Initialize the global counts
+    t_reads = 0;
+    m_reads_OT = 0;
+    m_reads_OB = 0;
+    m_reads_CTOT = 0;
+    m_reads_CTOB = 0;
+    t_CpG = 0;
+    m_CpG = 0;
+    t_CHG = 0;
+    m_CHG = 0;
+    t_CHH = 0;
+    m_CHH = 0;
+
+    if(argc == 1) {
+        usage(argv[0]);
+        quit(0, 0);
+    }
+
+    for(i=1; i<argc; i++) {
+        if(strcmp(argv[i], "-h") == 0) {
+            usage(argv[0]);
+            quit(0, 0);
+        } else if(strcmp(argv[i], "-v") == 0) {
+            version();
+            quit(0,0);
+        } else if(strcmp(argv[i], "-1") == 0) {
+            i++;
+            config.FASTQ1 = argv[i];
+        } else if(strcmp(argv[i], "-2") == 0) {
+            i++;
+            config.FASTQ2 = argv[i];
+            config.paired = 1;
+        } else if(strcmp(argv[i], "-U") == 0) {
+            i++;
+            config.FASTQ1 = argv[i];
+        } else if(strcmp(argv[i], "-g") == 0) {
+            i++;
+            config.genome_dir = argv[i];
+        } else if(strcmp(argv[i], "-p") == 0) {
+            i++;
+            config.nthreads = atoi(argv[i]);
+        } else if(strcmp(argv[i], "-mp") == 0) {
+            i++;
+            config.nmthreads = atoi(argv[i]);
+        } else if(strcmp(argv[i], "-o") == 0) {
+            i++;
+            config.odir = strdup(argv[i]);
+        } else if(strcmp(argv[i], "-tmpdir") == 0) {
+            i++;
+            config.tmpdir= argv[i];
+        } else if(strcmp(argv[i], "-upto") == 0) {
+            i++;
+            //upto = atoi(argv[i]);
+            upto = strtoul(argv[i], NULL, 10);
+        } else if(strcmp(argv[i], "--directional") == 0) {
+            config.directional = 1;
+        } else if(strcmp(argv[i], "--unmapped") == 0) {
+            config.unmapped = 1;
+        } else if(strcmp(argv[i], "--reorder") == 0) {
+            config.reorder = 1;
+        } else if(strcmp(argv[i], "-@") == 0) {
+            config.n_compression_threads = atoi(argv[++i]);
+#ifndef NOTHROTTLE
+        } else if(strcmp(argv[i], "-queue_size") == 0) {
+            i++;
+            config.reads_in_queue = atoi(argv[i]);
+#endif
+#ifdef DEBUG
+        } else if(strcmp(argv[i], "-taskid") == 0) {
+            i++;
+            global_debug_taskid = atoi(argv[i]);
+            taskid = global_debug_taskid;
+#endif
+        } else if(strcmp(argv[i], "--score-min") == 0) {
+            i++;
+            printf("Changing --score-min from 'L,-0.6,-0.6' to %s!\n", argv[i]);
+            config.scoremin_type = strtok(argv[i], ",")[0];
+            config.scoremin_intercept = (float) atof(strtok(NULL, ","));
+            config.scoremin_coef = (float) atof(strtok(NULL, ","));
+        } else {
+            if(strcmp(argv[i], "--local") == 0 || strcmp(argv[i], "--very-fast-local") == 0 || strcmp(argv[i], "--fast-local") == 0 || strcmp(argv[i], "--sensitive-local") == 0 || strcmp(argv[i], "--very-sensitive-local") == 0) {
+                config.mode = 1;
+                if(config.scoremin_type == 'L' && config.scoremin_intercept == -0.6f && config.scoremin_coef == -0.6f) {
+                    config.scoremin_type = 'G';
+                    config.scoremin_intercept = 20.0;
+                    config.scoremin_coef = 8.0;
+                    if(!config.quiet) printf("Since --local was specified and --score-min was not already changed, changing --score-min to the bowtie2 default of 'G,20,8' (specify --score-min to change this)\n");
+                }
+            }
+            if(strcmp(argv[i], "--quiet") == 0) config.quiet = 1; //This also needs to be passed
+            //bowtie2 option
+            if(strlen(config.bowtie2_options) + 1 + strlen(argv[i]) >= bowtie2_options_max) {
+                bowtie2_options_max = strlen(config.bowtie2_options) + 1 + strlen(argv[i]) + 100;
+                config.bowtie2_options = realloc(config.bowtie2_options, sizeof(char) * bowtie2_options_max);
+            }
+            strcat(config.bowtie2_options, " ");
+            strcat(config.bowtie2_options, argv[i]);
+        }
+    }
+
+    if(config.FASTQ1 == NULL || config.genome_dir == NULL || (config.FASTQ2 == NULL && config.paired == 1)) {
+        if(taskid == MASTER) {
+            printf("No FASTQ files!\n");
+            usage(argv[0]);
+        }
+        quit(0, -1);
+    }
+
+    //If more than one input file was specified, enable reorder
+    tmp = strdup(config.FASTQ1);
+    p = strtok(tmp, ",");
+    if(wordexp(p, &p_wordexp, WRDE_SHOWERR | WRDE_UNDEF) != 0) {
+        printf("There was an error while parsing %s.\n", p);
+        free(tmp);
+        wordfree(&p_wordexp);
+        quit(0, -1);
+    }
+    multi_file += p_wordexp.we_wordc;
+    p = strtok(NULL, ",");
+    while(p != NULL) {
+        if(wordexp(p, &p_wordexp, WRDE_SHOWERR | WRDE_UNDEF | WRDE_REUSE) != 0) {
+            printf("There was an error while parsing %s.\n", p);
+            free(tmp);
+            wordfree(&p_wordexp);
+            quit(0, -1);
+        }
+        multi_file += p_wordexp.we_wordc;
+        p = strtok(NULL, ",");
+    }
+    free(tmp);
+    wordfree(&p_wordexp);
+#ifdef DEBUG
+    if(multi_file>1) {
+        printf("In DEBUG mode, you can't input multiple file-sets!\n");
+        quit(0,-1);
+    }
+#else
+    if(!config.quiet) printf("%s has rank %i\n", processor_name, taskid); fflush(stdout);
+    if(taskid > effective_nodes()) {
+        printf("From node %i: So long and thanks for all the bits.\n", taskid); fflush(stdout);
+        return(-2); //We're an extraneous node
+    }
+#endif
+    ngroups = effective_nodes();
+
+    if(config.tmpdir == NULL) config.tmpdir = "/tmp";
+
+    //Allocate room for the genome, if needed
+    if(taskid == MASTER) {
+        chromosomes.max_genome = 3000000000;
+        if(!config.quiet) printf("Allocating space for %llu characters\n", chromosomes.max_genome);
+        fflush(stdout);
+        chromosomes.genome = malloc(sizeof(char)*chromosomes.max_genome);
+        *chromosomes.genome = '\0';
+        if(chromosomes.genome == NULL) {
+            printf("Could not allocate enough room to hold the genome!\n");
+            return -1;
+        }
+    } else {
+        chromosomes.max_genome = 0;
+    }
+
+    //Setup the global variables (these will need to be free()d!)
+#ifndef DEBUG
+    if(taskid == MASTER) {
+#endif
+        nwritten = calloc(multi_file, sizeof(char *));
+        fnames1 = calloc(multi_file, sizeof(char *));
+        fnames2 = calloc(multi_file, sizeof(char *));
+        flengths = calloc(multi_file, sizeof(unsigned long long));
+#ifndef DEBUG
+    }
+#endif
+
+    //Append score_min, and p
+    if(strlen(config.bowtie2_options) + 1000 >= bowtie2_options_max) {
+        bowtie2_options_max = strlen(config.bowtie2_options) + 1000; //This should suffice
+        config.bowtie2_options = realloc(config.bowtie2_options, sizeof(char) * bowtie2_options_max);
+    }
+    if(strlen(config.bowtie2_options) > 0) {
+        sprintf(config.bowtie2_options, "%s -p %i --score-min '%c,%g,%g'", config.bowtie2_options, config.nthreads, config.scoremin_type, config.scoremin_intercept, config.scoremin_coef);
+    } else {
+        sprintf(config.bowtie2_options, "-p %i --score-min '%c,%g,%g'", config.nthreads, config.scoremin_type, config.scoremin_intercept, config.scoremin_coef);
+    }
+
+    //There should be as many tasks according to MPI as dictated by the library type.
+    if(config.directional) {
+        ngroups /= 2;
+    } else {
+        ngroups /= 4;
+    }
+    if(ngroups < 1) {
+        if(taskid == MASTER) printf("There are only %i groups of nodes available!! You need to allocate more nodes (at least 3 for direcional and 5 for non-directional libraries)!\n", ngroups);
+        quit(0, -1);
+    }
+    //Yes, these silently change user input
+    if(config.nmthreads < 1) config.nmthreads = 1;
+    if(config.nmthreads > ngroups) config.nmthreads = ngroups;
+
+#ifdef DEBUG
+    //DEBUG can't handle multiple files
+    update_odir();
+    config.basename = get_basename(config.FASTQ1);
+    config.outname = malloc(sizeof(char)*(strlen(config.odir)+ strlen(config.basename)+5));
+    sprintf(config.outname, "%s%s.bam", config.odir, config.basename);
+    if(taskid == MASTER) {
+#else
+    if(taskid == MASTER) {
+        //Deal with the output directory
+        update_odir();
+#endif
+
+        //Store the genome into memory
+        read_genome();
+
+        //Setup the mutexes
+        pthread_mutex_init(&metrics_mutex, NULL);
+
+        //Setup the linked-lists
+        nodes = malloc(sizeof(struct packed_struct *)*effective_nodes());
+        last_sentinel_node = malloc(sizeof(struct packed_struct *)*effective_nodes());
+        fastq_nodes = malloc(sizeof(struct packed_struct *)*ngroups);
+        last_fastq_sentinel_node = malloc(sizeof(struct packed_struct *)*ngroups);
+        to_write_node = malloc(sizeof(struct packed_struct *)*config.nmthreads);
+        to_write_sentinel_node = malloc(sizeof(struct packed_struct *)*config.nmthreads);
+        for(i=0; i<effective_nodes(); i++) {
+            nodes[i] = initialize_list(nodes[i]);
+            last_sentinel_node[i] = nodes[i]->next;
+        }
+        for(i=0; i<ngroups; i++) {
+            fastq_nodes[i] = initialize_list(fastq_nodes[i]);
+            last_fastq_sentinel_node[i] = fastq_nodes[i]->next;
+        }
+        for(i=0; i<config.nmthreads; i++) {
+            to_write_node[i] = initialize_list(to_write_node[i]);
+            to_write_sentinel_node[i] = to_write_node[i]->next;
+        }
+
+        //Start the master node processer threads
+        threads = calloc(2+config.nmthreads, sizeof(pthread_t));
+        int *threadids = malloc(sizeof(int)*config.nmthreads);
+        pthread_create(&(threads[0]), NULL, &send_store_fastq, (void *) &upto);
+        for(i=0; i<config.nmthreads; i++) {
+            *(threadids+i) = i;
+            pthread_create(&(threads[i+1]), NULL, &herd_master_processer_thread, threadids+i);
+        }
+        pthread_create(&(threads[1+config.nmthreads]), NULL, &bam_writer, NULL);
+        herd_slurp(NULL);
+        pthread_join(threads[0], NULL);
+        for(i=0; i<config.nmthreads; i++) pthread_join(threads[i+1], NULL);
+        pthread_join(threads[1+config.nmthreads], NULL);
+
+        //Start freeing things up
+        free(threadids);
+        free(threads);
+        for(i=0; i<effective_nodes(); i++) destroy_list(nodes[i]);
+        free(nodes);
+        free(last_sentinel_node);
+        for(i=0; i<ngroups; i++) destroy_raw_list(fastq_nodes[i]);
+        free(fastq_nodes);
+        free(last_fastq_sentinel_node);
+        free(to_write_node);
+        free(to_write_sentinel_node);
+        pthread_mutex_destroy(&metrics_mutex);
+
+        //Print some metrics
+        bam_header_destroy(global_header);
+    } else {
+        //Create a temporary directory
+        char *tmpdir = malloc(sizeof(char) * (strlen(config.tmpdir) + strlen("/herd_XXXXXX") + 1));
+        sprintf(tmpdir, "%s/herd_XXXXXX", config.tmpdir);
+        tmpdir = mkdtemp(tmpdir);
+
+        //Name the FIFOs
+        slurp_fastq_struct *silly_struct = malloc(sizeof(slurp_fastq_struct));
+        silly_struct->thread_id = taskid;
+        silly_struct->fastq1 = malloc(sizeof(char) * (strlen(tmpdir) + strlen("/read1") + 1));
+        silly_struct->fastq2 = malloc(sizeof(char) * (strlen(tmpdir) + strlen("/read2") + 1));
+        sprintf(silly_struct->fastq1, "%s/read1", tmpdir);
+        sprintf(silly_struct->fastq2, "%s/read2", tmpdir);
+
+        mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
+        int rv = mkfifo(silly_struct->fastq1, mode);
+        if(rv != 0) {
+            printf("mkfifo returned with status %i!\n", rv);
+            fflush(stdout);
+        }
+        if(config.paired) {
+            rv = mkfifo(silly_struct->fastq2, mode);
+            if(rv != 0) {
+                printf("mkfifo returned with status %i!\n", rv);
+                fflush(stdout);
+            }
+        }
+
+        //Start slurping in the fastq reads and converting them so they can be aligned
+#ifndef DEBUG
+        threads = calloc(1, sizeof(pthread_t));
+        pthread_create(&(threads[0]), NULL, &slurp_fastq, (void *) silly_struct);
+#else
+        threads = calloc(2, sizeof(pthread_t));
+        pthread_create(&(threads[1]), NULL, &send_store_fastq, (void *) &upto);
+        pthread_create(&(threads[0]), NULL, &slurp_fastq, (void *) silly_struct);
+#endif
+        //worker node stuff
+        herd_worker_node(taskid, silly_struct->fastq1, silly_struct->fastq2);
+        pthread_join(threads[0], NULL);
+#ifdef DEBUG
+        pthread_join(threads[1], NULL);
+#endif
+        if(!config.quiet) printf("Returning from worker node %i\n", taskid);
+        fflush(stdout);
+        free(silly_struct->fastq1); //The worker node unlinks this
+        free(silly_struct->fastq2); //The worker node unlinks this
+        free(silly_struct);
+        if(rmdir(tmpdir) != 0) {
+            printf("Couldn't remove %s directory!\n", tmpdir);
+            fflush(stdout);
+        }
+        free(tmpdir);
+        free(threads);
+    }
+
+#ifndef DEBUG
+    if(taskid == MASTER) {
+#endif
+        free(nwritten);
+        free(fnames1);
+        free(fnames2);
+        free(flengths);
+#ifndef DEBUG
+    }
+#endif
+
+    //Clean up
+    if(config.odir != NULL) free(config.odir);
+    quit(3, 0);
+    return 0;
+}
diff --git a/herd/master.c b/herd/master.c
new file mode 100644
index 0000000..05af3f7
--- /dev/null
+++ b/herd/master.c
@@ -0,0 +1,221 @@
+#include "../bison.h"
+#include <math.h>
+#include <sys/time.h>
+
+/*******************************************************************************
+*
+*   The master node function.
+*
+*   void *a: Actually an int*, the thread_id
+*
+*******************************************************************************/
+void * herd_master_processer_thread(void *a) {
+    int thread_id = *((int *) a), best_node, j, quit = 0, multiplier;
+    int ngroups = effective_nodes();
+    int node_base, node_final;
+    int tmp_j = 0;
+    char **seq = malloc(sizeof(char *) * 2);
+    bam1_t **node1_read = malloc(sizeof(bam1_t*) * 2);
+    bam1_t **node2_read = malloc(sizeof(bam1_t*) * 2);
+    bam1_t **node3_read = malloc(sizeof(bam1_t*) * 2);
+    bam1_t **node4_read = malloc(sizeof(bam1_t*) * 2);
+    bam1_t **best_read = NULL;
+    fastq *read = malloc(sizeof(fastq));
+    time_t now;
+    char ctime_buffer[26];
+    unsigned long long local_m_reads_OT = 0, local_m_reads_OB = 0;
+    unsigned long long local_m_reads_CTOT = 0, local_m_reads_CTOB = 0;
+    unsigned long long local_total = 0;
+
+    //Properly set the number of node groups and other small things
+    if(config.directional) {
+        ngroups /= 2;
+        multiplier = 2;
+    } else {
+        ngroups /= 4;
+        multiplier = 4;
+    }
+    read->max_name1 = 0;
+    read->max_seq1 = 0;
+    read->max_qual1 = 0;
+    read->max_name2 = 0;
+    read->max_seq2 = 0;
+    read->max_qual2 = 0;
+    read->name1 = NULL;
+    read->seq1 = NULL;
+    read->qual1 = NULL;
+    read->name2 = NULL;
+    read->seq2 = NULL;
+    read->qual2 = NULL;
+
+    //Get the minimum and maximum node group to work on
+    node_base = thread_id*(ngroups/config.nmthreads);
+    node_final = node_base+(ngroups/config.nmthreads)-1;
+    if(thread_id+1-config.nmthreads == 0) node_final += ngroups % config.nmthreads;
+    node_final++;
+
+    //Process read i/o
+    while(quit < node_final-node_base) {
+        //Currently, we output everything in the same order as the original input
+        //We could also invoke one thread per node-group (or multiple groups) and
+        //then either output in order or randomly (easier to implement).
+        //I'll have to benchmark things to see if this can keep up
+        for(j=node_base; j<node_final; j++) {
+            while(!is_ready(nodes[multiplier*j], 0));
+            if(is_finished(nodes[multiplier*j])) {
+                quit += 1;
+                if(!config.quiet) printf("Thread %i received a finished signal from node group %i (%i of %i groups are finished)\n", thread_id, j, quit, node_final-node_base); fflush(stdout);
+                continue;
+            }
+            *(node1_read) = update_read(nodes[multiplier*j], 0);
+            if(config.paired) {
+                while(!is_ready(nodes[multiplier*j], 1));
+                *(node1_read+1) = update_read(nodes[multiplier*j], 1);
+            }
+            while(!is_ready(nodes[multiplier*j+1], 0));
+            *(node2_read) = update_read(nodes[multiplier*j+1], 0);
+            if(config.paired) {
+                while(!is_ready(nodes[multiplier*j+1], 1));
+                *(node2_read+1) = update_read(nodes[multiplier*j+1], 1);
+            }
+            if(!config.directional) {
+                while(!is_ready(nodes[multiplier*j+2], 0));
+                *node3_read = update_read(nodes[multiplier*j+2], 0);
+                if(config.paired) {
+                    while(!is_ready(nodes[multiplier*j+2], 1));
+                    *(node3_read+1) = update_read(nodes[multiplier*j+2], 1);
+                }
+                while(!is_ready(nodes[multiplier*j+3], 0));
+                *node4_read = update_read(nodes[multiplier*j+3], 0);
+                if(config.paired) {
+                    while(!is_ready(nodes[multiplier*j+3], 1));
+                    *(node4_read+1) = update_read(nodes[multiplier*j+3], 1);
+                }
+            }
+
+            //Give some output, it's a bit misleading as the count is actually only for this thread and it'll only display for thread 0.
+            if(!config.quiet) {
+                if(++local_total % 100000 == 0) {
+                    now = time(NULL);
+                    printf("%llu reads in thread %i @ %s", local_total, thread_id, ctime_r(&now, ctime_buffer)); fflush(stdout);
+                }
+            }
+
+            //Get the appropriate read from the linked list
+            while(!is_ready(fastq_nodes[j], 0)) sleep(1);
+            //Do we need to flush our statistics?
+            if(*((char *)(fastq_nodes[j]->next->packed)) == '\2') {
+                //Update!
+//lock
+                pthread_mutex_lock(&metrics_mutex);
+                m_reads_OT += local_m_reads_OT;
+                m_reads_OB += local_m_reads_OB;
+                m_reads_CTOT += local_m_reads_CTOT;
+                m_reads_CTOB += local_m_reads_CTOB;
+                pthread_mutex_unlock(&metrics_mutex);
+//unlock
+                local_m_reads_OT = 0;
+                local_m_reads_OB = 0;
+                local_m_reads_CTOT = 0;
+                local_m_reads_CTOB = 0;
+                local_total = 0;
+                tmp_j = j;
+                for(j=node_base; j<node_final; j++) {
+                    while(!is_ready(fastq_nodes[j], 0)) sleep(1);
+                    remove_raw_element(fastq_nodes[j]);
+                }
+                j = tmp_j;
+                while(!is_ready(fastq_nodes[j], 0)) sleep(1);
+            }
+            read = unpack_fastq(read, (fastq_nodes[j])->next->packed);
+            *seq = read->seq1;
+            *(*seq + strlen(*seq) - 1) = '\0'; //remove the \n, 
+            if(config.paired) {
+                *(seq+1) = read->seq2;
+                *(*(seq+1) + strlen(*(seq+1)) - 1) = '\0';
+            }
+
+            //Process the reads
+            if(!config.paired) {
+                best_node = process_single(*node1_read, *node2_read, *node3_read, *node4_read, *seq); //Output is stored in read1
+            } else {
+                best_node = process_paired(node1_read, node2_read, node3_read, node4_read, seq); //Output is stored in read
+            }
+
+            if(best_node == 1) {
+                best_read = node1_read;
+                if(!((*best_read)->core.flag & BAM_FUNMAP)) local_m_reads_OT++;
+            } else if(best_node == 2) {
+                best_read = node2_read;
+                if(!((*best_read)->core.flag & BAM_FUNMAP)) local_m_reads_OB++;
+            } else if(best_node == 3) {
+                best_read = node3_read;
+                if(!((*best_read)->core.flag & BAM_FUNMAP)) local_m_reads_CTOT++;
+            } else if(best_node == 4) {
+                best_read = node4_read;
+                if(!((*best_read)->core.flag & BAM_FUNMAP)) local_m_reads_CTOB++;
+            }
+
+            //Store the reads and free up space (N.B., the writer thread will free up the space used by the best read)
+            if(best_node != 1) {
+                remove_element(nodes[multiplier*j]);
+                if(config.paired) remove_element(nodes[multiplier*j]);
+            } else {
+                move_element(nodes[multiplier*j], to_write_sentinel_node[thread_id]);
+            }
+            if(best_node != 2) {
+                remove_element(nodes[multiplier*j+1]);
+                if(config.paired) remove_element(nodes[multiplier*j+1]);
+            } else {
+                move_element(nodes[multiplier*j+1], to_write_sentinel_node[thread_id]);
+            }
+            if(!config.directional) {
+                if(best_node != 3) {
+                    remove_element(nodes[multiplier*j+2]);
+                    if(config.paired) remove_element(nodes[multiplier*j+2]);
+                } else {
+                    move_element(nodes[multiplier*j+2], to_write_sentinel_node[thread_id]);
+                }
+                if(best_node != 4) {
+                    remove_element(nodes[multiplier*j+3]);
+                    if(config.paired) remove_element(nodes[multiplier*j+3]);
+                } else {
+                    move_element(nodes[multiplier*j+3], to_write_sentinel_node[thread_id]);
+                }
+            }
+            remove_raw_element(fastq_nodes[j]);
+        }
+    }
+
+    //Tell the writer thread that we're finished
+    add_finished(to_write_sentinel_node[thread_id]);
+
+    //Update the global metrics
+//lock
+    pthread_mutex_lock(&metrics_mutex);
+    m_reads_OT += local_m_reads_OT;
+    m_reads_OB += local_m_reads_OB;
+    m_reads_CTOT += local_m_reads_CTOT;
+    m_reads_CTOB += local_m_reads_CTOB;
+    pthread_mutex_unlock(&metrics_mutex);
+//unlock
+
+    //Clean up
+    free(seq);
+    free(node1_read);
+    free(node2_read);
+    free(node3_read);
+    free(node4_read);
+    free(read->name1);
+    free(read->seq1);
+    free(read->qual1);
+    if(config.paired) {
+        free(read->name2);
+        free(read->seq2);
+        free(read->qual2);
+    }
+    free(read);
+    if(!config.quiet) printf("Thread %i finishing!\n", thread_id); fflush(stdout);
+
+    return NULL;
+}
diff --git a/herd/slurp.c b/herd/slurp.c
new file mode 100644
index 0000000..8a8cecd
--- /dev/null
+++ b/herd/slurp.c
@@ -0,0 +1,286 @@
+#include "../bison.h"
+
+/******************************************************************************
+*
+*   Remove a raw element from the start of a linked-list
+*   is_ready(first, 0) must return 1!
+*
+*   struct packed_struct *first: first sentinel struct
+*
+*******************************************************************************/
+void remove_raw_element(struct packed_struct *first) {
+    struct packed_struct *remove = first->next;
+    struct packed_struct *new_next = remove->next;
+
+    first->next = new_next;
+    free(remove->packed);
+    free(remove);
+}
+
+/******************************************************************************
+*
+*   Move an element from one linked-list to another.
+*
+*   struct packed_struct *source: source linked list
+*   struct packed-struct *dest: destination sentinel node
+*
+*******************************************************************************/
+void move_element(struct packed_struct *source, struct packed_struct *dest) {
+    struct packed_struct *next_to_last = dest->previous;
+    struct packed_struct *element = source->next;
+    struct packed_struct *new_next = NULL;
+
+    //Remove from source
+    if(config.paired) {
+        new_next = source->next->next->next; //the next read #1
+        source->next->next->previous = source->next; //Ensure that read #2 has the address for read #1
+        source->next->previous = dest->previous; //Ensure that read #1 points to the previous read #2
+        //Remove from source
+        element->next->next = dest; //point read #2 to the sentinel node
+        element->state = 0; //read #1 set not ready
+        element->next->state = 0; //read #2 set not ready
+        dest->previous = element->next; //Update the destination sentinel node
+    } else {
+        new_next = source->next->next; //the next read
+        source->next->previous = dest->previous; //Ensure that the read knows who came before it
+        //Remove from source
+        element->next = dest; //Next is the sentinel node
+        element->state = 0; //read is set not ready
+        dest->previous = element; //Update destination sentinel node
+    }
+
+    //Update the source
+    source->next = new_next;
+
+    //Add to destination
+    next_to_last->next = element; //Update previous read to point to the new one
+    if(!config.paired) {
+        //Don't do anything if the previous node is a sentinel node
+        if(next_to_last->previous != next_to_last) next_to_last->state = 1; //set previous read to ready
+    } else {
+        //Don't do anything if the previous node is a sentinel node
+        if(next_to_last->previous != next_to_last) next_to_last->previous->state = 1; //Set previous read #1 to ready (we never check read #2
+    }
+}
+
+/******************************************************************************
+*
+*   Destroy a linked list of packed_structs with unmodified ->packed
+*
+*   struct packed_struct *first: linked list to destroy
+*
+*******************************************************************************/
+void destroy_raw_list(struct packed_struct *first) {
+    while(first->next->next != first->next) remove_raw_element(first);
+    free(first->next);
+    free(first);
+}
+
+/******************************************************************************
+*
+*   The MPI receiver thread on a bison_herd main node
+*
+*   void *a: NULL input
+*
+*   returns NULL
+*
+*******************************************************************************/
+#ifndef DEBUG
+void *herd_slurp(void *a) {
+    time_t t0, t1;
+    void *p = NULL;
+    int nnodes = effective_nodes();
+    int nfinished = 0;
+    int source = 0;
+    int size = 0;
+    struct packed_struct *target_node = NULL;
+    bam_header_t *tmp_header;
+    MPI_Status status;
+    if(MPI_Recv((void *) &size, 1, MPI_INT, 1, 1, MPI_COMM_WORLD, &status) != MPI_SUCCESS) {
+        printf("Received an error when trying to receive header size.\n");
+        fflush(stdout);
+        quit(3, -2);
+    }
+    p = malloc((size_t) size);
+    if(MPI_Recv(p, size, MPI_BYTE, 1, 2, MPI_COMM_WORLD, &status) != MPI_SUCCESS) {
+        printf("Received an error when trying to receive header.\n");
+        fflush(stdout);
+        quit(3, -2);
+    }
+    tmp_header= bam_header_init();
+    unpack_header(tmp_header, p);
+    free(p);
+    global_header = tmp_header; //Now the writer thread is unblocked!
+    
+    t0 = time(NULL);
+    if(!config.quiet) printf("Started slurping @%s", ctime(&t0)); fflush(stdout);
+    while(nfinished < nnodes) {
+        MPI_Probe(MPI_ANY_SOURCE, 5, MPI_COMM_WORLD, &status);
+        source = status.MPI_SOURCE;
+        MPI_Get_count(&status, MPI_BYTE, &size);
+        target_node = last_sentinel_node[source-1];
+
+        if(size > 1) {
+            p = malloc((size_t) size);
+            MPI_Recv(p, size, MPI_BYTE, source, 5, MPI_COMM_WORLD, &status);
+            add_element(target_node, p);
+        } else {
+            p = malloc((size_t) size);
+            MPI_Recv(p, size, MPI_BYTE, source, 5, MPI_COMM_WORLD, &status);
+            free(p);
+            add_finished(target_node);
+            nfinished++;
+        }
+    }
+    t1 = time(NULL);
+    if(!config.quiet) printf("Finished slurping @%s\t(%f seconds elapsed)\n", ctime(&t1), difftime(t1, t0)); fflush(stdout);
+    return NULL;
+}
+#else
+void *herd_slurp(void *a) {
+    time_t t0, t1;
+    bamFile fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8;
+    char *iname = malloc(sizeof(char) * (1+strlen(config.odir)+strlen(config.basename)+strlen("_X.bam")));
+    bam1_t *read = bam_init1();
+    bam_header_t *tmp;
+    MPI_read *packed = calloc(1, sizeof(MPI_read));
+    struct packed_struct *target_node = NULL;
+
+    //Open the input files and get the header
+    sprintf(iname, "%s%s_1.bam", config.odir, config.basename);
+    fp1 = bam_open(iname, "r");
+    global_header = bam_header_read(fp1);
+    sprintf(iname, "%s%s_2.bam", config.odir, config.basename);
+    fp2 = bam_open(iname, "r");
+    tmp = bam_header_read(fp2);
+    bam_header_destroy(tmp);
+    sprintf(iname, "%s%s_3.bam", config.odir, config.basename);
+    fp3 = bam_open(iname, "r");
+    tmp = bam_header_read(fp3);
+    bam_header_destroy(tmp);
+    sprintf(iname, "%s%s_4.bam", config.odir, config.basename);
+    fp4 = bam_open(iname, "r");
+    tmp = bam_header_read(fp4);
+    bam_header_destroy(tmp);
+    sprintf(iname, "%s%s_5.bam", config.odir, config.basename);
+    fp5 = bam_open(iname, "r");
+    tmp = bam_header_read(fp5);
+    bam_header_destroy(tmp);
+    sprintf(iname, "%s%s_6.bam", config.odir, config.basename);
+    fp6 = bam_open(iname, "r");
+    tmp = bam_header_read(fp6);
+    bam_header_destroy(tmp);
+    sprintf(iname, "%s%s_7.bam", config.odir, config.basename);
+    fp7 = bam_open(iname, "r");
+    tmp = bam_header_read(fp7);
+    bam_header_destroy(tmp);
+    sprintf(iname, "%s%s_8.bam", config.odir, config.basename);
+    fp8 = bam_open(iname, "r");
+    tmp = bam_header_read(fp8);
+    bam_header_destroy(tmp);
+    free(iname);
+
+
+    //Write a header
+    bam_header_write(OUTPUT_BAM, global_header);
+    packed->size = 0;
+
+    t0 = time(NULL);
+    if(!config.quiet) printf("Started slurping @%s", ctime(&t0)); fflush(stdout);
+
+    while(bam_read1(fp1, read) > 1) {
+        packed->packed = NULL;
+        packed->size = 0;
+        packed = pack_read(read, packed);
+        target_node = last_sentinel_node[0];
+        add_element(target_node, packed->packed);
+
+        //Node2
+        bam_read1(fp2, read);
+        packed->packed = NULL;
+        packed->size = 0;
+        packed = pack_read(read, packed);
+        target_node = last_sentinel_node[1];
+        add_element(target_node, packed->packed);
+
+        //Node3
+        bam_read1(fp3, read);
+        packed->packed = NULL;
+        packed->size = 0;
+        packed = pack_read(read, packed);
+        target_node = last_sentinel_node[2];
+        add_element(target_node, packed->packed);
+
+        //Node4
+        bam_read1(fp4, read);
+        packed->packed = NULL;
+        packed->size = 0;
+        packed = pack_read(read, packed);
+        target_node = last_sentinel_node[3];
+        add_element(target_node, packed->packed);
+
+        //Node5
+        bam_read1(fp5, read);
+        packed->packed = NULL;
+        packed->size = 0;
+        packed = pack_read(read, packed);
+        target_node = last_sentinel_node[4];
+        add_element(target_node, packed->packed);
+
+        //Node6
+        bam_read1(fp6, read);
+        packed->packed = NULL;
+        packed->size = 0;
+        packed = pack_read(read, packed);
+        target_node = last_sentinel_node[5];
+        add_element(target_node, packed->packed);
+
+        //Node7
+        bam_read1(fp7, read);
+        packed->packed = NULL;
+        packed->size = 0;
+        packed = pack_read(read, packed);
+        target_node = last_sentinel_node[6];
+        add_element(target_node, packed->packed);
+
+        //Node8
+        bam_read1(fp8, read);
+        packed->packed = NULL;
+        packed->size = 0;
+        packed = pack_read(read, packed);
+        target_node = last_sentinel_node[7];
+        add_element(target_node, packed->packed);
+    }
+    free(packed);
+    bam_destroy1(read);
+    bam_close(fp1);
+    bam_close(fp2);
+    bam_close(fp3);
+    bam_close(fp4);
+    bam_close(fp5);
+    bam_close(fp6);
+    bam_close(fp7);
+    bam_close(fp8);
+
+    target_node = last_sentinel_node[0];
+    add_finished(target_node);
+    target_node = last_sentinel_node[1];
+    add_finished(target_node);
+    target_node = last_sentinel_node[2];
+    add_finished(target_node);
+    target_node = last_sentinel_node[3];
+    add_finished(target_node);
+    target_node = last_sentinel_node[4];
+    add_finished(target_node);
+    target_node = last_sentinel_node[5];
+    add_finished(target_node);
+    target_node = last_sentinel_node[6];
+    add_finished(target_node);
+    target_node = last_sentinel_node[7];
+    add_finished(target_node);
+
+    t1 = time(NULL);
+    if(!config.quiet) printf("Finished slurping @%s\t(%f seconds elapsed)\n", ctime(&t1), difftime(t1, t0)); fflush(stdout);
+    return NULL;
+}
+#endif
diff --git a/herd/worker.c b/herd/worker.c
new file mode 100644
index 0000000..c914c63
--- /dev/null
+++ b/herd/worker.c
@@ -0,0 +1,427 @@
+#include "../bison.h"
+#include <sys/time.h>
+
+struct packed_struct *first_writer, *first_writer_sentinel;
+struct packed_struct *second_writer, *second_writer_sentinel;
+
+//write read #1
+void * first_writer_func(void *a) {
+    int thread_id = ((slurp_fastq_struct *) a)->thread_id;
+    char *fastq1 = ((slurp_fastq_struct *) a)->fastq1;
+    FILE *f1 = fopen(fastq1, "w");
+    fastq *read = malloc(sizeof(fastq));
+    int strand;
+
+    //Determine the conversions to make
+    if(config.directional) {
+        strand = (thread_id-1) % 2;
+    } else {
+        strand = (thread_id-1) % 4;
+    }
+
+    //Initialize the fastq struct
+    read->max_name1 = 10;
+    read->max_name2 = 10;
+    read->max_seq1 = 10;
+    read->max_seq2 = 10;
+    read->max_qual1 = 10;
+    read->max_qual2 = 10;
+    read->name1 = malloc(sizeof(char) * 10);
+    read->seq1 = malloc(sizeof(char) * 10);
+    read->qual1 = malloc(sizeof(char) * 10);
+    read->name2 = malloc(sizeof(char) * 10);
+    read->seq2 = malloc(sizeof(char) * 10);
+    read->qual2 = malloc(sizeof(char) * 10);
+
+    while(1) {
+        while(!is_ready(first_writer, 0)); //Sleeping slows things down too much
+        if(is_finished(first_writer)) break;
+
+        //Unpack
+        read = unpack_fastq(read, first_writer->next->packed);
+        //Remove from the linked list
+        remove_raw_element(first_writer);
+        //Convert
+        switch(strand) {
+            case 0 :
+            case 1 :
+                convertCT(read, 0);
+                break;
+            case 2 :
+            case 3 :
+                convertGA(read, 0);
+                break;
+        }
+        fprintf(f1, "%s%s+\n%s", read->name1, read->seq1, read->qual1);
+    }
+
+    //Free things up
+    fclose(f1);
+    free(read->name1);
+    free(read->seq1);
+    free(read->qual1);
+    free(read->name2);
+    free(read->seq2);
+    free(read->qual2);
+    free(read);
+    destroy_list(first_writer);
+    return NULL;
+}
+
+//write read #2
+void * second_writer_func(void *a) {
+    int thread_id = ((slurp_fastq_struct *) a)->thread_id;
+    char *fastq2 = ((slurp_fastq_struct *) a)->fastq2;
+    FILE *f2 = fopen(fastq2, "w");
+    fastq *read = malloc(sizeof(fastq));
+    int strand;
+
+    //Determine the conversions to make
+    if(config.directional) {
+        strand = (thread_id-1) % 2;
+    } else {
+        strand = (thread_id-1) % 4;
+    }
+
+    //Initialize the fastq struct
+    read->max_name1 = 10;
+    read->max_name2 = 10;
+    read->max_seq1 = 10;
+    read->max_seq2 = 10;
+    read->max_qual1 = 10;
+    read->max_qual2 = 10;
+    read->name1 = malloc(sizeof(char) * 10);
+    read->seq1 = malloc(sizeof(char) * 10);
+    read->qual1 = malloc(sizeof(char) * 10);
+    read->name2 = malloc(sizeof(char) * 10);
+    read->seq2 = malloc(sizeof(char) * 10);
+    read->qual2 = malloc(sizeof(char) * 10);
+
+    while(1) {
+        while(!is_ready(second_writer, 0)); //Sleeping slows things down too much
+        if(is_finished(second_writer)) break;
+
+        //Unpack
+        read = unpack_fastq(read, second_writer->next->packed);
+        //Remove from the linked list
+        remove_raw_element(second_writer);
+        //Convert
+        switch(strand) {
+            case 0 :
+            case 1 :
+                convertGA(read, 1);
+                break;
+            case 2 :
+            case 3 :
+                convertCT(read, 1);
+                break;
+        }
+        fprintf(f2, "%s%s+\n%s", read->name2, read->seq2, read->qual2);
+    }
+
+    //Free things up
+    fclose(f2);
+    free(read->name1);
+    free(read->seq1);
+    free(read->qual1);
+    free(read->name2);
+    free(read->seq2);
+    free(read->qual2);
+    free(read);
+    destroy_list(second_writer);
+    return NULL;
+}
+
+/******************************************************************************
+*
+*   This receives the reads, converts them, and writes them to the FIFO(s)
+*   
+*   void *a: a pointer to a struct with the following components:
+*
+*   int thread_id: the thread_id
+*   char *fastq1: FIFO from which bowtie2 can get read1
+*   char *fastq2: FIFO from which bowtie2 can get read2 (if it exists)
+*
+*******************************************************************************/
+void * slurp_fastq(void *a) {
+    pthread_t threads[2];
+    void *p = NULL, *p2 = NULL, *p3 = NULL;
+    int size = 0, current_p_size = 0;
+    MPI_Status status;
+    fastq *read = malloc(sizeof(fastq));
+
+    first_writer = malloc(sizeof(struct packed_struct));
+    first_writer_sentinel = malloc(sizeof(struct packed_struct));
+    first_writer = initialize_list(first_writer);
+    first_writer_sentinel = first_writer->next;
+    pthread_create(&(threads[0]), NULL, &first_writer_func, a);
+    if(config.paired) {
+        //If we have pairs, then writing simultaneuosly to two fifos (that will be read sequentially by bowtie2) won't work, since bowtie2 will read from a single fifo multiple times!!!
+        second_writer = malloc(sizeof(struct packed_struct));
+        second_writer_sentinel = malloc(sizeof(struct packed_struct));
+        second_writer = initialize_list(second_writer);
+        second_writer_sentinel = second_writer->next;
+        pthread_create(&(threads[1]), NULL, &second_writer_func, a);
+    }
+
+    //Initialize the fastq struct
+    read->max_name1 = 10;
+    read->max_name2 = 10;
+    read->max_seq1 = 10;
+    read->max_seq2 = 10;
+    read->max_qual1 = 10;
+    read->max_qual2 = 10;
+    read->name1 = malloc(sizeof(char) * 10);
+    read->seq1 = malloc(sizeof(char) * 10);
+    read->qual1 = malloc(sizeof(char) * 10);
+    read->name2 = malloc(sizeof(char) * 10);
+    read->seq2 = malloc(sizeof(char) * 10);
+    read->qual2 = malloc(sizeof(char) * 10);
+
+    //Receive and process the raw reads
+    while(1) {
+        MPI_Probe(0, 3, MPI_COMM_WORLD, &status);
+        MPI_Get_count(&status, MPI_BYTE, &size);
+        if(size > current_p_size) {
+            p = realloc(p, (size_t) size);
+        }
+        MPI_Recv(p, size, MPI_BYTE, 0, 3, MPI_COMM_WORLD, &status);
+        //Are we finished receiving?
+        if(size <= 1) break;
+
+        //Copy if needed
+        if(config.paired) {
+            p2 = malloc(size);
+            memcpy(p2,p,size);
+            add_element(second_writer_sentinel, p2);
+        }
+        p3 = malloc(size);
+        memcpy(p3,p,size);
+        add_element(first_writer_sentinel, p3);
+    }
+    add_finished(first_writer_sentinel);
+    if(config.paired) add_finished(second_writer_sentinel);
+
+    //Wait for the other thread
+    pthread_join(threads[0], NULL);
+    if(config.paired) {
+        pthread_join(threads[1], NULL);
+    }
+
+    //Free things up
+    free(p);
+    free(read->name1);
+    free(read->seq1);
+    free(read->qual1);
+    free(read->name2);
+    free(read->seq2);
+    free(read->qual2);
+    free(read);
+
+    return NULL;
+}
+
+/******************************************************************************
+*
+*   The main worker node function.
+*
+*   int thread_id: the thread_id
+*   char *fastq1: FIFO from which bowtie2 can get read1
+*   char *fastq2: FIFO from which bowtie2 can get read2 (if it exists)
+*
+*******************************************************************************/
+void herd_worker_node(int thread_id, char *fastq1, char *fastq2) {
+    int cmd_length = 1, max_qname = 0, status, strand;
+    char *cmd, *last_qname = calloc(1, sizeof(char));
+    MPI_Header *packed_header;
+    MPI_read *packed_read = calloc(1, sizeof(MPI_read));
+    bam_header_t *header;
+    bam1_t *read1 = bam_init1();
+    bam1_t *read2 = bam_init1();
+    tamFile fp;
+#ifdef DEBUG
+    MPI_Status stat;
+    int current_p_size = 100;
+    bamFile of;
+    bam_header_t *debug_header = bam_header_init();
+    bam1_t *debug_read = bam_init1();
+    global_header = bam_header_init();
+    void *p = calloc(100,1);
+    char *oname = NULL;
+#else
+    int i = 0;
+#endif
+    time_t t0, t1;
+
+    //Which strand should we be aligning to?
+    if(config.directional) {
+        strand = (thread_id-1) % 2;
+    } else {
+        strand = (thread_id-1) % 4;
+    }
+
+    packed_read->size = 0;
+    packed_read->packed = NULL;
+
+    //construct the bowtie2 command
+    cmd_length += (int) strlen("bowtie2 -q --reorder --no-mixed --no-discordant") + 1;
+    cmd_length += (int) strlen(config.bowtie2_options) + 1;
+    cmd_length += (int) strlen("--norc -x") + 1;
+    cmd_length += (int) strlen(config.genome_dir) + strlen("bisulfite_genome/CT_conversion/BS_CT") + 1;
+    cmd_length += (int) 2*(strlen("-1 ") + strlen(fastq1)) + 3;
+    if(config.paired) cmd_length += (int) strlen(fastq2); //This is likely unneeded.
+
+#ifdef DEBUG
+    oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_X.bam")));
+    sprintf(oname, "%s%s_%i.bam", config.odir, config.basename, thread_id);
+    if(!config.quiet) printf("Writing output to %s\n", oname);
+    of = bam_open(oname, "w");
+    free(oname);
+#endif
+
+    cmd = (char *) malloc(sizeof(char) * cmd_length);
+    if(strand == 0) { //OT Read#1 C->T, Read#2 G->A, Genome C->T only the + strand
+        if(config.paired) {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2);
+        } else {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, fastq1);
+        }
+    } else if(strand == 1) { //OB Read#1 C->T, Read#2 G->A, Genome G->A only the - strand
+        if(config.paired) {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2);
+        } else {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, fastq1);
+        }
+    } else if(strand == 2) { //CTOT Read#1 G->A, Read#2 C->T, Genome C->T, only the - strand
+        if(config.paired) {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2);
+        } else {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, fastq1);
+        }
+    } else if(strand == 3) { //CTOB Read#1 G->A, Read#2 C->T, Genome G->A, only the + strand
+        if(config.paired) {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2);
+        } else {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, fastq1);
+        }
+    } else {
+        printf("Oh shit, got strand %i!\n", strand);
+        return;
+    }
+
+    //Start the process
+    if(!config.quiet) printf("Node %i executing: %s\n", thread_id, cmd); fflush(stdout);
+    fp = sam_popen(cmd);
+    header = sam_header_read(fp);
+#ifdef DEBUG
+    bam_header_write(of, header);
+#endif
+
+#ifndef DEBUG
+    packed_header = pack_header(header);
+    if(thread_id == 1) {
+        //Send the header
+        MPI_Send((void *) &(packed_header->size), 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
+        status = MPI_Send((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD);
+        if(status != MPI_SUCCESS) {
+            printf("MPI_Send returned %i\n", status);
+            fflush(stdout);
+        }
+    }
+#else
+    packed_header = pack_header(header);
+    void *tmp_pointer = malloc(packed_header->size);
+    MPI_Request request;
+    MPI_Isend((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD, &request);
+    status = MPI_Recv(tmp_pointer, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD, &stat);
+    if(status != MPI_SUCCESS) printf("We seem to have not been able to send the message to ourselves!\n");
+    MPI_Wait(&request, &stat);
+    unpack_header(debug_header, tmp_pointer);
+    global_header = debug_header;
+    free(tmp_pointer);
+#endif
+
+    t0 = time(NULL);
+    if(!config.quiet) printf("Node %i began sending reads @%s", thread_id, ctime(&t0)); fflush(stdout);
+    while(sam_read1(fp, header, read1) > 1) {
+#ifdef DEBUG
+        bam_write1(of, read1);
+#endif
+        if(strcmp(bam1_qname(read1), last_qname) == 0) { //Multimapper
+            if(config.paired) {
+                sam_read1(fp, header, read2);
+#ifdef DEBUG
+                bam_write1(of, read2);
+#endif
+            }
+            continue;
+        } else {
+            if(read1->core.l_qname > max_qname) {
+                max_qname = read1->core.l_qname + 10;
+                last_qname = realloc(last_qname, sizeof(char) * max_qname);
+            }
+            strcpy(last_qname, bam1_qname(read1));
+        }
+
+        //Send the read
+        packed_read = pack_read(read1, packed_read);
+#ifndef DEBUG
+        MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD);
+#else
+        if(packed_read->size > current_p_size) p = realloc(p, packed_read->size);
+        MPI_Isend(packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request);
+        status = MPI_Recv(p, packed_header->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat);
+        MPI_Wait(&request, &stat);
+#endif
+        //Deal with paired-end reads
+        if(config.paired) {
+            sam_read1(fp, header, read2);
+            packed_read = pack_read(read2, packed_read);
+#ifndef DEBUG
+            MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD);
+#else
+            bam_write1(of, read2);
+            if(packed_read->size > current_p_size) p = realloc(p, packed_read->size);
+            MPI_Isend((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request);
+            status = MPI_Recv(p, packed_header->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat);
+            MPI_Wait(&request, &stat);
+            debug_read = unpack_read(debug_read, p);
+#endif
+        }
+#ifndef DEBUG
+        i++;
+#endif
+    }
+    t1 = time(NULL);
+    if(!config.quiet) printf("Node %i finished sending reads @%s\t(%f sec elapsed)\n", thread_id, ctime(&t1), difftime(t1, t0)); fflush(stdout);
+
+    //Notify the master node
+    packed_read->size = 0;
+#ifndef DEBUG
+    void *A = malloc(1);
+    MPI_Send(A, 1, MPI_BYTE, 0, 5, MPI_COMM_WORLD);
+    free(A);
+#endif
+
+    //Close things up
+    bam_header_destroy(header);
+    bam_destroy1(read1);
+    bam_destroy1(read2);
+    free(cmd);
+    if(packed_read->packed != NULL) free(packed_read->packed);
+    free(packed_read);
+    if(packed_header->packed != NULL) free(packed_header->packed);
+    free(packed_header);
+    free(last_qname);
+    sam_pclose(fp);
+    //Remove the FIFO(s)
+    unlink(fastq1);
+    if(config.paired) unlink(fastq2);
+#ifdef DEBUG
+    bam_close(of);
+    bam_header_destroy(debug_header);
+    bam_destroy1(debug_read);
+    free(p);
+#endif
+    if(!config.quiet) printf("Exiting worker node %i\n", thread_id); fflush(stdout);
+};
diff --git a/herd/writer.c b/herd/writer.c
new file mode 100644
index 0000000..0122ee0
--- /dev/null
+++ b/herd/writer.c
@@ -0,0 +1,204 @@
+#include "../bison.h"
+#include <sys/time.h>
+
+/******************************************************************************
+*
+*   Update the CpG/CHG/CHH metrics according to the methylation calls in a read
+*
+*******************************************************************************/
+void herd_update_counts(bam1_t *read) {
+    char *XM = bam_aux2Z(bam_aux_get(read, "XM"));
+    char base;
+    int i;
+
+    for(i=0; i<read->core.l_qseq; i++) {
+        base = *(XM+i);
+        if(base != '.') {
+            if(base == 'Z') {
+                t_CpG++;
+                m_CpG++;
+            } else if(base == 'z') {
+                t_CpG++;
+            } else if(base == 'X') {
+                t_CHG++;
+                m_CHG++;
+            } else if(base == 'x') {
+                t_CHG++;
+            } else if(base == 'H') {
+                t_CHH++;
+                m_CHH++;
+            } else if(base == 'h') {
+                t_CHH++;
+            }
+        }
+    }
+}
+
+void herd_setup(char *fname1, char *fname2) {
+    char *cmd = NULL;
+    if(config.basename) free(config.basename);
+    config.basename = get_basename(fname1);
+    config.outname = realloc(config.outname, sizeof(char)*(strlen(config.odir)+ strlen(config.basename)+5));
+    sprintf(config.outname, "%s%s.bam", config.odir, config.basename);
+    //Open the output file handles
+    if(config.unmapped) {
+        create_fastq_names(fname1, fname2);
+        cmd = malloc(sizeof(char) * (strlen(config.unmapped1) + 8));
+        if(!config.quiet) printf("Unmapped reads will be written to %s\n", config.unmapped1);
+        sprintf(cmd, "gzip > %s", config.unmapped1);
+        unmapped1 = popen(cmd, "w");
+        if(config.paired) {
+            cmd = realloc(cmd, sizeof(char) * (strlen(config.unmapped2) + 8));
+            if(!config.quiet) printf("Unmapped reads will be written to %s\n", config.unmapped2);
+            sprintf(cmd, "gzip > %s", config.unmapped2);
+            unmapped2 = popen(cmd, "w");
+        }
+        free(cmd);
+    }
+
+    //Open a file for output
+    OUTPUT_BAM = bam_open(config.outname, "w");
+    if(OUTPUT_BAM == NULL) {
+        printf("Could not open %s for writing!\n", config.outname);
+        quit(2,-1);
+    }
+    if(!config.quiet) printf("Alignments will be written to %s\n",config.outname);
+    if(config.n_compression_threads > 1) bgzf_mt(OUTPUT_BAM, config.n_compression_threads, 256);
+    bam_header_write(OUTPUT_BAM, global_header);
+    if(!config.quiet) printf("Alignment metrics will be printed to %s%s.txt\n",config.odir,config.basename);
+    fflush(stdout);
+}
+
+/******************************************************************************
+*
+*   This function will run as its own thread and process the linked lists
+*   output from the master processor threads, writing them in order to a BAM
+*   file. This will also write all of the other output (aside from metrics).
+*   Furthermore, this provides a readout of the current number of reads
+*   processed.
+*
+*   Output is NULL, as is the input (needed by pthreads).
+*
+*******************************************************************************/
+void * bam_writer(void *a) {
+    int i, j, *times = malloc(sizeof(int)*config.nmthreads);
+    int times_per_thread = effective_nodes();
+    int nfinished = 0;
+    int nlooped = 0, current_file = 0;
+    bam1_t *best_read1 = NULL;
+    bam1_t *best_read2 = NULL;
+    time_t now;
+    char ctime_buffer[26];
+
+    //If we write output in the exact same order as the input, we need to know
+    //how many times to write from each master_processor_thread before going to the next
+    if(config.directional){
+        times_per_thread /= 2;
+    } else {
+        times_per_thread /= 4;
+    }
+    for(i=0; i<config.nmthreads; i++) *(times+i) = times_per_thread/config.nmthreads;
+    *(times+config.nmthreads-1) += times_per_thread % config.nmthreads;
+
+    //Sleep until we've received the global header
+    while(global_header == NULL) sleep(1);
+    herd_setup(fnames1[current_file], fnames2[current_file]); //Setup the various names
+
+    while(nfinished < config.nmthreads) {
+        for(i=0; i<config.nmthreads; i++) {
+            for(j=0; j<*(times+i); j++) {
+                //Do we need to go to a new file?
+                if(flengths[current_file] > 0 && flengths[current_file] == t_reads) {
+                    print_metrics();
+                    t_reads = 0;
+                    m_reads_OT = 0;
+                    m_reads_OB = 0;
+                    m_reads_CTOT = 0;
+                    m_reads_CTOB = 0;
+                    t_CpG = 0;
+                    m_CpG = 0;
+                    t_CHG = 0;
+                    m_CHG = 0;
+                    t_CHH = 0;
+                    m_CHH = 0;
+                    //Are we finished?
+                    if(is_finished(to_write_node[i])) goto finished;
+                    if(unmapped1 != NULL) pclose(unmapped1);
+                    if(unmapped2 != NULL) pclose(unmapped2);
+                    bam_close(OUTPUT_BAM);
+                    current_file++;
+                    herd_setup(fnames1[current_file], fnames2[current_file]);
+                    i=0;
+                    j=0;
+                }
+                //Just poll every second if we haven't yet written anything or if we've already looped a few times
+                if(i == 0 && j == 0) nfinished = 0;
+                if(!config.reorder) {
+                    if(!is_ready(to_write_node[i], 0)) {
+                        if(config.nmthreads == 1) {
+                            sleep(1); //This is the same as --reorder
+                            break;
+                        }
+                        if(t_reads == 0) sleep(1);
+                        if(++nlooped > 100) {
+                            if(nlooped > 1000) nlooped = 1000;
+                            sleep(1);
+                        }
+                        break;
+                    }
+                } else {
+                    while(!is_ready(to_write_node[i], 0)) sleep(1);
+                }
+                if(is_finished(to_write_node[i])) {
+                    nfinished += 1;
+                    break;
+                }
+                best_read1 = to_write_node[i]->next->packed;
+                if(config.paired) best_read2 = to_write_node[i]->next->next->packed;
+                if(!config.paired) { //single-end
+                    if(!(best_read1->core.flag & BAM_FUNMAP)) {
+                        bam_write1(OUTPUT_BAM, best_read1);
+                        herd_update_counts(best_read1);
+                    } else {
+                        if(config.unmapped) write_unmapped(unmapped1, best_read1);
+                    }
+                } else {
+                    if(!(best_read1->core.flag & BAM_FUNMAP) && !(best_read2->core.flag & BAM_FUNMAP)) {
+                        bam_write1(OUTPUT_BAM, best_read1);
+                        herd_update_counts(best_read1);
+                        bam_write1(OUTPUT_BAM, best_read2);
+                        herd_update_counts(best_read2);
+                    } else {
+                        if(config.unmapped) {
+                            write_unmapped(unmapped1, best_read1);
+                            write_unmapped(unmapped2, best_read2);
+                        }
+                    }
+                }
+
+                remove_element(to_write_node[i]);
+                if(config.paired) remove_element(to_write_node[i]);
+                nlooped = 0;
+                t_reads++;
+                nwritten[current_file]++; //Only keep track of this if we're throttling
+
+                //Give some status
+                if((t_reads % 100000) == 0) {
+                    now = time(NULL);
+                    if(!config.quiet) printf("%llu reads written @ %s", t_reads, ctime_r(&now, ctime_buffer)); fflush(stdout);
+                }
+            }
+        }
+    }
+
+//This isn't elegant, but...
+finished:
+    if(t_reads != 0) print_metrics(); //There seems to be a race condition for the last sample in a list. This gets around that.
+    now = time(NULL);
+    if(!config.quiet) printf("Finished writing output @%s", ctime_r(&now, ctime_buffer)); fflush(stdout);
+
+    free(times);
+    for(i=0; i<config.nmthreads; i++) destroy_list(to_write_node[i]);
+
+    return NULL;
+}
diff --git a/index.c b/index.c
new file mode 100644
index 0000000..b49e729
--- /dev/null
+++ b/index.c
@@ -0,0 +1,159 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <pthread.h>
+
+#define MAXLINE 1024
+
+typedef struct {
+    char *options;
+    char *odir;
+    char conversion;
+} btoptions_struct;
+
+void usage(char *prog) {
+    printf("Usage: %s [options] reference(s)\n", prog);
+    printf("\n \
+Note, references is a comma separated list of FASTA files. A \"bisulfite_genome\"\n \
+directory with CT_conversion and GA_conversion subdirectories will be created.\n \
+While the directory structure and indexing method are identical to bismark, the\n \
+resulting indexes are not compatible, owing to bismark's changing of\n \
+chromosome/contig names.\n \
+\n \
+Options are currently identical to those for bowtie2-build\n \
+(http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer),\n \
+as this program is effectively just a wrapper.\n");
+}
+
+void * bt2_build(void *a) {
+    btoptions_struct *options = (btoptions_struct *) a;
+    int rv;
+    char *cmd;
+
+    //Create the command
+    cmd = malloc(sizeof(char) * (strlen(options->options) + 2*strlen(options->odir) + 200));
+    if(options->conversion == 'C') {
+        sprintf(cmd, "bowtie2-build %s %s/genome.fa %s/BS_CT", options->options, options->odir, options->odir);
+    } else {
+        sprintf(cmd, "bowtie2-build %s %s/genome.fa %s/BS_GA", options->options, options->odir, options->odir);
+    }
+    printf("Now executing: %s\n", cmd);
+    rv = system(cmd);
+    if(rv) printf("%s returned with status %i!\n", cmd, rv);
+    return NULL;
+}
+
+int main(int argc, char *argv[]) {
+    char *odir, *p, *CT_dir, *GA_dir;
+    char *CT_line, *GA_line;
+    char *options;
+    FILE *fp, *CT, *GA;
+    btoptions_struct CT_data, GA_data;
+    pthread_t threads[2];
+    int i;
+
+    if(argc == 1) {
+        usage(argv[0]);
+        return 0;
+    } else if(strcmp(argv[1], "-h") == 0) {
+        usage(argv[0]);
+        return 0;
+    }
+
+    //Store bowtie2-build options
+    options = (char *) calloc(1, sizeof(char));
+    for(i=1; i<argc-1; i++) {
+        options = realloc(options, sizeof(char) * (strlen(options) + strlen(argv[i]) + 2));
+        sprintf(options, "%s %s", options, argv[i]);
+    }
+
+    //Create the basename for dir
+    odir = strdup(argv[argc-1]);
+    p = strchr(odir, ',');
+    if(p != NULL) *p = '\0';
+    p = strrchr(odir, '/');
+    if(p != NULL) {
+        p++;
+        *p = '\0';
+    } else {
+        *odir = '\0';
+    }
+
+    //Make the output directories
+    odir = realloc(odir, sizeof(char) * (strlen(odir) + 1 + strlen("bisulfite_genome")));
+    odir = strcat(odir, "bisulfite_genome");
+    mkdir(odir, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+    printf("Output will be placed under %s\n", odir);
+    CT_dir = malloc(sizeof(char) * (strlen(odir) + strlen("/CT_conversion/genome.fa") +1));
+    sprintf(CT_dir, "%s/CT_conversion", odir);
+    mkdir(CT_dir, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+    GA_dir = malloc(sizeof(char) * (strlen(odir) + strlen("/CT_conversion/genome.fa") +1));
+    sprintf(GA_dir, "%s/GA_conversion", odir);
+    mkdir(GA_dir, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+
+    //Iterate through the files, converting them
+    CT_dir = strcat(CT_dir, "/genome.fa");
+    GA_dir = strcat(GA_dir, "/genome.fa");
+    CT = fopen(CT_dir, "w");
+    GA = fopen(GA_dir, "w");
+    CT_line = malloc(sizeof(char)*MAXLINE);
+    GA_line = malloc(sizeof(char)*MAXLINE);
+    p = strtok(argv[argc-1], ",");
+    while(p != NULL) {
+        printf("Reading in and converting %s...\n", p);
+        fp = fopen(p, "r");
+        while(fgets(CT_line, MAXLINE, fp)) {
+            if(*CT_line != '>') {
+                for(i=0; i<strlen(CT_line); i++) *(CT_line+i) = toupper(*(CT_line+i));
+                strcpy(GA_line, CT_line);
+                for(i=0; i<strlen(CT_line); i++) {
+                    if(*(CT_line+i) == 'C') *(CT_line+i) = 'T';
+                    if(*(GA_line+i) == 'G') *(GA_line+i) = 'A';
+                }
+            } else {
+                strcpy(GA_line, CT_line);
+            }
+            fputs(CT_line, CT);
+            fputs(GA_line, GA);
+        }
+        fclose(fp);
+        p = strtok(NULL, ",");
+    }
+    fclose(CT);
+    fclose(GA);
+
+    //Invoke bowtie2-build in 2 threads
+    p = strrchr(CT_dir, '/');
+    *p = '\0';
+    p = strrchr(GA_dir, '/');
+    *p = '\0';
+    CT_data.odir = CT_dir;
+    CT_data.options = options;
+    CT_data.conversion = 'C';
+    GA_data.odir = GA_dir;
+    GA_data.options = options;
+    GA_data.conversion = 'G';
+    pthread_create(&threads[0], NULL, &bt2_build, (void *) &CT_data);
+    pthread_create(&threads[1], NULL, &bt2_build, (void *) &GA_data);
+    pthread_join(threads[0], NULL);
+    pthread_join(threads[1], NULL);
+
+    //We don't actually need these anymore
+    *(CT_dir+strlen(CT_dir)) = '/';
+    *(GA_dir+strlen(GA_dir)) = '/';
+    printf("Removing %s\n", CT_dir);
+    printf("Removing %s\n", GA_dir);
+    unlink(CT_dir);
+    unlink(GA_dir);
+
+    //Cleaning up
+    free(options);
+    free(odir);
+    free(CT_dir);
+    free(GA_dir);
+    return 0;
+}
diff --git a/main.c b/main.c
new file mode 100644
index 0000000..1a877d0
--- /dev/null
+++ b/main.c
@@ -0,0 +1,380 @@
+#include "bison.h"
+
+void usage(char *prog) {
+    printf("Usage: %s [OPTIONS] -g genome_dir {-1 fastq.gz -2 fastq.gz | -U fastq.gz}\n", prog);
+    printf("\n \
+    N.B., Bison has a number of defaults that are different from that of bowtie2.\n \
+    Namely, --no-mixed and --no-discordant are always used. All other options\n \
+    can be changed with the normal bowtie2 options, which change bison's\n \
+    behavior as well. MAPQ scores are recalculated by bison in the same way as\n \
+    they are in bowtie2 (or at least they should be). Any option not listed\n \
+    below will be passed directly to bowtie2, so you can specify, e.g., --very-fast\n \
+    if you want. If you specify --local, --score-min is changed back to the\n \
+    bowtie2 default of 'G,20,6', unless you specify otherwise.\n \
+\n \
+-g          Directory containing the genome fasta files and the\n \
+            Bisulfite_Sequences directory.\n \
+\n \
+-1          Fastq file containing read #1 (normally named something like \n \
+            foo_1.fastq.gz). Reads needn't be gzipped, but that'll be more\n \
+            convenient.\n \
+\n \
+-2          As with -1, but with read #2.\n \
+\n \
+-U          For convenience, this denotes a fastq file from single-ended reads.\n \
+            Alternatively, -1 can be used without using -2.\n \
+\n \
+-p          How many threads bowtie2 should use on each node. Default is 12.\n \
+\n \
+-o          Output directory. By default, everything will be written to the\n \
+            directory holding the fastq files (or the file containing read #1,\n \
+            as appropriate). If you would prefer for the output BAM file and\n \
+            metrics txt file to be placed elsewhere, specify that here.\n \
+\n \
+            N.B., the directory must exist! \n \
+\n \
+--directional Denotes that the library was created in a directional, rather\n \
+            than non-directional manner. This will result in 3, rather than 5\n \
+            nodes being used as only alignments to 2 (rather than 4) strands are\n \
+            possible.\n \
+\n \
+-upto       The maximum number of reads to process. This is mostly useful for\n \
+            debuging and more quickly determining if a library is directional or\n \
+            not. 0 is the default, meaning all reads are used. N.B., the\n \
+            maximum value for this parameter is whatever an unsigned long is on\n \
+            your system.\n \
+\n \
+--unmapped  Save unaligned reads to a file or files (as appropriate).\n \
+\n \
+--quiet     Suppress printing of anything other than errors to the console.\n \
+\n \
+-h          Print this help message.\n \
+\n \
+-v          Print version information.\n \
+\n");
+#ifdef DEBUG
+    printf("\n \
+-taskid     Which node number to act as. The default is 0, the master node.\n \
+            Other possibilities are 1-4, which are the worker nodes that\n \
+            process OT, OB, CTOT, and CTOB alignments, respectively. A value of\n \
+            -1 will only convert the reads.\n \
+\n \
+            Note that if you plan to run with taskid=0 (i.e., as the master\n \
+            node), files named OT.bam, OB.bam, etc. should exist in your\n \
+            working directory. These will be created automatically if you run\n \
+            each pseudo-worker node first, which is recommended.\n\n");
+#endif
+}
+
+int main(int argc, char *argv[]) {
+    int i, taskid=0, provided;
+    pthread_t *threads;
+    int bowtie2_options_max = MAXREAD;
+    //These are bowtie2 options, but they'll just be pushed into the config.bowtie2_options character array
+    char *cmd, *p;
+    unsigned long upto = 0;
+#ifndef DEBUG
+    int mpi_ntasks, ntasks;
+    int name_len;
+    char processor_name[MPI_MAX_PROCESSOR_NAME];
+#endif
+
+    //Deal with MPI initialization, this seems like an odd way to do things.
+#ifndef DEBUG
+    MPI_Init_thread(NULL, NULL, MPI_THREAD_FUNNELED, &provided);
+    MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
+    MPI_Get_processor_name(processor_name, &name_len);
+#else
+    MPI_Init_thread(NULL, NULL, MPI_THREAD_FUNNELED, &provided);
+#endif
+    if(provided != MPI_THREAD_FUNNELED) {
+        printf("Your implementation does not support MPI_THREAD_FUNNELED, which is required for bison to run. This is actually quite unusual!\n");
+        return -1;
+    }
+
+    config.odir = NULL;
+    config.paired = 0; //Default is single-ended
+    config.directional = 0; //Default is non-directional
+    config.nthreads = 12; //Default is 12 threads/node
+    config.bowtie2_options = calloc(MAXREAD, sizeof(char));
+    config.unmapped = 0; //By default, unmapped reads are NOT written to a fastq file
+    config.scoremin_type = 'L'; //--score-min 'L,-0.6,-0.6'
+    config.scoremin_intercept = -0.6;
+    config.scoremin_coef = -0.6;
+    config.mode = 0; //--end-to-end
+    config.quiet = 0;
+    config.FASTQ1 = NULL;
+    config.FASTQ2 = NULL;
+    chromosomes.nchromosomes = 0; //We need to initialize the struct
+
+    //Initialize the global counts
+    t_reads = 0;
+    m_reads_OT = 0;
+    m_reads_OB = 0;
+    m_reads_CTOT = 0;
+    m_reads_CTOB = 0;
+    t_CpG = 0;
+    m_CpG = 0;
+    t_CHG = 0;
+    m_CHG = 0;
+    t_CHH = 0;
+    m_CHH = 0;
+
+    if(argc == 1) {
+        usage(argv[0]);
+        quit(0, 0);
+    }
+
+    for(i=1; i<argc; i++) {
+        if(strcmp(argv[i], "-h") == 0) {
+            usage(argv[0]);
+            quit(0, 0);
+        } else if(strcmp(argv[i], "-v") == 0) {
+            version();
+            quit(0,0);
+        } else if(strcmp(argv[i], "-1") == 0) {
+            i++;
+            config.FASTQ1 = argv[i];
+        } else if(strcmp(argv[i], "-2") == 0) {
+            i++;
+            config.FASTQ2 = argv[i];
+            config.paired = 1;
+        } else if(strcmp(argv[i], "-U") == 0) {
+            i++;
+            config.FASTQ1 = argv[i];
+        } else if(strcmp(argv[i], "-g") == 0) {
+            i++;
+            config.genome_dir = argv[i];
+        } else if(strcmp(argv[i], "-p") == 0) {
+            i++;
+            config.nthreads = atoi(argv[i]);
+        } else if(strcmp(argv[i], "-o") == 0) {
+            i++;
+            config.odir = strdup(argv[i]);
+        } else if(strcmp(argv[i], "-upto") == 0) {
+            i++;
+            upto = strtoul(argv[i], NULL, 10);
+        } else if(strcmp(argv[i], "--directional") == 0) {
+            config.directional = 1;
+        } else if(strcmp(argv[i], "--unmapped") == 0) {
+            config.unmapped = 1;
+#ifdef DEBUG
+        } else if(strcmp(argv[i], "-taskid") == 0) {
+            i++;
+            global_debug_taskid = atoi(argv[i]);
+            taskid = global_debug_taskid;
+#endif
+        } else if(strcmp(argv[i], "--score-min") == 0) {
+            i++;
+            if(!config.quiet) printf("Changing --score-min from 'L,-0.6,-0.6' to %s!\n", argv[i]);
+            config.scoremin_type = strtok(argv[i], ",")[0];
+            config.scoremin_intercept = (float) atof(strtok(NULL, ","));
+            config.scoremin_coef = (float) atof(strtok(NULL, ","));
+        } else {
+            if(strcmp(argv[i], "--local") == 0 || strcmp(argv[i], "--very-fast-local") == 0 || strcmp(argv[i], "--fast-local") == 0 || strcmp(argv[i], "--sensitive-local") == 0 || strcmp(argv[i], "--very-sensitive-local") == 0) {
+                config.mode = 1;
+                if(config.scoremin_type == 'L' && config.scoremin_intercept == -0.6f && config.scoremin_coef == -0.6f) {
+                    config.scoremin_type = 'G';
+                    config.scoremin_intercept = 20.0;
+                    config.scoremin_coef = 8.0;
+                    if(!config.quiet) printf("Since --local was specified and --score-min was not already changed, changing --score-min to the bowtie2 default of 'G,20,8' (specify --score-min to change this)\n");
+                }
+            }
+            if(strcmp(argv[i], "--quiet") == 0) config.quiet = 1;
+            //bowtie2 option
+            if(strlen(config.bowtie2_options) + 1 + strlen(argv[i]) >= bowtie2_options_max) {
+                bowtie2_options_max = strlen(config.bowtie2_options) + 1 + strlen(argv[i]) + 100;
+                config.bowtie2_options = realloc(config.bowtie2_options, sizeof(char) * bowtie2_options_max);
+            }
+            strcat(config.bowtie2_options, " ");
+            strcat(config.bowtie2_options, argv[i]);
+        }
+    }
+
+#ifndef DEBUG
+    if(!config.quiet) {printf("%s has rank %i\n", processor_name, taskid); fflush(stdout);}
+#endif
+
+    if(config.FASTQ1 == NULL || config.genome_dir == NULL || (config.FASTQ2 == NULL && config.paired == 1)) {
+        if(taskid == MASTER) {
+            printf("No FASTQ files!\n");
+            usage(argv[0]);
+        }
+        quit(0, -1);
+    }
+
+    //Allocate room for the genome, if needed
+    if(taskid == MASTER) {
+        chromosomes.max_genome = 3000000000;
+        if(!config.quiet) printf("Allocating space for %llu characters\n", chromosomes.max_genome);
+        fflush(stdout);
+        chromosomes.genome = malloc(sizeof(char)*chromosomes.max_genome);
+        *chromosomes.genome = '\0';
+        if(chromosomes.genome == NULL) {
+            printf("Could not allocate enough room to hold the genome!\n");
+            return -1;
+        }
+    } else {
+        chromosomes.max_genome = 0;
+    }
+
+    //Append score_min, and p
+    if(strlen(config.bowtie2_options) + 1000 >= bowtie2_options_max) {
+        bowtie2_options_max = strlen(config.bowtie2_options) + 1000; //This should suffice
+        config.bowtie2_options = realloc(config.bowtie2_options, sizeof(char) * bowtie2_options_max);
+    }
+    sprintf(config.bowtie2_options, "%s -p %i --score-min '%c,%g,%g'", config.bowtie2_options, config.nthreads, config.scoremin_type, config.scoremin_intercept, config.scoremin_coef);
+
+    //There should be as many tasks according to MPI as dictated by the library type.
+#ifndef DEBUG
+    ntasks = 5;
+    if(config.directional) ntasks = 3;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_ntasks);
+    if(mpi_ntasks < ntasks) {
+        if(taskid == MASTER) printf("There are only %i nodes available but we need %i to work. You need to allocate more nodes!\n", mpi_ntasks, ntasks);
+        quit(0, -1);
+    }
+#endif
+    /***********************************************************************************************
+    *
+    * Convert the input reads C->T and G->A as needed. There are 4 use cases:
+    *   Directional:
+    *       Paired-end: FASTQ1 will be C->T converted and FASTQ2 will be G->A converted.
+    *       Single-end: FASTQ1 will be C->T converted
+    *   Non-directional:
+    *       Paired-end: Both FASTQ1 and FASTQ2 will be C->T and G->A converted.
+    *       Single-end: FASTQ1 will be both C->T and G->A converted.
+    *
+    *   convert_fastq() takes a single integer parameter:
+    *       8 = convert FASTQ1 C->T
+    *       4 = convert FASTQ1 G->A
+    *       2 = convert FASTQ2 C->T
+    *       1 = convert FASTQ2 G->A
+    *
+    ***********************************************************************************************/
+    update_odir();
+    create_fastq_names(config.FASTQ1, config.FASTQ2);
+
+#ifndef DEBUG
+    if(taskid == MASTER) {
+#endif
+        //MASTER specific procedures
+        config.basename = get_basename(config.FASTQ1);
+        config.outname = malloc(sizeof(char)*(strlen(config.odir)+ strlen(config.basename)+5));
+        sprintf(config.outname, "%s%s.bam", config.odir, config.basename);
+#ifdef DEBUG
+        //When debugging, don't convert the files if it's already been done
+        if(access(config.FASTQ1CT, F_OK) == -1) {
+#endif
+        if(config.directional) {
+            if(config.paired) {
+                convert_fastq(9, upto);
+            } else {
+                convert_fastq(8, upto);
+            }
+        } else {
+            if(config.paired) {
+                convert_fastq(15, upto);
+            } else {
+                convert_fastq(12, upto);
+            }
+        }
+#ifdef DEBUG
+    }
+    //Just convert the reads
+    if(taskid == -1)  {
+        quit(3, 0);
+        return 0;
+    }
+#endif
+
+#ifdef DEBUG
+    if(taskid == MASTER) {
+#endif
+        //Open the input reads
+        cmd = malloc(sizeof(char) * (strlen(config.FASTQ1) + 7));
+        p = strrchr(config.FASTQ1, '.');
+        if(strcmp(p,".gz") == 0 || strcmp(p,".GZ") == 0) {
+            sprintf(cmd, "zcat %s", config.FASTQ1);
+        } else if(strcmp(p,".bz") == 0 || strcmp(p,".bz2") == 0) {
+            sprintf(cmd, "bzcat %s", config.FASTQ1);
+        } else {
+            sprintf(cmd, "cat %s", config.FASTQ1);
+        }
+        zip1 = popen(cmd, "r");
+        if(config.paired) {
+            cmd = realloc(cmd, sizeof(char) * (strlen(config.FASTQ2) + 7));
+            p = strrchr(config.FASTQ2, '.');
+            if(strcmp(p,".gz") == 0 || strcmp(p,".GZ") == 0) {
+                sprintf(cmd, "zcat %s", config.FASTQ2);
+            } else if(strcmp(p,".bz") == 0 || strcmp(p,".bz2") == 0) {
+                sprintf(cmd, "bzcat %s", config.FASTQ2);
+            } else {
+                sprintf(cmd, "cat %s", config.FASTQ2);
+            }
+            zip2 = popen(cmd, "r");
+        }
+
+        //Open the output file handles
+        if(config.unmapped) {
+            cmd = realloc(cmd, sizeof(char) * (strlen(config.unmapped1) + 8));
+            if(!config.quiet) printf("Writing unmapped reads to %s\n", config.unmapped1);
+            sprintf(cmd, "gzip > %s", config.unmapped1);
+            unmapped1 = popen(cmd, "w");
+            if(config.paired) {
+                cmd = realloc(cmd, sizeof(char) * (strlen(config.unmapped2) + 8));
+                if(!config.quiet) printf("Writing unmapped reads to %s\n", config.unmapped2);
+                sprintf(cmd, "gzip > %s", config.unmapped2);
+                unmapped2 = popen(cmd, "w");
+            }
+        }
+        free(cmd);
+
+        //Store the genome into memory
+        read_genome();
+
+        //Open a file for output
+        OUTPUT_BAM = bam_open(config.outname, "w");
+        if(OUTPUT_BAM == NULL) {
+            printf("Could not open %s for writing!\n", config.outname);
+            quit(2,-1);
+        }
+        if(!config.quiet) printf("Alignment metrics will be printed to %s%s.txt\n",config.odir,config.basename);
+        fflush(stdout);
+
+        //Setup the linked-lists
+        node1 = initialize_list(node1);
+        node1_last_sentinel = node1->next;
+        node2 = initialize_list(node2);
+        node2_last_sentinel = node2->next;
+        node3 = initialize_list(node3);
+        node3_last_sentinel = node3->next;
+        node4 = initialize_list(node4);
+        node4_last_sentinel = node4->next;
+
+        //Start the master node processer threads
+        threads = calloc(1, sizeof(pthread_t));
+        pthread_create(&(threads[0]), NULL, &master_processer_thread, NULL);
+        slurp(NULL);
+        pthread_join(threads[0], NULL);
+
+        //Start freeing things up
+        if(!config.quiet) printf("Closing input files\n");
+        free(threads);
+        pclose(zip1);
+        if(config.paired) pclose(zip2);
+
+        //Print some metrics
+        print_metrics();
+    } else {
+        //worker node stuff, wait for the master
+        worker_node(taskid);
+        if(!config.quiet) printf("Returning from worker node %i\n", taskid);
+        fflush(stdout);
+    }
+
+    //Clean up
+    if(config.odir != NULL) free(config.odir);
+    quit(3, 0);
+    return 0;
+}
diff --git a/markduplicates.c b/markduplicates.c
new file mode 100644
index 0000000..fc2c20c
--- /dev/null
+++ b/markduplicates.c
@@ -0,0 +1,286 @@
+#include <assert.h>
+#include <inttypes.h>
+#include <bam.h>
+
+#define WORD_OFFSET(b) b/32
+#define BIT_OFFSET(b) b%32
+
+typedef struct {
+    int32_t tid, start1, start2, stop1, stop2;
+    int strand, MAPQ;
+    unsigned read_number;
+} alignment;
+
+typedef struct {
+    uint64_t nelements;
+    int threadid;
+    alignment *alignments;
+} qsort_func_struct;
+
+/*
+    Sort a list of alignments, they'll be ordered as follows (always low to high):
+    (1) tid (chromosome index ID)
+    (2) start1 (5' position of read #1)
+    (3) stop1 (3' position of read #1)
+    (4) start2 (5' position of read #2)
+    (5) stop2 (3' position of read #2)
+    (6) strand (0 OT, 1 OB, 2 CTOT, 3 CTOB)
+    (7) MAPQ (high to low)
+*/
+int comp_func(const void *a, const void *b) {
+    alignment *a1 = (alignment*) a;
+    alignment *a2 = (alignment*) b;
+
+    if(a1->tid < a2->tid) return -1;
+    else if(a1->tid > a2->tid) return 1;
+    else {
+        if(a1->start1 < a2->start1) return -1;
+        else if(a1->start1 > a2->start1) return 1;
+        else {
+            if(a1->stop1 < a2->stop1) return -1;
+            else if(a1->stop1 > a2->stop1) return 1;
+            else {
+                if(a1->start2 < a2->start2) return -1;
+                else if(a1->start2 > a2->start2) return 1;
+                else {
+                    if(a1->stop2 < a2->stop2) return -1;
+                    else if(a1->stop2 > a2->stop2) return 1;
+                    else {
+                        if(a1->strand < a2->strand) return -1;
+                        else if(a1->strand > a2->strand) return 1;
+                        else { //This will be the other way around
+                            if(a1->MAPQ > a2->MAPQ) return -1; 
+                            else if(a1->MAPQ < a2->MAPQ) return 1; 
+                            else return 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+//This is the same as comp_func(), except that MAPQ is ignored
+int comp_func2(const void *a, const void *b) {
+    alignment *a1 = (alignment*) a;
+    alignment *a2 = (alignment*) b;
+
+    if(a1->tid < a2->tid) return -1;
+    else if(a1->tid > a2->tid) return 1;
+    else {
+        if(a1->start1 < a2->start1) return -1;
+        else if(a1->start1 > a2->start1) return 1;
+        else {
+            if(a1->stop1 < a2->stop1) return -1;
+            else if(a1->stop1 > a2->stop1) return 1;
+            else {
+                if(a1->start2 < a2->start2) return -1;
+                else if(a1->start2 > a2->start2) return 1;
+                else {
+                    if(a1->stop2 < a2->stop2) return -1;
+                    else if(a1->stop2 > a2->stop2) return 1;
+                    else {
+                        if(a1->strand < a2->strand) return -1;
+                        else if(a1->strand > a2->strand) return 1;
+                        else return 0;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void *qsort_func(void *a) {
+    uint64_t total_pairs = ((qsort_func_struct *) a)->nelements;
+    int thread_id = ((qsort_func_struct *) a)->threadid;
+    uint64_t nelements = total_pairs/(thread_id+1);
+    uint64_t offset = thread_id*nelements;
+    alignment *alignments = ((qsort_func_struct *) a)->alignments;
+    void *p = (void *) (alignments+offset);
+
+    if(thread_id == 5) nelements += total_pairs % thread_id;
+    qsort(p, (size_t) nelements, sizeof(alignment), &comp_func);
+
+    return NULL;
+}
+
+//Set 1 at the given offset
+void set_bit(uint32_t *map, uint64_t n) { 
+    map[WORD_OFFSET(n)] |= (1 << BIT_OFFSET(n));
+}
+
+//Get the value at a given offset
+int get_bit(uint32_t *map, uint64_t n) {
+    uint32_t bit = map[WORD_OFFSET(n)] & (1 << BIT_OFFSET(n));
+    return bit != 0; 
+}
+
+uint64_t mark_dups(alignment *alignments, uint32_t *bitmap, uint64_t total_pairs) {
+    uint64_t i, ndups = 0;
+    void *cur_alignment = (void *) alignments;
+
+    for(i=1;i<total_pairs;i++) {
+        if(comp_func2(cur_alignment, (void *) (alignments+i)) == 0) {
+            set_bit(bitmap, alignments[i].read_number);
+            ndups++;
+        } else {
+            cur_alignment = (void *) (alignments+i);
+        }
+    }
+    return ndups;
+}
+        
+
+void usage(char *prog) {
+    printf("Usage: %s [OPTIONS] input.bam output.bam\n", prog);
+    printf("\n\
+This program will parse a BAM file produced by bison and mark likely PCR\n\
+duplicates in a new file. A PCR duplicate is defined as two reads/read pairs\n\
+having the same start and stop coordinates on the some strand of the same\n\
+chromosome. The read or pair with the best MAPQ score will be kept.\n\
+\n\
+There are better ways to go about this (both in terms of algorithms and in the\n\
+information used to determine a PCR duplicate), but this will normally suffice.\n\
+\n\
+N.B., this program does not currently support coordinate-sorted BAM files or\n\
+paired-end files containing discordant or other mixed alignments where the reads\n\
+are on different chromosomes or different strand.\n");
+    printf("\nOptions:\n\
+\n\
+    -s INT Initial array size used to hold mapping coordinates (measured in\n\
+           reads/read pairs). The default is 10 million. In an ideal world, this\n\
+           would be at least as large as the number of reads you have, but the\n\
+           array will grow as needed.\n\
+\n\
+    -g INT How much the array should grow (measured in reads/read pairs) when\n\
+           it's full). The default is 1000000.\n");
+}
+
+inline int get_strand(bam1_t *read) {
+    char *XG = bam_aux2Z(bam_aux_get(read, "XG"));
+    char *XR = bam_aux2Z(bam_aux_get(read, "XR"));
+
+    if(*XG == 'C') { //OT or CTOT
+        if(*XR == 'C') return 0; //OT
+        else return 2; //CTOT
+    } else {
+        if(*XR == 'C') return 1; //OB
+        else return 3; //CTOB
+    }
+}
+
+int main(int argc, char *argv[]) {
+    bam1_t *read;
+    bam_header_t *header;
+    bamFile fp = NULL, of = NULL;
+    uint64_t total_pairs = 0, max_length = 10000000;
+    uint64_t bitmap_length = 0, cur_read = 0, ndups = 0;
+    uint64_t grow_size = 1000000;
+    alignment *alignments;
+    uint32_t *bitmap;
+    int i;
+    char *iname = NULL, *oname = NULL;
+
+    if(argc < 3) {
+        usage(argv[0]);
+        return 0;
+    }
+
+    for(i=1; i<argc; i++) {
+        if(strcmp("-h", argv[i]) == 0 || strcmp("--help", argv[i]) == 0) {
+            usage(argv[0]);
+            return 0;
+        } else if(strcmp("-s", argv[i]) == 0) {
+            max_length = (uint64_t) strtoull(argv[++i], NULL, 10);
+        } else if(strcmp("-g", argv[i]) == 0) {
+            grow_size = (uint64_t) strtoull(argv[++i], NULL, 10);
+        } else if(iname == NULL) {
+            iname = argv[i];
+            fp = bam_open(iname, "r");
+        } else if(oname == NULL) {
+            oname = argv[i];
+            of = bam_open(oname, "w");
+        } else {
+            printf("Unrecognized option: %s\n", argv[i]);
+            usage(argv[0]);
+            return 1;
+        }
+    }
+    if(iname == NULL || oname == NULL) {
+        printf("Either the input or output file name were not specified!\n");
+        usage(argv[0]);
+        return 2;
+    }
+
+    //Set everything up
+    header = bam_header_read(fp);
+    read = bam_init1();
+    alignments = malloc(sizeof(alignment)*max_length);
+    assert(alignments != NULL);
+
+    //Read in the alignments
+    while(bam_read1(fp, read) > 1) {
+        alignments[total_pairs].tid = read->core.tid;
+        alignments[total_pairs].start1 = read->core.pos;
+        alignments[total_pairs].stop1 = bam_calend(&(read->core), bam1_cigar(read));
+        alignments[total_pairs].MAPQ = read->core.qual;
+        alignments[total_pairs].strand = get_strand(read);
+        if(read->core.flag & BAM_FPAIRED) {
+            assert(bam_read1(fp,read)>1);
+            alignments[total_pairs].start2 = read->core.pos;
+            alignments[total_pairs].stop2 = bam_calend(&(read->core), bam1_cigar(read));
+        } else {
+            alignments[total_pairs].start2 = 0;
+            alignments[total_pairs].stop2 = 0;
+        }
+        alignments[total_pairs].read_number = total_pairs;
+        total_pairs++;
+
+        //Lengthen the array as needed
+        if(max_length-total_pairs == 0) {
+            max_length += grow_size;
+            alignments = realloc(alignments, max_length * sizeof(alignment));
+            assert(alignments != NULL);
+        }
+    }
+    bam_close(fp);
+
+    //create bitmap
+    bitmap_length = max_length/32;
+    bitmap_length += (max_length % 32 > 0) ? 1 : 0;
+    bitmap = calloc(bitmap_length, sizeof(uint32_t));
+
+    //Sort
+    qsort((void *) alignments, (size_t) total_pairs, sizeof(alignment), &comp_func);
+
+    //Mark duplicates in bitmap
+    ndups = mark_dups(alignments, bitmap, total_pairs);
+    free(alignments);
+    printf("There were %"PRIu64" duplicates from %"PRIu64" total reads or pairs\n", ndups, total_pairs);
+
+    //reopen file, iterate through and change flags as appropriate
+    fp = bam_open(iname, "r");
+    header = bam_header_read(fp);
+    of = bam_open(oname, "w");
+    bgzf_mt(of, 4, 256); //This should be user configurable
+    bam_header_write(of, header);
+
+    while(bam_read1(fp, read) > 1) {
+        if(get_bit(bitmap, cur_read)) read->core.flag = read->core.flag | BAM_FDUP;
+        bam_write1(of, read);
+        if(read->core.flag & BAM_FPAIRED) {
+            assert(bam_read1(fp, read) > 1);
+            if(get_bit(bitmap, cur_read)) read->core.flag = read->core.flag | BAM_FDUP;
+            bam_write1(of, read);
+        }
+        cur_read++;
+    }
+
+    //Clean up
+    bam_close(fp);
+    bam_close(of);
+    bam_destroy1(read);
+    free(bitmap);
+
+    return 0;
+}
diff --git a/master.c b/master.c
new file mode 100644
index 0000000..fd3ef2e
--- /dev/null
+++ b/master.c
@@ -0,0 +1,1091 @@
+#include "bison.h"
+#include <math.h>
+#include <sys/time.h>
+
+typedef struct {
+    unsigned long long t_reads; //total reads
+    unsigned long long m_reads_OT; //reads mapped to the OT
+    unsigned long long m_reads_OB;
+    unsigned long long m_reads_CTOT;
+    unsigned long long m_reads_CTOB;
+    unsigned long long t_CpG; //Total CpGs
+    unsigned long long m_CpG; //Methylated CpGs
+    unsigned long long t_CHG;
+    unsigned long long m_CHG;
+    unsigned long long t_CHH;
+    unsigned long long m_CHH;
+} metrics_struct;
+
+/******************************************************************************
+*
+*   Update the CpG/CHG/CHH metrics according to the methylation calls in a read
+*
+*******************************************************************************/
+void update_counts(bam1_t *read, metrics_struct *metrics) {
+    char *XM = bam_aux2Z(bam_aux_get(read, "XM"));
+    char base;
+    int i;
+
+    for(i=0; i<read->core.l_qseq; i++) {
+        base = *(XM+i);
+        if(base != '.') {
+            if(base == 'Z') {
+                metrics->t_CpG++;
+                metrics->m_CpG++;
+            } else if(base == 'z') {
+                metrics->t_CpG++;
+            } else if(base == 'X') {
+                metrics->t_CHG++;
+                metrics->m_CHG++;
+            } else if(base == 'x') {
+                metrics->t_CHG++;
+            } else if(base == 'H') {
+                metrics->t_CHH++;
+                metrics->m_CHH++;
+            } else if(base == 'h') {
+                metrics->t_CHH++;
+            }
+        }
+    }
+}
+
+/******************************************************************************
+*
+*   Return the alignment score or -MAX_INT if unaligned
+*
+*   bam1_t *read: the read in question
+*
+*******************************************************************************/
+int get_AS(bam1_t *read) {
+    int AS = INT_MIN>>2;
+    uint8_t *p = bam_aux_get(read, "AS");
+
+    if(read->core.flag & BAM_FUNMAP) return AS;
+    if(p != NULL) AS = bam_aux2i(p);
+    return AS;
+}
+
+/******************************************************************************
+*
+*   Calculate the minimum score for a given readlength
+*
+*   int32_t rlen: a read length
+*
+*******************************************************************************/
+inline int scoreMin(int32_t rlen) {
+    //Return different values, depending on --score-min
+    if(config.scoremin_type == 'L') {
+        return (config.scoremin_intercept + config.scoremin_coef * rlen);
+    } else if(config.scoremin_type == 'S') {
+        return (config.scoremin_intercept + config.scoremin_coef * sqrt((float) rlen));
+    } else if(config.scoremin_type == 'G') {
+        return (config.scoremin_intercept + config.scoremin_coef * log((float) rlen));
+    } else { //'C'
+        return (config.scoremin_intercept + config.scoremin_coef);
+    }
+}
+
+/******************************************************************************
+*
+*   Return the secondary alignment score or -MAX_INT if unaligned
+*
+*   bam1_t *read: the read in question
+*
+*******************************************************************************/
+int get_XS(bam1_t *read) {
+    int XS = INT_MIN>>2;
+    uint8_t *p = bam_aux_get(read, "XS");
+
+    if(read->core.flag & BAM_FUNMAP) return XS;
+    if(p != NULL) XS = bam_aux2i(p);
+    return XS;
+}
+
+/******************************************************************************
+*
+*   Calculate a MAPQ, given AS, XS, and the minimum score (ala bowtie2)
+*
+*******************************************************************************/
+int calc_MAPQ_BT2(int AS, int XS, int scMin) {
+    int diff, bestOver, bestdiff;
+    diff = abs(scMin); //Range of possible alignment scores
+    bestOver = AS-scMin; //Shift alignment score range, so worst score is 0
+    
+    //This seems like an odd way to calculate this!
+
+    //The method depends on config.mode
+    bestdiff = (int) abs(abs((float) AS)-abs((float) XS)); //Absolute distance between alignment scores
+    if(config.mode == 0) { //--end-to-end (default)
+        if(XS < scMin) {
+            if(bestOver >= diff * (double) 0.8f) return 42;
+            else if(bestOver >= diff * (double) 0.7f) return 40;
+            else if(bestOver >= diff * (double) 0.6f) return 24;
+            else if(bestOver >= diff * (double) 0.5f) return 23;
+            else if(bestOver >= diff * (double) 0.4f) return 8;
+            else if(bestOver >= diff * (double) 0.3f) return 3;
+            else return 0;
+        } else {
+            if(bestdiff >= diff * (double) 0.9f) {
+                if(bestOver == diff) {
+                    return 39;
+                } else {
+                    return 33;
+                }
+            } else if(bestdiff >= diff * (double) 0.8f) {
+                if(bestOver == diff) {
+                    return 38;
+                } else {
+                    return 27;
+                }
+            } else if(bestdiff >= diff * (double) 0.7f) {
+                if(bestOver == diff) {
+                    return 37;
+                } else {
+                    return 26;
+                }
+            } else if(bestdiff >= diff * (double) 0.6f) {
+                if(bestOver == diff) {
+                    return 36;
+                } else {
+                    return 22;
+                }
+            } else if(bestdiff >= diff * (double) 0.5f) {
+                if(bestOver == diff) {
+                    return 35;
+                } else if(bestOver >= diff * (double) 0.84f) {
+                    return 25;
+                } else if(bestOver >= diff * (double) 0.68f) {
+                    return 16;
+                } else {
+                    return 5;
+                }
+            } else if(bestdiff >= diff * (double) 0.4f) {
+                if(bestOver == diff) {
+                    return 34;
+                } else if(bestOver >= diff * (double) 0.84f) {
+                    return 21;
+                } else if(bestOver >= diff * (double) 0.68f) {
+                    return 14;
+                } else {
+                    return 4;
+                }
+            } else if(bestdiff >= diff * (double) 0.3f) {
+                if(bestOver == diff) {
+                    return 32;
+                } else if(bestOver >= diff * (double) 0.88f) {
+                    return 18;
+                } else if(bestOver >= diff * (double) 0.67f) {
+                    return 15;
+                } else {
+                    return 3;
+                }
+            } else if(bestdiff >= diff * (double) 0.2f) {
+                if(bestOver == diff) {
+                    return 31;
+                } else if(bestOver >= diff * (double) 0.88f) {
+                    return 17;
+                } else if(bestOver >= diff * (double) 0.67f) {
+                    return 11;
+                } else {
+                    return 0;
+                }
+            } else if(bestdiff >= diff * (double) 0.1f) {
+                if(bestOver == diff) {
+                    return 30;
+                } else if(bestOver >= diff * (double) 0.88f) {
+                    return 12;
+                } else if(bestOver >= diff * (double) 0.67f) {
+                    return 7;
+                } else {
+                    return 0;
+                }
+            } else if(bestdiff > 0) {
+                if(bestOver >= diff * (double)0.67f) {
+                    return 6;
+                } else {
+                    return 2;
+                }
+            } else {
+                if(bestOver >= diff * (double)0.67f) {
+                    return 1;
+                } else {
+                    return 0;
+                }
+            }
+        }
+    } else { //--local
+        if(XS < scMin) {
+            if(bestOver >= diff * (double) 0.8f) return 44;
+            else if(bestOver >= diff * (double) 0.7f) return 42;
+            else if(bestOver >= diff * (double) 0.6f) return 41;
+            else if(bestOver >= diff * (double) 0.5f) return 36;
+            else if(bestOver >= diff * (double) 0.4f) return 28;
+            else if(bestOver >= diff * (double) 0.3f) return 24;
+            else return 22;
+        } else {
+            if(bestdiff >= diff * (double) 0.9f) return 40;
+            else if(bestdiff >= diff * (double) 0.8f) return 39;
+            else if(bestdiff >= diff * (double) 0.7f) return 38;
+            else if(bestdiff >= diff * (double) 0.6f) return 37;
+            else if(bestdiff >= diff * (double) 0.5f) {
+                if     (bestOver == diff)       return 35;
+                else if(bestOver >= diff * (double) 0.5f) return 25;
+                else                            return 20;
+            } else if(bestdiff >= diff * (double) 0.4f) {
+                if     (bestOver == diff)       return 34;
+                else if(bestOver >= diff * (double) 0.5f) return 21;
+                else                            return 19;
+            } else if(bestdiff >= diff * (double) 0.3f) {
+                if     (bestOver == diff)       return 33;
+                else if(bestOver >= diff * (double) 0.5f) return 18;
+                else                            return 16;
+            } else if(bestdiff >= diff * (double) 0.2f) {
+                if     (bestOver == diff)       return 32;
+                else if(bestOver >= diff * (double) 0.5f) return 17;
+                else                            return 12;
+            } else if(bestdiff >= diff * (double) 0.1f) {
+                if     (bestOver == diff)       return 31;
+                else if(bestOver >= diff * (double) 0.5f) return 14;
+                else                            return 9;
+            } else if(bestdiff > 0) {
+                if(bestOver >= diff * (double) 0.5f)      return 11;
+                else                            return 2;
+            } else {
+                if(bestOver >= diff * (double) 0.5f)      return 1;
+                else                            return 0;
+            }
+        }
+    }
+}
+
+/******************************************************************************
+*
+*   Determine whether the alignment is actually unique by comparing the AS and
+*   XS auxiliary tags.
+*
+*   bam1_t *read: The read to look at
+*
+*******************************************************************************/
+int unique_alignment(bam1_t *read) {
+    int AS, XS;
+
+    AS = bam_aux2i(bam_aux_get(read, "AS"));
+    if(bam_aux_get(read, "XS") == 0) return 1;
+    XS = bam_aux2i(bam_aux_get(read, "XS"));
+    if(AS > XS) return 1;
+    return 0;
+}
+
+/******************************************************************************
+*
+*   Replace the stored sequence in a read.
+*
+*   bam1_t *read: The read whose sequence will be replaced
+*   char *seq: Sequence to coopy into read.
+*
+*   If read is reverse complemented, the same will be done to seq.
+*
+*******************************************************************************/
+void swap_sequence(bam1_t *read, char *seq) {
+    uint8_t *sequence = bam1_seq(read), val;
+    char *seq2 = strdup(seq);
+    int i, j;
+
+    //Do we need to reverse complement?
+    if(read->core.flag & BAM_FREVERSE) reverse_complement(seq2);
+    for(i=0, j=0; i<strlen(seq2); i+=2, j++) {
+        if(*(seq2+i) == 'A') {
+            if(*(seq2+i+1) == 'A') val = 17;
+            else if(*(seq2+i+1) == 'C') val = 18;
+            else if(*(seq2+i+1) == 'G') val = 20;
+            else if(*(seq2+i+1) == 'T') val = 24;
+            else if(*(seq2+i+1) == 'N') val = 31;
+            else val = 16;
+        } else if(*(seq2+i) == 'C') {
+            if(*(seq2+i+1) == 'A') val = 33;
+            else if(*(seq2+i+1) == 'C') val = 34;
+            else if(*(seq2+i+1) == 'G') val = 36;
+            else if(*(seq2+i+1) == 'T') val = 40;
+            else if(*(seq2+i+1) == 'N') val = 47;
+            else val = 32;
+        } else if(*(seq2+i) == 'G') {
+            if(*(seq2+i+1) == 'A') val = 65;
+            else if(*(seq2+i+1) == 'C') val = 66;
+            else if(*(seq2+i+1) == 'G') val = 68;
+            else if(*(seq2+i+1) == 'T') val = 72;
+            else if(*(seq2+i+1) == 'N') val = 79;
+            else val = 64;
+        } else if(*(seq2+i) == 'T') {
+            if(*(seq2+i+1) == 'A') val = 129;
+            else if(*(seq2+i+1) == 'C') val = 130;
+            else if(*(seq2+i+1) == 'G') val = 132;
+            else if(*(seq2+i+1) == 'T') val = 136;
+            else if(*(seq2+i+1) == 'N') val = 143;
+            else val = 128;
+        } else {
+            if(*(seq2+i+1) == 'A') val = 241;
+            else if(*(seq2+i+1) == 'C') val = 242;
+            else if(*(seq2+i+1) == 'G') val = 244;
+            else if(*(seq2+i+1) == 'T') val = 248;
+            else if(*(seq2+i+1) == 'N') val = 255;
+            else val = 240;
+        }
+        *(sequence+j) = val;
+    }
+    free(seq2);
+}
+
+/******************************************************************************
+*
+*   Return the XM string that will be appended to a read.
+*
+*   bam1_t *read; the read in question
+*   char *XG: The XG tag, indicating which coversion to pay attention to.
+*
+*   THE OUTPUT MUST BE free()d
+*******************************************************************************/
+char *callXM(bam1_t *read, char *XG) {
+    char *chrom = lookup_chrom(read);
+    unsigned long long offset = genome_offset(chrom, 0), current_position;
+    unsigned long long chrom_end = genome_chrom_length(chrom);
+    unsigned long long *genomic_position = calculate_positions(read);
+
+    char *read_seq = calloc(1+read->core.l_qseq, sizeof(char));
+    char *XM = calloc(1+read->core.l_qseq, sizeof(char));
+    char genome_base, read_base, *bases;
+    int i;
+    uint8_t b;
+
+    //Extract the read sequence
+    for(i=0; i<read->core.l_qseq; i++) {
+        b = bam1_seqi(bam1_seq(read), i);
+        if(b == 1) {
+            *(read_seq+i) = 'A';
+        } else if(b == 2) {
+            *(read_seq+i) = 'C';
+        } else if(b == 4) {
+            *(read_seq+i) = 'G';
+        } else if(b == 8) {
+            *(read_seq+i) = 'T';
+        } else if(b == 15) {
+            *(read_seq+i) = 'N';
+        }
+        current_position = *(genomic_position+i);
+    }
+
+    for(i=0; i<read->core.l_qseq; i++) {
+        current_position = *(genomic_position+i);
+        if(current_position == ULLONG_MAX) {
+            *(XM+i) = '.';
+            continue;
+        }
+        genome_base = toupper(*(chromosomes.genome+offset+current_position));
+        read_base = toupper(*(read_seq+i));
+        if(read_base != genome_base) {
+            //Mismatches to the top and bottom strands are treated differently
+            if(*XG == 'C') { //OT or CTOT
+                if(genome_base == 'C' && read_base == 'T') {
+                    bases = get_genomic_context(offset, current_position, 2, chrom_end);
+                    if(*(bases+1) == 'G') {
+                        //Unmethylated CpG
+                        *(XM+i) = 'z';
+                    } else if(*(bases+2) == 'G') {
+                        //Unmethylated CHG
+                        *(XM+i) = 'x';
+                    } else {
+                        //Unmethylated CHH
+                        *(XM+i) = 'h';
+                    }
+                    free(bases);
+                } else {
+                    //Just a mismatch
+                    *(XM+i) = '.';
+                }
+            } else { //OB or CTOB
+                if(genome_base == 'G' && read_base == 'A') {
+                    bases = get_genomic_context(offset, current_position, -2, chrom_end);
+                    if(*(bases+1) == 'C') {
+                        //Unmethylated CpG
+                        *(XM+i) = 'z';
+                    } else if(*(bases+0) == 'C') {
+                        //Unmethylated CHG
+                        *(XM+i) = 'x';
+                    } else {
+                        //Unmethylated CHH
+                        *(XM+i) = 'h';
+                    }
+                    free(bases);
+                } else {
+                    *(XM+i) = '.';
+                }
+            }
+        } else {
+            if(*XG == 'C') { //OT or CTOT
+                if(genome_base == 'C') {
+                    bases = get_genomic_context(offset, current_position, 2, chrom_end);
+                    if(*(bases+1) == 'G') {
+                        //Methylated CpG
+                        *(XM+i) = 'Z';
+                    } else if(*(bases+2) == 'G') {
+                        //Methylated CHG
+                        *(XM+i) = 'X';
+                    } else {
+                        //Methylated CHH
+                        *(XM+i) = 'H';
+                    }
+                    free(bases);
+                } else {
+                    *(XM+i) = '.';
+                }
+            } else { //OB or CTOB
+                if(genome_base == 'G') {
+                    bases = get_genomic_context(offset, current_position, -2, chrom_end);
+                    if(*(bases+1) == 'C') {
+                        //Methylated CpG
+                        *(XM+i) = 'Z';
+                    } else if(*(bases+0) == 'C') {
+                        //Methylated CHG
+                        *(XM+i) = 'X';
+                    } else {
+                        //Methylated CHH
+                        *(XM+i) = 'H';
+                    }
+                    free(bases);
+                } else {
+                    *(XM+i) = '.';
+                }
+            }
+        }
+    }
+
+    free(read_seq);
+    free(genomic_position);
+    return XM;
+}
+
+/******************************************************************************
+*
+*   As with callXM, but return the mismatches with the reference.
+*
+*   bam_t *read; the read in question
+*   char *XM: output from callXM
+*   char *XG: The XG tag, indicating which coversion to pay attention to.
+*
+*   THE OUTPUT MUST BE fre()d
+*   The length of XX is currently limited to MAXREAD!!!
+*
+*******************************************************************************/
+char *callXX(bam1_t *read, char *XM, char *XG) {
+    char *chrom = lookup_chrom(read);
+    unsigned long long offset = genome_offset(chrom, 0), current_position;
+    unsigned long long *genomic_position = calculate_positions(read);
+    uint8_t base, NM = 0;
+
+    char *read_seq = calloc(1+read->core.l_qseq, sizeof(char));
+    char *XX = calloc(MAXREAD, sizeof(char));
+    int i, good = 0;
+
+    //Extract the read sequence
+    for(i=0; i<read->core.l_qseq; i++) {
+        base = bam1_seqi(bam1_seq(read), i);
+        if(base == 1) *(read_seq+i) = 'A';
+        else if(base == 2) *(read_seq+i) = 'C';
+        else if(base == 4) *(read_seq+i) = 'G';
+        else if(base == 8) *(read_seq+i) = 'T';
+        else *(read_seq+i) = 'N';
+        current_position = *(genomic_position+i);
+    }
+
+    //Create the XM string
+    for(i=0; i<strlen(XM); i++) {
+        if(*(XM+i) != '.') {
+            //unlike bismark, we don't count methylation changes as mismatches
+            good++;
+        } else {
+            current_position = *(genomic_position+i);
+            base = toupper(*(chromosomes.genome+offset+current_position));
+            if(base != *(read_seq+i)) {
+                NM++;
+                //If the read starts with a mismatch, the XX string should start with a 0
+                if(i == 0) {
+                    sprintf(XX, "0%c", *(read_seq+i));
+                } else if(good) {
+                    sprintf(XX, "%s%i%c", XX, good, *(read_seq+i));
+                    good = 0;
+                } else {
+                    sprintf(XX, "%s%c", XX, *(read_seq+i));
+                }
+            } else {
+                good++;
+            }
+        }
+    }
+    if(good) sprintf(XX, "%s%i", XX, good);
+
+    //Update the NM tag
+    *(bam_aux_get(read, "NM")+1) = NM;
+
+    free(read_seq);
+    free(genomic_position);
+
+    return XX;
+}
+
+/******************************************************************************
+*
+*   Given a set of single-end reads, determine which one, if any, aligns best.
+*   Then, add the various XM/XX/etc. tags and prepare the read for writing. The
+*   final read will always be stored in read1. Return the worker node number
+*   producing the best alignment (or 0).
+*
+*   bam1_t *readN: Unpacked reads from the worker nodes
+*   char *seq: The unconverted fastq read
+*
+*******************************************************************************/
+int process_single(bam1_t *read1, bam1_t *read2, bam1_t *read3, bam1_t *read4, char *seq) {
+    int AS1=0, AS2=0, AS3=0, AS4=0;
+    bam1_t *tmp_read = NULL;
+    char *XM, *XX, XG[] = "CT", XR[] = "CT";
+    int best_node = 0;
+    kstring_t *kXM = (kstring_t *) calloc(1, sizeof(kstring_t));
+    kstring_t *kXX = (kstring_t *) calloc(1, sizeof(kstring_t));
+    //For recalculating the MAPQ ala bowtie2 v2 MAPQ calculator
+    int XS, scMin, MAPQ = 0, AS=0;
+
+    //Determine the read with the highest alignment score
+    AS1 = get_AS(read1);
+    AS2 = get_AS(read2);
+    if(!config.directional) {
+        AS3 = get_AS(read3);
+        AS4 = get_AS(read4);
+    }
+    if(config.directional) {
+        if(AS1 > AS2) {
+            sprintf(XR, "CT");
+            sprintf(XG, "CT");
+            if(!(read1->core.flag & BAM_FUNMAP)) {
+                tmp_read = read1;
+                best_node = 1;
+            }
+        } else if(AS2 > AS1) {
+            sprintf(XR, "CT");
+            sprintf(XG, "GA");
+            if(!(read2->core.flag & BAM_FUNMAP)) {
+                tmp_read = read2;
+                best_node = 2;
+            }
+        }
+    } else {
+        if(AS1 > AS2 && AS1 > AS3 && AS1 > AS4) { //OT
+            sprintf(XR, "CT");
+            sprintf(XG, "CT");
+            if(!(read1->core.flag & BAM_FUNMAP)) {
+                tmp_read = read1;
+                best_node = 1;
+            }
+        } else if(AS2 > AS1 && AS2 > AS3 && AS2 > AS4) { //OB
+            sprintf(XR, "CT");
+            sprintf(XG, "GA");
+            if(!(read2->core.flag & BAM_FUNMAP)) {
+                tmp_read = read2;
+                best_node = 2;
+            }
+        } else if(AS3 > AS1 && AS3 > AS2 && AS3 > AS4) { //CTOT
+            sprintf(XR, "GA");
+            sprintf(XG, "CT");
+            if(!(read3->core.flag & BAM_FUNMAP)) {
+                tmp_read = read3;
+                best_node = 3;
+            }
+        } else if(AS4 > AS1 && AS4 > AS2 && AS4 > AS3) { //CTOB
+            sprintf(XR, "GA");
+            sprintf(XG, "GA");
+            if(!(read4->core.flag & BAM_FUNMAP)) {
+                tmp_read = read4;
+                best_node = 4;
+            }
+        }
+    }
+
+    //If there is no best score (tmp_read == NULL), mark read1 as unmapped
+    if(tmp_read == NULL) {
+        swap_sequence(read1, seq);
+        read1->core.flag = read1->core.flag | 0x4;
+        best_node = 1;
+    } else {
+        swap_sequence(tmp_read, seq);
+        XM = callXM(tmp_read, XG);
+        XX = callXX(tmp_read, XM, XG);
+        //append the tags
+        kputs(XX, kXX);
+        kputs(XM, kXM);
+
+        bam_aux_del(tmp_read, bam_aux_get(tmp_read, "XM"));
+        bam_aux_del(tmp_read, bam_aux_get(tmp_read, "XG"));
+        bam_aux_append(tmp_read, "XX", 'Z', kXX->l + 1, (uint8_t*) kXX->s);
+        bam_aux_append(tmp_read, "XM", 'Z', kXM->l + 1, (uint8_t*) kXM->s);
+        bam_aux_append(tmp_read, "XR", 'Z', 3, (uint8_t*) XR);
+        bam_aux_append(tmp_read, "XG", 'Z', 3, (uint8_t*) XG);
+        free(kXX->s);
+        free(kXM->s);
+        free(XM);
+        free(XX);
+
+        //Recalculate MAPQ and replace the XS score
+        scMin = scoreMin(tmp_read->core.l_qseq);
+        XS = get_XS(tmp_read);
+        if(best_node == 1) {
+            AS = AS1;
+            if(AS2 > XS) XS = AS2;
+            if(!config.directional) {
+                if(AS3 > XS) XS = AS3;
+                if(AS4 > XS) XS = AS4;
+            }
+        }
+        if(best_node == 2) {
+            AS = AS2;
+            if(AS1 > XS) XS = get_AS(read2);
+            if(!config.directional) {
+                if(AS3 > XS) XS = AS3;
+                if(AS4 > XS) XS = AS4;
+            }
+        }
+        if(best_node == 3) {
+            AS = AS3;
+            if(AS1 > XS) XS = AS1;
+            if(AS2 > XS) XS = AS2;
+            if(AS4 > XS) XS = AS4;
+        }
+        if(best_node == 4) {
+            AS = AS4;
+            if(AS1 > XS) XS = AS1;
+            if(AS2 > XS) XS = AS2;
+            if(AS3 > XS) XS = AS3;
+        }
+        MAPQ = calc_MAPQ_BT2(AS, XS, scMin);
+        MAPQ = (MAPQ < tmp_read->core.qual) ? MAPQ : tmp_read->core.qual;
+        tmp_read->core.qual = MAPQ;
+        if(XS >= scMin) {
+            //replace/add the XS tag
+            if(bam_aux_get(tmp_read, "XS")) bam_aux_del(tmp_read, bam_aux_get(tmp_read, "XS"));
+            bam_aux_append(tmp_read, "XS", 'i', 4, (uint8_t*) &XS);
+        }
+    }
+    free(kXX);
+    free(kXM);
+    return best_node;
+}
+
+/******************************************************************************
+*
+*   Like process_single, but for paired_end reads. The bam1_t**s hold the
+*   buffered reads. i denotes the read#1 of interest (read #2 is the next read)
+*
+*******************************************************************************/
+int process_paired(bam1_t **read1, bam1_t **read2, bam1_t **read3, bam1_t **read4, char **seq) {
+    int AS1=0, AS2=0, AS3=0, AS4=0;
+    bam1_t *tmp_read1 = NULL, *tmp_read2 = NULL;
+    char *XM1, *XM2, *XX1, *XX2, XG[] = "CT", XR1[] = "CT", XR2[] = "CT";
+    kstring_t *kXM1 = (kstring_t *) calloc(1, sizeof(kstring_t));
+    kstring_t *kXM2 = (kstring_t *) calloc(1, sizeof(kstring_t));
+    kstring_t *kXX1 = (kstring_t *) calloc(1, sizeof(kstring_t));
+    kstring_t *kXX2 = (kstring_t *) calloc(1, sizeof(kstring_t));
+    int best_node = 0;
+    //For MAPQ/XS replacement
+    int MAPQ, XS1, XS2, scMin1, scMin2;
+
+    //Determine the read with the highest alignment score
+    AS1 = get_AS(*(read1)) + get_AS(*(read1+1));
+    AS2 = get_AS(*(read2)) + get_AS(*(read2+1));
+    if(!config.directional) {
+        AS3 = get_AS(*(read3)) + get_AS(*(read3+1));
+        AS4 = get_AS(*(read4)) + get_AS(*(read4+1));
+    }
+    if(config.directional) {
+        if(AS1 > AS2) { //OT
+            sprintf(XR1, "CT");
+            sprintf(XR2, "GA");
+            sprintf(XG, "CT");
+            if(!((*(read1))->core.flag & BAM_FUNMAP)) {
+                tmp_read1 = *(read1);
+                tmp_read2 = *(read1+1);
+                best_node = 1;
+            }
+        } else if(AS2 > AS1) { //OB
+            sprintf(XR1, "CT");
+            sprintf(XR2, "GA");
+            sprintf(XG, "GA");
+            if(!((*(read2))->core.flag & BAM_FUNMAP)) {
+                tmp_read1 = *(read2);
+                tmp_read2 = *(read2+1);
+                best_node = 2;
+            }
+        }
+    } else {
+        if(AS1 > AS2 && AS1 > AS3 && AS1 > AS4) { //OT
+            sprintf(XR1, "CT");
+            sprintf(XR2, "GA");
+            sprintf(XG, "CT");
+            if(!((*(read1))->core.flag & BAM_FUNMAP)) {
+                tmp_read1 = *(read1);
+                tmp_read2 = *(read1+1);
+                best_node = 1;
+            }
+        } else if(AS2 > AS1 && AS2 > AS3 && AS2 > AS4) { //OB
+            sprintf(XR1, "CT");
+            sprintf(XR2, "GA");
+            sprintf(XG, "GA");
+            if(!((*(read2))->core.flag & BAM_FUNMAP)) {
+                tmp_read1 = *(read2);
+                tmp_read2 = *(read2+1);
+                best_node = 2;
+            }
+        } else if(AS3 > AS1 && AS3 > AS2 && AS3 > AS4) { //CTOT
+            sprintf(XR1, "GA");
+            sprintf(XR2, "CT");
+            sprintf(XG, "CT");
+            if(!((*(read3))->core.flag & BAM_FUNMAP)) {
+                tmp_read1 = *(read3);
+                tmp_read2 = *(read3+1);
+                best_node = 3;
+            }
+        } else if(AS4 > AS1 && AS4 > AS2 && AS4 > AS3) { //CTOB
+            sprintf(XR1, "GA");
+            sprintf(XR2, "CT");
+            sprintf(XG, "GA");
+            if(!((*(read4))->core.flag & BAM_FUNMAP)) {
+                tmp_read1 = *(read4);
+                tmp_read2 = *(read4+1);
+                best_node = 4;
+            }
+        }
+    }
+
+    //If there is no best score (tmp_read == NULL), mark read1 as unmapped
+    if(tmp_read1 == NULL) {
+        swap_sequence(*(read1), *(seq));
+        swap_sequence(*(read1+1), *(seq+1));
+        (*(read1))->core.flag = (*(read1))->core.flag | 0x4;
+        (*(read1+1))->core.flag = (*(read1+1))->core.flag | 0x4;
+        best_node = 1;
+    } else {
+        swap_sequence(tmp_read1, *(seq));
+        swap_sequence(tmp_read2, *(seq+1));
+        XM1 = callXM(tmp_read1, XG);
+        XX1 = callXX(tmp_read1, XM1, XG);
+        XM2 = callXM(tmp_read2, XG);
+        XX2 = callXX(tmp_read2, XM2, XG);
+
+        kputs(XX1, kXX1);
+        kputs(XX2, kXX2);
+        kputs(XM1, kXM1);
+        kputs(XM2, kXM2);
+
+        bam_aux_del(tmp_read1, bam_aux_get(tmp_read1, "XM"));
+        bam_aux_del(tmp_read2, bam_aux_get(tmp_read2, "XM"));
+        bam_aux_del(tmp_read1, bam_aux_get(tmp_read1, "XG"));
+        bam_aux_del(tmp_read2, bam_aux_get(tmp_read2, "XG"));
+
+        bam_aux_append(tmp_read1, "XX", 'Z', kXX1->l + 1, (uint8_t*) kXX1->s);
+        bam_aux_append(tmp_read2, "XX", 'Z', kXX2->l + 1, (uint8_t*) kXX2->s);
+
+        bam_aux_append(tmp_read1, "XM", 'Z', kXM1->l + 1, (uint8_t*) kXM1->s);
+        bam_aux_append(tmp_read2, "XM", 'Z', kXM2->l + 1, (uint8_t*) kXM2->s);
+
+        bam_aux_append(tmp_read1, "XR", 'Z', 3, (uint8_t*) XR1);
+        bam_aux_append(tmp_read2, "XR", 'Z', 3, (uint8_t*) XR2);
+
+        bam_aux_append(tmp_read1, "XG", 'Z', 3, (uint8_t*) XG);
+        bam_aux_append(tmp_read2, "XG", 'Z', 3, (uint8_t*) XG);
+        free(kXX1->s);
+        free(kXX2->s);
+        free(kXM1->s);
+        free(kXM2->s);
+        free(XM1);
+        free(XM2);
+        free(XX1);
+        free(XX2);
+
+        //Recalculate MAPQ and replace the XS score
+        scMin1 = scoreMin(tmp_read1->core.l_qseq);
+        scMin2 = scoreMin(tmp_read2->core.l_qseq);
+        XS1 = get_XS(tmp_read1);
+        XS2 = get_XS(tmp_read2);
+        if(XS2 < scMin2) {
+            if(XS1 >= scMin1) XS2 = get_AS(tmp_read2);
+        } else if(XS1 < scMin1) {
+            if(XS2 >= scMin2) XS1 = get_AS(tmp_read1);
+        }
+        if(best_node == 1) {
+            if(AS2 > XS1+XS2) {
+                XS1 = get_AS(*(read2)); 
+                XS2 = get_AS(*(read2+1)); 
+            }
+            if(!config.directional) {
+                if(AS3 > XS1+XS2) {
+                    XS1 = get_AS(*(read3)); 
+                    XS2 = get_AS(*(read3+1)); 
+                }
+                if(AS4 > XS1+XS2) {
+                    XS1 = get_AS(*(read4)); 
+                    XS2 = get_AS(*(read4+1)); 
+                }
+            }
+        }
+        if(best_node == 2) {
+            if(AS1 > XS1+XS2) {
+                XS1 = get_AS(*(read1)); 
+                XS2 = get_AS(*(read1+1)); 
+            }
+            if(!config.directional) {
+                if(AS3 > XS1+XS2) {
+                    XS1 = get_AS(*(read3)); 
+                    XS2 = get_AS(*(read3+1)); 
+                }
+                if(AS4 > XS1+XS2) {
+                    XS1 = get_AS(*(read4)); 
+                    XS2 = get_AS(*(read4+1)); 
+                }
+            }
+        }
+        if(best_node == 3) {
+            if(AS1 > XS1+XS2) {
+                XS1 = get_AS(*(read1)); 
+                XS2 = get_AS(*(read1+1)); 
+            }
+            if(AS2 > XS1+XS2) {
+                XS1 = get_AS(*(read2)); 
+                XS2 = get_AS(*(read2+1)); 
+            }
+            if(AS4 > XS1+XS2) {
+                XS1 = get_AS(*(read4)); 
+                XS2 = get_AS(*(read4+1)); 
+            }
+        }
+        if(best_node == 4) {
+            if(AS1 > XS1+XS2) {
+                XS1 = get_AS(*(read1)); 
+                XS2 = get_AS(*(read1+1)); 
+            }
+            if(AS2 > XS1+XS2) {
+                XS1 = get_AS(*(read2)); 
+                XS2 = get_AS(*(read2+1)); 
+            }
+            if(AS3 > XS1+XS2) {
+                XS1 = get_AS(*(read3)); 
+                XS2 = get_AS(*(read3+1)); 
+            }
+        }
+        MAPQ = calc_MAPQ_BT2(get_AS(tmp_read1)+get_AS(tmp_read2), XS1+XS2, scMin1+scMin2);
+        MAPQ = (MAPQ < tmp_read1->core.qual) ? MAPQ : tmp_read1->core.qual; //Otherwise, a mapping can get worse but a score better!
+        tmp_read1->core.qual = MAPQ;
+        tmp_read2->core.qual = MAPQ;
+        //replace/add the XS tag
+        if(XS1 >= scMin1) {
+            if(bam_aux_get(tmp_read1, "XS")) bam_aux_del(tmp_read1, bam_aux_get(tmp_read1, "XS"));
+            bam_aux_append(tmp_read1, "XS", 'i', 4, (uint8_t*) &XS1);
+        }
+        if(XS2 >= scMin2) {
+            if(bam_aux_get(tmp_read2, "XS")) bam_aux_del(tmp_read2, bam_aux_get(tmp_read2, "XS"));
+            bam_aux_append(tmp_read2, "XS", 'i', 4, (uint8_t*) &XS2);
+        }
+    }
+    free(kXX1);
+    free(kXX2);
+    free(kXM1);
+    free(kXM2);
+    return best_node;
+}
+
+/*******************************************************************************
+*
+*   Update a packed read so that it's a proper bam1_t and return a pointer
+*
+*   struct packed_struct *first: first sentinel node
+*   int offset: Return the read from the first (0) or second (1) element
+*
+*   returns a pointer to a bam1_t read
+*
+*******************************************************************************/
+bam1_t * update_read(struct packed_struct *first, int offset) {
+    bam1_t *pbam1_t;
+    uint8_t *data;
+    bam1_t *new_copy = bam_init1();
+
+    if(offset == 0) {
+        pbam1_t = (bam1_t *) first->next->packed;
+    } else {
+        pbam1_t = (bam1_t *) first->next->next->packed;
+    }
+    data = (uint8_t *) (pbam1_t+1);
+    pbam1_t->data = data;
+    bam_copy1(new_copy, pbam1_t);
+    free(pbam1_t);
+    if(offset == 0) {
+        first->next->packed = (void *) new_copy;
+    } else {
+        first->next->next->packed = (void *) new_copy;
+    }
+    return new_copy;
+}
+
+/*******************************************************************************
+*
+*   The master node function.
+*
+*   void *a: Actually a int*, the thread_id
+*
+*******************************************************************************/
+void * master_processer_thread(void *a) {
+    int thread_id = 0, best_node, i;
+    int times = (config.paired) ? 2 : 1;
+    char **seq = malloc(sizeof(char *) * 2);
+    *(seq) = malloc(sizeof(char)*MAXREAD);
+    *(seq+1) = malloc(sizeof(char)*MAXREAD);
+    bam1_t **node1_read = malloc(sizeof(bam1_t*) * 2);
+    bam1_t **node2_read = malloc(sizeof(bam1_t*) * 2);
+    bam1_t **node3_read = malloc(sizeof(bam1_t*) * 2);
+    bam1_t **node4_read = malloc(sizeof(bam1_t*) * 2);
+    bam1_t **best_read = NULL;
+    time_t now;
+
+    //Metrics
+    metrics_struct *metrics = malloc(sizeof(metrics_struct));
+    metrics->t_reads = 0;
+    metrics->m_reads_OT = 0;
+    metrics->m_reads_OB = 0;
+    metrics->m_reads_CTOT = 0;
+    metrics->m_reads_CTOB = 0;
+    metrics->t_CpG = 0;
+    metrics->m_CpG = 0;
+    metrics->t_CHG = 0;
+    metrics->m_CHG = 0;
+    metrics->t_CHH = 0;
+    metrics->m_CHH = 0;
+
+    //Process read i/o
+    while(1) {
+        while(!is_ready(node1, 0));
+        if(is_finished(node1)) break;
+        *(node1_read) = update_read(node1, 0);
+        if(config.paired) {
+            while(!is_ready(node1, 1));
+            *(node1_read+1) = update_read(node1, 1);
+        }
+        while(!is_ready(node2, 0));
+        *(node2_read) = update_read(node2, 0);
+        if(config.paired) {
+            while(!is_ready(node2, 1));
+            *(node2_read+1) = update_read(node2, 1);
+        }
+        if(!config.directional) {
+            while(!is_ready(node3, 0));
+            *node3_read = update_read(node3, 0);
+            if(config.paired) {
+                while(!is_ready(node3, 1));
+                *(node3_read+1) = update_read(node3, 1);
+            }
+            while(!is_ready(node4, 0));
+            *node4_read = update_read(node4, 0);
+            if(config.paired) {
+                while(!is_ready(node4, 1));
+                *(node4_read+1) = update_read(node4, 1);
+            }
+        }
+        metrics->t_reads++;
+
+        //Give some output, it's a bit misleading as the count is actually only for this thread and it'll only display for thread 0.
+        if(!config.quiet) {
+            if(thread_id == 0) {
+                if((metrics->t_reads) % 100000 == 0) {
+                    now = time(NULL);
+                    printf("%llu reads %s", metrics->t_reads, ctime(&now)); fflush(stdout);
+                }
+            }
+        }
+
+        get_seq(*seq, zip1);
+        if(config.paired) get_seq(*(seq+1), zip2);
+
+        //Process the reads
+        if(!config.paired) {
+            best_node = process_single(*node1_read, *node2_read, *node3_read, *node4_read, *seq); //Output is stored in read1
+        } else {
+            best_node = process_paired(node1_read, node2_read, node3_read, node4_read, seq); //Output is stored in read
+        }
+        if(best_node == 1) {
+            best_read = node1_read;
+            if(!((*best_read)->core.flag & BAM_FUNMAP)) metrics->m_reads_OT++;
+        } else if(best_node == 2) {
+            best_read = node2_read;
+            if(!((*best_read)->core.flag & BAM_FUNMAP)) metrics->m_reads_OB++;
+        } else if(best_node == 3) {
+            best_read = node3_read;
+            if(!((*best_read)->core.flag & BAM_FUNMAP)) metrics->m_reads_CTOT++;
+        } else if(best_node == 4) {
+            best_read = node4_read;
+            if(!((*best_read)->core.flag & BAM_FUNMAP)) metrics->m_reads_CTOB++;
+        }
+
+        //Store the reads
+        if(!config.paired) {
+            if(!((*(best_read))->core.flag & BAM_FUNMAP)) {
+                bam_write1(OUTPUT_BAM, *(best_read));
+                update_counts(*(best_read), metrics);
+            } else {
+                if(config.unmapped) write_unmapped(unmapped1, *(best_read));
+            }
+        } else {
+            if(!((*(best_read))->core.flag & BAM_FUNMAP) && !((*(best_read+1))->core.flag & BAM_FUNMAP)) {
+                bam_write1(OUTPUT_BAM, *(best_read));
+                update_counts(*(best_read), metrics);
+                bam_write1(OUTPUT_BAM, *(best_read+1));
+                update_counts(*(best_read+1), metrics);
+            } else {
+                if(config.unmapped) {
+                    write_unmapped(unmapped1, *(best_read));
+                    write_unmapped(unmapped2, *(best_read+1));
+                }
+            }
+        }
+
+        //Remove the processed reads
+        for(i=0; i<times; i++){
+            remove_element(node1);
+            remove_element(node2);
+            if(!config.directional) {
+                remove_element(node3);
+                remove_element(node4);
+            }
+        }
+    }
+
+    
+
+    //Update the global metrics
+    t_reads += metrics->t_reads;
+    m_reads_OT += metrics->m_reads_OT;
+    m_reads_OB += metrics->m_reads_OB;
+    m_reads_CTOT += metrics->m_reads_CTOT;
+    m_reads_CTOB += metrics->m_reads_CTOB;
+    t_CpG += metrics->t_CpG;
+    m_CpG += metrics->m_CpG;
+    t_CHG += metrics->t_CHG;
+    m_CHG += metrics->m_CHG;
+    t_CHH += metrics->t_CHH;
+    m_CHH += metrics->m_CHH;
+
+    //Clean up
+    free(*(seq)); free(*(seq+1)); free(seq);
+    free(metrics);
+    bam_header_destroy(global_header);
+    free(node1_read);
+    free(node2_read);
+    free(node3_read);
+    free(node4_read);
+    destroy_list(node1);
+    destroy_list(node2);
+    destroy_list(node3);
+    destroy_list(node4);
+    return NULL;
+}
diff --git a/mbias.c b/mbias.c
new file mode 100644
index 0000000..e08e9a0
--- /dev/null
+++ b/mbias.c
@@ -0,0 +1,224 @@
+#include "bison.h"
+#include <math.h>
+
+unsigned long long *r1_m[4];
+unsigned long long *r1_um[4];
+unsigned long long *r2_m[4];
+unsigned long long *r2_um[4];
+int min_phred = 10;
+
+void store_calls(unsigned long long *m, unsigned long long *um, bam1_t *read, int reversed) {
+    char *meth = bam_aux2Z(bam_aux_get(read, "XM"));
+    uint8_t *qual = bam1_qual(read);
+    int i;
+
+    if(!reversed) {
+        for(i=0; i<strlen(meth); i++) {
+            if(*(qual+i) < min_phred) continue;
+            if(*(meth+i) == 'Z') *(m+i) += 1;
+            if(*(meth+i) == 'z') *(um+i) += 1;
+        }
+    } else {
+        for(i=strlen(meth)-1; i>=0; i--) {
+            if(*(qual+i) < min_phred) continue;
+            if(*(meth+i) == 'Z') *(m+i) += 1;
+            if(*(meth+i) == 'z') *(um+i) += 1;
+        }
+    }
+}
+
+void usage(char *prog) {
+    printf("Usage: %s [OPTIONS] file.bam\n", prog);
+    printf("\n\
+    Compute the methylation percentage as a function of read position for a BAM\n\
+    file. The output can be conveniently plotted with the accompanying\n\
+    plot_mbias.R script\n\
+\n\
+    -phred Minimum Phred score that a base must have for inclusion in the\n\
+           metrics (default 10).\n\
+\n\
+    -q     Read MAPQ value must at least this for inclusion (default 20).\n\
+           Specify 0 to include everything.\n\
+\n\
+    -pdf   Run the R script to convert the output to pdf format, including\n\
+           recommended inclusion bounds. R must be installed and in your PATH.\n");
+}
+
+int main(int argc, char *argv[]) {
+    bamFile ifile = NULL;
+    FILE *ofile = NULL;
+    char *prefix = NULL;
+    char *p, *XR, *XG;
+    bam1_t *read = bam_init1();
+    bam_header_t *header = NULL;
+    int max_length = 50;
+    int paired = 0, reversed = 0, hasComp = 0;
+    int i, j, min_mapq = 20, pdf = 0;
+    unsigned long long treads = 0;
+
+    if(argc < 2) {
+        usage(argv[0]);
+        return 1;
+    }
+    for(i=1; i<argc; i++) {
+        if(strcmp(argv[i], "-phred") == 0) {
+            min_phred = atoi(argv[++i]);
+        } else if(strcmp(argv[i], "-q") == 0) {
+            min_mapq = atoi(argv[++i]);
+        } else if(strcmp(argv[i], "-pdf") == 0) {
+            pdf = 1;
+        } else if(strcmp(argv[i], "-h") == 0) {
+            usage(argv[0]);
+            return 0;
+        } else if(prefix == NULL) {
+            ifile = bam_open(argv[i], "r");
+            header = bam_header_read(ifile);
+            prefix = strdup(argv[i]);
+        } else {
+            printf("Unknown option: %s", argv[i]);
+            free(prefix);
+            bam_header_destroy(header);
+            bam_close(ifile);
+            usage(argv[0]);
+            return -1;
+        }
+    }
+
+    if(prefix == NULL) {
+        printf("No BAM file specified!\n");
+        usage(argv[0]);
+        return -1;
+    }
+
+    //Create enough space to hold the metrics
+    for(i=0; i<4; i++) {
+        r1_m[i] = calloc(max_length, sizeof(unsigned long long));
+        r1_um[i] = calloc(max_length, sizeof(unsigned long long));
+        r2_m[i] = calloc(max_length, sizeof(unsigned long long));
+        r2_um[i] = calloc(max_length, sizeof(unsigned long long));
+    }
+
+    //Create the output name
+    p = strrchr(prefix, '.');
+    *p = '\0';
+    prefix = realloc(prefix, sizeof(char) * (strlen(prefix) + strlen("_mbias.txt ")));
+    sprintf(prefix, "%s_mbias.txt", prefix);
+    ofile = fopen(prefix, "w");
+
+    while(bam_read1(ifile, read) > 1) {
+        if(++treads % 10000000 == 0) printf("Processed %llu reads\n", treads);
+        if(read->core.qual < min_mapq) continue;
+        if(read->core.flag & BAM_FUNMAP) continue;
+
+        //Lengthen the output arrays if needed
+        if(read->core.l_qseq > max_length) {
+            for(i=0; i<4; i++) {
+                r1_m[i] = realloc(r1_m[i], read->core.l_qseq * sizeof(unsigned long long));
+                r1_um[i] = realloc(r1_um[i], read->core.l_qseq * sizeof(unsigned long long));
+                r2_m[i] = realloc(r2_m[i], read->core.l_qseq * sizeof(unsigned long long));
+                r2_um[i] = realloc(r2_um[i], read->core.l_qseq * sizeof(unsigned long long));
+                for(j=max_length; j<read->core.l_qseq; j++) {
+                    *(r1_m[i]+j) = 0;
+                    *(r1_um[i]+j) = 0;
+                    *(r2_m[i]+j) = 0;
+                    *(r2_um[i]+j) = 0;
+                }
+            }
+            max_length = read->core.l_qseq;
+        }
+        reversed = (read->core.flag & BAM_FREVERSE) ? 1 : 0;
+
+        if(bam_aux_get(read, "XR") == NULL || bam_aux_get(read, "XG") == NULL) printf("%s\n", bam_format1(header, read));
+        XR = bam_aux2Z(bam_aux_get(read, "XR"));
+        XG = bam_aux2Z(bam_aux_get(read, "XG"));
+        if(!(read->core.flag & BAM_FREAD2)) {
+            if(strcmp(XG, "CT") == 0) { //OT or CTOT
+                if(strcmp(XR, "CT") == 0) { //OT
+                    store_calls(r1_m[0], r1_um[0], read, reversed);
+                } else { //CTOT
+                    hasComp = 1;
+                    store_calls(r1_m[1], r1_um[1], read, reversed);
+                }
+            } else {
+                if(strcmp(XR, "CT") == 0) { //OB
+                    store_calls(r1_m[2], r1_um[2], read, reversed);
+                } else { //CTOB
+                    hasComp = 1;
+                    store_calls(r1_m[3], r1_um[3], read, reversed);
+                }
+            }
+        } else {
+            paired = 1;
+            if(strcmp(XG, "CT") == 0) { //OT or CTOT
+                if(strcmp(XR, "GA") == 0) { //OT
+                    store_calls(r2_m[0], r2_um[0], read, reversed);
+                } else { //CTOT
+                    hasComp = 1;
+                    store_calls(r2_m[1], r2_um[1], read, reversed);
+                }
+            } else {
+                if(strcmp(XR, "GA") == 0) { //OB
+                    store_calls(r2_m[2], r2_um[2], read, reversed);
+                } else { //CTOB
+                    hasComp = 1;
+                    store_calls(r2_m[3], r2_um[3], read, reversed);
+                }
+            }
+        }
+    }
+
+    //Output the calls
+    fprintf(ofile, "Strand\tRead\tPosition\tnMethylated\tnUnmethylated\n");
+    for(i=0; i<max_length; i++) { //OT
+        if(r1_m[0][i] > 0 || r1_um[0][i] > 0) fprintf(ofile, "OT\t1\t%i\t%llu\t%llu\n", i+1, r1_m[0][i], r1_um[0][i]);
+        if(paired) {
+            if(r2_m[0][i] > 0 || r2_um[0][i] > 0) fprintf(ofile, "OT\t2\t%i\t%llu\t%llu\n", i+1, r2_m[0][i], r2_um[0][i]);
+        }
+    }
+    for(i=0; i<max_length; i++) { //OB
+        if(r1_m[2][i] > 0 || r1_um[2][i] > 0) fprintf(ofile, "OB\t1\t%i\t%llu\t%llu\n", i+1, r1_m[2][i], r1_um[2][i]);
+        if(paired) {
+            if(r2_m[2][i] > 0 || r2_um[2][i] > 0) fprintf(ofile, "OB\t2\t%i\t%llu\t%llu\n", i+1, r2_m[2][i], r2_um[2][i]);
+        }
+    }
+    if(hasComp) {
+        for(i=0; i<max_length; i++) { //CTOT
+            if(r1_m[1][i] > 0 || r1_um[1][i] > 0) fprintf(ofile, "CTOT\t1\t%i\t%llu\t%llu\n", i+1, r1_m[1][i], r1_um[1][i]);
+            if(paired) {
+                if(r2_m[1][i] > 0 || r2_um[1][i] > 0) fprintf(ofile, "CTOT\t2\t%i\t%llu\t%llu\n", i+1, r2_m[1][i], r2_um[1][i]);
+            }
+        }
+        for(i=0; i<max_length; i++) { //CTOB
+            if(r1_m[3][i] > 0 || r1_um[3][i] > 0) fprintf(ofile, "CTOB\t1\t%i\t%llu\t%llu\n", i+1, r1_m[3][i], r1_um[3][i]);
+            if(paired) {
+                if(r2_m[3][i] > 0 || r2_um[3][i] > 0) fprintf(ofile, "CTOB\t2\t%i\t%llu\t%llu\n", i+1, r2_m[3][i], r2_um[3][i]);
+            }
+        }
+    }
+
+    printf("Processed %llu reads\n", treads);
+    fclose(ofile);
+
+    if(pdf) {
+        char *cmd = malloc(sizeof(char) * (strlen("bison_mbias2pdf ") + strlen(prefix) + 1));
+        sprintf(cmd, "bison_mbias2pdf %s", prefix);
+        printf("Executing %s\n", cmd);
+        if(system(cmd) == -1) printf("N.B. an error occured while running bison_mbias2pdf!\n");
+        free(cmd);
+    } else {
+        printf("The output may be converted to PDF with recommended inclusion bounds by running bison_mbias2pdf %s\n", prefix);
+    }
+
+    //Cleanup
+    free(prefix);
+    for(i=0; i<4; i++) {
+        free(r1_m[i]);
+        free(r1_um[i]);
+        free(r2_m[i]);
+        free(r2_um[i]);
+    }
+    bam_header_destroy(header);
+    bam_destroy1(read);
+    bam_close(ifile);
+    return(0);
+}
diff --git a/methylation_extractor.c b/methylation_extractor.c
new file mode 100644
index 0000000..f2b953e
--- /dev/null
+++ b/methylation_extractor.c
@@ -0,0 +1,1001 @@
+#include "bison.h"
+#include "sam.h"
+
+//Eh, this is simple enough for a small program
+int storeCpG, storeCHG, storeCHH, min_Phred;
+
+//Inclusion bounds
+int OT[4], OB[4], CTOT[4], CTOB[4];
+
+//This will hold the output file handles (some of which can be NULL)
+struct of_struct {
+    FILE *CpG;
+    FILE *CHG;
+    FILE *CHH;
+};
+
+//This stores an individual methylation call
+typedef struct {
+    int32_t tid;
+    int32_t start;
+    _Bool strand; //+/- == 1/0
+    unsigned int type; //This would normally be 0 (unmethylated) or 1 (methylated)
+} Site;
+
+//This struct hold an array of methylation calls that will need to be sorted
+typedef struct {
+    Site *CpG;
+    Site *CHG;
+    Site *CHH;
+    int num_CpG;
+    int max_CpG;
+    int num_CHG;
+    int max_CHG;
+    int num_CHH;
+    int max_CHH;
+    int only_CpG;
+    int only_CHG;
+    int only_CHH;
+} Sites;
+
+struct list_struct {
+    int32_t tid;
+    int32_t pos; //negative positions are - strand, otherwise, + strand
+    unsigned int n_methylated;
+    unsigned int n_unmethylated;
+    struct list_struct *next;
+};
+
+//Linked lists holding the final methylation calls
+struct list_struct *CpGlist, *CHGlist, *CHHlist;
+
+//Initialize a linked list
+struct list_struct* init_list() {
+    struct list_struct *output = calloc(1, sizeof(struct list_struct));
+    struct list_struct *next = calloc(1, sizeof(struct list_struct));
+    output->next = next;
+    output->tid = -1;
+    output->pos = -1;
+
+    next->next = NULL;;
+    next->tid = INT_MAX;
+    next->pos = INT_MAX;
+
+    return output;
+}
+
+//Destroy the linked list
+void destroy_methyl_list(struct list_struct *list) {
+    struct list_struct *next = list->next;
+    struct list_struct *current = list;
+
+    while(next != NULL) {
+        next = current->next;
+        free(current);
+        current = next;
+   }
+}
+
+//Insert a new methylation call into the linked list
+struct list_struct* insert_call(struct list_struct *current, Site *site) {
+    struct list_struct *next = current->next;
+    struct list_struct *new = malloc(sizeof(struct list_struct));
+
+    new->next = next;
+    new->tid = site->tid;
+    new->pos = (site->strand) ? site->start : -1 * (site->start);
+    if(site->type) {
+        new->n_methylated = 1;
+        new->n_unmethylated = 0;
+    } else {
+        new->n_methylated = 0;
+        new->n_unmethylated = 1;
+    }
+    current->next = new;
+    return new;
+}
+
+/*******************************************************************************
+*
+*  Initialize a Sites structure
+*
+*******************************************************************************/
+Sites* init_sites() {
+    Sites *output = malloc(sizeof(Sites));
+    output->CpG = malloc(sizeof(Site)*1000000);
+    output->CHG = malloc(sizeof(Site)*1000000);
+    output->CHH = malloc(sizeof(Site)*1000000);
+    output->num_CpG = 0;
+    output->max_CpG = 1000000;
+    output->num_CHG = 0;
+    output->max_CHG = 1000000;
+    output->num_CHH = 0;
+    output->max_CHH = 1000000;
+    output->only_CpG = 0;
+    output->only_CHG = 0;
+    output->only_CHH = 0;
+    return output;
+}
+
+/*******************************************************************************
+*
+*  Free space used by a Sites structure
+*
+*******************************************************************************/
+void destroy_sites(Sites *p) {
+    free(p->CpG);
+    free(p->CHG);
+    free(p->CHH);
+    free(p);
+}
+
+/*******************************************************************************
+*
+*  Site sorting comparison function used by qsort in sort calls
+*
+*******************************************************************************/
+int site_comparison(const void *p1, const void *p2) {
+    Site *site1 = (Site *) p1;
+    Site *site2 = (Site *) p2;
+    int output = 0;
+
+    if(site1->tid == site2->tid) {
+        if(site1->start == site2->start) {
+            output = 0;
+        } else {
+            output = site1->start - site2->start;
+        }
+    } else {
+        output = strcmp(global_header->target_name[site1->tid],global_header->target_name[site2->tid]);
+    }
+    return output;
+}
+
+/*******************************************************************************
+*
+*  Sort methylation sites according to chromosome and start position
+*
+*******************************************************************************/
+void sort_sites(Sites *sites, int which) {
+    if(which == 1) qsort((void *) sites->CpG, (size_t) sites->num_CpG, sizeof(Site), site_comparison);
+    else if(which == 2) qsort((void *) sites->CHG, (size_t) sites->num_CHG, sizeof(Site), site_comparison);
+    else if(which == 3) qsort((void *) sites->CHH, (size_t) sites->num_CHH, sizeof(Site), site_comparison);
+}
+
+void merge_calls(Sites *sites, int which) {
+    Site *type;
+    int nsites=0, i=0;
+    struct list_struct *olist=NULL, *current=NULL;
+
+    if(which == 1) {
+        type = sites->CpG;
+        olist = CpGlist;
+        nsites = sites->num_CpG;
+    } else if(which == 2) {
+        type = sites->CHG;
+        olist = CHGlist;
+        nsites = sites->num_CHG;
+    } else if(which == 3) {
+        type = sites->CHH;
+        olist = CHHlist;
+        nsites = sites->num_CHH;
+    }
+
+    //Take care of the first call
+    current = olist;
+    while(i<nsites) {
+        if(current->tid == type[i].tid) {
+            if(abs(current->pos) == abs(type[i].start)) {
+                if(type[i].type == 1) current->n_methylated++;
+                else current->n_unmethylated++;
+                i++;
+            } else if(abs(current->next->pos) > type[i].start) {
+                current = insert_call(current, type+i);
+                i++;
+            } else {
+                if(current->next->tid == type[i].tid) {
+                    current = current->next;
+                } else {
+                    current = insert_call(current, type+i);
+                    i++;
+                }
+            }
+        } else {
+            if(current->next->tid == INT_MAX) {
+                current = insert_call(current, type+i);
+                i++;
+            } else if(current->next->tid == type[i].tid) { //Changing chromosomes
+                if(abs(current->next->pos) > type[i].start) {
+                    current = insert_call(current, type+i);
+                    i++;
+                } else {
+                    current = current->next;
+                }
+            } else if(strcmp(global_header->target_name[type[i].tid], global_header->target_name[current->next->tid]) < 0) {
+                current = insert_call(current, type+i);
+                i++;
+            } else {
+                current = current->next;
+            }
+        }
+    }
+
+    //Reset the appropriate counter
+    if(which == 1) sites->num_CpG = 0;
+    else if(which == 2) sites->num_CHG = 0;
+    else if(which == 3) sites->num_CHH = 0;
+}
+
+
+/*******************************************************************************
+*
+*  This will write the actual output, return 1 on success and 0 on error.
+*
+*******************************************************************************/
+int process_call(int32_t tid, unsigned int position, char call, Sites *sites, char strand) {
+    Site *site;
+
+    if(call == 'Z' || call == 'z') { //CpG (methylated == Z, unmethylated = z)
+        if(sites->only_CHG || sites->only_CHH) return 1;
+        site = sites->CpG+sites->num_CpG;
+        site->tid = tid;
+        site->strand = (strand == '+') ? 1 : 0;
+        site->start = position;
+        site->type = (call == 'Z') ? 1 : 0;
+        (sites->num_CpG)++;
+    } else if(call == 'H' || call == 'h') { //CHH (methylated == H, unmethylated == h)
+        if(sites->only_CpG || sites->only_CHG) return 1;
+        site = sites->CHH+sites->num_CHH;
+        site->tid = tid;
+        site->strand = (strand == '+') ? 1 : 0;
+        site->start = position;
+        site->type = (call == 'H') ? 1 : 0;
+        (sites->num_CHH)++;
+    } else if(call == 'X' || call == 'x') { //CHG (methylated == X, unmethylated == x)
+        if(sites->only_CpG || sites->only_CHH) return 1;
+        site = sites->CHG+sites->num_CHG;
+        site->tid = tid;
+        site->strand = (strand == '+') ? 1 : 0;
+        site->start = position;
+        site->start = position;
+        site->type = (call == 'X') ? 1 : 0;
+        (sites->num_CHG)++;
+    } else {
+        printf("(1) Got an unknown character in the XM string of a read: %c\n",call);
+        return 0;
+    }
+    return 1;
+}
+
+/*******************************************************************************
+*
+*  Process either a single-end read or a non-overlapping paired-end read.
+*
+*  Return 1 on success and 0 on error.
+*
+*******************************************************************************/
+int extractor_process_single(bam1_t *read, Sites *sites) {
+    unsigned long long *positions = NULL;
+    char *XM = bam_aux2Z(bam_aux_get(read,"XM"));
+    char *XR = bam_aux2Z(bam_aux_get(read, "XR"));
+    char *XG = bam_aux2Z(bam_aux_get(read, "XG"));
+    uint8_t *QUAL = bam1_qual(read);
+    char strand = (strcmp(bam_aux2Z(bam_aux_get(read,"XG")), "CT") == 0) ? '+' : '-';
+    char call;
+    int i, start = 0, end = strlen(XM); //These may be overridden
+
+    /***************************************************************************
+    *
+    *  Do we need to increase the size of anything pointed to by sites?
+    *
+    ***************************************************************************/
+    if(storeCpG) {
+        if(sites->num_CpG + 100000 > sites->max_CpG) {
+            sites->CpG = realloc(sites->CpG, (sites->max_CpG+100000)*sizeof(Site));
+            sites->max_CpG += 100000;
+        }
+    }
+    if(storeCHG) {
+        if(sites->num_CHG + 100000 > sites->max_CHG) {
+            sites->CHG = realloc(sites->CHG, (sites->max_CHG+100000)*sizeof(Site));
+            sites->max_CHG += 100000;
+        }
+    }
+    if(storeCHH) {
+        if(sites->num_CHH + 100000 > sites->max_CHH) {
+            sites->CHH = realloc(sites->CHH, (sites->max_CHH+100000)*sizeof(Site));
+            sites->max_CHH += 100000;
+        }
+    }
+
+    positions = calculate_positions(read);
+
+    //Should we override "start" and "end"?
+    if(read->core.flag & BAM_FREAD2) { //#2
+        if(strcmp(XR, "GA") == 0 && strcmp(XG, "CT") == 0) { //OT
+            if(OT[2] != 0) start = OT[2];
+            if(OT[3] != 0) {
+                if(end > OT[3]) end = OT[3];
+            }
+        } else if(strcmp(XR, "GA") == 0 && strcmp(XG, "GA") == 0) { //OB
+            if(OB[2] != 0) start = OB[2];
+            if(OB[3] != 0) {
+                if(end > OB[3]) end = OB[3];
+            }
+        } else if(strcmp(XR, "CT") == 0 && strcmp(XG, "CT") == 0) { //CTOT
+            if(CTOT[2] != 0) start = CTOT[2];
+            if(CTOT[3] != 0) {
+                if(end > CTOT[3]) end = CTOT[3];
+            }
+        } else if(strcmp(XR, "CT") == 0 && strcmp(XG, "CT") == 0) { //CTOT
+            if(CTOB[2] != 0) start = CTOB[2];
+            if(CTOB[3] != 0) {
+                if(end > CTOB[3]) end = CTOB[3];
+            }
+        }
+    } else { //#1
+        if(strcmp(XR, "CT") == 0 && strcmp(XG, "CT") == 0) { //OT
+            if(OT[0] != 0) start = OT[0];
+            if(OT[1] != 0) {
+                if(end > OT[1]) end = OT[1];
+            }
+        } else if(strcmp(XR, "CT") == 0 && strcmp(XG, "GA") == 0) { //OB
+            if(OB[0] != 0) start = OB[0];
+            if(OB[1] != 0) {
+                if(end > OB[1]) end = OB[1];
+            }
+        } else if(strcmp(XR, "GA") == 0 && strcmp(XG, "CT") == 0) { //CTOT
+            if(CTOT[0] != 0) start = CTOT[0];
+            if(CTOT[1] != 0) {
+                if(end > CTOT[1]) end = CTOT[1];
+            }
+        } else if(strcmp(XR, "GA") == 0 && strcmp(XG, "GA") == 0) { //CTOB
+            if(CTOB[0] != 0) start = CTOB[0];
+            if(CTOB[1] != 0) {
+                if(end > CTOB[1]) end = CTOB[1];
+            }
+        }
+    }
+
+    for(i=start; i<end; i++) {
+        if(*(positions+i) != ULLONG_MAX) {
+            if(*(XM+i) != '.') {
+                if(*(QUAL+i) < min_Phred) continue;
+                call = *(XM+i);
+                if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) {
+                    if(!process_call(read->core.tid, *(positions+i), *(XM+i), sites, strand)) {
+                        printf("(2) Got an unknown character (%i) in the XM string of a single-ended read: %s\n", i, XM);
+                        free(positions);
+                        return 0;
+                    }
+                }
+            }
+        }
+    }
+    free(positions);
+    return 1;
+}
+
+/*******************************************************************************
+*
+*  Process either overlapping paired-end reads
+*
+*  Return 1 on success and 0 on error.
+*
+*******************************************************************************/
+int extractor_process_overlapping(bam1_t *read1, bam1_t *read2, Sites *sites) {
+    unsigned long long *positions1 = calculate_positions(read1), *positions2 = calculate_positions(read2);
+    char strand = (strcmp(bam_aux2Z(bam_aux_get(read1, "XG")), "CT") == 0) ? '+' : '-';
+    char call;
+    char *XR = bam_aux2Z(bam_aux_get(read1,"XR"));
+    char *XG = bam_aux2Z(bam_aux_get(read1,"XG"));
+    char *XM1 = bam_aux2Z(bam_aux_get(read1,"XM"));
+    char *XM2 = bam_aux2Z(bam_aux_get(read2,"XM"));
+    int i, j, end1 = (int) read1->core.l_qseq, end2 = (int) read2->core.l_qseq;
+    int start1 = 0, start2 = 0;
+
+    /***************************************************************************
+    *
+    *  Do we need to increase the size of anything pointed to by sites?
+    *
+    ***************************************************************************/
+    if(sites->num_CpG + 100000 > sites->max_CpG) {
+        sites->CpG = realloc(sites->CpG, (sites->max_CpG+100000)*sizeof(Site));
+        sites->max_CpG += 100000;
+    }
+    if(sites->num_CHG + 100000 > sites->max_CHG) {
+        sites->CHG = realloc(sites->CHG, (sites->max_CHG+100000)*sizeof(Site));
+        sites->max_CHG += 100000;
+    }
+    if(sites->num_CHH + 100000 > sites->max_CHH) {
+        sites->CHH = realloc(sites->CHH, (sites->max_CHH+100000)*sizeof(Site));
+        sites->max_CHH += 100000;
+    }
+
+    //Should we override start1,start2, end1 and end2?
+    if(strcmp(XR, "CT") == 0 && strcmp(XG, "CT") == 0) { //OT
+        if(OT[0] != 0) start1 = OT[0];
+        if(OT[1] != 0) {
+            if(end1 > OT[1]) end1 = OT[1];
+        }
+        if(OT[2] != 0) start2 = OT[2];
+        if(OT[3] != 0) {
+            if(end2 > OT[3]) end2 = OT[3];
+        }
+    } else if(strcmp(XR, "CT") == 0 && strcmp(XG, "GA") == 0) { //OB
+        if(OB[0] != 0) start1 = OB[0];
+        if(OB[1] != 0) {
+            if(end1 > OB[1]) end1 = OB[1];
+        }
+        if(OB[2] != 0) start2 = OB[2];
+        if(OB[3] != 0) {
+            if(end2 > OB[3]) end2 = OB[3];
+        }
+    } else if(strcmp(XR, "GA") == 0 && strcmp(XG, "CT") == 0) { //CTOT
+        if(CTOT[0] != 0) start1 = CTOT[0];
+        if(CTOT[1] != 0) {
+            if(end1 > CTOT[1]) end1 = CTOT[1];
+        }
+        if(CTOT[2] != 0) start2 = CTOT[2];
+        if(CTOT[3] != 0) {
+            if(end2 > CTOT[3]) end2 = CTOT[3];
+        }
+    } else if(strcmp(XR, "GA") == 0 && strcmp(XG, "GA") == 0) { //CTOB
+        if(CTOB[0] != 0) start1 = CTOB[0];
+        if(CTOB[1] != 0) {
+            if(end1 > CTOB[1]) end1 = CTOB[1];
+        }
+        if(CTOB[2] != 0) start2 = CTOB[2];
+        if(CTOB[3] != 0) {
+            if(end2 > CTOB[3]) end2 = CTOB[3];
+        }
+    }
+    i = start1;
+    XM1 += start1;
+    j = start2;
+    XM2 += start2;
+    while(*(positions1+i) == ULLONG_MAX) {
+        i++;
+        start1++;
+        XM1++;
+    }
+    while(*(positions2+j) == ULLONG_MAX) {
+        j++;
+        start2++;
+        XM2++;
+    }
+    while(*(positions1+end1-1) == ULLONG_MAX) end1--;
+    while(*(positions2+end2-1) == ULLONG_MAX) end2--;
+
+    /***************************************************************************
+    *
+    *  If there is a 5' overhang when comparing the two sequences, then we
+    *  should process that first before dealing with the overlap.
+    *
+    ***************************************************************************/
+    if(*positions1 < *positions2) {
+        while(*(positions1+i) < *positions2) {
+            if(*(positions1+i) != ULLONG_MAX) {
+                if(*XM1 != '.') {
+                    call = *XM1;
+                    if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) {
+                        if(!process_call(read1->core.tid, *(positions1+i), *XM1, sites, strand)) {
+                            printf("(3) Got an unknown character in the XM string: %s\n", XM1);
+                            return 0;
+                        }
+                    }
+                }
+            }
+            i++;
+            XM1++;
+            if(i == end1) break;
+        }
+    } else if(*positions2 < *positions1) {
+        while(*(positions2+j) < *positions1) {
+            if(*(positions2+j) != ULLONG_MAX) {
+                if(*XM2 != '.') {
+                    call = *XM2;
+                    if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) {
+                        if(!process_call(read2->core.tid, *(positions2+j), *XM2, sites, strand)) {
+                            printf("(4) Got an unknown character in the XM string: %s\n", XM2);
+                            return 0;
+                        }
+                    }
+                }
+            }
+            j++;
+            XM2++;
+            if(j == end2) break;
+        }
+    }
+
+    //We are now up to the overlapping section
+    while((i<end1) && (j<end2)) {
+        //Ensure we're on the same position
+        if(*(positions1+i) != ULLONG_MAX && *(positions2+j) != ULLONG_MAX) {
+            //Deal with InDels at the beginning of homopolymer repeats which may be different in the reads
+            if(*(positions1+i) != *(positions2+j)) {
+                if(*(positions1+i) < *(positions2+j)) {
+                    if(*XM1 != '.') {
+                        call = *XM1;
+                        if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) {
+                            if(!process_call(read1->core.tid, *(positions1+i), *XM1, sites, strand)) {
+                                printf("(5a) Got an unknown character in the XM string: %s\n", XM1);
+                                return 0;
+                            }
+                        }
+                    }
+                    XM1++;
+                    i++;
+                } else {
+                    if(*XM2 != '.') {
+                        call = *XM2;
+                        if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) {
+                            if(!process_call(read2->core.tid, *(positions2+j), *XM2, sites, strand)) {
+                                printf("(5b) Got an unknown character in the XM string: %s\n", XM2);
+                                return 0;
+                            }
+                        }
+                    }
+                    XM2++;
+                    j++;
+                }
+                continue;
+            }
+
+            if(*XM1 == *XM2) {
+                if(*XM1 != '.') {
+                    call = *XM1;
+                    if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) {
+                        if(!process_call(read1->core.tid, *(positions1+i), *XM1, sites, strand)) {
+                            printf("(6) Got an unknown character in the XM string: %s\n", XM1);
+                            return 0;
+                        }
+                    }
+                }
+            } else { //bison will call '.' if there is an N in a read or an impossible conversion, so whichever read has a call is correct (the call becomes '.' if the reads have different calls)
+                if(*XM2 != '.' && *XM1 == '.') {
+                    call = *XM2;
+                    if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) {
+                        if(!process_call(read2->core.tid, *(positions2+j), *XM2, sites, strand)) {
+                            printf("(7) Got an unknown character in the XM string: %s\n", XM2);
+                            return 0;
+                        }
+                    }
+                } else if(*XM1 != '.' && *XM2 == '.') {
+                    call = *XM1;
+                    if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) {
+                        if(!process_call(read1->core.tid, *(positions1+i), *XM1, sites, strand)) {
+                            printf("(8) Got an unknown character in the XM string: %s\n", XM1);
+                            return 0;
+                        }
+                    }
+                }
+            }
+            XM1++;
+            XM2++;
+            i++;
+            j++;
+        } else {
+            if(*(positions1+i) == ULLONG_MAX) { 
+                XM1++;
+                i++;
+            }
+            if(*(positions2+j) == ULLONG_MAX) { 
+                XM2++;
+                j++;
+            }
+        }
+    }
+
+    if(i >= end1 && j >= end2) {
+        free(positions1);
+        free(positions2);
+        return 1;
+    }
+    if(i >= end1) {
+        while(j<end2) {
+            if(*(positions2+j) != ULLONG_MAX) {
+                if(*XM2 != '.') {
+                    call = *XM2;
+                    if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) {
+                        if(!process_call(read2->core.tid, *(positions2+j), *XM2, sites, strand)) {
+                            printf("(12) Got an unknown character in the XM string: %s\n", XM2);
+                            free(positions1);
+                            free(positions2);
+                            return 0;
+                        }
+                    }
+                }
+            }
+            XM2++;
+            j++;
+        }
+    } else {
+        while(i<end1) {
+            if(*(positions1+i) != ULLONG_MAX) {
+                if(*XM1 != '.') {
+                    call = *XM1;
+                    if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) {
+                        if(!process_call(read1->core.tid, *(positions1+i), *XM1, sites, strand)) {
+                            printf("(13) Got an unknown character in the XM string: %s\n", XM1);
+                            free(positions1);
+                            free(positions2);
+                            return 0;
+                        }
+                    }
+                }
+            }
+            XM1++;
+            i++;
+        }
+    }
+    free(positions1);
+    free(positions2);
+    return 1;
+}
+
+//Remove methylation calls on 5' ends of reads
+void trim5(bam1_t *read, int digest_types) {
+    int MspI = digest_types & 1;
+    int TaqI = digest_types & 2;
+    unsigned long long offset = genome_offset(lookup_chrom(read), read->core.pos);
+    char *sequence;
+    char *XM = bam_aux2Z(bam_aux_get(read, "XM"));
+    int i;
+
+    for(i=0; i<2; i++) {
+        sequence = chromosomes.genome+offset+read->core.pos-i;
+        if(MspI) {
+            if(strcmp(sequence, "CCGG")) {
+                *(XM+2-i) = '.';
+                break;
+            }
+        }
+        if(TaqI) {
+            if(strcmp(sequence, "TCGA")) {
+                *(XM+2-i) = '.';
+                break;
+            }
+        }
+    }
+}
+
+//Remove methylation calls on 3' ends of reads
+void trim3(bam1_t *read, int digest_types) {
+    int MspI = digest_types & 1;
+    int TaqI = digest_types & 2;
+    unsigned long long offset = genome_offset(lookup_chrom(read), read->core.pos);
+    char *sequence;
+    char *XM = bam_aux2Z(bam_aux_get(read, "XM"));
+    uint32_t end = bam_calend(&(read->core), bam1_cigar(read));
+    int i, len = strlen(XM);
+
+    for(i=0; i<2; i++) {
+        sequence = chromosomes.genome+offset+end-2-i;
+        if(MspI) {
+            if(strcmp(sequence, "CCGG")) {
+                *(XM+len-2-i) = '.';
+                break;
+            }
+        }
+        if(TaqI) {
+            if(strcmp(sequence, "TCGA")) {
+                *(XM+len-2-i) = '.';
+                break;
+            }
+        }
+    }
+}
+
+void process_RRBS_read(bam1_t *read1, bam1_t *read2, int digest_types) {
+    if(strncmp(bam_aux2Z(bam_aux_get(read1, "XG")), "CT", 2) == 0) { //OT or CTOT
+        trim3(read1, digest_types);
+        if(read1->core.flag & BAM_FPAIRED) trim3(read2, digest_types);
+    } else { //OB or CTOB
+        trim5(read1, digest_types);
+        if(read1->core.flag & BAM_FPAIRED) trim5(read2, digest_types);
+    }
+}
+
+void write_sites(struct of_struct *of, int which) {
+    struct list_struct *list = NULL;
+    int mpercent;
+    FILE *f = NULL;
+
+    if(which == 1) {
+        list = CpGlist->next;
+        f = of->CpG;
+    } else if(which == 2) {
+        list = CHGlist->next;
+        f = of->CHG;
+    } else if(which == 3) {
+        list = CHHlist->next;
+        f = of->CHH;
+    }
+
+    while(list->next != NULL) {
+        mpercent = (int) (1000 * ((float) list->n_methylated)/(float)(list->n_methylated + list->n_unmethylated));
+        fprintf(f, "%s\t%u\t%u\t%i\t%u\t%u\n", global_header->target_name[list->tid], \
+            abs(list->pos), abs(list->pos)+1, mpercent, list->n_methylated, list->n_unmethylated);
+        list = list->next;
+    }
+}
+
+//Generate output file names and open them for writing
+void generate_output_names(char *ifile, struct of_struct *of) {
+    char *p, *tmp = strdup(ifile);
+    char *oname = NULL;
+
+    //Generate the basename by stripping off .sam or .bam
+    p = strrchr(tmp, '.');
+    if(strcmp(p, ".sam") == 0 || strcmp(p, ".bam") == 0) *p = '\0';
+    oname = malloc(sizeof(char) * (strlen(tmp) + strlen("_CpG.bedGraph ")));
+
+    if(storeCpG) {
+        sprintf(oname, "%s_CpG.bedGraph", tmp);
+        printf("CpG counts will be written to %s\n", oname);
+        of->CpG = fopen(oname, "w");
+    }
+    if(storeCHG) {
+        sprintf(oname, "%s_CHG.bedGraph", tmp);
+        printf("CHG counts will be written to %s\n", oname);
+        of->CHG = fopen(oname, "w");
+    }
+    if(storeCHH) {
+        sprintf(oname, "%s_CHH.bedGraph", tmp);
+        printf("CHH counts will be written to %s\n", oname);
+        of->CHH = fopen(oname, "w");
+    }
+
+    free(tmp);
+    free(oname);
+}
+
+//Fill the inclusion bounds
+void fill_bounds(char *str, int bounds[4]) {
+    int i;
+    char *p;
+
+    for(i=0; i<4; i++) {
+        if(i==0) {
+            p = strtok(str, ",");
+        } else {
+            p = strtok(NULL, ",");
+        }
+        if(p == NULL) break;
+        bounds[i] = atoi(p);
+    }
+}
+
+void usage(char *prog) {
+    printf("Usage: %s OPTIONS genome_directory input.(sam|bam)\n", prog);
+    printf("\n\
+    Extract methylation information into a bedGraph file or files. By default,\n\
+    only CpG metrics are output\n\
+\n\
+    -h            Print this message.\n\
+\n\
+    -q            Read MAPQ value must at least this for inclusion (default 10).\n\
+                  Specify 0 to include everything.\n\
+\n\
+    -phred        Minimum Phred score that a base must have for its methylation\n\
+                  state to be included in the output. The default is 5.\n\
+\n\
+    --MspI        Library was MspI digested.\n\
+\n\
+    --TaqI        Library was TaqI digested (this can be in addition to\n\
+                  MspI digestion).\n\
+\n\
+    -no_CpG       Don't output CpG sites (they're output by default).\n\
+\n\
+    -CHH          Output CHH statistics.\n\
+\n\
+    -CHG          Output CHG statistics.\n\
+\n\
+    -OT           Bounds for the region of reads mapped to the original top\n\
+                  strand to include. It is highly recommended that bison_mbias\n\
+                  and/or bison_mbias2pdf be run so that approximate bounds can\n\
+                  be generated for this. The format is \"-OT A,B,C,D\", where \"A\"\n\
+                  is the 5'-most and \"B\" the 3'-most bound of the included\n\
+                  region for read #1. \"C\" and \"D\" are the equivalent bounds for\n\
+                  read #2. A value of 0 means to leave that portion of the read\n\
+                  unbound (e.g., \"-OT 0,90,20,0\" will not include methylation\n\
+                  calls after the 90th base on read #1 or before the 20th base\n\
+                  on read #2). The default is \"-OT 0,0,0,0\", meaning that all\n\
+                  methylation calls are included.\n\
+\n\
+    -OB           Like -OT, but for reads mapping to the original bottom strand.\n\
+\n\
+    -CTOT         Like -OT, but for reads mapping to the complementary to\n\
+                  original top strand.\n\
+\n\
+    -CTOB         Like -OT, but for reads mapping to the complementary to\n\
+                  original bottom strand.\n\
+\n\
+    -max-sites-size N    This option can increase or decrease memory\n\
+                  requirements by changing the number of methylation calls\n\
+                  stored in memory prior to sorting and merging. The default is\n\
+                  50,000.\n");
+}
+
+int main(int argc, char *argv[]) {
+    int i, max_sites_size = 50000, MspI = 0, TaqI = 0;
+    int min_MAPQ = 10;
+    samfile_t *fp = NULL;
+    bam1_t *read1 = bam_init1(), *read2 = bam_init1();
+    struct of_struct *of = calloc(1, sizeof(struct of_struct));
+    unsigned int r1_pos = 0, r2_pos = 0, total_reads = 0;
+    Sites *sites = init_sites();
+
+    CpGlist = init_list();
+    CHGlist = init_list();
+    CHHlist = init_list();
+    config.genome_dir = NULL;
+    chromosomes.nchromosomes = 0;
+    storeCpG = storeCHG = storeCHH = 0;
+    storeCpG = 1;
+    min_Phred = 10;
+    for(i=0; i<4; i++) OT[i] = OB[i] = CTOT[i] = CTOB[i] = 0;
+
+    /* read in the file names */
+    if(argc < 3) {
+        usage(argv[0]);
+        return 1;
+    };
+    for(i=1; i<argc; i++) {
+        if(strcmp(argv[i], "-no_CpG") == 0) {
+            storeCpG = 0;
+        } else if(strcmp(argv[i], "-CHG") == 0) {
+            storeCHG = 1;
+        } else if(strcmp(argv[i], "-CHH") == 0) {
+            storeCHH = 1;
+        } else if(strcmp(argv[i], "--MspI") == 0) {
+            MspI = 1;
+        } else if(strcmp(argv[i], "--TaqI") == 0) {
+            TaqI = 1;
+        } else if(strcmp(argv[i], "-h") == 0) {
+            usage(argv[0]);
+            return 0;
+        } else if(strcmp(argv[i], "-q") == 0) {
+            i++;
+            min_MAPQ = atoi(argv[i]);
+        } else if(strcmp(argv[i], "-phred") == 0) {
+            i++;
+            min_Phred = atoi(argv[i]);
+        } else if(strcmp(argv[i], "-max-sites-size") == 0) {
+            max_sites_size = atoi(argv[++i]);
+        } else if (strcmp(argv[i], "-OT") == 0) {
+            fill_bounds(argv[++i], OT);
+        } else if (strcmp(argv[i], "-OB") == 0) {
+            fill_bounds(argv[++i], OT);
+        } else if (strcmp(argv[i], "-CTOT") == 0) {
+            fill_bounds(argv[++i], CTOT);
+        } else if (strcmp(argv[i], "-CTOB") == 0) {
+            fill_bounds(argv[++i], CTOB);
+        } else if(config.genome_dir == NULL) {
+            config.genome_dir = argv[i];
+        } else if(fp == NULL) {
+            if(argv[i][strlen(argv[i])-3] == 'b') {
+                fp = samopen(argv[i], "rb", NULL);
+            } else {
+                fp = samopen(argv[i], "r", NULL);
+            }
+            global_header = fp->header;
+        } else {
+            printf("Unknown parameter %s\n", argv[i]);
+            usage(argv[0]);
+            return 1;
+        }
+    }
+
+    if(config.genome_dir == NULL || fp == NULL) {
+        printf("Genome directory or SAM/BAM input file not specified!\n");
+        usage(argv[0]);
+    }
+
+    //Generate the output names and open the output files
+    generate_output_names(argv[argc-1], of);
+
+    //Read in the genome
+    chromosomes.max_genome = 3000000000;
+    printf("Allocating space for %llu characters\n", chromosomes.max_genome); fflush(stdout);
+    chromosomes.genome = malloc(sizeof(char)*chromosomes.max_genome);
+    *chromosomes.genome = '\0';
+    if(chromosomes.genome == NULL) {
+        printf("Could not allocate enough room to hold the genome!\n");
+        return -1;
+    }
+    read_genome();
+
+    //Process the reads
+    while(samread(fp, read1) > 1) {
+        if(read1->core.flag & BAM_FPAIRED) {
+            samread(fp, read2);
+            r1_pos = read1->core.pos+1;
+            r2_pos = read2->core.pos+1;
+        } else {
+            r1_pos = read1->core.pos+1;
+            r2_pos = INT_MAX;
+        }
+        if(read1->core.flag & BAM_FDUP) continue;
+        if(read1->core.qual < min_MAPQ) continue;
+        if(TaqI+MspI) process_RRBS_read(read1, read2, MspI+2*TaqI);
+
+        //Are the reads even overlapping? If not, this is easy.
+        if(r2_pos == INT_MAX) { //Unpaired read
+            if(!extractor_process_single(read1, sites)) { printf("Error!\n"); break; }
+        } else if(r1_pos < r2_pos && r1_pos + read1->core.l_qseq - 1 < r2_pos) { //No Overlap
+            if(!extractor_process_single(read1, sites)) { printf("Error!\n"); break; }
+            if(!extractor_process_single(read2, sites)) { printf("Error!\n"); break; }
+        } else if(r2_pos < r1_pos && r2_pos + read2->core.l_qseq - 1 < r1_pos) { //No Overlap
+            if(!extractor_process_single(read1, sites)) { printf("Error!\n"); break; }
+            if(!extractor_process_single(read2, sites)) { printf("Error!\n"); break; }
+        } else { //Overlap
+            if(!extractor_process_overlapping(read1, read2, sites)) { printf("Error!\n"); break; }
+        }
+
+        if(sites->num_CpG >= max_sites_size) {
+            sort_sites(sites, 1);
+            merge_calls(sites, 1);
+        }
+        if(sites->num_CHG >= max_sites_size) {
+            sort_sites(sites, 2);
+            merge_calls(sites, 2);
+        }
+        if(sites->num_CHH >= max_sites_size) {
+            sort_sites(sites, 3);
+            merge_calls(sites, 3);
+        }
+
+        total_reads++;
+        if(total_reads % 1000000 == 0) {
+            printf("Processed %u reads\n", total_reads);
+            fflush(stdout);
+        }
+    }
+    if(samread(fp, read1) > 1) {
+        printf("We must have exited on an error as there are still reads left\n");
+        fflush(stdout);
+    }
+
+    //Do the final sort and merge
+    if(sites->num_CpG) {
+        sort_sites(sites, 1);
+        merge_calls(sites, 1);
+    }
+    if(sites->num_CHG) {
+        sort_sites(sites, 2);
+        merge_calls(sites, 2);
+    }
+    if(sites->num_CHH) {
+        sort_sites(sites, 3);
+        merge_calls(sites, 3);
+    }
+
+    //Write output
+    if(storeCpG) write_sites(of, 1);
+    if(storeCHG) write_sites(of, 2);
+    if(storeCHH) write_sites(of, 3);
+
+    //Close things up
+    if(of->CpG != NULL) fclose(of->CpG);
+    if(of->CHG != NULL) fclose(of->CHG);
+    if(of->CHH != NULL) fclose(of->CHH);
+    free(of);
+    free(chromosomes.genome);
+    for(i=0; i<chromosomes.nchromosomes; i++) {
+        free((chromosomes.chromosome[i])->chrom);
+        free(*(chromosomes.chromosome+i));
+    }
+    free(chromosomes.chromosome);
+    destroy_methyl_list(CpGlist);
+    destroy_methyl_list(CHGlist);
+    destroy_methyl_list(CHHlist);
+    destroy_sites(sites);
+    bam_destroy1(read1);
+    bam_destroy1(read2);
+    samclose(fp);
+
+    return 0;
+};
diff --git a/slurp.c b/slurp.c
new file mode 100644
index 0000000..e2c0222
--- /dev/null
+++ b/slurp.c
@@ -0,0 +1,318 @@
+#include "bison.h"
+
+/******************************************************************************
+*
+*   Add an element to the end of a linked-list
+*
+*   struct packed_struct *last: last sentinel struct
+*   void *packed: a packed read
+*
+*******************************************************************************/
+void add_element(struct packed_struct *last, void *packed) {
+    struct packed_struct *new = malloc(sizeof(struct packed_struct));
+    struct packed_struct *next_to_last = last->previous;
+
+    //Setup the new element
+    new->packed = packed;
+    new->next = last;
+    new->previous = next_to_last;
+    new->state = 0;
+
+    //Update the sentinel struct
+    last->previous = new;
+
+    //Update the next_to_last struct
+    next_to_last->next = new;
+    next_to_last->state = 1;
+}
+
+/******************************************************************************
+*
+*   Destroy a (typically already removed) element from a linked-list
+*
+*   struct packed_struct *remove: element to destroy
+*
+*******************************************************************************/
+inline void destroy_element(struct packed_struct *remove) {
+    bam1_t *pbam1_t = remove->packed;
+    if(pbam1_t != NULL) {
+        if(pbam1_t->data != NULL) free(pbam1_t->data);
+        free(pbam1_t);
+    }
+    free(remove);
+}
+
+/******************************************************************************
+*
+*   Remove an element from the start of a linked-list
+*   is_ready(first, 0) must return 1!
+*
+*   struct packed_struct *first: first sentinel struct
+*
+*******************************************************************************/
+void remove_element(struct packed_struct *first) {
+    struct packed_struct *remove = first->next;
+    struct packed_struct *new_next = remove->next;
+
+    first->next = new_next;
+
+    destroy_element(remove);
+}
+
+
+/******************************************************************************
+*
+*   Is the first or second element ready?
+*
+*   struct packed_struct *first: first sentinel struct
+*   int offset: 0 (first element) or 1 (second element)
+*
+*    returns 1 for element ready, or 0 otherwise
+*
+*******************************************************************************/
+inline int is_ready(struct packed_struct *first, int offset) {
+    if(offset == 0) {
+        if(first->next->state == 1) return 1;
+    } else {
+        if(first->next->next->state == 1) return 1;
+    }
+    return 0;
+}
+
+/******************************************************************************
+*
+*   Is the linked list finished?
+*
+*   struct packed_struct *first: first sentinel struct
+*
+*   returns 1 for finished, 0 otherwise
+*
+*******************************************************************************/
+inline int is_finished(struct packed_struct *first) {
+    if(first->next->packed == NULL) return 1;
+    return 0;
+}
+
+/******************************************************************************
+*
+*   Add a finished element to a linked list
+*
+*   struct packed_struct *last: last sentenel struct of targeted list
+*
+*******************************************************************************/
+void add_finished(struct packed_struct *last) {
+    struct packed_struct *new = malloc(sizeof(struct packed_struct));
+    struct packed_struct *next_to_last = last->previous;
+
+    new->packed = NULL;
+    new->next = last;
+    new->previous = NULL;
+    new->state = 1;
+
+    //Update the sentinel struct
+    last->previous = new;
+
+    //Update the next_to_last struct
+    next_to_last->next = new;
+    next_to_last->state = 1;
+    if(config.paired) next_to_last->previous->state = 1;
+}
+
+/******************************************************************************
+*
+*   Initialize a linked list, returning the last sentinel struct
+*
+*   struct packed_struct *first: first sentinel struct
+*
+*   returns first sentinel struct
+*
+*******************************************************************************/
+struct packed_struct *initialize_list(struct packed_struct *first) {
+    first = malloc(sizeof(struct packed_struct));
+    struct packed_struct *last= malloc(sizeof(struct packed_struct));
+
+    first->next = last;
+    first->previous = first;
+    first->packed = NULL;
+    last->next = last;
+    last->previous = first;
+    last->packed = NULL;
+
+    last->state = 0; //is_ready(last) should always be 0;
+    first->state = 0; //is_ready(last) should always be 0;
+    return first;
+}
+
+/******************************************************************************
+*
+*   Destroy a linked list of packed_structs
+*
+*   struct packed_struct *first: linked list to destroy
+*
+*******************************************************************************/
+void destroy_list(struct packed_struct *first) {
+    while(first->next->next != first->next) remove_element(first);
+    free(first->next);
+    free(first);
+}
+
+/******************************************************************************
+*
+*   The MPI receiver thread on the main node
+*
+*   void *a: NULL input
+*
+*   returns NULL
+*
+*******************************************************************************/
+void *slurp(void *a) {
+    time_t t0, t1;
+#ifndef DEBUG
+    void *p = NULL;
+    int nnodes = (config.directional) ? 2 : 4;
+    int nfinished = 0;
+    int source = 0;
+    int size = 0;
+    struct packed_struct *target_node = NULL;
+    MPI_Status status;
+    int start = 1;
+    int ntasks = (config.directional) ? 3: 5;
+    int i;
+    for(i=1; i<ntasks; i++) {
+        if(!config.quiet) printf("Sending start to node %i\n", i); fflush(stdout);
+        MPI_Ssend(&start, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
+    }
+    //Get the header
+    if(MPI_Recv((void *) &size, 1, MPI_INT, 1, 1, MPI_COMM_WORLD, &status) != MPI_SUCCESS) {
+        printf("Received an error when trying to receive header size.\n");
+        fflush(stdout);
+        quit(3, -2);
+    }
+    p = malloc((size_t) size);
+    if(MPI_Recv(p, size, MPI_BYTE, 1, 2, MPI_COMM_WORLD, &status) != MPI_SUCCESS) {
+        printf("Received an error when trying to receive header.\n");
+        fflush(stdout);
+        quit(3, -2);
+    }
+    global_header = bam_header_init();
+    unpack_header(global_header, p);
+    free(p);
+#else
+    char *iname = malloc(sizeof(char) * (1+strlen(config.odir)+strlen(config.basename)+strlen("_CTOT.bam")));
+    sprintf(iname, "%s%s_OT.bam", config.odir, config.basename);
+    fp1 = bam_open(iname, "r");
+    sprintf(iname, "%s%s_OB.bam", config.odir, config.basename);
+    fp2 = bam_open(iname, "r");
+    if(!config.directional) {
+        sprintf(iname, "%s%s_CTOT.bam", config.odir, config.basename);
+        fp3 = bam_open(iname, "r");
+        sprintf(iname, "%s%s_CTOB.bam", config.odir, config.basename);
+        fp4 = bam_open(iname, "r");
+    }
+    free(iname);
+    global_header = bam_header_read(fp1);
+    if(!config.quiet) printf("header written\n"); fflush(stdout);
+    bam1_t *read = bam_init1();
+    MPI_read *packed = calloc(1, sizeof(MPI_read));
+    packed->packed = NULL;
+    packed->size = 0;
+    bam_header_t *tmp;
+    tmp = bam_header_read(fp2);
+    bam_header_destroy(tmp);
+    if(!config.directional) {
+        tmp = bam_header_read(fp3);
+        bam_header_destroy(tmp);
+        tmp = bam_header_read(fp4);
+        bam_header_destroy(tmp);
+    }
+#endif
+
+    //Write a header
+    bam_header_write(OUTPUT_BAM, global_header);
+
+    t0 = time(NULL);
+    if(!config.quiet) printf("Started slurping @%s", ctime(&t0)); fflush(stdout);
+#ifndef DEBUG
+    while(nfinished < nnodes) {
+        MPI_Probe(MPI_ANY_SOURCE, 5, MPI_COMM_WORLD, &status);
+        source = status.MPI_SOURCE;
+        MPI_Get_count(&status, MPI_BYTE, &size);
+        if(source == 1) target_node = node1_last_sentinel;
+        else if(source == 2) target_node = node2_last_sentinel;
+        else if(source == 3) target_node = node3_last_sentinel;
+        else if(source == 4) target_node = node4_last_sentinel;
+
+        if(size > 1) {
+            p = malloc((size_t) size);
+            MPI_Recv(p, size, MPI_BYTE, source, 5, MPI_COMM_WORLD, &status);
+            add_element(target_node, p);
+        } else {
+            p = malloc((size_t) size);
+            MPI_Recv(p, size, MPI_BYTE, source, 5, MPI_COMM_WORLD, &status);
+            free(p);
+            add_finished(target_node);
+            nfinished++;
+        }
+    }
+#else
+    //OT
+    while(bam_read1(fp1, read) > 1) {
+        packed->size = 0;
+        packed = pack_read(read, packed);
+        add_element(node1_last_sentinel, packed->packed);
+        if(config.paired) {
+            bam_read1(fp1, read);
+            packed->size = 0;
+            packed = pack_read(read, packed);
+            add_element(node1_last_sentinel, packed->packed);
+        }
+        //OB
+        bam_read1(fp2, read);
+        packed->size = 0;
+        packed = pack_read(read, packed);
+        add_element(node2_last_sentinel, packed->packed);
+        if(config.paired) {
+            bam_read1(fp2, read);
+            packed->size = 0;
+            packed = pack_read(read, packed);
+            add_element(node2_last_sentinel, packed->packed);
+        }
+        if(!config.directional) {
+            //CTOT
+            bam_read1(fp3, read);
+            packed->size = 0;
+            packed = pack_read(read, packed);
+            add_element(node3_last_sentinel, packed->packed);
+            if(config.paired) {
+                bam_read1(fp3, read);
+                packed->size = 0;
+                packed = pack_read(read, packed);
+                add_element(node3_last_sentinel, packed->packed);
+            }
+            //CTOB
+            bam_read1(fp4, read);
+            packed->size = 0;
+            packed = pack_read(read, packed);
+            add_element(node4_last_sentinel, packed->packed);
+            if(config.paired) {
+                bam_read1(fp4, read);
+                packed->size = 0;
+                packed = pack_read(read, packed);
+                add_element(node4_last_sentinel, packed->packed);
+            }
+        }
+    }
+    bam_destroy1(read);
+    free(packed);
+    
+    add_finished(node1_last_sentinel);
+    add_finished(node2_last_sentinel);
+    if(!config.directional) {
+        add_finished(node3_last_sentinel);
+        add_finished(node4_last_sentinel);
+    }
+#endif
+    t1 = time(NULL);
+    if(!config.quiet) printf("Finished slurping @%s\t(%f seconds elapsed)\n", ctime(&t1), difftime(t1, t0)); fflush(stdout);
+    return NULL;
+}
diff --git a/worker.c b/worker.c
new file mode 100644
index 0000000..0137bf3
--- /dev/null
+++ b/worker.c
@@ -0,0 +1,218 @@
+#include "bison.h"
+#include <sys/time.h>
+
+/******************************************************************************
+*
+*   The main worker node function.
+*
+*   int thread_id: the thread_id
+*
+*******************************************************************************/
+void worker_node(int thread_id) {
+    int cmd_length = 1, max_qname = 0, status;
+    char *cmd, *last_qname = calloc(1, sizeof(char));;
+    MPI_Header *packed_header;
+    MPI_read *packed_read = calloc(1, sizeof(MPI_read));
+    bam_header_t *header;
+    bam1_t *read1 = bam_init1();
+    bam1_t *read2 = bam_init1();
+    tamFile fp;
+    MPI_Status stat;
+#ifdef DEBUG
+    int current_p_size = 100;
+    bamFile of;
+    bam_header_t *debug_header = bam_header_init();
+    bam1_t *debug_read = bam_init1();
+    global_header = bam_header_init();
+    void *p = calloc(100,1);
+    int NODE_ID = -1;
+    MPI_Comm_rank(MPI_COMM_WORLD, &NODE_ID);
+    if(!config.quiet) printf("NODE_ID: %i\n",NODE_ID); fflush(stdout);
+    char *oname;
+#else
+    int start = 0, i = 0;
+#endif
+    time_t t0, t1;
+
+    packed_read->size = 0;
+    packed_read->packed = NULL;
+
+    //construct the bowtie2 command
+    cmd_length += (int) strlen("bowtie2 -q --reorder --no-mixed --no-discordant") + 1;
+    cmd_length += (int) strlen(config.bowtie2_options) + 1;
+    cmd_length += (int) strlen("--norc -x") + 1;
+    cmd_length += (int) strlen(config.genome_dir) + strlen("bisulfite_genome/CT_conversion/BS_CT") + 1;
+    cmd_length += (int) 2*(strlen("-1 ") + strlen(config.FASTQ1CT)) + 3;
+
+    cmd = (char *) malloc(sizeof(char) * cmd_length);
+    if(thread_id == 1) { //OT Read#1 C->T, Read#2 G->A, Genome C->T only the + strand
+        if(config.paired) {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, config.FASTQ1CT, config.FASTQ2GA);
+        } else {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, config.FASTQ1CT);
+        }
+#ifdef DEBUG
+        oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_OT.bam")));
+        sprintf(oname, "%s%s_OT.bam", config.odir, config.basename);
+        of = bam_open(oname, "w");
+        free(oname);
+#endif
+    } else if(thread_id == 2) { //OB Read#1 C->T, Read#2 G->A, Genome G->A only the - strand
+        if(config.paired) {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, config.FASTQ1CT, config.FASTQ2GA);
+        } else {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, config.FASTQ1CT);
+        }
+#ifdef DEBUG
+        oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_OB.bam")));
+        sprintf(oname, "%s%s_OB.bam", config.odir, config.basename);
+        of = bam_open(oname, "w");
+        free(oname);
+#endif
+    } else if(thread_id == 3) { //CTOT Read#1 G->A, Read#2 C->T, Genome C->T, only the - strand
+        if(config.paired) {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, config.FASTQ1GA, config.FASTQ2CT);
+        } else {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, config.FASTQ1GA);
+        }
+#ifdef DEBUG
+        oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_CTOT.bam")));
+        sprintf(oname, "%s%s_CTOT.bam", config.odir, config.basename);
+        of = bam_open(oname, "w");
+        free(oname);
+#endif
+    } else if(thread_id == 4) { //CTOB Read#1 G->A, Read#2 C->T, Genome G->A, only the + strand
+        if(config.paired) {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, config.FASTQ1GA, config.FASTQ2CT);
+        } else {
+            sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, config.FASTQ1GA);
+        }
+#ifdef DEBUG
+        oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_CTOB.bam")));
+        sprintf(oname, "%s%s_CTOB.bam", config.odir, config.basename);
+        of = bam_open(oname, "w");
+        free(oname);
+#endif
+    } else {
+        printf("Oh shit, got thread_id %i!\n", thread_id);
+        return;
+    }
+
+    //Wait for the signal to start
+#ifndef DEBUG
+    while(start == 0) MPI_Recv(&start, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &stat);
+#endif
+
+    //Start the process
+    if(!config.quiet) printf("Node %i executing: %s\n", thread_id, cmd); fflush(stdout);
+    fp = sam_popen(cmd);
+    header = sam_header_read(fp);
+#ifdef DEBUG
+    bam_header_write(of, header);
+#endif
+
+#ifndef DEBUG
+    packed_header = pack_header(header);
+    if(thread_id == 1) {
+        //Send the header
+        MPI_Send((void *) &(packed_header->size), 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
+        status = MPI_Send((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD);
+        if(status != MPI_SUCCESS) {
+            printf("MPI_Send returned %i\n", status);
+            fflush(stdout);
+        }
+    }
+#else
+    packed_header = pack_header(header);
+    void *tmp_pointer = malloc(packed_header->size);
+    MPI_Request request;
+    MPI_Isend((void *) packed_header->packed, packed_header->size, MPI_BYTE, NODE_ID, 2, MPI_COMM_WORLD, &request);
+    status = MPI_Recv(tmp_pointer, packed_header->size, MPI_BYTE, NODE_ID, 2, MPI_COMM_WORLD, &stat);
+    if(status != MPI_SUCCESS) printf("We seem to have received an error when sending to ourselves!\n");
+    MPI_Wait(&request, &stat);
+    unpack_header(debug_header, tmp_pointer);
+    global_header = debug_header;
+    free(tmp_pointer);
+#endif
+
+    t0 = time(NULL);
+    if(!config.quiet) printf("Node %i began sending reads @%s", thread_id, ctime(&t0)); fflush(stdout);
+    while(sam_read1(fp, header, read1) > 1) {
+#ifdef DEBUG
+        bam_write1(of, read1);
+#endif
+        if(strcmp(bam1_qname(read1), last_qname) == 0) { //Multimapper
+            if(config.paired) {
+                sam_read1(fp, header, read2);
+#ifdef DEBUG
+                bam_write1(of, read2);
+#endif
+            }
+            continue;
+        } else {
+            if(read1->core.l_qname > max_qname) {
+                max_qname = read1->core.l_qname + 10;
+                last_qname = realloc(last_qname, sizeof(char) * max_qname);
+            }
+            strcpy(last_qname, bam1_qname(read1));
+        }
+
+        //Send the read
+        packed_read = pack_read(read1, packed_read);
+#ifndef DEBUG
+        MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD);
+#else
+        if(packed_read->size > current_p_size) p = realloc(p, packed_read->size);
+        MPI_Isend(packed_read->packed, packed_read->size, MPI_BYTE, NODE_ID, 5, MPI_COMM_WORLD, &request);
+        status = MPI_Recv(p, packed_header->size, MPI_BYTE, NODE_ID, 5, MPI_COMM_WORLD, &stat);
+        MPI_Wait(&request, &stat);
+#endif
+        //Deal with paired-end reads
+        if(config.paired) {
+            sam_read1(fp, header, read2);
+            packed_read = pack_read(read2, packed_read);
+#ifndef DEBUG
+            MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD);
+#else
+            bam_write1(of, read2);
+            if(packed_read->size > current_p_size) p = realloc(p, packed_read->size);
+            MPI_Isend((void *) packed_read->packed, packed_read->size, MPI_BYTE, NODE_ID, 5, MPI_COMM_WORLD, &request);
+            status = MPI_Recv(p, packed_header->size, MPI_BYTE, NODE_ID, 5, MPI_COMM_WORLD, &stat);
+            MPI_Wait(&request, &stat);
+            debug_read = unpack_read(debug_read, p);
+#endif
+        }
+#ifndef DEBUG
+        i++;
+#endif
+    }
+    t1 = time(NULL);
+    if(!config.quiet) printf("Node %i finished sending reads @%s\t(%f sec elapsed)\n", thread_id, ctime(&t1), difftime(t1, t0)); fflush(stdout);
+
+    //Notify the master node
+    packed_read->size = 0;
+#ifndef DEBUG
+    void *A = malloc(1);
+    MPI_Send(A, 1, MPI_BYTE, 0, 5, MPI_COMM_WORLD);
+    free(A);
+#endif
+
+    //Close things up
+    bam_header_destroy(header);
+    bam_destroy1(read1);
+    bam_destroy1(read2);
+    free(cmd);
+    if(packed_read->packed != NULL) free(packed_read->packed);
+    free(packed_read);
+    if(packed_header->packed != NULL) free(packed_header->packed);
+    free(packed_header);
+    free(last_qname);
+    sam_pclose(fp);
+#ifdef DEBUG
+    bam_close(of);
+    bam_header_destroy(debug_header);
+    bam_destroy1(debug_read);
+    free(p);
+#endif
+    if(!config.quiet) printf("Exiting worker node %i\n", thread_id); fflush(stdout);
+};