diff --git a/MPI_packing.c b/MPI_packing.c new file mode 100644 index 0000000..f6008f8 --- /dev/null +++ b/MPI_packing.c @@ -0,0 +1,173 @@ +#include "bison.h" + +/****************************************************************************** +* +* Take a BAM header and pack it into a single contiguous memory block. Store +* the resulting block and its size in an MPI_Header structure. +* +* THE RESULT MUST BE free()d +* +* bam_header_t *header: The header to store +* +*******************************************************************************/ +MPI_Header * pack_header(bam_header_t *header) { + size_t size = sizeof(int32_t); //n_targets + int32_t *pint32_t; + uint32_t *puint32_t; + char *pchar; + int *pint; + int i; + void *p; + MPI_Header *output = malloc(sizeof(MPI_Header)); + + //target_name + for(i=0; in_targets; i++) { + size += (sizeof(char) * (1+strlen(header->target_name[i]))); + } + + //target_len + size += sizeof(uint32_t) * header->n_targets; + + //l_text + size += sizeof(int); + + //text + size += sizeof(char) * (1 + header->l_text); + + //Start copying, layout is n_targets,target_name[s],target_len[s],l_text,text + output->size = (int) size; + output->packed = malloc(size); + p = output->packed; + + //n_targets + memcpy(p, (void *) &(header->n_targets), sizeof(int32_t)); + pint32_t = (int32_t *) p; + p = (void *) (++pint32_t); + + //target_name + for(i=0; in_targets; i++) { + memcpy(p, (void *) header->target_name[i], sizeof(char) * (1 + strlen(header->target_name[i]))); + pchar = (char *) p; + p = (void *) (pchar+1+strlen(header->target_name[i])); + } + //target_len + memcpy(p, (void *) header->target_len, sizeof(uint32_t)*(header->n_targets)); + puint32_t = (uint32_t *) p; + p = (void *) (puint32_t + header->n_targets); + + //l_text + memcpy(p, (void *) &(header->l_text), sizeof(int)); + pint = (int *) p; + p = (void *) ++pint; + + //text + memcpy(p, (void *) (header->text), sizeof(char) * (1 + header->l_text)); + + return output; +} + +/****************************************************************************** +* +* Unpack a header packed into an initialized bam_header_t +* +* bam_header_t *header: The header to unpack into +* void *packed: The packed header +* +*******************************************************************************/ +void unpack_header(bam_header_t *header, void *packed) { + void *p = packed; + int i; + int *pint; + int32_t *pint32_t; + uint32_t *puint32_t; + char *pchar; + size_t strlength; + + //n_targets + header->n_targets = *((int32_t *) packed); + pint32_t = (int32_t *) p; + p = (void *) (++pint32_t); + + //**target_name + header->target_name = (char **) malloc(sizeof(char *) * (header->n_targets)); + for(i=0; in_targets; i++) { + strlength = strlen((char *) p)+1; + header->target_name[i] = malloc(sizeof(char) * strlength); + memcpy((void *) (header->target_name[i]), p, sizeof(char)*strlength); + pchar = (char *) p; + p = (void *) (pchar+strlength); + } + + //target_len + header->target_len = malloc(sizeof(uint32_t) * (header->n_targets)); + for(i=0; in_targets; i++) { + header->target_len[i] = *((uint32_t *) p); + puint32_t = (uint32_t *) p; + p = (void *) ++puint32_t; + } + + //l_text + header->l_text = *((int *) p); + pint = (int *) p; + p = (void *) ++pint; + + //text + header->text = (char *) malloc(sizeof(char) * (header->l_text+1)); + memcpy((void *) (header->text), p, sizeof(char) * (header->l_text + 1)); +} + +/****************************************************************************** +* +* Take a BAM read and pack it into a single contiguous memory block. Store +* the resulting block and its size in an MPI_Read structure. +* +* THE RESULT MUST BE free()d +* +* bam1_t *read: The read to store +* +*******************************************************************************/ +MPI_read * pack_read(bam1_t *read, MPI_read *output) { + bam1_t *pbam1_t; + int needed_size, m_data = read->m_data; + + needed_size = (int) (sizeof(bam1_t) + m_data); + if(output->size == 0) { + output->packed = malloc((size_t) needed_size); + output->size = needed_size; + } else if(needed_size > output->size) { + output->packed = realloc(output->packed, (size_t) needed_size); + output->size = needed_size; + } + memcpy((void *) output->packed, (void *) read, sizeof(bam1_t)); + pbam1_t = output->packed; + pbam1_t++; + memcpy((void *) pbam1_t, (void *) read->data, m_data); + return output; +} + +/****************************************************************************** +* +* Unpack a packed read into an initialized bam1_t read. +* +* bam1_t *read: The read to unpack into +* void *packed: The packed read +* +*******************************************************************************/ +bam1_t *unpack_read(bam1_t *read, void *packed) { + bam1_t *pbam1_t = packed; + uint8_t *pdata = (uint8_t *) (pbam1_t+1); + uint8_t *newdata; + + pbam1_t->data = pdata; + if(read != NULL) bam_destroy1(read); + read = bam_init1(); + read->core = pbam1_t->core; + read->l_aux = pbam1_t->l_aux; + read->m_data = pbam1_t->m_data; + read->data_len= pbam1_t->data_len; + newdata = (uint8_t *) malloc(read->m_data); + memcpy((void *) newdata, (void *) pdata, read->m_data); + read->data = newdata; + + return read; +} diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..2f0363b --- /dev/null +++ b/Makefile @@ -0,0 +1,82 @@ +WORK=/home/ryand#This should be changed to match your needs +PREFIX = $(WORK)/bin +CC = mpicc +INCLUDE_DIRS = -I$(WORK)/include #This should be were samtools was compiled -I/path/to/samtools/compilation +LIB_DIRS = -L$(WORK)/lib #As above, but -L/path/to/samtools/compilation +OPTS = -Wall -O3 #-DDEBUG #-DNOTHROTTLE -g +MPI = -lmpich -lmpl #This is usually appropriate for mpich2 +#MPI = #This is appropriate for mvapich2 +#MPI = -lmpi #This is usually appropriate for openmpi + +#Don't edit below here unless you know what you're doing! + +OBJS = aux.o fastq.o genome.o slurp.o master.o common.o MPI_packing.o worker.o +HERD_OBJS = herd/fastq.o herd/master.o herd/MPI_packing.o herd/slurp.o herd/worker.o herd/writer.o + +.SUFFIXES:.c .o + +all: align index extractor mbias markduplicates + +.c.o: + $(CC) -c $(OPTS) $(INCLUDE_DIRS) $< -o $@ + +markduplicates: + $(CC) $(OPTS) $(INCLUDE_DIRS) $(LIB_DIRS) -o bison_markduplicates markduplicates.c -lpthread -lbam -lz + +mbias: + $(CC) $(OPTS) $(INCLUDE_DIRS) $(LIB_DIRS) -o bison_mbias mbias.c -lpthread -lbam -lz + +index: + $(CC) $(OPTS) -o bison_index index.c -lpthread + +align: $(OBJS) + $(CC) -c $(OPTS) $(INCLUDE_DIRS) main.c -o main.o + $(CC) $(OPTS) $(OBJS) main.o -o bison $(LIB_DIRS) -lm -lpthread $(MPI) -lbam -lz + +extractor: + $(CC) -c $(OPTS) $(INCLUDE_DIRS) common.c -o common.o + $(CC) -c $(OPTS) $(INCLUDE_DIRS) methylation_extractor.c -o methylation_extractor.o + $(CC) $(OPTS) $(LIB_DIRS) common.o methylation_extractor.o -o bison_methylation_extractor -lpthread -lbam -lz + +#Don't compile herd by default +herd: $(OBJS) $(HERD_OBJS) + $(CC) -c $(OPTS) $(INCLUDE_DIRS) herd/main.c -o herd/main.o + $(CC) $(OPTS) $(OBJS) $(HERD_OBJS) herd/main.o -o bison_herd $(LIB_DIRS) -lm -lpthread $(MPI) -lbam -lz + +#Auxiliary programs, don't compile by default +auxiliary: merge_CpGs bedGraph2methylKit make_reduced_genome aux_python_scripts CpG_coverage + +aux_python_scripts: + cp -f auxiliary/bedGraph2BSseq.py ./ + cp -f auxiliary/merge_bedGraphs.py ./ + +CpG_coverage: common.o + $(CC) -c $(OPTS) $(INCLUDE_DIRS) auxiliary/CpG_coverage.c -o auxiliary/CpG_coverage.o + $(CC) $(OPTS) $(LIB_DIRS) common.o auxiliary/CpG_coverage.o -o bison_CpG_coverage + +merge_CpGs: common.o + $(CC) -c $(OPTS) $(INCLUDE_DIRS) auxiliary/merge_CpGs.c -o auxiliary/merge_CpGs.o + $(CC) $(OPTS) $(LIB_DIRS) common.o auxiliary/merge_CpGs.o -o bison_merge_CpGs + +bedGraph2methylKit:common.o + $(CC) -c $(OPTS) $(INCLUDE_DIRS) auxiliary/bedGraph2methylKit.c -o auxiliary/bedGraph2methylKit.o + $(CC) $(OPTS) $(LIB_DIRS) common.o auxiliary/bedGraph2methylKit.o -o bedGraph2methylKit + +make_reduced_genome: + $(CC) $(OPTS) $(LIB_DIRS) auxiliary/make_reduced_genome.c -o make_reduced_genome + +install : + mv bison_* $(PREFIX)/ + chmod a+x Rscripts/* + cp Rscripts/* $(PREFIX)/ + if [ -f bison ]; then mv bison $(PREFIX)/ ; fi; + if [ -f bedGraph2methylKit ]; then mv bedGraph2methylKit $(PREFIX)/ ; fi; + if [ -f bedGraph2BSseq.py ]; then chmod a+x bedGraph2BSseq.py ; mv bedGraph2BSseq.py $(PREFIX)/ ; fi; + if [ -f merge_bedGraphs.py ]; then chmod a+x merge_bedGraphs.py ; mv merge_bedGraphs.py $(PREFIX)/ ; fi; + if [ -f check_accuracy ]; then mv check_accuracy $(PREFIX)/ ; fi; + if [ -f make_reduced_genome ]; then mv make_reduced_genome $(PREFIX)/ ; fi; + +clean: + rm -f *.o bison bison_* bedGraph2methylKit check_accuracy make_reduced_genome bedGraph2BSseq.py + rm -f herd/*.o + rm -f auxiliary/*.o diff --git a/README b/README new file mode 100644 index 0000000..e79b788 --- /dev/null +++ b/README @@ -0,0 +1,468 @@ +This is Bison, bisulfite alignment on nodes of a cluster. + +___________________________________________________________________________ +Prerequisites + +This program depends upon the following: + +1. A functional MPI implementation, such as mpich + +2. The SAMtools library or similar. SAMtools is available here: http://samtools.sourceforge.net/ + +3. Bowtie2, available here: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + The bowtie2 executable MUST be in your PATH. + +4. zcat, gzip, and bzcat must also be in your PATH, though this will almost + always be the case. + +5. To use bison_mbias2pdf (or the -pdf option of bison_mbias), R must be + installed and in your PATH. Additionally, the ggplot2 library must be + installed. + +N.B., the actual SAMtools library and header files are required for the + compilation step and can then be removed. The actual samtools executable + isn't required. + +___________________________________________________________________________ +General setup should go as follows: + +0. Download and extract the source code for samtools. Change into the directory + containing said code and type "make". + +1. Download the source distribution. + +2. Unpack, for example: tar zxf bison-0.1.0.tgz + +3. Possibly edit the Makefile, to include MPI and SAMtools library and header + locations. If these are installed in standard locations, the defaults + should suffice. For samtools see example in the Makefile. The default + Makefile is suitable for mpich2. If you're using openmpi you'll need to + comment out the first MPI line and uncomment the second MPI line. + +4. type "make" + +4a. If you would like to use bison_herd, type "make herd". + +4b. If you would like the auxiliary tools installed, type "make auxiliary". + +5. type "make install" + +The install path can be changed easily in the Makefile. + +___________________________________________________________________________ +Detailed installation instructions: + +1. Download samtools (at least version 0.1.19!). + +2. Extract the compressed bzipped tar-ball: +tar jxf samtools-0.1.19.tar.bz2 + +3. Change to that directory and type: +make + +4. Similarly download and extract the source code for bison + +5. Change the installation target. For example, if you would like bison to be + installed under "bin" in your home directory, then the PREFIX line should be: +PREFIX = ~/bin + +6. The default compiler is mpicc, but this can be changed by altering the line + beginning with "CC". + +7. If you extracted and built samtools in your home directory, then you will + likely need to change the INCLUDE_DIRS and LIB_DIRS to something like: +INCLUDE_DIRS = -I/home/username/samtools-0.1.19 +LIB_DIRS = -L/home/username/samtools-0.1.19 + If you already have the headers and libbam.a file elsewhere, then change + these lines appropriately. + + Likewise, add the location of your MPI headers and libraries, if they're not + in the normal search path. + +8. You can disable throttling in bison_herd by adding "-DNOTHROTTLE" in the + "OPTS" line, though read the "Throttling" section , below. Similarly, both + bison and bison_herd can be compiled in a special debug mode by adding + "-DDEBUG" to the "OPTS" line. See the "Debug mode" section, below. + +9. Continue with step #4 in the preceding section. + +___________________________________________________________________________ +Usage + +Indexing of a directory of fasta (extension .fa or .fasta) can be performed +as follows: + +bison_index [OPTIONS] directory/ + +Options that are not specific to bison are simply passed to bowtie2, which must +be in your PATH. The output is placed under "directory/bisulfite_genome". + +Alignment can be performed as follows (bison_herd is the same): + +mpiexec bison [OPTIONS] -g directory/ {-1 fastq_1.gz -2 fastq_2.gz | -U fastq.fq} + +"directory" is identical to that used for indexing. For further details type +"bison -h". For non-directional libraries, "mpiexec -N 5" should be used, +otherwise "mpiexec -N 3". Resource managers, such as slurm, should work in +an equivalent manner. All options not explicitly mentioned by typing +"bison -h" are passed to bowtie2. Consequently, using the --very-sensitive or +--dovetail options will work as expected. Bison already passes the following +flags to bowtie2: +-q --reorder --no-mixed --no-discordant + +bison_herd is equivalent, except that you can specify more nodes. You may also +input multiple files (comma-separated, no spaces) to align, in which case +alignments will be printed to multiples files. Furthermore, you may use +wild-cards in your file list. For example: + +mpiexec -N 17 bison_herd -o Alignments -g directory/ -1 exp1/sample*_1.fq.gz,/some/other/path/foo*_1.fq.gz -2 exp1/sample*_2.fq.gz,/some/other/path/foo*_2.fq.gz + +Make sure to not have multiple input files with the same name +(e.g., sample*/read1.fastq), as they will all be written to the same file +(overwriting any subsequent alignments)! + +There is also a methylation extractor that produces a bedGraph file, called +bison_methylation_extractor. Note, coordinate-sorted BAM files should not +be used! The methylation extractor can be told to ignore certain parts of each +read. This is particularly useful in cases where there is methylation bias +across the length of reads (i.e., if one plots the average methylation +percentage summed per position over all reads, the value goes up/down toward the +5' or 3' end). It is recommended to always run bison_mbias (with the -pdf option +if you have R and ggplot2 installed) to generate the required information for +constructing an M-bias plot. The bison_mbias2pdf script can convert this to a +PDF file (or a series of PNG files) and will also suggest what, if any, regions +should be ignored. These regions are strand and read number (in the case of +paired-end reads) dependent. While the suggested regions are often good, the +should not be blindly accepted (just look at the graph and use your best +judgement). + +See the "Auxiliary files" section, below, for additional files. + +___________________________________________________________________________ +Auxiliary files + +The following programs and scripts will be available if you type "make auxiliary": + +bedGraph2BSseq.py +This python script can accept a filename prefix and the names of at least 2 +bedGraph files and output 3 files for input into BSseq. A single chromosome can +be processed at a time, if desired, by using the -chr option. The output files +will be named $prefix.M, $prefix.Cov, and $prefix.gr. $prefix.M is a matrix with +a header line that lists the number of reads supporting methylation at each site +in the bedGraph files. If there is no coverage in a given sample, the value is +set to 0. $prefix.Cov is the analogous file listing coverage in each sample +(again, 0 denotes no coverage). $prefix.gr lists the coordinates for each line +in the .Cov and .M files. Loading these files into R would be performed as +follows (in this example "Chr17" was the prefix): + +M <- as.matrix(read.delim("Chr17.M", header=T)) +Cov <- as.matrix(read.delim("Chr17.Cov", header=T)) +bed <- read.delim("Chr17.bed", header=F) +#Remember that BED and bedGraph files are 0-based! +gr <- GRanges(seqnames=Rle(bed$V1),ranges=IRanges(start=bed$V2+1, end=bed$V3), strand=Rle("*", nrow(bed))) +groups <- data.frame(row.names=colnames(M), + var1 <- c(1,1,1,1,2,2,2,2)) #A very simple experiment with 2 groups of 4 samples +BS1 <- BSseq(M=M, Cov=Cov, gr=gr, pData=groups, sampleNames=colnames(M)) #You'll want to set some of the additional options! + + +bedGraph2methylKit +As above, but each bedGraph file is converted to a .methylKit file. The +bedGraphs should be of CpGs and not have had the strands merged (i.e., don't run +the merge_CpGs command below). + +make_reduced_genome.c +Create a reduced representation genome appropriate for reads of a given size +($size, default is 36bp). MspI and TaqI libraries are supported. Nucleotides +greater than $size+10% are converted to N. + +merge_bedGraphs.py +This will merge bedGraphs from technical replicates of a single sample into a +single bedGraph file, summing the methylation metrics as it goes. The output, +like the input is coordinate sorted. + +bison_merge_CpGs +Methylation is usually symmetric at CpG sites. While the output bedGraph files +have a single-C resolution, this will convert that to single-CpG resolution by +summing Cs in the same CpG from opposite strands. This saves space and will +often speed up downstream statistics. + +___________________________________________________________________________ +Advanced bison_herd usage + +bison_herd has the ability to use a semi-arbitrary number of nodes. In practice, +if bison is given N nodes, it will effectively use 2*((N-1)/2)+1 or +4*((N-1)/4)+1 nodes, for directional and non-directional libraries, +respectively. As an example, if you allot 20 nodes for a directional library, +bison_herd will only use 19 of them (17 for non-directional reads). The excess +nodes will exit properly and, unless you specify --quiet, produce an error +message. + +The options -mp, -queue-size, and -@ are bison_herd-specific and deserve further +description. + +-mp sets the number of threads that the master node will use to process +alignments produced by the worker nodes. Worker nodes are grouped into twos or +fours, where each group has the a number of nodes equal to the number of +possible bisulfite converted strands. As the number of allocated nodes +increases, a point is eventually reached where a single thread on the master +node is unable to keep up with the workers. In my experience, for directional +libraries, one thread can handle approximately 130 bowtie2 threads (i.e., if +using -p 11, -mp should be increased once ~12 worker nodes are allocated, since +that would equate to 132 threads in use by bowtie2). One should keep in mind +that there are already at least 3 other threads concurrently running on the +master node (sending and storing fastq reads, receiving alignments, and writing +alignments). Consequently, there is a practical limit to the number of nodes is +determined by how many cores are available on each node. + +-queue-size determines the maximum difference between reads sent for alignment +and reads processed. This option is unavailable if bison_herd was compiled with +-DNOTHROTTLE. By default, the thread that sends reads for alignment will pause +if it has sent more than ~1 million reads than have been processed. The purpose +of this is to prevent overwhelming of the MPI unexpected message buffer, since +the thread on the master node that sends reads can generally process reads +faster than all of the worker nodes combined can align them. Setting this value +too high may result in bison_herd crashing with otherwise cryptic messages +involving MPI_Send. In such cases, decreasing the value used by -queue-size +should resolve the problem. On the other hand, setting this value too low can +result in a deadlocks, due to buffering at various levels. The default value +hasn't resulted in deadlocking or crashes on our cluster, but yours may be +different! This difference is checked every 100000 reads, which can changed by +editting the THROTTLE_CHECK_INTERVAL value in bison.h prior to compilation. + +-@ specifies the number of compression threads used for writing the output BAM +file. In practice, a single compression thread can write ~80 million paired-end +reads per hour (depending on CPU speed). I routinely use -@ 4 when using more +than ~9 nodes as this allows writing to occur as quickly as reads are processed. +To determine if the number of compression threads should be increased, not the +time difference (especially early on) between when each master processor thread +has processed 100000 reads and when those reads have been written to a file. +Even when --reorder is used, if there is >1 second between these, then you may +benefit from increasing the number of compression threads. For those curious, +this option is identical to that used in samtools. + +___________________________________________________________________________ +Throttling + +bison_herd generally uses blocking, but not synchronous sends. What this means +in practice is that many reads will be queued by the master node for sending to +the worker nodes. Likewise, many alignments can be queued by the worker nodes +for sending back to the master node. The queue that many MPI implementations use +for this is relatively small and immutable. While a full queue should cause +MPI_Send to block until there is sufficient space, occasionally a constellation +of events can occur that cause this queue to overflow and the master node to +then crash. This can be alleviated by limiting the possible number of reads that +could ever possibly be in the queue at any single time. As the queue is not +directly pollable, the difference between the number of reads sent and written +is used as a surrogate. The maximum number of reads in the wild is then either +2x or 4x this difference (since a read is queued per worker node). In reality, +the queue should be emptier than this as there are normally reads buffered on +the worker nodes (being fed to bowtie2, being aligned or being sent) and +elsewhere on the master node (being received, waiting to be processed, being +processed, waiting to be written, or being written). + +Throttling is not always required, particularly as an increasing number of nodes +are used. Throttling can be disabled altogether by compiling with -DNOTHROTTLE, +which will remove all related components. + +___________________________________________________________________________ +Debug mode + +For debugging, a special debug mode is available for both bison and bison_herd +by compiling with -DDEBUG. Instead of running of needing multiple nodes, both +programs will then run as if they were just a single node. Compiling with this +option adds the -taskid option to both programs. The taskid is equivalent to the +node number in the bison (or bison_herd) hierarchy. Node 0 is the master node +and performs the final file writing. For bison, nodes 1-4 are equivalent to the +worker nodes that align reads to the original top, original bottom, +complementary to original top and complementary to original bottom strands, +respectively. For directional libraries, only the first 2 are used. These will +write alignments to a file for final processing when run as taskid 0. This is +useful when odd alignments are being output and the source of the error needs to +be tracked down. The mode for bison_herd is similar, except there are always 8 +theoretical worker nodes (i.e., taskid 1-8 need to be run prior to taskid 0). +This allows testing multiple master processor threads with both directional and +non-directional reads. + +In general, this mode should not be used unless you are running into extremely +odd bugs. + +___________________________________________________________________________ +Compatibility with Bismark + +Bison is generally similar to bismark, however the indexes are incompatible, +due to bismark renaming contigs. Also, the two will not produce identical +output, due to algorithmic differences. Running bison_methylation_extractor +on the output of bismark will also produce different results, again due to +algorithmic differences. In addition, bison always outputs BAM files directly. + +___________________________________________________________________________ +Other details + +Bison needn't be run on multiple computers. You can also use a single +computer for all compute nodes (e.g. mpiexec -n 5 bison ...). The same holds +true for bison_herd. Both bison and bison_herd seem to be faster than bismark, +even when limited to the same resources. + +___________________________________________________________________________ +Changes: + +0.2.4 + * Fixed an off-by-one error in bison_mbias. Also, at some point 1-methylation + percentage started getting calculated. That's been fixed. + + * Added bison_markduplicates, which, as the name implies, marks apparent PCR + duplicates. The methylation extractor and m-bias calculator have also been + updated to ignore marked duplicates. + + * Fixed a bug in the CpG coverage program, which wasn't properly handling + single-C bedGraph files before (if they were merged, then they were being + handled correctly). + +0.2.3 + * Fix how hard and soft-clipped bases are dealt with (previously, soft- + clipped bases resulted in an error and hard-clipped bases in incorrect + position assignments!). + + * Multiple bug fixes related to local alignment, which previously didn't + work correctly. These issues seem to generally now be resolved. May thanks + to user mvijayen on seqanswers for providing a perfect usage example for + testing (see thread http://seqanswers.com/forums/showthread.php?t=39914). + + * The maximum length of a single contig is now (2^64)-1 (instead of the + previous 2^64). I don't think bowtie2 would even support something that + long, but if it did then bison wouldn't (internally, a position of 2^64 + means a base is inserted, soft, or hard-clipped). + + * A previously missing "*" caused Bison to use the entirety of the + description line in the fasta file as the chromosome name. This caused + errors since bowtie2 only uses every before the first space (the proper + method). Bison now does the same. + + * A note about creating methylation-bias metrics with locally aligned reads + is in order. If a read is soft-clipped, that portion is still included in + the M-bias metrics. Likewise, if you pass -OT X,X,X,X or similar + parameters to the methylation extractor, the soft-clipped area is also + included in there. + + * Another note regarding local alignments is that the XX auxiliary tag + (effectively the more verbose version of the MD tag) contains soft-clipped + sequences. I could probably have these removed if someone would like. + +0.2.2 + * Properly fixed some wording on the textual output (i.e., removed the word + "unique"). + + * Lowered the default MAPQ and Phred thresholds used by the methylation + extractor to 10 each. That the MAPQ threshold was originally + 20 was an error on my part. + +0.2.1 + * Added support for file globbing in bison_herd. You may now input multiple + files using a combination of wild-cards (*, ?, etc.) and commas. Remember + to put these in quotes (e.g., "foo/*1.fq.gz","bar/*1.fq.gz") so the shell + doesn't perform the expansion!). As before, specifying multiple inputs with + the same file name (e.g., sample1/reads.fq,sample2/reads.fq) will cause the + output from the first reads.fq alignment to be over-written by the second. + + * Fixed the text output, since "unique alignments" isn't really correct, + given that alignments with scores of 0 or 1 can be output but aren't + unique. + + * Added information in the Makefile and above about compiling with openmpi. + + * Fixed a bug in bison_herd wherein the -upto option wasn't being handled + properly. -upto now accepts an unsigned long in bison_herd. + + * Fixed a bug in bison_herd when paired-end reads were used. This was due to + how bowtie2 reads from FIFOs. Changing how things were written to the FIFOs + on the worker nodes resolved the problem. + + * The bison_mbias program has been heavily revamped. It still outputs the + number of methylated or unmethylated CpG calls per position, but now keeps + the metrics for each strand (and read, when paired-end reads are used) + separate. If R and the ggplot2 library are installed, the program can also + run the bison_mbias2pdf program (see below). + + * Created an bison_mbias2pdf Rscript that will read in the output of + bison_mbias and plot the results, indicating the region of each read that + should be included in methylation extraction. This script also print these + suggestions in the format used by bison_methylation_extractor, for + convenience. + + * The methylation extractor can now be told to only include certain regions + of each read in the output methylation metrics. This is needed when there + is apparent bias in the methylation at one or both ends of a read. + + * Previously, the recalculated MAPQ was incorrect when only 1 read in a pair + had a valid secondary alignment. This has been fixed. + + * Fixed another MAPQ recalculation bug, affecting reads with MAPQ 2 that + have MAPQ=6. + + * Fixed a bug in writing unmapped reads. + + * Fixed a bug in bison_herd that allowed early termination without warning. + +0.2.0 + * Added a note to the methylation summary statistics output at the end of a + run that the numbers will include double counting of any site covered by + both mates in a pair. These metrics are only meant for general information + and not further analysis, so I don't consider that a bug (it's actually a + design decision for the sake of performance). + + * --ignore-quals is no longer passed to bowtie2 by default. Specifying this + will marginally decrease both correct and incorrect alignments. It will + also generally decrease the alignment rate. + + * Fixed --unmapped, which are now written to the directory specified by -o + + * --maxins was already 500 by default, so it is no longer set by default. + + * Added bison_herd, see above for usage + + * The methylation extractor now has a -phred option, to exclude methylation + calls from low confidence base-calls. The default threshold is 20. + + * Added a script to convert bedGraph files to a format suitable for BSseq. + + * Fixed a bug in bison_merge_CpGs + + * Both bison and bison_herd now check to ensure that the MPI implementation + actually supports the level of thread support requested (previously, this + was just assumed). + +0.1.1 + * Fixed a number of minor bugs. + + * Added support for uncompressed fastq files, as well as bzipped files + (previously, only gzipped fastq files worked properly). + + * --score-min is now parsed by bison prior to being sent to bowtie2, + read MAPQ scores are recalculated accordingly by the same algorithm + used by bowtie2 (N.B., this only bears a vague correspondence to + -10*log10(probability the mapping position is wrong)!). + + * Added a bison_mbias function, to process the aligned BAM file and + create a text file containing the percentage of methylated C's as a + function of read position. For the utility of this, see: Hansen KD, + Langmead B and Irizarry RA, BSmooth: from whole genome bisulfite + sequencing to differentially methylated regions. Genome Biol 2012; + 13(10):R83. + + * The methylation extractor now accepts the -q options, which sets the + MAPQ threshold for a read to be included in the methylation results. + The default is a minimum MAPQ of 20, which seems to be a reasonable + threshold from a few simulations. + + * In DEBUG mode, the output BAM files used to have fixed names. This was + a problem in cases where debugging was being performed on multiple + input files. Now, the OT/OB/CTOT/CTOB.bam filename is prepended with + an appropriate prefix (extracted from the input file name). In + addition, the output directory is now respected in DEBUG mode. + + * Included an "auxiliary" directory, that includes functions for making + an RRBS genome and other possibly useful functions. + + +0.1.0 + Initial release diff --git a/README.md b/README.md new file mode 100644 index 0000000..b8b62b3 --- /dev/null +++ b/README.md @@ -0,0 +1,462 @@ +#Bison: bisulfite alignment on nodes of a cluster. + +##Prerequisites + +This program depends upon the following: + +1. A functional MPI implementation, such as mpich + +2. The SAMtools library or similar. SAMtools is available here: http://samtools.sourceforge.net/ + +3. Bowtie2, available here: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + The bowtie2 executable MUST be in your PATH. + +4. zcat, gzip, and bzcat must also be in your PATH, though this will almost + always be the case. + +5. To use bison_mbias2pdf (or the -pdf option of bison_mbias), R must be + installed and in your PATH. Additionally, the ggplot2 library must be + installed. + +N.B., the actual SAMtools library and header files are required for the + compilation step and can then be removed. The actual samtools executable + isn't required. + +##General setup should go as follows: + +0. Download and extract the source code for samtools. Change into the directory + containing said code and type "make". + +1. Download the source distribution. + +2. Unpack, for example: tar zxf bison-0.1.0.tgz + +3. Possibly edit the Makefile, to include MPI and SAMtools library and header + locations. If these are installed in standard locations, the defaults + should suffice. For samtools see example in the Makefile. The default + Makefile is suitable for mpich2. If you're using openmpi you'll need to + comment out the first MPI line and uncomment the second MPI line. + +4. type "make" + + * If you would like to use `bison_herd`, type "make herd". + + * If you would like the auxiliary tools installed, type "make auxiliary". + +5. type "make install" + +The install path can be changed easily in the Makefile. + +##Detailed installation instructions + +1. Download samtools (at least version 0.1.19!). + +2. Extract the compressed bzipped tar-ball: +tar jxf samtools-0.1.19.tar.bz2 + +3. Change to that directory and type: +make + +4. Similarly download and extract the source code for bison + +5. Change the installation target. For example, if you would like bison to be + installed under "bin" in your home directory, then the PREFIX line should be: +PREFIX = ~/bin + +6. The default compiler is mpicc, but this can be changed by altering the line + beginning with "CC". + +7. If you extracted and built samtools in your home directory, then you will + likely need to change the `INCLUDE_DIRS` and `LIB_DIRS` to something like: + + INCLUDE_DIRS = -I/home/username/samtools-0.1.19 + LIB_DIRS = -L/home/username/samtools-0.1.19 + + If you already have the headers and libbam.a file elsewhere, then change + these lines appropriately. + + Likewise, add the location of your MPI headers and libraries, if they're not + in the normal search path. + +8. You can disable throttling in `bison_herd` by adding "-DNOTHROTTLE" in the + "OPTS" line, though read the "Throttling" section , below. Similarly, both + bison and `bison_herd` can be compiled in a special debug mode by adding + "-DDEBUG" to the "OPTS" line. See the "Debug mode" section, below. + +9. Continue with step #4 in the preceding section. + +##Usage + +Indexing of a directory of fasta (extension .fa or .fasta) can be performed +as follows: + + bison_index [OPTIONS] directory/ + +Options that are not specific to bison are simply passed to bowtie2, which must +be in your PATH. The output is placed under `directory/bisulfite_genome`. + +Alignment can be performed as follows (`bison_herd` is the same): + + mpiexec bison [OPTIONS] -g directory/ {-1 fastq_1.gz -2 fastq_2.gz | -U fastq.fq} + +"directory" is identical to that used for indexing. For further details type +"bison -h". For non-directional libraries, "mpiexec -N 5" should be used, +otherwise "mpiexec -N 3". Resource managers, such as slurm, should work in +an equivalent manner. All options not explicitly mentioned by typing +"bison -h" are passed to bowtie2. Consequently, using the --very-sensitive or +--dovetail options will work as expected. Bison already passes the following +flags to bowtie2: + + -q --reorder --no-mixed --no-discordant + +`bison_herd` is equivalent, except that you can specify more nodes. You may also +input multiple files (comma-separated, no spaces) to align, in which case +alignments will be printed to multiples files. Furthermore, you may use +wild-cards in your file list. For example: + + mpiexec -N 17 bison_herd -o Alignments -g directory/ -1 exp1/sample*_1.fq.gz,/some/other/path/foo*_1.fq.gz -2 exp1/sample*_2.fq.gz,/some/other/path/foo*_2.fq.gz + +Make sure to not have multiple input files with the same name +(e.g., `sample*/read1.fastq`), as they will all be written to the same file +(overwriting any subsequent alignments)! + +There is also a methylation extractor that produces a bedGraph file, called +`bison_methylation_extractor`. Note, coordinate-sorted BAM files should not +be used! The methylation extractor can be told to ignore certain parts of each +read. This is particularly useful in cases where there is methylation bias +across the length of reads (i.e., if one plots the average methylation +percentage summed per position over all reads, the value goes up/down toward the +5' or 3' end). It is recommended to always run `bison_mbias` (with the -pdf option +if you have R and ggplot2 installed) to generate the required information for +constructing an M-bias plot. The `bison_mbias2pdf` script can convert this to a +PDF file (or a series of PNG files) and will also suggest what, if any, regions +should be ignored. These regions are strand and read number (in the case of +paired-end reads) dependent. While the suggested regions are often good, the +should not be blindly accepted (just look at the graph and use your best +judgement). + +See the "Auxiliary files" section, below, for additional files. + +##Auxiliary files + +The following programs and scripts will be available if you type "make auxiliary": + +###bedGraph2BSseq.py +This python script can accept a filename prefix and the names of at least 2 +bedGraph files and output 3 files for input into BSseq. A single chromosome can +be processed at a time, if desired, by using the -chr option. The output files +will be named $prefix.M, $prefix.Cov, and $prefix.gr. $prefix.M is a matrix with +a header line that lists the number of reads supporting methylation at each site +in the bedGraph files. If there is no coverage in a given sample, the value is +set to 0. $prefix.Cov is the analogous file listing coverage in each sample +(again, 0 denotes no coverage). $prefix.gr lists the coordinates for each line +in the .Cov and .M files. Loading these files into R would be performed as +follows (in this example "Chr17" was the prefix): + +```R +M <- as.matrix(read.delim("Chr17.M", header=T)) +Cov <- as.matrix(read.delim("Chr17.Cov", header=T)) +bed <- read.delim("Chr17.bed", header=F) +#Remember that BED and bedGraph files are 0-based! +gr <- GRanges(seqnames=Rle(bed$V1),ranges=IRanges(start=bed$V2+1, end=bed$V3), strand=Rle("*", nrow(bed))) +groups <- data.frame(row.names=colnames(M), + var1 <- c(1,1,1,1,2,2,2,2)) #A very simple experiment with 2 groups of 4 samples +BS1 <- BSseq(M=M, Cov=Cov, gr=gr, pData=groups, sampleNames=colnames(M)) #You'll want to set some of the additional options! +``` + + +###`bedGraph2methylKit` +As above, but each bedGraph file is converted to a .methylKit file. The +bedGraphs should be of CpGs and not have had the strands merged (i.e., don't run +the merge_CpGs command below). + +###`make_reduced_genome` +Create a reduced representation genome appropriate for reads of a given size +($size, default is 36bp). MspI and TaqI libraries are supported. Nucleotides +greater than $size+10% are converted to N. + +###`merge_bedGraphs.py` +This will merge bedGraphs from technical replicates of a single sample into a +single bedGraph file, summing the methylation metrics as it goes. The output, +like the input is coordinate sorted. + +###`bison_merge_CpGs` +Methylation is usually symmetric at CpG sites. While the output bedGraph files +have a single-C resolution, this will convert that to single-CpG resolution by +summing Cs in the same CpG from opposite strands. This saves space and will +often speed up downstream statistics. + +##Advanced bison_herd usage + +`bison_herd` has the ability to use a semi-arbitrary number of nodes. In practice, +if bison is given N nodes, it will effectively use `2*((N-1)/2)+1` or +`4*((N-1)/4)+1` nodes, for directional and non-directional libraries, +respectively. As an example, if you allot 20 nodes for a directional library, +`bison_herd` will only use 19 of them (17 for non-directional reads). The excess +nodes will exit properly and, unless you specify --quiet, produce an error +message. + +The options -mp, -queue-size, and -@ are `bison_herd`-specific and deserve further +description. + +-mp sets the number of threads that the master node will use to process +alignments produced by the worker nodes. Worker nodes are grouped into twos or +fours, where each group has the a number of nodes equal to the number of +possible bisulfite converted strands. As the number of allocated nodes +increases, a point is eventually reached where a single thread on the master +node is unable to keep up with the workers. In my experience, for directional +libraries, one thread can handle approximately 130 bowtie2 threads (i.e., if +using -p 11, -mp should be increased once ~12 worker nodes are allocated, since +that would equate to 132 threads in use by bowtie2). One should keep in mind +that there are already at least 3 other threads concurrently running on the +master node (sending and storing fastq reads, receiving alignments, and writing +alignments). Consequently, there is a practical limit to the number of nodes is +determined by how many cores are available on each node. + +-queue-size determines the maximum difference between reads sent for alignment +and reads processed. This option is unavailable if `bison_herd` was compiled with +-DNOTHROTTLE. By default, the thread that sends reads for alignment will pause +if it has sent more than ~1 million reads than have been processed. The purpose +of this is to prevent overwhelming of the MPI unexpected message buffer, since +the thread on the master node that sends reads can generally process reads +faster than all of the worker nodes combined can align them. Setting this value +too high may result in `bison_herd` crashing with otherwise cryptic messages +involving `MPI_Send`. In such cases, decreasing the value used by -queue-size +should resolve the problem. On the other hand, setting this value too low can +result in a deadlocks, due to buffering at various levels. The default value +hasn't resulted in deadlocking or crashes on our cluster, but yours may be +different! This difference is checked every 100000 reads, which can changed by +editting the `THROTTLE_CHECK_INTERVAL` value in bison.h prior to compilation. + +-@ specifies the number of compression threads used for writing the output BAM +file. In practice, a single compression thread can write ~80 million paired-end +reads per hour (depending on CPU speed). I routinely use -@ 4 when using more +than ~9 nodes as this allows writing to occur as quickly as reads are processed. +To determine if the number of compression threads should be increased, not the +time difference (especially early on) between when each master processor thread +has processed 100000 reads and when those reads have been written to a file. +Even when --reorder is used, if there is >1 second between these, then you may +benefit from increasing the number of compression threads. For those curious, +this option is identical to that used in samtools. + +##Throttling + +`bison_herd` generally uses blocking, but not synchronous sends. What this means +in practice is that many reads will be queued by the master node for sending to +the worker nodes. Likewise, many alignments can be queued by the worker nodes +for sending back to the master node. The queue that many MPI implementations use +for this is relatively small and immutable. While a full queue should cause +`MPI_Send` to block until there is sufficient space, occasionally a constellation +of events can occur that cause this queue to overflow and the master node to +then crash. This can be alleviated by limiting the possible number of reads that +could ever possibly be in the queue at any single time. As the queue is not +directly pollable, the difference between the number of reads sent and written +is used as a surrogate. The maximum number of reads in the wild is then either +2x or 4x this difference (since a read is queued per worker node). In reality, +the queue should be emptier than this as there are normally reads buffered on +the worker nodes (being fed to bowtie2, being aligned or being sent) and +elsewhere on the master node (being received, waiting to be processed, being +processed, waiting to be written, or being written). + +Throttling is not always required, particularly as an increasing number of nodes +are used. Throttling can be disabled altogether by compiling with -DNOTHROTTLE, +which will remove all related components. + +##Debug mode + +For debugging, a special debug mode is available for both bison and `bison_herd` +by compiling with -DDEBUG. Instead of running of needing multiple nodes, both +programs will then run as if they were just a single node. Compiling with this +option adds the -taskid option to both programs. The taskid is equivalent to the +node number in the bison (or `bison_herd`) hierarchy. Node 0 is the master node +and performs the final file writing. For bison, nodes 1-4 are equivalent to the +worker nodes that align reads to the original top, original bottom, +complementary to original top and complementary to original bottom strands, +respectively. For directional libraries, only the first 2 are used. These will +write alignments to a file for final processing when run as taskid 0. This is +useful when odd alignments are being output and the source of the error needs to +be tracked down. The mode for `bison_herd` is similar, except there are always 8 +theoretical worker nodes (i.e., taskid 1-8 need to be run prior to taskid 0). +This allows testing multiple master processor threads with both directional and +non-directional reads. + +In general, this mode should not be used unless you are running into extremely +odd bugs. + +##Compatibility with Bismark + +Bison is generally similar to bismark, however the indexes are incompatible, +due to bismark renaming contigs. Also, the two will not produce identical +output, due to algorithmic differences. Running `bison_methylation_extractor` +on the output of bismark will also produce different results, again due to +algorithmic differences. In addition, bison always outputs BAM files directly. + +##Other details + +Bison needn't be run on multiple computers. You can also use a single +computer for all compute nodes (e.g. mpiexec -n 5 bison ...). The same holds +true for `bison_herd`. Both bison and `bison_herd` seem to be faster than bismark, +even when limited to the same resources. + +##Changes + +###0.2.4 + * Fixed an off-by-one error in bison_mbias. Also, at some point 1-methylation + percentage started getting calculated. That's been fixed. + + * Added bison_markduplicates, which, as the name implies, marks apparent PCR + duplicates. The methylation extractor and m-bias calculator have also been + updated to ignore marked duplicates. + + * Fixed a bug in the CpG coverage program, which wasn't properly handling + single-C bedGraph files before (if they were merged, then they were being + handled correctly). + +###0.2.3 + * Fix how hard and soft-clipped bases are dealt with (previously, soft- + clipped bases resulted in an error and hard-clipped bases in incorrect + position assignments!). + + * Multiple bug fixes related to local alignment, which previously didn't + work correctly. These issues seem to generally now be resolved. May thanks + to user mvijayen on seqanswers for providing a perfect usage example for + testing (see thread http://seqanswers.com/forums/showthread.php?t=39914). + + * The maximum length of a single contig is now (2^64)-1 (instead of the + previous 2^64). I don't think bowtie2 would even support something that + long, but if it did then bison wouldn't (internally, a position of 2^64 + means a base is inserted, soft, or hard-clipped). + + * A previously missing "*" caused Bison to use the entirety of the + description line in the fasta file as the chromosome name. This caused + errors since bowtie2 only uses every before the first space (the proper + method). Bison now does the same. + + * A note about creating methylation-bias metrics with locally aligned reads + is in order. If a read is soft-clipped, that portion is still included in + the M-bias metrics. Likewise, if you pass -OT X,X,X,X or similar + parameters to the methylation extractor, the soft-clipped area is also + included in there. + + * Another note regarding local alignments is that the XX auxiliary tag + (effectively the more verbose version of the MD tag) contains soft-clipped + sequences. I could probably have these removed if someone would like. + +###0.2.2 + * Properly fixed some wording on the textual output (i.e., removed the word + "unique"). + + * Lowered the default MAPQ and Phred thresholds used by the methylation + extractor to 10 each. That the MAPQ threshold was originally + 20 was an error on my part. + +###0.2.1 + * Added support for file globbing in bison_herd. You may now input multiple + files using a combination of wild-cards (*, ?, etc.) and commas. Remember + to put these in quotes (e.g., "foo/*1.fq.gz","bar/*1.fq.gz") so the shell + doesn't perform the expansion!). As before, specifying multiple inputs with + the same file name (e.g., sample1/reads.fq,sample2/reads.fq) will cause the + output from the first reads.fq alignment to be over-written by the second. + + * Fixed the text output, since "unique alignments" isn't really correct, + given that alignments with scores of 0 or 1 can be output but aren't + unique. + + * Added information in the Makefile and above about compiling with openmpi. + + * Fixed a bug in bison_herd wherein the -upto option wasn't being handled + properly. -upto now accepts an unsigned long in bison_herd. + + * Fixed a bug in bison_herd when paired-end reads were used. This was due to + how bowtie2 reads from FIFOs. Changing how things were written to the FIFOs + on the worker nodes resolved the problem. + + * The bison_mbias program has been heavily revamped. It still outputs the + number of methylated or unmethylated CpG calls per position, but now keeps + the metrics for each strand (and read, when paired-end reads are used) + separate. If R and the ggplot2 library are installed, the program can also + run the bison_mbias2pdf program (see below). + + * Created an bison_mbias2pdf Rscript that will read in the output of + bison_mbias and plot the results, indicating the region of each read that + should be included in methylation extraction. This script also print these + suggestions in the format used by bison_methylation_extractor, for + convenience. + + * The methylation extractor can now be told to only include certain regions + of each read in the output methylation metrics. This is needed when there + is apparent bias in the methylation at one or both ends of a read. + + * Previously, the recalculated MAPQ was incorrect when only 1 read in a pair + had a valid secondary alignment. This has been fixed. + + * Fixed another MAPQ recalculation bug, affecting reads with MAPQ 2 that + have MAPQ=6. + + * Fixed a bug in writing unmapped reads. + + * Fixed a bug in bison_herd that allowed early termination without warning. + +###0.2.0 + * Added a note to the methylation summary statistics output at the end of a + run that the numbers will include double counting of any site covered by + both mates in a pair. These metrics are only meant for general information + and not further analysis, so I don't consider that a bug (it's actually a + design decision for the sake of performance). + + * --ignore-quals is no longer passed to bowtie2 by default. Specifying this + will marginally decrease both correct and incorrect alignments. It will + also generally decrease the alignment rate. + + * Fixed --unmapped, which are now written to the directory specified by -o + + * --maxins was already 500 by default, so it is no longer set by default. + + * Added bison_herd, see above for usage + + * The methylation extractor now has a -phred option, to exclude methylation + calls from low confidence base-calls. The default threshold is 20. + + * Added a script to convert bedGraph files to a format suitable for BSseq. + + * Fixed a bug in bison_merge_CpGs + + * Both bison and bison_herd now check to ensure that the MPI implementation + actually supports the level of thread support requested (previously, this + was just assumed). + +###0.1.1 + * Fixed a number of minor bugs. + + * Added support for uncompressed fastq files, as well as bzipped files + (previously, only gzipped fastq files worked properly). + + * --score-min is now parsed by bison prior to being sent to bowtie2, + read MAPQ scores are recalculated accordingly by the same algorithm + used by bowtie2 (N.B., this only bears a vague correspondence to + -10*log10(probability the mapping position is wrong)!). + + * Added a bison_mbias function, to process the aligned BAM file and + create a text file containing the percentage of methylated C's as a + function of read position. For the utility of this, see: Hansen KD, + Langmead B and Irizarry RA, BSmooth: from whole genome bisulfite + sequencing to differentially methylated regions. Genome Biol 2012; + 13(10):R83. + + * The methylation extractor now accepts the -q options, which sets the + MAPQ threshold for a read to be included in the methylation results. + The default is a minimum MAPQ of 20, which seems to be a reasonable + threshold from a few simulations. + + * In DEBUG mode, the output BAM files used to have fixed names. This was + a problem in cases where debugging was being performed on multiple + input files. Now, the OT/OB/CTOT/CTOB.bam filename is prepended with + an appropriate prefix (extracted from the input file name). In + addition, the output directory is now respected in DEBUG mode. + + * Included an "auxiliary" directory, that includes functions for making + an RRBS genome and other possibly useful functions. + + +###0.1.0 + Initial release diff --git a/Rscripts/bison_mbias2pdf b/Rscripts/bison_mbias2pdf new file mode 100755 index 0000000..26f0f39 --- /dev/null +++ b/Rscripts/bison_mbias2pdf @@ -0,0 +1,228 @@ +#!/usr/bin/env Rscript +suppressMessages(require(ggplot2)) + +#Agresti-Coull confidence interval +CI <- function(df, which = 0) { + X = df$nMethylated + N = df$nMethylated + df$nUnmethylated + Z = qnorm(1-0.5*min_p) + Z_squared = Z*Z + N_dot = N + Z_squared + P_dot = (1/N_dot)*(X+0.5*Z_squared) + if(which == 0) { + P_dot - Z*sqrt((P_dot/N_dot)*(1-P_dot)) + } else { + P_dot + Z*sqrt((P_dot/N_dot)*(1-P_dot)) + } +} + +usage <- function() { + cat("Usage: bison_mbias2pdf [OPTIONS] output_from_bison_mbias.txt + + Given the output of bison_mbias, graph it in R and estimate what regions to + ignore when extracting methylation. The graph includes the average + methylation level at each position as well as the (by default) 99.9% + confidence intervals, which are semi-transparent. Changing the -p value + changes the confidence intervals accordingly. Vertical lines may be drawn at + the bounds of the region suggested for inclusion in the methylation metrics. + The appropriate options for the methylation extractor are then printed to + the screen, for convenience. + + -5 The 5' most bound on the + strand of the region for calculating the + baseline methylation level. The default is 0.2. + + -3 The 3' most bound on the + strand of the region for calculating the + baseline methylation level. The default is 0.8. + + -m Minimum difference from expected methylation level to suggest trimming a + base. The default is 0.01 (i.e., 1%). Without a minimum, the script + would output spurious results when minimally biased data is processed. + + -p Minimum p-value for the test of whether a position's methylation is + different from expected. The default is 0.001. + + -png Write output to multiple PNG files instead of to PDF. + + -h Print this message. +") +} + +#defaults +left = 0.2 +right = 0.8 +do_png = 0 +f = NULL +min_percent = 0.01 +min_p = 0.001 +cmd = "" #This will hold the options for the methylation extractor + +args = commandArgs(trailingOnly=T) +i=1 +while(i<=length(args)) { + if(args[i] == "-5") { + i = i+1 + left = as.numeric(args[i]) + } else if(args[i] == "-3") { + i = i+1 + right = as.numeric(args[i]) + } else if(args[i] == "-m") { + i = i+1 + min_percent = as.numeric(args[i]) + } else if(args[i] == "-p") { + i = i+1 + min_p = as.numeric(args[i]) + } else if(args[i] == "-png") { + do_png=1 + } else if(args[i] == "-h") { + usage() + stop() + } else if(is.null(f)) { + f = args[i] + } + i = i+1 +} + +if(min_p >= 1) { + min_p = 0.001 +} else if(min_p < 0) { + min_p = 0.001 +} +if(min_percent > 1) { + min_percent = min_percent/100 + cat(sprintf("-m reset to %f since the original value was > 1!\n", min_percent)) +} else if(min_percent < 0) { + cat(sprintf("-m reset to 0.01 since you specified negative methylation, which makes no sense\n")) + min_percent = 0.01 +} + +if(is.null(f)) { + usage() + stop() +} else { + d <- read.delim(f, header=T) + #Fix some of the columns + d$Read <- factor(d$Read) + d$Strand <- relevel(d$Strand, "OT") + #Calculate methylation + d$Methylation <- d$nMethylated/(d$nMethylated+d$nUnmethylated) + #Upper/Lower Confidence Interval + d$UpperCI <- CI(d, 1) + d$LowerCI <- CI(d, 0) + + #Determine the output prefix + prefix = sub("_mbias.txt", "", f) + + if(do_png == 0) { + cat(sprintf("Output will be written to %s_mbias.pdf\n",prefix)) + pdf(file=sprintf("%s_mbias.pdf", prefix)) + } + for(lev in levels(d$Strand)) { + if(do_png == 1) { + cat(sprintf("Output will be written to %s_%s_mbias.png\n", prefix, lev)) + png(filename=sprintf("%s_%s_mbias.png", prefix, lev)) + } + #Calculate the cutoffs + cutoff_inters = c() + cutoff_types = c() + cutoff_cols = factor(c(), levels=c("1","2")) + #read 1 + USE <- intersect(which(d$Strand==lev), which(d$Read == 1)) + lower <- floor(left * max(d$Position[USE])) + upper <- ceiling(right * max(d$Position[USE])) + USE2 <- intersect(USE, intersect(which(d$Position >= lower), which(d$Position <= upper))) + av <- c(mean(d$Methylation[USE2]), min(d$LowerCI[USE2]), max(d$UpperCI[USE2])) + read1_5 <- 0 + read1_3 <- 0 + #Significantly below + to_remove <- intersect(which(d$UpperCI[USE] < av[1]), which(d$Methylation[USE] < av[2])) + #Significantly above + to_remove <- append(to_remove, intersect(which(d$LowerCI[USE] > av[1]), which(d$Methylation[USE] > av[3]))) + to_remove <- unique(to_remove) + #Difference threshold + to_remove <- intersect(to_remove, which(abs(d$Methylation[USE] - av[1]) > min_percent)) + midway = floor(0.5*max(d$Position[USE])) + #5' + if(any(d$Position[USE][to_remove] < midway)) { + read1_5 = d$Position[USE][max(to_remove[which(d$Position[USE][to_remove] < midway)])]+1 + cutoff_inters <- append(cutoff_inters, read1_5) + cutoff_types <- append(cutoff_types, "L1") + cutoff_cols <- append(cutoff_cols, "1") + } + #3' + if(any(d$Position[USE][to_remove] >= midway)) { + read1_3 = d$Position[USE][min(to_remove[which(d$Position[USE][to_remove] >= midway)])]-1 + cutoff_inters <- append(cutoff_inters, read1_3) + cutoff_types <- append(cutoff_types, "L2") + cutoff_cols <- append(cutoff_cols, "1") + } + + #read 2 + USE <- intersect(which(d$Strand==lev), which(d$Read == 2)) + read2_5 <- 0 + read2_3 <- 0 + if(length(USE) > 0) { + lower <- floor(left * max(d$Position[USE])) + upper <- ceiling(right * max(d$Position[USE])) + USE2 <- intersect(USE, intersect(which(d$Position >= lower), which(d$Position <= upper))) + av <- c(mean(d$Methylation[USE2]), min(d$LowerCI[USE2]), max(d$UpperCI[USE2])) + #Significantly below + to_remove <- intersect(which(d$UpperCI[USE] < av[1]), which(d$Methylation[USE] < av[2])) + #Significantly above + to_remove <- append(to_remove, intersect(which(d$LowerCI[USE] > av[1]), which(d$Methylation[USE] > av[3]))) + to_remove <- unique(to_remove) + #Difference threshold + to_remove <- intersect(to_remove, which(abs(d$Methylation[USE] - av[1]) > min_percent)) + midway = floor(0.5*max(d$Position[USE])) + #5' + if(any(d$Position[USE][to_remove] < midway)) { + read2_5 = d$Position[USE][max(to_remove[which(d$Position[USE][to_remove] < midway)])]+1 + cutoff_inters <- append(cutoff_inters, read2_5) + cutoff_types <- append(cutoff_types, "L3") + cutoff_cols <- append(cutoff_cols, "2") + } + #3' + if(any(d$Position[USE][to_remove] >= midway)) { + read2_3 = d$Position[USE][min(to_remove[which(d$Position[USE][to_remove] >= midway)])]-1 + cutoff_inters <- append(cutoff_inters, read2_3) + cutoff_types <- append(cutoff_types, "L4") + cutoff_cols <- append(cutoff_cols, "2") + } + } + + #Make the pplot + USE <- which(d$Strand==lev) + cutoffs <- data.frame(x=cutoff_inters, types=cutoff_types, cols= cutoff_cols) + g <- ggplot(d[USE,], aes(x=Position, y=Methylation, ymin=max(min(LowerCI), 0), ymax=min(max(UpperCI), 1), group=Read)) + g <- g + geom_ribbon(aes(ymin=LowerCI, ymax=UpperCI, alpha=0.9, fill=Read)) + g <- g + geom_line(aes(colour=Read)) + g <- g + scale_alpha(guide='none') + g <- g + scale_colour_discrete(guide='none') + if(length(unique(d[USE, 2])) == 2) { + g <- g + scale_fill_discrete(name="Read", labels=c("#1","#2")) + } else { + g <- g + scale_fill_discrete(guide='none') + } + g <- g + xlab("Position along mapped read (5'->3' of + strand)") + g <- g + scale_x_continuous(breaks=seq(0, max(d$Position[USE]), 10)) + g <- g + ggtitle(sprintf("%s strand", lev)) + + #Add the cutoffs + if(length(cutoff_inters) > 0) { + g <- g + geom_vline(data=cutoffs, aes(xintercept=x, colour=cols, linetype=types), show_guide=T) + g <- g + scale_linetype_discrete(name="Cutoffs", labels=sprintf("%i", cutoffs$x)) + g <- g + guides(fill=guide_legend(override.aes=list(linetype=0))) + + cmd <- sprintf("%s-%s %i,%i,%i,%i ", cmd, lev, read1_5, read1_3, read2_5, read2_3) + } + print(g) + if(do_png == 1) { + suppressMessages(dev.off()) + } + } + if(do_png == 0) { + suppressMessages(dev.off()) + } + if(cmd != "") { + cat(sprintf("Suggested methylation extractor parameters: %s\n", cmd)) + } +} diff --git a/aux.c b/aux.c new file mode 100644 index 0000000..3c5f9f5 --- /dev/null +++ b/aux.c @@ -0,0 +1,201 @@ +#include "bison.h" + +KSTREAM_INIT(gzFile, gzread, 16384) +KHASH_MAP_INIT_STR(ref, uint64_t) +FILE *popen_fd; + +struct __tamFile_t { + gzFile fp; + kstream_t *ks; + kstring_t *str; + uint64_t n_lines; + int is_first; +}; + +/****************************************************************************** +* +* Return the number of nodes what will actually be run (as opposed to the +* number allocated) +* +*******************************************************************************/ +#ifdef DEBUG +int effective_nodes() { + return(8); +} +#else +int effective_nodes() { + int output, remainder; + + MPI_Comm_size(MPI_COMM_WORLD, &output); + --output; //Ignore the master node + + if(config.directional) { + remainder = output % 2; + } else { + remainder = output % 4; + } + output -= remainder; + return(output); +} +#endif + +/****************************************************************************** +* +* quit, while performing some cleanup +* +* int FLAG: What to free/close/etc. +* 0x1 things created by create_fastq_names() +* 0x2 things pthreads are closed and bam headers destroyed +* In addition, the master node will free chromosomes.genome, close +* the BAM file, and free everything in the chromosomes struct. +* +* int rv: return value +* +*******************************************************************************/ +void quit(int FLAG, int rv) { + int taskid, i; + + free(config.bowtie2_options); + + MPI_Comm_rank(MPI_COMM_WORLD, &taskid); + + if(FLAG & 1) { //FASTQ filenames set +#ifndef DEBUG + if(taskid == MASTER) { + if(config.FASTQ1CT != NULL) remove(config.FASTQ1CT); + if(config.paired && (config.FASTQ2GA != NULL)) remove(config.FASTQ2GA); + if(!config.directional) { + if(config.FASTQ1GA != NULL) remove(config.FASTQ1GA); + if(config.paired && (config.FASTQ2CT != NULL)) remove(config.FASTQ2CT); + } + } +#endif + if(config.FASTQ1CT != NULL) free(config.FASTQ1CT); + if(config.FASTQ1GA != NULL) free(config.FASTQ1GA); + if(config.unmapped1 != NULL) free(config.unmapped1); + if(config.paired) { + if(config.FASTQ2CT != NULL) free(config.FASTQ2CT); + if(config.FASTQ2GA != NULL) free(config.FASTQ2GA); + if(config.unmapped2 != NULL) free(config.unmapped2); + } + free(config.basename); + free(config.outname); + } + + if(taskid == MASTER) { + free(chromosomes.genome); + for(i=0; ichrom); + free(*(chromosomes.chromosome+i)); + } + free(chromosomes.chromosome); + if(FLAG && OUTPUT_BAM) bam_close(OUTPUT_BAM); + } + MPI_Finalize(); + if(taskid == MASTER && FLAG > 0) { +#ifdef DEBUG + if(fp1) bam_close(fp1); + if(fp2) bam_close(fp2); + if(!config.directional) { + if(fp3) bam_close(fp3); + if(fp4) bam_close(fp4); + } +#else + if(config.unmapped) { + pclose(unmapped1); + if(config.paired) pclose(unmapped2); + } +#endif + } + exit(rv); +} + +void print_metrics() { + char *of = malloc(sizeof(char) * (strlen(config.odir)+5+strlen(config.basename))); + FILE *fp; + unsigned long long m_reads = m_reads_OT + m_reads_OB + m_reads_CTOT + m_reads_CTOB; + sprintf(of, "%s%s.txt", config.odir, config.basename); + fp = fopen(of, "w"); + + if(!config.quiet) printf("Alignment:\n"); + fprintf(fp,"Alignment:\n"); + if(config.paired) { + if(!config.quiet) { + printf("\t%llu total paired-end reads analysed\n", t_reads); + printf("\t%llu paired-end reads mapped (%6.2f%%).\n", m_reads, ((float) (100*m_reads))/((float) t_reads)); + printf("\n"); + } + fprintf(fp, "\t%llu total paired-end reads analysed\n", t_reads); + fprintf(fp, "\t%llu paired-end reads mapped (%6.2f%%).\n", m_reads, ((float) (100*m_reads))/((float) t_reads)); + fprintf(fp, "\n"); + } else { + if(!config.quiet) { + printf("\t%llu total reads analysed\n", t_reads); + printf("\t%llu reads mapped (%6.2f%%).\n", m_reads, ((float) (100*m_reads))/((float) t_reads)); + printf("\n"); + } + fprintf(fp,"\t%llu total reads analysed\n", t_reads); + fprintf(fp,"\t%llu reads mapped (%6.2f%%).\n", m_reads, ((float) (100*m_reads))/((float) t_reads)); + fprintf(fp,"\n"); + } + if(!config.quiet) { + printf("Number of hits aligning to each of the orientations:\n"); + printf("\t%llu\t%6.2f%%\tOT (original top strand)\n", m_reads_OT, ((float) (100*m_reads_OT))/((float) t_reads)); + printf("\t%llu\t%6.2f%%\tOB (original bottom strand)\n", m_reads_OB, ((float) (100*m_reads_OB))/((float) t_reads)); + if(!config.directional) printf("\t%llu\t%6.2f%%\tCTOT (complementary to the original top strand)\n", m_reads_CTOT, ((float) (100*m_reads_CTOT))/((float) t_reads)); + if(!config.directional) printf("\t%llu\t%6.2f%%\tCTOB (complementary to the original bottom strand)\n", m_reads_CTOB, ((float) (100*m_reads_CTOB))/((float) t_reads)); + printf("\n"); + printf("Cytosine Methylation (N.B., statistics from overlapping mates are added together!):\n"); + printf("\tNumber of C's in a CpG context: %llu\n", t_CpG); + printf("\tPercentage of methylated C's in a CpG context: %6.2f%%\n", ((float) (100*m_CpG))/((float) t_CpG)); + printf("\tNumber of C's in a CHG context: %llu\n", t_CHG); + printf("\tPercentage of methylated C's in a CHG context: %6.2f%%\n", ((float) (100*m_CHG))/((float) t_CHG)); + printf("\tNumber of C's in a CHH context: %llu\n", t_CHH); + printf("\tPercentage of methylated C's in a CHH context: %6.2f%%\n", ((float) (100*m_CHH))/((float) t_CHH)); + } + fprintf(fp,"Number of hits aligning to each of the orientations:\n"); + fprintf(fp,"\t%llu\t%6.2f%%\tOT (original top strand)\n", m_reads_OT, ((float) (100*m_reads_OT))/((float) t_reads)); + fprintf(fp,"\t%llu\t%6.2f%%\tOB (original bottom strand)\n", m_reads_OB, ((float) (100*m_reads_OB))/((float) t_reads)); + if(!config.directional) fprintf(fp,"\t%llu\t%6.2f%%\tCTOT (complementary to the original top strand)\n", m_reads_CTOT, ((float) (100*m_reads_CTOT))/((float) t_reads)); + if(!config.directional) fprintf(fp,"\t%llu\t%6.2f%%\tCTOB (complementary to the original bottom strand)\n", m_reads_CTOB, ((float) (100*m_reads_CTOB))/((float) t_reads)); + fprintf(fp,"\n"); + fprintf(fp,"Cytosine Methylation (N.B., statistics from overlapping mates are added together!):\n"); + fprintf(fp,"\tNumber of C's in a CpG context: %llu\n", t_CpG); + fprintf(fp,"\tPercentage of methylated C's in a CpG context: %6.2f%%\n", ((float) (100*m_CpG))/((float) t_CpG)); + fprintf(fp,"\tNumber of C's in a CHG context: %llu\n", t_CHG); + fprintf(fp,"\tPercentage of methylated C's in a CHG context: %6.2f%%\n", ((float) (100*m_CHG))/((float) t_CHG)); + fprintf(fp,"\tNumber of C's in a CHH context: %llu\n", t_CHH); + fprintf(fp,"\tPercentage of methylated C's in a CHH context: %6.2f%%\n", ((float) (100*m_CHH))/((float) t_CHH)); + + fclose(fp); + free(of); +} + +tamFile sam_popen(char *cmd) { + tamFile fp = calloc(1, sizeof(struct __tamFile_t)); + gzFile gzfp; + int fid, fid2; + popen_fd = popen(cmd, "r"); //Global + + if(popen_fd == NULL) return 0; + fid = fileno(popen_fd); + fid2 = dup(fid); //otherwise, the file descriptor is closed by zlib and pclose() won't work!! + gzfp = gzdopen(fid2, "r"); + fp->str = (kstring_t*) calloc(1, sizeof(kstring_t)); + fp->fp = gzfp; + fp->ks = ks_init(fp->fp); + fp->n_lines = 0; + fp->is_first = 1; + return fp; +} + +void sam_pclose(tamFile fp) { + if(fp) { + ks_destroy(fp->ks); + gzclose(fp->fp); + pclose(popen_fd); //global + free(fp->str->s); + free(fp->str); + free(fp); + } +} diff --git a/auxiliary/CpG_coverage.c b/auxiliary/CpG_coverage.c new file mode 100644 index 0000000..c071751 --- /dev/null +++ b/auxiliary/CpG_coverage.c @@ -0,0 +1,119 @@ +#include "../bison.h" +#include "sam.h" + +//This will hold the coverage. The last bin is actually for everything >250 +unsigned long long coverage[252]; +struct { + char *chrom; + unsigned long long position; + unsigned long long end; + unsigned long coverage; +} cur_line; + +void next_line(FILE *fp, char *buffer) { + if(fgets(buffer, 1024, fp) != NULL) { + cur_line.chrom = strtok(buffer, "\t"); + cur_line.position = strtoull(strtok(NULL, "\t"), NULL, 10); + cur_line.end = strtoull(strtok(NULL, "\t"), NULL, 10); + strtok(NULL, "\t"); + cur_line.coverage = strtoul(strtok(NULL, "\t"), NULL, 10); + cur_line.coverage += strtoul(strtok(NULL, "\n"), NULL, 10); + } +} + +void usage(char *prog) { + printf("Usage: %s genome_directory input.bedGraph output.txt\n", prog); + printf("\n\ + Calculate a histogram of per-CpG coverage. N.B., the genome and bedGraph\n\ + file need to be in the same order (they will be if the bedGraph file was\n\ + produced with bison and the same genome is used).\n\ +\n\ + -h Print this message.\n\ +\n"); +} + +int main(int argc, char *argv[]) { + FILE *fp = NULL; + FILE *ofile = NULL; + int32_t i = 0; + uint32_t j = 0; + char *GenomeChrom = NULL; + unsigned long temp_coverage = 0; + char *line = malloc(sizeof(char) * 1024); + unsigned long long k; + unsigned long long nCpGs = 0; + + config.genome_dir = NULL; + chromosomes.nchromosomes = 0; + + /* read in the file names */ + if(argc < 4 || strcmp(argv[1], "-h") == 0) { + usage(argv[0]); + return 1; + }; + config.genome_dir = argv[1]; + fp = fopen(argv[2], "r"); + ofile = fopen(argv[3], "w"); + + for(i=0; i<252; i++) coverage[i] = 0; + + //Read in the genome + chromosomes.max_genome = 3000000000; + printf("Allocating space for %llu characters\n", chromosomes.max_genome); fflush(stdout); + chromosomes.genome = malloc(sizeof(char)*chromosomes.max_genome); + *chromosomes.genome = '\0'; + if(chromosomes.genome == NULL) { + printf("Could not allocate enough room to hold the genome!\n"); + return -1; + } + read_genome(); + + //Start reading in the file + next_line(fp, line); + + //Iterate through the genome + for(i=0; ichrom; + j = chromosomes.chromosome[i]->offset; + k = 0; //0-based chromosome position + while(j < chromosomes.chromosome[i]->length - 1) { + if(*(chromosomes.genome+j) == 'C' && *(chromosomes.genome+j+1) == 'G') { + nCpGs++; + while(strcmp(cur_line.chrom, GenomeChrom) == 0 && k > cur_line.position) next_line(fp, line); //We should never go beyond 1 line... + if(strcmp(cur_line.chrom, GenomeChrom) == 0 && (k == cur_line.position || k == cur_line.position-1)) { + temp_coverage = cur_line.coverage; + if(cur_line.end-cur_line.position == 1) { //Single-C resolution rather than merged as CpGs + next_line(fp, line); + if(strcmp(cur_line.chrom, GenomeChrom) == 0 && k == cur_line.position-1) { + temp_coverage += cur_line.coverage; + } + } + if(temp_coverage > 250) temp_coverage = 251; + coverage[temp_coverage]++; + } else { + coverage[0]++; + } + } + j++; + k++; + } + } + + //Print some output + for(i=0; i<251; i++) fprintf(ofile, "%i\t%llu\n", i, coverage[i]); + fprintf(ofile, "251+\t%llu\n", coverage[251]); + printf("There were %llu CpGs\n", nCpGs); + + //Close things up + free(line); + free(chromosomes.genome); + for(i=0; ichrom); + free(*(chromosomes.chromosome+i)); + } + free(chromosomes.chromosome); + fclose(fp); + fclose(ofile); + + return 0; +}; diff --git a/auxiliary/bedGraph2BSseq.py b/auxiliary/bedGraph2BSseq.py new file mode 100755 index 0000000..3edc50d --- /dev/null +++ b/auxiliary/bedGraph2BSseq.py @@ -0,0 +1,115 @@ +#!/usr/bin/python +import argparse +import csv +import sys + +parser = argparse.ArgumentParser(description='Convert a series of bedGraph files into input files appropriate for BSseq.') +parser.add_argument('-chr', metavar='chromosome', help="Only output this chromosome (e.g. chr17) instead of all of them.") +parser.add_argument('prefix', metavar='prefix', help="Output prefix") +parser.add_argument('files', metavar='files', nargs='*', help="Input bedGraph files. There must be at least 2.") +args = parser.parse_args() + +if((args.prefix == None) or (args.files == None) or (len(args.files) < 2)) : + parser.print_help() + sys.exit() + +files = [] +for f in args.files : + files.append(csv.reader(open(f, "r"), dialect="excel-tab")) +ofM = open("%s.M" % (args.prefix), "w") +ofCov = open("%s.Cov" % (args.prefix), "w") +ofbed = open("%s.bed" % (args.prefix), "w") + +lines = [] +for f in files : + line = f.next() + lines.append([line[0],int(line[1]), int(line[2]), int(line[4]), int(line[5])]) + +#Add a header +first = 1 +for f in args.files : + if(first == 1) : + ofM.write("%s" % f) + ofCov.write("%s" % f) + first = 0 + else : + ofM.write("\t%s" % f) + ofCov.write("\t%s" % f) +ofM.write("\n") +ofCov.write("\n") + +n_finished = 0 +n_total = len(files) +while(n_finished < n_total) : + i = 0 + lowest = 0 + #Determine the appropriate starting point + while(ichrom, chrom) == 0) return i; + } + return chromosomes.nchromosomes; +} + +inline void process_line(char *line, struct CpG *current_line) { + char *col; + + //start + col = strtok(NULL, "\t"); + current_line->start = (int32_t) atoi(col); + + //end + col = strtok(NULL, "\t"); + current_line->end = (int32_t) atoi(col); + + //1000*methylation percentage + col = strtok(NULL, "\t"); + + //n_methylated + col = strtok(NULL, "\t"); + current_line->n_methylated = (int32_t) atoi(col); + + //n_unmethylated + col = strtok(NULL, "\t"); + current_line->n_unmethylated = (int32_t) atoi(col); +} + +int main(int argc, char *argv[]) { + int i, last_tid = 0; + char *fname = NULL, *line = malloc(sizeof(char) * MAXREAD); + char *chrom, *last_chrom = NULL; + char base, strand; + unsigned long long offset; + FILE *of, *ifile; + struct CpG current_line; + + config.genome_dir = NULL; + chromosomes.nchromosomes = 0; + + /* read in the file names */ + if(argc < 3) { + usage(argv[0]); + return 0; + }; + for(i=1; ioffset; + base = toupper(*(chromosomes.genome+offset+current_line.start)); + strand='R'; + if(base=='C') strand='F'; + fprintf(of, "%s.%i\t%s\t%i\t%c\t%i\t%5.2f\t%5.2f\n", chrom, current_line.start+1, chrom, current_line.start+1, strand, \ + current_line.n_methylated+current_line.n_unmethylated, \ + 100*((float) current_line.n_methylated)/(float)(current_line.n_methylated + current_line.n_unmethylated), \ + 100*((float) current_line.n_unmethylated)/(float)(current_line.n_methylated + current_line.n_unmethylated)); + } + + //Close things up + free(line); + fclose(of); + fclose(ifile); + free(chromosomes.genome); + for(i=0; ichrom); + free(*(chromosomes.chromosome+i)); + } + free(chromosomes.chromosome); + + return 0; +}; diff --git a/auxiliary/make_reduced_genome.c b/auxiliary/make_reduced_genome.c new file mode 100644 index 0000000..4eea5e1 --- /dev/null +++ b/auxiliary/make_reduced_genome.c @@ -0,0 +1,347 @@ +#include +#include +#include +#define MAXLINE 512 +#define MAXChromosome 400000000 + +void usage(char *prog_name) { + printf("Usage: %s (options) GENOME.FA OUTPUT.FA\n",prog_name); + printf("\t-n X Maximum number of bases in each read (prior to CG/TG/etc. trimming)\n\t\tDefault is 36. N.B. 10%% more is used to the output.\n"); + printf("\t-TaqI Create a reduced representation genome that was cut by TaqI as well as MspI.\n"); + printf("\t-h Print this message\n"); + return; +} + +void output_fragment(FILE *of, char *fragment) { + fprintf(of,"%s",fragment); + return; +} + +unsigned long get_left_mask(char *f, int read_size) { + unsigned long output = 0, i = 0, max_len = strlen(f)-1; + + while(i < read_size) { + if(*(f+output) != '\n') { + i++; + } + output++; + if(output >= max_len) { + output = max_len; + break; + } + } + + return output; +} + +unsigned long get_right_mask(char *f, int read_size) { + unsigned long output = strlen(f)-1, i = 0; + + while(i < read_size) { + if(*(f+output) != '\n') { + i++; + } + output--; + if(output <= 0) { + output = 0; + break; + } + } + + return output; +} + +void process_fragment(FILE *of, char *fragment, int read_size) { + char *fp = fragment; + unsigned long i = 0, left_mask, right_mask; + + //Determine the masking coordinates + if(strlen(fragment)-1 <= read_size) { + output_fragment(of, fragment); + return; + } + left_mask = get_left_mask(fragment, read_size); + right_mask = get_right_mask(fragment, read_size); + + if(left_mask < right_mask) { + for(i=0; i <= right_mask; i++) { + if(i>=left_mask) { + if(*fp != '\n') { + *fp = 'N'; + } + } + fp++; + } + } + output_fragment(of, fragment); + + return; +} + +void process_chromosome(FILE *of, char *chrom, int read_size, int Taq) { + char *fragment = malloc(MAXChromosome * sizeof(char)); + char *cp = chrom, *fp = fragment; + + while(*cp != '\0') { + //Are we at a TaqI site? Do we even care? + if(*cp == 'T' && Taq) { + if(strncmp(cp,"TCGA", 4) == 0) { + //Add on the last T and a null + *fp = 'T'; + *(++fp) = '\0'; + process_fragment(of,fragment,read_size); + + fp = fragment; //Move the pointer back to the front of the fragment + //Start the next fragment + *fp = 'C'; + fp++; + *fp = 'G'; + fp++; + *fp = 'A'; + fp++; + + //Move the chromosome pointer past the cut site and loop + cp += 4; + continue; + } else if(strncmp(cp,"T\nCGA", 5) == 0) { + //Add on the last T and a null + *fp = 'T'; + *(++fp) = '\0'; + process_fragment(of,fragment,read_size); + + fp = fragment; //Move the pointer back to the front of the fragment + //Start the next fragment + *fp = '\n'; + fp++; + *fp = 'C'; + fp++; + *fp = 'G'; + fp++; + *fp = 'A'; + fp++; + + //Move the chromosome pointer past the cut site and loop + cp += 5; + continue; + } else if(strncmp(cp,"TC\nGA", 5) == 0) { + //Add on the last T and a null + *fp = 'T'; + *(++fp) = '\0'; + process_fragment(of,fragment,read_size); + + fp = fragment; //Move the pointer back to the front of the fragment + //Start the next fragment + *fp = 'C'; + fp++; + *fp = '\n'; + fp++; + *fp = 'G'; + fp++; + *fp = 'A'; + fp++; + + //Move the chromosome pointer past the cut site and loop + cp += 5; + continue; + } else if(strncmp(cp,"TCG\nA", 5) == 0) { + //Add on the last T and a null + *fp = 'T'; + *(++fp) = '\0'; + process_fragment(of,fragment,read_size); + + fp = fragment; //Move the pointer back to the front of the fragment + //Start the next fragment + *fp = 'C'; + fp++; + *fp = 'G'; + fp++; + *fp = '\n'; + fp++; + *fp = 'A'; + fp++; + + //Move the chromosome pointer past the cut site and loop + cp += 5; + continue; + } + } else if(*cp == 'C') { //MspI site + if(strncmp(cp,"CCGG", 4) == 0) { + //Add on the last T and a null + *fp = 'C'; + *(++fp) = '\0'; + process_fragment(of,fragment,read_size); + + fp = fragment; //Move the pointer back to the front of the fragment + //Start the next fragment + *fp = 'C'; + fp++; + *fp = 'G'; + fp++; + *fp = 'G'; + fp++; + + //Move the chromosome pointer past the cut site and loop + cp += 4; + continue; + } else if(strncmp(cp,"C\nCGG", 5) == 0) { + //Add on the last T and a null + *fp = 'C'; + *(++fp) = '\0'; + process_fragment(of,fragment,read_size); + + fp = fragment; //Move the pointer back to the front of the fragment + //Start the next fragment + *fp = '\n'; + fp++; + *fp = 'C'; + fp++; + *fp = 'G'; + fp++; + *fp = 'G'; + fp++; + + //Move the chromosome pointer past the cut site and loop + cp += 5; + continue; + } else if(strncmp(cp,"CC\nGG", 5) == 0) { + //Add on the last T and a null + *fp = 'C'; + *(++fp) = '\0'; + process_fragment(of,fragment,read_size); + + fp = fragment; //Move the pointer back to the front of the fragment + //Start the next fragment + *fp = 'C'; + fp++; + *fp = '\n'; + fp++; + *fp = 'G'; + fp++; + *fp = 'G'; + fp++; + + //Move the chromosome pointer past the cut site and loop + cp += 5; + continue; + } else if(strncmp(cp,"CCG\nG", 5) == 0) { + //Add on the last T and a null + *fp = 'C'; + *(++fp) = '\0'; + process_fragment(of,fragment,read_size); + + fp = fragment; //Move the pointer back to the front of the fragment + //Start the next fragment + *fp = 'C'; + fp++; + *fp = 'G'; + fp++; + *fp = '\n'; + fp++; + *fp = 'G'; + fp++; + + //Move the chromosome pointer past the cut site and loop + cp += 5; + continue; + } + } + + //We are not at a cut site + *fp = *cp; + fp++; + cp++; + } + + //Don't forget the last fragment! + *fp = '\0'; + process_fragment(of,fragment,read_size); + + return; +} + +int main(int argc, char *argv[]) { + int Taq = 0; + int i = 1; + int read_size = 36; //Set by -n, bp maximum in each read and, therefore number of bases on each end of a fragment to print. + char *infile = NULL, *outfile = NULL; + FILE *f = NULL, *of = NULL; + char *chrom_sequence = malloc(MAXChromosome * sizeof(char)), *line = malloc(MAXLINE * sizeof(char)); + char *p = chrom_sequence; + + if(argc < 3) { + usage(argv[0]); + return 1; + } + + //Parse the input + while(i') { + if(chrom_sequence != p) { + *p = '\0'; //Ensure that we end in a null + process_chromosome(of, chrom_sequence, read_size, Taq); + } + fprintf(of,"%s",line); + p = chrom_sequence; + } else { + for(i = 0; ichrom, chrom) == 0) return i; + } + return chromosomes.nchromosomes; +} + +inline void process_line(char *line, struct CpG *current_line) { + char *col; + + //start + col = strtok(NULL, "\t"); + current_line->start = (int32_t) atoi(col); + + //end + col = strtok(NULL, "\t"); + current_line->end = (int32_t) atoi(col); + + //1000*methylation percentage + col = strtok(NULL, "\t"); + + //n_methylated + col = strtok(NULL, "\t"); + current_line->n_methylated = (int32_t) atoi(col); + + //n_unmethylated + col = strtok(NULL, "\t"); + current_line->n_unmethylated = (int32_t) atoi(col); +} + +int main(int argc, char *argv[]) { + int i, last_tid = 0, mpercent; + char *fname = NULL, *line = malloc(sizeof(char) * MAXREAD); + char *chrom, *last_chrom = NULL; + char base; + unsigned long long offset; + FILE *of, *ifile; + struct CpG current_line, last_line; + + last_line.tid = -1; //This will mean that the last line has been written + last_line.start = 0; + last_line.end = 0; + last_line.n_methylated = 0; + last_line.n_unmethylated = 0; + config.genome_dir = NULL; + chromosomes.nchromosomes = 0; + + /* read in the file names */ + if(argc < 3) { + usage(argv[0]); + return 0; + }; + for(i=1; ioffset; + base = toupper(*(chromosomes.genome+offset+last_line.start)); + if(base == 'C') { //Yes + last_line.end++; + last_line.n_methylated += current_line.n_methylated; + last_line.n_unmethylated += current_line.n_unmethylated; + mpercent = (int) (1000 * ((float) last_line.n_methylated)/(float)(last_line.n_methylated + last_line.n_unmethylated)); + fprintf(of, "%s\t%i\t%i\t%i\t%i\t%i\n", chromosomes.chromosome[last_tid]->chrom, last_line.start, \ + last_line.end, mpercent, last_line.n_methylated, last_line.n_unmethylated); + last_line.tid = -1; + } else { //No + last_line.start--; + mpercent = (int) (1000 * ((float) last_line.n_methylated)/(float)(last_line.n_methylated + last_line.n_unmethylated)); + fprintf(of, "%s\t%i\t%i\t%i\t%i\t%i\n", chromosomes.chromosome[last_tid]->chrom, last_line.start, \ + last_line.end, mpercent, last_line.n_methylated, last_line.n_unmethylated); + last_line.tid = current_line.tid; + last_line.start = current_line.start; + last_line.end = current_line.end; + last_line.n_methylated = current_line.n_methylated; + last_line.n_unmethylated = current_line.n_unmethylated; + } + } else { + if(last_line.tid != -1) { + offset = chromosomes.chromosome[last_line.tid]->offset; + base = toupper(*(chromosomes.genome+offset+last_line.start)); + if(base == 'C') { //Yes + last_line.end++; + } else { + last_line.start--; + } + mpercent = (int) (1000 * ((float) last_line.n_methylated)/(float)(last_line.n_methylated + last_line.n_unmethylated)); + fprintf(of, "%s\t%i\t%i\t%i\t%i\t%i\n", chromosomes.chromosome[last_line.tid]->chrom, last_line.start, \ + last_line.end, mpercent, last_line.n_methylated, last_line.n_unmethylated); + } + last_line.tid = current_line.tid; + last_line.start = current_line.start; + last_line.end = current_line.end; + last_line.n_methylated = current_line.n_methylated; + last_line.n_unmethylated = current_line.n_unmethylated; + } + } + //Attend to a possible remnant line + if(last_line.tid != -1) { + offset = chromosomes.chromosome[last_tid]->offset; + base = toupper(*(chromosomes.genome+offset+last_line.start)); + if(base == 'C') { //Yes + last_line.end++; + } else { + last_line.start--; + } + mpercent = (int) (1000 * ((float) last_line.n_methylated)/(float)(last_line.n_methylated + last_line.n_unmethylated)); + fprintf(of, "%s\t%i\t%i\t%i\t%i\t%i\n", chromosomes.chromosome[last_tid]->chrom, last_line.start, \ + last_line.end, mpercent, last_line.n_methylated, last_line.n_unmethylated); + } + + //Close things up + free(line); + fclose(of); + fclose(ifile); + free(chromosomes.genome); + for(i=0; ichrom); + free(*(chromosomes.chromosome+i)); + } + free(chromosomes.chromosome); + + return 0; +}; diff --git a/auxiliary/merge_bedGraphs.py b/auxiliary/merge_bedGraphs.py new file mode 100755 index 0000000..356732d --- /dev/null +++ b/auxiliary/merge_bedGraphs.py @@ -0,0 +1,72 @@ +#!/usr/bin/python +import argparse +import csv +import sys + +parser = argparse.ArgumentParser(description='Merge a number of bedGraph files from the bison methylation extractor') +parser.add_argument('outfile', metavar='outfile', help="Output bedGraph files") +parser.add_argument('files', metavar='files', nargs='*', help="Input bedGraph files. There must be at least 2.") +args = parser.parse_args() + +if((args.outfile == None) or (args.files == None) or (len(args.files) < 2)) : + parser.print_help() + sys.exit() + +files = [] +for f in args.files : + if(f != args.outfile) : + files.append(csv.reader(open(f, "r"), dialect="excel-tab")) +of = open(args.outfile, "w") + +lines = [] +for f in files : + line = f.next() + lines.append([line[0],int(line[1]), int(line[2]), int(line[4]), int(line[5])]) + +n_finished = 0 +n_total = len(files) +while(n_finished < n_total) : + i = 0 + lowest = 0 + #Determine the appropriate starting point + while(i +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXREAD 1024 +#define MASTER 0 +#define VERSION "0.2.4" +#define BT2BUF_SZ 256 * 1024 +#define THROTTLE_CHECK_INTERVAL 100000 //When bison_herd auto-throttles, this specifies how frequently it should check whether it should do so (units are "reads") +#define version() printf("Bison, version %s\n", VERSION) + +/****************************************** +* +* MPI Send/Recv tags: +* +* 0: Workers should start +* 1: Header size (this could be removed) +* 2: packed header struct +* 3: Packed fastq struct +* 4: Unused (used to be packed read size) +* 5: Packed read +* +******************************************/ + +//Mutexes for thread i/o designation. A thread should not read/write until it's ID number is equal to these +FILE *zip1; +FILE *zip2; +FILE *unmapped1; +FILE *unmapped2; +bamFile OUTPUT_BAM; +unsigned long long t_reads; //total number of reads +unsigned long long m_reads_OT; //total number mapped to the OT strand +unsigned long long m_reads_OB; +unsigned long long m_reads_CTOT; +unsigned long long m_reads_CTOB; +unsigned long long t_CpG; //Total CpGs +unsigned long long m_CpG; //Methylated CpGs +unsigned long long t_CHG; +unsigned long long m_CHG; +unsigned long long t_CHH; +unsigned long long m_CHH; + +//This is useful for single-node debugging +#ifdef DEBUG +int global_debug_taskid; +bamFile fp1; +bamFile fp2; +bamFile fp3; +bamFile fp4; +#endif + +//Some people may find it useful to have the system throttle itself so as not to overwhelm the MPI buffer +unsigned long long *nwritten; + +//Mutex for controlling access to the global metrics struct and the output files +pthread_mutex_t metrics_mutex; + +typedef struct { + char *chrom; + unsigned long long offset; + unsigned long long length; +} chromosome_struct; + +typedef struct { + int nchromosomes; + unsigned long long max_genome; //The offset value in the read_genome() function will be used to keep track of how close we are + chromosome_struct **chromosome; + char *genome; //This will hold the genomic sequence in memory as a continuous string. +} chromosomes_struct; + +typedef struct { + char *FASTQ1; + char *FASTQ2; + char *FASTQ1CT; + char *FASTQ1GA; + char *FASTQ2CT; + char *FASTQ2GA; + char *unmapped1; + char *unmapped2; + char *genome_dir; + char *basename; + char *odir; + char *tmpdir; + char *bowtie2_options; + char *outname; + char scoremin_type; + int paired; + int directional; + int nthreads; + int nmthreads; + int buffer_size; + int send_receive_buffer_size; + int unmapped; + int mode; //0 is --end-to-end (default), 1 is local + int quiet; //0 or 1, the latter supresses all output to the screen + int reorder; //0 or 1, latter reorders writing to match input, only meaningful in herd + int n_compression_threads; //Default is 0 +#ifndef NOTHROTTLE + int reads_in_queue; +#endif + float scoremin_intercept; + float scoremin_coef; +} t_config; + +typedef struct { + int size; //Theres an effective size limit imposed by MPI of whatever int is + void *packed; +} MPI_Header; + +typedef struct { + int size; + void *packed; //the format is sizeof(bam1_t) followed by data, which is of size data_len +} MPI_read; + +typedef struct { + int size; + void *packed; //format is: char *name1\0seq1\0qual1\0 followed by optional char *name2\0seq2\0qual2\0 +} MPI_Fastq; + +typedef struct { + int max_name1; //current maximum length of memory for name1 + int max_seq1; + int max_qual1; + int max_name2; + int max_seq2; + int max_qual2; + char *name1; + char *seq1; + char *qual1; + char *name2; + char *seq2; + char *qual2; +} fastq; + +//This is used as the input struct for slurp_fastq +typedef struct { + int thread_id; + char *fastq1; + char *fastq2; +} slurp_fastq_struct; + +struct packed_struct { + void *packed; //If NULL, then finished + struct packed_struct *next; + struct packed_struct *previous; //Only used on last sentinel struct + int state; //0 no next node (not ready) + //1 has next node (ready) +}; + +//Global values +t_config config; +bam_header_t *global_header; +//This will be the global structure for pointers to chromosome_struct's holding the information for *genome +chromosomes_struct chromosomes; +char **fnames1, **fnames2; //This will hold the file names so that the writer thread knows what to rename things +unsigned long long *flengths; //This will hold the size of each file + +//Linked-list of reads +struct packed_struct *node1, *node1_last_sentinel; +struct packed_struct *node2, *node2_last_sentinel; +struct packed_struct *node3, *node3_last_sentinel; +struct packed_struct *node4, *node4_last_sentinel; +//bison-herd +struct packed_struct **nodes, **last_sentinel_node; +struct packed_struct **fastq_nodes, **last_fastq_sentinel_node; +struct packed_struct **to_write_node, **to_write_sentinel_node; + +/****************************************************************************** +* +* Take a fastq struct and convert it G->A, the conversion is in place +* +* fastq *read, input struct +* int which, which of the reads to convert +* +*******************************************************************************/ +void convertGA(fastq *, int); //fastq.c + +/****************************************************************************** +* +* Take a fastq struct and convert it C->T, the conversion is in place +* +* fastq *read, input struct +* int which, which of the reads to convert +* +*******************************************************************************/ +void convertCT(fastq *, int ); //fastq.c + +/****************************************************************************** +* +* Write an unmapped read to a gzipped fastq file. +* +* FILE *fp: gzipped fastq file +* bam1_t *read: read to write in fastq format +* +*******************************************************************************/ +void write_unmapped(FILE *, bam1_t *); //fastq.c + +/****************************************************************************** +* +* Read in the fastq file(s) sending the reads to the appropriate nodes and +* also storing the unconverted reads in a linked list on the master node. +* +* This will act as its own thread on the master node. +* +* void *a is unused but required by pthreads +* +*******************************************************************************/ +void * send_store_fastq(void *); + +/****************************************************************************** +* +* Add an element to the end of a linked-list +* +* struct packed_struct *last: last sentinel struct +* void *packed: a packed read +* +*******************************************************************************/ +void add_element(struct packed_struct *, void *); + +/****************************************************************************** +* +* Remove an element from the start of a linked-list +* is_ready(first, 0) must return 1! +* +* struct packed_struct *: first sentinel struct +* +*******************************************************************************/ +void remove_element(struct packed_struct *); //slurp.c +//As above, but the packed component can't have been updated to a bam1_t +void remove_raw_element(struct packed_struct *); //slurp.c + +/****************************************************************************** +* +* Move an element from one linked-list to another. +* +* struct packed_struct *source: source linked list +* struct packed-struct *dest: destination sentinel node +* +*******************************************************************************/ +void move_element(struct packed_struct *, struct packed_struct *); + +/****************************************************************************** +* +* Is the first or second element ready? +* +* struct packed_struct *first: first sentinel struct +* int offset: 0 (first element) or 1 (second element) +* +* returns 1 for element ready, or 0 otherwise +* +*******************************************************************************/ +int is_ready(struct packed_struct *, int); //slurp.c + +/****************************************************************************** +* +* Is the linked list finished? +* +* struct packed_struct *: first sentinel struct +* +* returns 1 for finished, 0 otherwise +* +*******************************************************************************/ +int is_finished(struct packed_struct *); //slurp.c + +/****************************************************************************** +* +* Add an elemnt to a node designating that the list is finished +* +* struct packed_struct: last sentinel node +* +*******************************************************************************/ +void add_finished(struct packed_struct *); //slurp.c + +/****************************************************************************** +* +* Initialize a linked list, returning the last sentinel struct +* +* struct packed_struct *: first sentinel struct +* +* returns first sentinel struct +* +*******************************************************************************/ +struct packed_struct *initialize_list(struct packed_struct *); //slurp.c + +/****************************************************************************** +* +* Destroy a linked list of packed_structs +* +* struct packed_struct *first: linked list to destroy +* +*******************************************************************************/ +void destroy_list(struct packed_struct *); //slurp.c +//As above, but for lists where ->packed hasn't been converted to a bam1_t +void destroy_raw_list(struct packed_struct *); //slurp.c + +/****************************************************************************** +* +* The MPI receiver thread on the main node +* +* void *: NULL input +* +* returns NULL +* +*******************************************************************************/ +void *slurp(void *); //slurp.c +void *herd_slurp(void *); // herd/slurp.c + +/****************************************************************************** +* +* Construct the output directory name, putting it in config.odir +* +*******************************************************************************/ +void update_odir(); //fastq.c + +/****************************************************************************** +* +* Given the name of a (possibly gzipped) fastq file, return the file name +* with the .fastq.gz, .fq.gz, .fastq, or .fq extension removed. +* +* char *file: filename +* +* CAUTION, THE OUTPUT MUST BE free()d! +* +*******************************************************************************/ +char * get_basename(char *); //fastq.c + +/****************************************************************************** +* +* Invoke the C->T and G->A conversion threads of the fastq files (located in +* the global config structure). +* +* FLAGS: integer bit field denoting the conversions to make +* 0x8 fastq #1 C->T +* 0x4 fastq #1 G->A +* 0x2 fastq #2 C->T +* 0x1 fastq #2 G->A +* +*******************************************************************************/ +void convert_fastq(int, unsigned int); //fastq.c + +/****************************************************************************** +* +* Take the config.FASTQ1 and config.FASTQ2 filenames and use them to generate +* the config.FASTQ1CT... filenames. These must subsequently be free()d, which +* is done in the quit() function. +* +* char *f1, config.FASTQ1 +* char *f2, config.FASTQ2 +* these are only really needed if there's more than one input file +* +*******************************************************************************/ +void create_fastq_names(char *, char*); //fastq.c + +/****************************************************************************** +* +* Read in all .fa and .fasta files within config.genome_dir. The sequences +* are concatenated onto chromosomes.genome. The global chromosomes structure +* is modified with each new chromosome. +* +* Note, chromosomes.genome (in fact, all of chromosomes) need to be free()d +* The is performed by the quit() function. +* +*******************************************************************************/ +void read_genome(); //common.c + +/****************************************************************************** +* +* Print metrics to STDOUT and a file. +* +*******************************************************************************/ +void print_metrics(); //aux.c + +/****************************************************************************** +* +* Return the number of worker nodes that will actually run. +* +*******************************************************************************/ +int effective_nodes(); //aux.c + +/****************************************************************************** +* +* quit, while performing some cleanup +* +* int FLAG: What to free/close/etc. +* 0x1 things created by create_fastq_names() +* 0x2 things pthreads are closed and bam headers destroyed +* In addition, the master node will free chromosomes.genome, close +* the BAM file, and free everything in the chromosomes struct. +* Also, everynode will free config.bowtie2_options +* +* int rv: return value +* +*******************************************************************************/ +void quit(int, int); //aux.c + +/****************************************************************************** +* +* Take a BAM header and pack it into a single contiguous memory block. Store +* the resulting block and its size in an MPI_Header structure. +* +* THE RESULT MUST BE free()d +* +* bam_header_t *header: The header to store +* +*******************************************************************************/ +MPI_Header * pack_header(bam_header_t *); //MPI_packing.c + +/****************************************************************************** +* +* Unpack a header packed into an initialized bam_header_t +* +* bam_header_t *header: The header to unpack into +* void *packed: The packed header +* +*******************************************************************************/ +void unpack_header(bam_header_t *, void *); //MPI_packing.c + +/****************************************************************************** +* +* Take a fastq struct and pack it for shipping +* +* THE RESULT MUST BE free()d eventually +* +* fastq *read: The read(s) to store +* MPI_Fastq *output: the struct into which to pack things +* +*******************************************************************************/ +MPI_Fastq * pack_fastq(fastq *); //MPI_packing.c + +/****************************************************************************** +* +* Take unpack a packed fastq struct +* +* THE RESULT MUST BE free()d +* +* fastq *read: The fastq struct to unpack into +* void *packed: The packed structure +* +*******************************************************************************/ +fastq * unpack_fastq(fastq *, void *); //MPI_packing.c + +/****************************************************************************** +* +* Unpack a packed read into an initialized bam1_t read. +* +* bam1_t *read: The read to unpack into +* void *packed: The packed read +* +*******************************************************************************/ +bam1_t *unpack_read(bam1_t *, void *); //MPI_packing.c + +/****************************************************************************** +* +* Take a BAM read and pack it into a single contiguous memory block. Store +* the resulting block and its size in an MPI_Read structure. +* +* THE RESULT MUST BE free()d +* +* bam1_t *read: The read to store +* +*******************************************************************************/ +MPI_read * pack_read(bam1_t *, MPI_read *); //MPI_packing.c + +/****************************************************************************** +* +* Extract the next sequence line from a file stream. +* +* char *seq: destination +* FILE *fp: source +* +* THE OUTPUT MUST BE free()d +* This function is affected by the MAXREAD definition, above. If this value is +* less than the longest read, things will break. It would be better to realloc +* as needed. +* +*******************************************************************************/ +void get_seq(char *, FILE *); //genome.c + +/****************************************************************************** +* +* Reverse complement a sequence (in place) +* +* char *seq: the sequence +* +*******************************************************************************/ +void reverse_complement(char *); //common.c + +/****************************************************************************** +* +* Determine the appropriate offset in chromosomes.genome +* +* char *chrom: Chromosome name +* int32_t pos: 0-based position on Chromosome. This is read->core.pos +* +*******************************************************************************/ +unsigned long long genome_offset(char*, int32_t); //common.c + +/****************************************************************************** +* +* Return the length of a given chromosome. +* +* char *chrom: the chromosome of interest +* +*******************************************************************************/ +unsigned long long genome_chrom_length(char *); //genome.c + +/****************************************************************************** +* +* Return a pointer to the chromosome name onto which a read maps. +* +* bam1_t *read: The read in question +* +*******************************************************************************/ +char *lookup_chrom(bam1_t *); //common.c + +/****************************************************************************** +* +* Return a base and another 2 bases on one of its sides. This is needed for +* making methylation calls. If this span goes off the edge of a chromosome, +* N's will be used. +* +* unsigned long long offset: from genome_offset +* unsigned long long position: converted read->core.pos +* int change: Direction of the context (- is backwards) +* unsigned long long chrom_length: from genome_chrom_length +* +* The output needs to be free()d +* +*******************************************************************************/ +char* get_genomic_context(unsigned long long, unsigned long long, int, unsigned long long); //genome.c + +/******************************************************************************* +* +* Create a position array to account for any InDels +* This function assumes that the first base is not marked as an InDel or +* clipped in any way. If that occurs then things will break. +* +* The output needs to be free()d +* +*******************************************************************************/ +unsigned long long *calculate_positions(bam1_t *); //common.c + +/******************************************************************************* +* +* The master node function. +* +* void *a: Actually an int*, the thread_id +* +*******************************************************************************/ +void * master_processer_thread(void*); //master.c +void * herd_master_processer_thread(void*); //master.c under herd/ + +/****************************************************************************** +* +* Given a set of single-end reads, determine which one, if any, aligns best. +* Then, add the various XM/XX/etc. tags and prepare the read for writing. The +* final read will always be stored in read1. Return the worker node number +* producing the best alignment (or 0). +* +* bam1_t *readN: Unpacked reads from the worker nodes +* char *seq: The unconverted fastq read +* +*******************************************************************************/ +int process_single(bam1_t *, bam1_t *, bam1_t *, bam1_t *, char *); //master.c + +/****************************************************************************** +* +* Like process_single, but for paired_end reads. The bam1_t**s hold the +* buffered reads. i denotes the read#1 of interest (read #2 is the next read) +* +*******************************************************************************/ +int process_paired(bam1_t **, bam1_t **, bam1_t **, bam1_t **, char **); //master.c + +/******************************************************************************* +* +* Update a packed read so that it's a proper bam1_t and return a pointer +* +* struct packed_struct *first: first sentinel node +* int offset: Return the read from the first (0) or second (1) element +* +* returns a pointer to a bam1_t read +* +*******************************************************************************/ +bam1_t *update_read(struct packed_struct *, int); //master.c + +/****************************************************************************** +* +* This function will run as its own thread and process the linked lists +* output from the master processor threads, writing them in order to a BAM +* file. This will also write all of the other output (aside from metrics). +* Furthermore, this provides a readout of the current number of reads +* processed. +* +* Output is NULL, as is the input (needed by pthreads). +* +*******************************************************************************/ +void * bam_writer(void *); //writer.c + +/****************************************************************************** +* +* This receives the reads, converts them, and writes them to the FIFO(s) +* +* void *a: a pointer to a struct with the following components: +* +* int thread_id: the thread_id +* char *fastq1: FIFO from which bowtie2 can get read1 +* char *fastq2: FIFO from which bowtie2 can get read2 (if it exists) +* +*******************************************************************************/ +void * slurp_fastq(void *); //worker.c + +/****************************************************************************** +* +* The main worker node function. +* +* int thread_id: the thread_id +* +*******************************************************************************/ +void worker_node(int); //worker.c + +/****************************************************************************** +* +* The main worker node function. +* +* int thread_id: the thread_id +* char *fastq1: FIFO from which bowtie2 can get read1 +* char *fastq2: FIFO from which bowtie2 can get read2 (if it exists) +* +*******************************************************************************/ +void herd_worker_node(int, char *, char *); //worker.c under herd/ + +/****************************************************************************** +* +* Open a sam file for reading via popen +* +* char *cmd: The command given to popen, the mode is always "r". +* +*******************************************************************************/ +tamFile sam_popen(char *); //aux.c + +/****************************************************************************** +* +* Close a SAM file that was opened with sam_popen +* +* tamFile fp: The file pointer struct returned from sam_popen +* +*******************************************************************************/ +void sam_pclose(tamFile fp); //aux.c diff --git a/common.c b/common.c new file mode 100644 index 0000000..eea72fd --- /dev/null +++ b/common.c @@ -0,0 +1,174 @@ +#include "bison.h" + +/******************************************************************************* +* +* Create a position array to account for any InDels +* This function assumes that the first base is not marked as an InDel or +* clipped in any way. If that occurs then things will break. +* +* The output needs to be free()d +* +*******************************************************************************/ +unsigned long long *calculate_positions(bam1_t *read) { + unsigned long long *positions = malloc(sizeof(unsigned long long) * (size_t)read->core.l_qseq); + int i, j, offset = 0, op, op_len; + uint32_t *CIGAR = bam1_cigar(read); + unsigned int previous_position = (unsigned int) read->core.pos; + + for(i=0; icore.n_cigar; i++) { + op = *(CIGAR+i) & 15; + op_len = (*(CIGAR+i)) >> 4; + for(j=0; jd_name, '.'); + if(p == NULL) continue; + if(strcmp(p, ".fa") == 0 || strcmp(p, ".fasta") == 0) { + //This is a fasta file that we need to read into the genome array and append a chromosome_struct onto chromosomes_struct + fullpath = realloc(fullpath, sizeof(char)*(strlen(config.genome_dir)+strlen(file->d_name)+1)); + sprintf(fullpath, "%s%s",config.genome_dir,file->d_name); + fp = fopen(fullpath, "r"); + if(!config.quiet) printf("Reading in %s\n", fullpath); + fflush(stdout); + while(fgets(line, MAXREAD, fp) != NULL) { + end=strlen(line); + if(line[end-1] == '\n') line[end-1] = '\0'; + if(line[0] == '>') { + //Store the length of the previous contig, if there was one + if(chromosome != NULL) { + chromosome->length = length; + } + + //Initialize a new chromosome_struct and lengthen the global chromosomes struct + nchromosomes = ++chromosomes.nchromosomes; + chromosomes.chromosome = realloc(chromosomes.chromosome, sizeof(chromosome_struct*) * nchromosomes); + chromosomes.chromosome[nchromosomes-1] = malloc(sizeof(chromosome_struct)); + chromosome = chromosomes.chromosome[nchromosomes-1]; + chromosome->offset = offset; + p = strchr(line, ' '); + if(p != NULL) *p = '\0'; //If there's anything after the name, ignore it + chromosome->chrom = malloc(sizeof(char)*strlen(line)); + strcpy(chromosome->chrom, (line+1)); //ignore the ">" + length = 0; + chromosome->offset = offset; + } else { + //Ensure that we have enough space in chromosomes.genome + if(offset + 10000 >= chromosomes.max_genome) { + chromosomes.max_genome += 100000; + chromosomes.genome = realloc(chromosomes.genome, sizeof(char) * chromosomes.max_genome); + g = chromosomes.genome + offset; + } + offset += end-1; + length += end-1; + for(i=0; ilength = length; + if(!config.quiet) printf("Finished %s\n", fullpath); + fflush(stdout); + fclose(fp); + } + } + free(line); + free(fullpath); + closedir(dir); +} + +/****************************************************************************** +* +* Reverse complement a sequence (in place) +* +* char *seq: the sequence +* +*******************************************************************************/ +void reverse_complement(char *seq) { + char *tmp = strdup(seq); + char current, new; + int i, j; + + for(i=0, j=strlen(tmp)-1; j>=0; i++, j--) { + current = *(tmp+j); + new = 'N'; + if(current == 'A' || current == 'a') new = 'T'; + if(current == 'T' || current == 't') new = 'A'; + if(current == 'C' || current == 'c') new = 'G'; + if(current == 'G' || current == 'g') new = 'C'; + *(seq+i) = new; + } + free(tmp); +} + +/****************************************************************************** +* +* Determine the appropriate offset in chromosomes.genome +* +* char *chrom: Chromosome name +* int32_t pos: 0-based position on Chromosome. This is read->core.pos +* +*******************************************************************************/ +unsigned long long genome_offset(char *chrom, int32_t pos) { + int i; + unsigned long long chrom_offset = 0; + + for(i=0; ichrom, chrom) == 0) { + chrom_offset = chromosomes.chromosome[i]->offset; + chrom_offset += pos; + break; + } + } + + if(chrom_offset == 0 && pos != 0) printf("Unable to calculate the genomic offset for %s:%i!\n", chrom, (int) pos); + return chrom_offset; +} + +/****************************************************************************** +* +* Return a pointer to the chromosome name onto which a read maps. +* +* bam1_t *read: The read in question +* +*******************************************************************************/ +inline char *lookup_chrom(bam1_t *read) { + int32_t tid = read->core.tid; + return global_header->target_name[tid]; +} diff --git a/fastq.c b/fastq.c new file mode 100644 index 0000000..e76c96d --- /dev/null +++ b/fastq.c @@ -0,0 +1,385 @@ +#include "bison.h" + +char * reverse_qual(char *qual) { + char *output = malloc(sizeof(char)*(1+strlen(qual))); + int i, j; + for(i=0, j=strlen(qual)-1; icore.l_qseq, sizeof(char)); + char *qual = calloc(1+read->core.l_qseq, sizeof(char)); + uint8_t b, *seqp = bam1_seq(read), *qualp = bam1_qual(read); + int i; + + for(i=0; icore.l_qseq; i++) { + b = bam1_seqi(seqp, i); + if(b == 1) *(seq+i) = 'A'; + else if(b == 2) *(seq+i) = 'C'; + else if(b == 4) *(seq+i) = 'G'; + else if(b == 8) *(seq+i) = 'T'; + else if(b == 15) *(seq+i) = 'N'; + *(qual+i) = qualp[i] + 33; + } + if(read->core.flag & BAM_FREVERSE) { + reverse_complement(seq); + qual = reverse_qual(qual); + } + + fprintf(fp, "@%s\n", bam1_qname(read)); + fprintf(fp, "%s\n", seq); + fprintf(fp, "+\n"); + fprintf(fp, "%s\n", qual); + + free(seq); + free(qual); +} + +/****************************************************************************** +* +* Construct the output directory name, putting it in config.odir +* +*******************************************************************************/ +void update_odir() { + char *p, *tmp; + + if(config.odir == NULL) { + tmp = strdup(config.FASTQ1); + p = strrchr(tmp, '/'); + if(p != NULL) { + *(p+1) = '\0'; + config.odir = tmp; + } else { + config.odir = malloc(sizeof(char) * 3); + sprintf(config.odir, "./"); + } + } else { + if(config.odir[strlen(config.odir)-1] != '/') { + config.odir = realloc(config.odir, (strlen(config.odir)+2) * sizeof(char)); + strcat(config.odir, "/"); + } + } +} + +/****************************************************************************** +* +* Given the name of a (possibly gzipped) fastq file, return the file name +* with the .fastq.gz, .fq.gz, .fastq, or .fq extension removed. +* +* CAUTION, THE OUTPUT MUST BE free()d! +* +*******************************************************************************/ +char * get_basename(char *file) { + char *output = malloc(sizeof(char) * (strlen(file) + 1)); + char *p = NULL; + + //Create the basename of the input + strcpy(output, file); + p = strrchr(output, '.'); + if(p != NULL) { + if(strcmp(p, ".gz") == 0) { + *p = '\0'; + p = strrchr(output, '.'); + if(p != NULL) { + if(strcmp(p, ".fastq") == 0 || strcmp(p, ".fq") == 0) *p = '\0'; + } + } else if(strcmp(p, ".fastq") == 0 || strcmp(p, ".fq") == 0) { + *p = '\0'; + } + } + + //Remove any preceding path + p = strrchr(output, '/'); + if(p != NULL) { + p++; + memmove(output, p, strlen(p)+1); + } + return output; +} + +/****************************************************************************** +* +* These functions are executed via pthreads to convert the fastq sequences. +* +*******************************************************************************/ +void * convert1(void *a) { + char *cmd = malloc(sizeof(char) * (strlen(config.FASTQ1) + 6)); + char *line1 = malloc(MAXREAD*sizeof(char)); + char *line2 = malloc(MAXREAD*sizeof(char)); + FILE *f, *of1, *of2 = NULL; + unsigned long long total = 0; + unsigned int limit = *((unsigned int *) a); + int i; + char *p; + + //Determine how we should read in the file + p = strrchr(config.FASTQ1, '.'); + if(strcmp(p, ".gz") == 0 || strcmp(p, ".GZ") == 0) { + sprintf(cmd, "zcat %s", config.FASTQ1); + } else if(strcmp(p, ".bz") == 0 || strcmp(p, ".bz2") == 0) { + sprintf(cmd, "bzcat %s", config.FASTQ1); + } else { + sprintf(cmd, "cat %s", config.FASTQ1); + } + f = popen(cmd, "r"); + + //CT + cmd = realloc(cmd,sizeof(char) * (strlen(config.FASTQ1CT) + 8)); + sprintf(cmd, "gzip > %s", config.FASTQ1CT); + of1 = popen(cmd, "w"); + + //GA + if(!config.directional) { + sprintf(cmd, "gzip > %s", config.FASTQ1GA); + of2 = popen(cmd, "w"); + } + + //Iterate through + while(1) { + //Read name + if(fgets(line1, MAXREAD, f) == NULL) break; + total++; + fputs(line1, of1); + if(!config.directional) fputs(line1, of2); + //Sequence + assert(fgets(line1, MAXREAD, f) != NULL); + if(!config.directional) strcpy(line2, line1); + for(i=0; i= limit) break; + } + + if(!config.quiet) printf("%s contained %llu reads\n", config.FASTQ1, total); + pclose(f); + pclose(of1); + if(!config.directional) pclose(of2); + free(cmd); + free(line1); + free(line2); + + return NULL; +} +void * convert2(void *a) { + char *cmd = malloc(sizeof(char) * (strlen(config.FASTQ2) + 6)); + char *line1 = malloc(MAXREAD*sizeof(char)); + char *line2 = malloc(MAXREAD*sizeof(char)); + FILE *f, *of1, *of2 = NULL; + unsigned long long total = 0; + unsigned int limit = *((unsigned int *) a); + int i; + char *p; + + //Determine how we should read in the file + p = strrchr(config.FASTQ2, '.'); + if(strcmp(p, ".gz") == 0 || strcmp(p, ".GZ") == 0) { + sprintf(cmd, "zcat %s", config.FASTQ2); + } else if(strcmp(p, ".bz") == 0 || strcmp(p, ".bz2") == 0) { + sprintf(cmd, "bzcat %s", config.FASTQ2); + } else { + sprintf(cmd, "cat %s", config.FASTQ2); + } + f = popen(cmd, "r"); + + //GA + cmd = realloc(cmd, sizeof(char) * (strlen(config.FASTQ2GA) + 8)); + sprintf(cmd, "gzip > %s", config.FASTQ2GA); + of1 = popen(cmd, "w"); + + //CT + if(!config.directional) { + sprintf(cmd, "gzip > %s", config.FASTQ2CT); + of2 = popen(cmd, "w"); + } + + //Iterate through + while(1) { + //Read name + if(fgets(line1, MAXREAD, f) == NULL) break; + total++; + fputs(line1, of1); + if(!config.directional) fputs(line1, of2); + //Sequence + assert(fgets(line1, MAXREAD, f) != NULL); + if(!config.directional) strcpy(line2, line1); + for(i=0; i= limit) break; + } + + if(!config.quiet) printf("%s contained %llu reads\n", config.FASTQ2, total); + pclose(f); + pclose(of1); + if(!config.directional) pclose(of2); + free(cmd); + free(line1); + free(line2); + + return NULL; +} + +/****************************************************************************** +* +* Invoke the C->T and G->A conversion threads of the fastq files (located in +* the global config structure). +* +* FLAGS: integer bit field denoting the conversions to make +* 0x8 fastq #1 C->T +* 0x4 fastq #1 G->A +* 0x2 fastq #2 C->T +* 0x1 fastq #2 G->A +* +*******************************************************************************/ +void convert_fastq(int FLAGS, unsigned int limit) { + pthread_t *threads; + int rc; + + if(!config.quiet) { + if(FLAGS & 8) printf("Will C->T convert %s and store the results in %s.\n", config.FASTQ1, config.FASTQ1CT); + if(FLAGS & 4) printf("Will G->A convert %s and store the results in %s.\n", config.FASTQ1, config.FASTQ1GA); + if(FLAGS & 2) printf("Will C->T convert %s and store the results in %s.\n", config.FASTQ2, config.FASTQ2CT); + if(FLAGS & 1) printf("Will G->A convert %s and store the results in %s.\n", config.FASTQ2, config.FASTQ2GA); + } + + if(config.paired) { + threads = calloc(2, sizeof(pthread_t)); + rc = pthread_create(&(threads[0]), NULL, convert1, (void *) &limit); + if(rc) { + printf("An error occured with invoking pthread_create; %d\n", rc); + exit(-1); + } + rc = pthread_create(&(threads[1]), NULL, convert2, (void *) &limit); + if(rc) { + printf("An error occured with invoking pthread_create; %d\n", rc); + exit(-1); + } + } else { + threads = calloc(1, sizeof(pthread_t)); + rc = pthread_create(&(threads[0]), NULL, convert1, (void *) &limit); + if(rc) { + printf("An error occured with invoking pthread_create; %d\n", rc); + exit(-1); + } + } + pthread_join(threads[0], NULL); + if(config.paired) pthread_join(threads[1], NULL); + + free(threads); +} + +/****************************************************************************** +* +* Take the config.FASTQ1 and config.FASTQ2 filenames and use them to generate +* the config.FASTQ1CT... filenames. These must subsequently be free()d, which +* is done in the quit() function. +* +*******************************************************************************/ +void create_fastq_names(char *f1, char *f2) { + char *basename1 = malloc(sizeof(char) * (strlen(f1) + 20)); + char *basename2 = NULL; + char *p; + + basename1 = strcpy(basename1, f1); + if(config.paired) { + basename2 = malloc(sizeof(char) * (strlen(f2) + 20)); + basename2 = strcpy(basename2, f2); + } + + //Create the basename of FASTQ1, trim off [.fastq/.fq].(bz/gz/bz2/fastq/fq) + p = strrchr(basename1, '.'); + if(p != NULL) { + if(strcmp(p, ".gz") == 0 || strcmp(p, ".bz") == 0 || strcmp(p, ".bz2") == 0 || strcmp(p, ".fastq") == 0 || strcmp(p, ".fq") == 0) { + *p = '\0'; + p = strrchr(basename1, '.'); + if(p != NULL) { + if(strcmp(p, ".fastq") == 0 || strcmp(p, ".fq") == 0) *p = '\0'; + } + } + } + config.FASTQ1CT = malloc(sizeof(char) * (strlen(basename1) + 10)); + config.FASTQ1GA = malloc(sizeof(char) * (strlen(basename1) + 10)); + if(config.odir != NULL) { + p = strrchr(basename1, '/'); + if(p!=NULL) { + p++; + } else { + p = basename1; + } + config.unmapped1 = malloc(sizeof(char) * (strlen(config.odir) + strlen(p) + strlen(".unmapped.fq.gz") + 1)); + sprintf(config.unmapped1, "%s%s.unmapped.fq.gz", config.odir, p); + } else { + config.unmapped1 = malloc(sizeof(char) * (strlen(basename1) + strlen(".unmapped.fq.gz") + 1)); + sprintf(config.unmapped1, "%s.unmapped.fq.gz", basename1); + } + sprintf(config.FASTQ1CT, "%s.CT.fq.gz", basename1); + sprintf(config.FASTQ1GA, "%s.GA.fq.gz", basename1); + + //Create the basename of FASTQ2 + if(config.paired) { + p = strrchr(basename2, '.'); + if(p != NULL) { + if(strcmp(p, ".gz") == 0 || strcmp(p, ".bz") == 0 || strcmp(p, ".bz2") == 0 || strcmp(p, ".fastq") == 0 || strcmp(p, ".fq") == 0) { + *p = '\0'; + p = strrchr(basename2, '.'); + if(p != NULL) { + if(strcmp(p, ".fastq") == 0 || strcmp(p, ".fq") == 0) *p = '\0'; + } + } + } + config.FASTQ2CT = malloc(sizeof(char) * (strlen(basename2) + 10)); + config.FASTQ2GA = malloc(sizeof(char) * (strlen(basename2) + 10)); + if(config.odir != NULL) { + p = strrchr(basename2, '/'); + if(p!=NULL) { + p++; + } else { + p = basename2; + } + config.unmapped2 = malloc(sizeof(char) * (strlen(config.odir) + strlen(p) + strlen(".unmapped.fq.gz") + 1)); + sprintf(config.unmapped2, "%s%s.unmapped.fq.gz", config.odir, p); + } else { + config.unmapped2 = malloc(sizeof(char) * (strlen(basename2) + strlen(".unmapped.fq.gz") + 1)); + sprintf(config.unmapped2, "%s.unmapped.fq.gz", basename2); + } + sprintf(config.FASTQ2CT, "%s.CT.fq.gz", basename2); + sprintf(config.FASTQ2GA, "%s.GA.fq.gz", basename2); + free(basename2); + } + + free(basename1); +} diff --git a/genome.c b/genome.c new file mode 100644 index 0000000..3a0e37b --- /dev/null +++ b/genome.c @@ -0,0 +1,81 @@ +#include "bison.h" + +/****************************************************************************** +* +* Extract the next sequence line from a file stream. +* +* char *seq: destination +* FILE *fp: source +* +* THE OUTPUT MUST BE free()d +* This function is affected by the MAXREAD definition, above. If this value is +* less than the longest read, things will break. It would be better to realloc +* as needed. +* +*******************************************************************************/ +void get_seq(char *seq, FILE *fp) { + char *line = malloc(MAXREAD*sizeof(char)); + assert(fgets(line, MAXREAD, fp) != NULL); + assert(fgets(line, MAXREAD, fp) != NULL); + *(line+strlen(line)-1) = '\0'; //remove the \n + strcpy(seq, line); + assert(fgets(line, MAXREAD, fp) != NULL); + assert(fgets(line, MAXREAD, fp) != NULL); + free(line); +} + +/****************************************************************************** +* +* Return the length of a given chromosome. +* +* char *chrom: the chromosome of interest +* +*******************************************************************************/ +unsigned long long genome_chrom_length(char *chrom) { + int i; + unsigned long long output = 0; + + for(i=0; ichrom, chrom) == 0) { + output = chromosomes.chromosome[i]->length; + break; + } + } + return output; +} + +/****************************************************************************** +* +* Return a base and another 2 bases on one of its sides. This is needed for +* making methylation calls. If this span goes off the edge of a chromosome, +* N's will be used. +* +* unsigned long long offset: from genome_offset +* unsigned long long position: converted read->core.pos +* int change: Direction of the context (- is backwards) +* unsigned long long chrom_length: from genome_chrom_length +* +*******************************************************************************/ +char * get_genomic_context(unsigned long long offset, unsigned long long position, int change, unsigned long long chrom_length) { + int i; + char *output = calloc(4, sizeof(char)); + + if(change > 0) { + for(i=0; i<3; i++) { + if(position+i < chrom_length) { + *(output+i) = toupper(*(chromosomes.genome+offset+position+i)); + } else { + *(output+i) = 'N'; + } + } + } else { + for(i=0; i<3; i++) { + if(position-2+i >= 0) { + *(output+i) = toupper(*(chromosomes.genome+offset+position-2+i)); + } else { + *(output+i) = 'N'; + } + } + } + return output; +} diff --git a/herd/MPI_packing.c b/herd/MPI_packing.c new file mode 100644 index 0000000..b40181c --- /dev/null +++ b/herd/MPI_packing.c @@ -0,0 +1,147 @@ +#include "../bison.h" + +/****************************************************************************** +* +* Take a fastq struct and pack it for shipping +* +* THE RESULT MUST BE free()d eventually +* +* fastq *read: The read(s) to store +* MPI_Fastq *output: the struct into which to pack things +* +*******************************************************************************/ +MPI_Fastq * pack_fastq(fastq *read) { + size_t size = 0; + size_t length1, length2; + void *p; + char *pchar, null_char = '\0'; + MPI_Fastq *output = malloc(sizeof(MPI_Fastq)); + + //Calculate the size needed for read1 + length1 = sizeof(char) * (strlen(read->name1) + strlen(read->seq1) + strlen(read->qual1) + 3); + size += length1; + if(config.paired) { + length2 = sizeof(char) * (strlen(read->name2) + strlen(read->seq2) + strlen(read->qual2) + 3); + size += length2; + } + output->size = size; + output->packed = malloc(size); + + //Set everything + p = output->packed; + + //read1 + memcpy(p, (void *) read->name1, sizeof(char) * (strlen(read->name1))); + pchar = (char *) p; + p = (void *) (pchar + strlen(read->name1)); + memcpy(p, (void *) &null_char, sizeof(char)); + pchar = (char *) p; + p = (void *) (++pchar); + memcpy(p, (void *) read->seq1, sizeof(char) * (strlen(read->seq1))); + pchar = (char *) p; + p = (void *) (pchar + strlen(read->seq1)); + memcpy(p, (void *) &null_char, sizeof(char)); + pchar = (char *) p; + p = (void *) (++pchar); + memcpy(p, (void *) read->qual1, sizeof(char) * (strlen(read->qual1))); + pchar = (char *) p; + p = (void *) (pchar + strlen(read->qual1)); + memcpy(p, (void *) &null_char, sizeof(char)); + pchar = (char *) p; + p = (void *) (++pchar); + + //read2 + if(config.paired) { + memcpy(p, (void *) read->name2, sizeof(char) * (strlen(read->name2))); + pchar = (char *) p; + p = (void *) (pchar + strlen(read->name2)); + memcpy(p, (void *) &null_char, sizeof(char)); + pchar = (char *) p; + p = (void *) (++pchar); + memcpy(p, (void *) read->seq2, sizeof(char) * (strlen(read->seq2))); + pchar = (char *) p; + p = (void *) (pchar + strlen(read->seq2)); + memcpy(p, (void *) &null_char, sizeof(char)); + pchar = (char *) p; + p = (void *) (++pchar); + memcpy(p, (void *) read->qual2, sizeof(char) * (strlen(read->qual2))); + pchar = (char *) p; + p = (void *) (pchar + strlen(read->qual2)); + memcpy(p, (void *) &null_char, sizeof(char)); + pchar = (char *) p; + p = (void *) (++pchar); + } + return output; +} + +/****************************************************************************** +* +* Take unpack a packed fastq struct +* +* THE RESULT MUST BE free()d +* +* fastq *read: The fastq struct to unpack into +* void *packed: The packed structure +* +*******************************************************************************/ +fastq * unpack_fastq(fastq *read, void *packed) { + char *pchar; + void *p = packed; + size_t len; + + //Read1 + len = strlen((char *) p) + 1; //name + if(len > read->max_name1) { + read->name1 = realloc((void *) read->name1, sizeof(char) * len); + read->max_name1 = len; + } + strcpy(read->name1, (char *) p); + pchar = (char *) p; + p = (void *) (pchar + len); + len = strlen((char *) p) + 1; //seq + if(len > read->max_seq1) { + read->seq1 = realloc((void *) read->seq1, sizeof(char) * len); + read->max_seq1 = len; + } + strcpy(read->seq1, (char *) p); + pchar = (char *) p; + p = (void *) (pchar + len); + len = strlen((char *) p) + 1; //qual + if(len > read->max_qual1) { + read->qual1 = realloc((void *) read->qual1, sizeof(char) * len); + read->max_qual1 = len; + } + strcpy(read->qual1, (char *) p); + pchar = (char *) p; + p = (void *) (pchar + len); + + //Read2 + if(config.paired) { + len = strlen((char *) p) + 1; //name + if(len > read->max_name2) { + read->name2 = realloc((void *) read->name2, sizeof(char) * len); + read->max_name2 = len; + } + strcpy(read->name2, (char *) p); + pchar = (char *) p; + p = (void *) (pchar + len); + len = strlen((char *) p) + 1; //seq + if(len > read->max_seq2) { + read->seq2 = realloc((void *) read->seq2, sizeof(char) * len); + read->max_seq2 = len; + } + strcpy(read->seq2, (char *) p); + pchar = (char *) p; + p = (void *) (pchar + len); + len = strlen((char *) p) + 1; //qual + if(len > read->max_qual2) { + read->qual2 = realloc((void *) read->qual2, sizeof(char) * len); + read->max_qual2 = len; + } + strcpy(read->qual2, (char *) p); + pchar = (char *) p; + p = (void *) (pchar + len); + } + + return read; +} diff --git a/herd/fastq.c b/herd/fastq.c new file mode 100644 index 0000000..5191a0d --- /dev/null +++ b/herd/fastq.c @@ -0,0 +1,477 @@ +#include "../bison.h" +#include +#include +#include + +//This serve as the buffer for reading from compressed files +struct local_buffer { + char *buf; + unsigned long pos; + int finished; //0 no, 1 yes + int type; //0: txt, 1: gz, 2:bz2 + union { + FILE *fptxt; + gzFile fpgz; + BZFILE *fpbz2; + } x; +}; + +/****************************************************************************** +* +* Take a fastq struct and convert it C->T, the conversion is in place +* +* fastq *read, input struct +* int which, which of the reads to convert +* +*******************************************************************************/ +void convertCT(fastq *read, int which) { + char *p; + if(which == 0) { + p = read->seq1; + } else { + p = read->seq2; + } + while(*p != '\n') { + if(*p == 'C' || *p == 'c') *p = 'T'; + p++; + } +} + +/****************************************************************************** +* +* Take a fastq struct and convert it G->A, the conversion is in place +* +* fastq *read, input struct +* int which, which of the reads to convert +* +*******************************************************************************/ +void convertGA(fastq *read, int which) { + char *p; + if(which == 0) { + p = read->seq1; + } else { + p = read->seq2; + } + while(*p != '\n') { + if(*p == 'G' || *p == 'g') *p = 'A'; + p++; + } +} + +/****************************************************************************** +* +* Read a full line into a buffer, increasing its size as needed and returning +* its max size. +* +* FILE *fp, input file stream +* char *cur_buf, the buffer to expand and insert into +* int size, current maximum malloc()ed size of cur_buf +* char *buf, a buffer of length sizeof(char)*MAXREAD to use, this simply saves +* us from constantly malloc()ing one. +* int ignore, if 1, read in the line until the end but don't store it +* +* size is updated on success and set to -1 on error or EOF +* +*******************************************************************************/ +char * read_line(struct local_buffer *fp, char *cur_buf, int *size, int ignore) { + + if(fp->type == 0 || fp->type == 2) { //plain text input + while(1) { + if(fp->finished == 1) { + //We hit the end of the file in the last go around + *size = -1; + break; + } + if(ignore) { + if(fgets(fp->buf, BT2BUF_SZ, fp->x.fptxt) == NULL) { + fp->finished = 1; + *size = -1; + break; + } + while(fp->buf[strlen(fp->buf)-1] != '\n') { + if(fgets(fp->buf, BT2BUF_SZ, fp->x.fptxt) == NULL) fp->finished = 1; + if(fp->finished == 1) break; //Broken input + } + break; + } else { + if(fgets(cur_buf, *size, fp->x.fptxt) == NULL) { + fp->finished = 1; + *size = -1; + break; + } + while(cur_buf[strlen(cur_buf)-1] != '\n') { + cur_buf = realloc(cur_buf, sizeof(char) * (*size + BT2BUF_SZ)); + *size += BT2BUF_SZ; + if(fgets(fp->buf, BT2BUF_SZ, fp->x.fptxt) == NULL) fp->finished = 1; + if(fp->finished == 1) break; //Broken input + cur_buf = strcat(cur_buf, fp->buf); + } + break; + } + } + } else if(fp->type == 1) { //gzipped input + while(1) { + if(fp->finished == 1) { + //We hit the end of the file in the last go around + *size = -1; + break; + } + if(ignore) { + if(gzgets(fp->x.fpgz, fp->buf, BT2BUF_SZ) == NULL) { + fp->finished = 1; + *size = -1; + break; + } + while(fp->buf[strlen(fp->buf)-1] != '\n') { + if(gzgets(fp->x.fpgz, fp->buf, BT2BUF_SZ) == NULL) fp->finished = 1; + if(fp->finished == 1) break; //Broken input + } + break; + } else { + if(gzgets(fp->x.fpgz, cur_buf, *size) == NULL) { + fp->finished = 1; + *size = -1; + break; + } + while(cur_buf[strlen(cur_buf)-1] != '\n') { + cur_buf = realloc(cur_buf, sizeof(char) * (*size + BT2BUF_SZ)); + *size += BT2BUF_SZ; + if(gzgets(fp->x.fpgz, fp->buf, BT2BUF_SZ) == NULL) fp->finished = 1; + if(fp->finished == 1) break; //Broken input + cur_buf = strcat(cur_buf, fp->buf); + } + break; + } + } + } + + return cur_buf; +} + +/****************************************************************************** +* +* Read in an actual fastq read into a fastq struct, resizing as needed +* +* FILE *fp, input file pointer +* fastq *read, input struct +* int which, 0 for read1 and 1 for read2 +* +* returns an int, which is -1 on EOF or error +* +*******************************************************************************/ +int read_fastq(struct local_buffer *fp, fastq *read, int which) { + int *max_name = NULL, *max_seq = NULL, *max_qual = NULL; + int orig_maxname; + char *name = NULL, *seq = NULL, *qual = NULL; + + //Point everything to the correct read + if(which == 0) { //read1 + name = read->name1; + seq = read->seq1; + qual = read->qual1; + max_name = &(read->max_name1); + max_seq = &(read->max_seq1); + max_qual = &(read->max_qual1); + } else { + name = read->name2; + seq = read->seq2; + qual = read->qual2; + max_name = &(read->max_name2); + max_seq = &(read->max_seq2); + max_qual = &(read->max_qual2); + } + + //name + orig_maxname = *max_name; + name = read_line(fp, name, max_name, 0); + if(*max_name == -1) { + *max_name = orig_maxname; + return -1; + } + //Seq + seq = read_line(fp, seq, max_seq, 0); + //+ + read_line(fp, NULL, 0, 1); + //Qual + qual = read_line(fp, qual, max_qual, 0); + + //Reset the pointers if they've moved + if(which == 0) { //Read1 + read->name1 = name; + read->seq1 = seq; + read->qual1 = qual; + } else { + read->name2 = name; + read->seq2 = seq; + read->qual2 = qual; + } + + return 0; +} + +/****************************************************************************** +* +* Read in the fastq file(s) sending the reads to the appropriate nodes and +* also storing the unconverted reads in a linked list on the master node. +* +* This will act as its own thread on the master node. +* +* void *a is an unsigned long +* +*******************************************************************************/ +void * send_store_fastq(void *a) { + char *line = malloc(MAXREAD*sizeof(char)); + struct local_buffer *f1 = NULL, *f2 = NULL; + int i=0, nnodes = effective_nodes(), status; + int nnode_groups = nnodes/((config.directional) ? 2 : 4); + int j, max_j = 4, multiplier = 4; + int current_file = 0; + unsigned long upto = *((unsigned long *) a); + unsigned long total = 0; + char *cmd = NULL; + char *p, *fname1 = NULL, *fname2 = NULL, *save_ptr1=NULL, *save_ptr2=NULL; + char *finished_signal = NULL; + fastq *read = malloc(sizeof(fastq)); + MPI_Fastq *packed = NULL; + int rv1 = 0, rv2 = 0, wordexp_offset=0; + wordexp_t fnames1_wordexp, fnames2_wordexp; + void *A = malloc(1); +#ifdef DEBUG + int taskid = global_debug_taskid; +#endif + f1 = calloc(1, sizeof(struct local_buffer)); + f1->buf = malloc(BT2BUF_SZ*sizeof(char)); + f1->buf[0] = '\0'; //Just so that we know that we're at the start of a buffer + if(config.paired) { + f2 = calloc(1, sizeof(struct local_buffer)); + f2->buf = malloc(BT2BUF_SZ*sizeof(char)); + f2->buf[0] = '\0'; //Just so that we know that we're at the start of a buffer + } + + //Initialize the read struct + read->max_name1 = 10; + read->max_seq1 = 10; + read->max_qual1 = 10; + read->max_name2 = 10; + read->max_seq2 = 10; + read->max_qual2 = 10; + read->name1 = malloc(sizeof(char)*10); + read->seq1 = malloc(sizeof(char)*10); + read->qual1 = malloc(sizeof(char)*10); + read->name2 = malloc(sizeof(char)*10); + read->seq2 = malloc(sizeof(char)*10); + read->qual2 = malloc(sizeof(char)*10); + + //These will be used later + if(config.directional) { + max_j = 2; + multiplier = 2; + } + + fname1 = strtok_r(config.FASTQ1,",", &save_ptr1); + rv1 = wordexp(fname1, &fnames1_wordexp, WRDE_SHOWERR | WRDE_UNDEF); + fnames1[current_file] = strdup(fnames1_wordexp.we_wordv[wordexp_offset]); //This will need to be free()d + if(config.paired) { + fname2 = strtok_r(config.FASTQ2,",", &save_ptr2); + rv2 = wordexp(fname2, &fnames2_wordexp, WRDE_SHOWERR | WRDE_UNDEF); + fnames2[current_file] = strdup(fnames2_wordexp.we_wordv[wordexp_offset]); + } + while(fname1 != NULL) { + if(rv1 != 0 || rv2 != 0) { + printf("An error ocurred when trying to expand the first filename.\n"); + if(rv1 == WRDE_BADCHAR) { + printf("%s contains an illegal character\n", fname1); + } else if(rv1 == WRDE_BADVAL) { + printf("%s contains an undefined shell variable\n", fname1); + } else if(rv1 == WRDE_NOSPACE) { + printf("Out of memory when processing %s\n", fname1); + } else if(rv1 == WRDE_SYNTAX) { + printf("%s had a syntax error\n", fname1); + } + if(config.paired) { + if(rv2 == WRDE_BADCHAR) { + printf("%s contains an illegal character\n", fname2); + } else if(rv2 == WRDE_BADVAL) { + printf("%s contains an undefined shell variable\n", fname2); + } else if(rv2 == WRDE_NOSPACE) { + printf("Out of memory when processing %s\n", fname2); + } else if(rv2 == WRDE_SYNTAX) { + printf("%s had a syntax error\n", fname2); + } + } + goto finish; //Yeah yeah, an evil "goto" + } + //Determine how we should read in the file(s) + p = strrchr(fnames1_wordexp.we_wordv[wordexp_offset], '.'); + if(strcmp(p, ".gz") == 0 || strcmp(p, ".GZ") == 0) { + f1->type = 1; + f1->x.fpgz = gzopen(fnames1_wordexp.we_wordv[wordexp_offset], "rb"); + } else if(strcmp(p, ".bz") == 0 || strcmp(p, ".bz2") == 0) { + f1->type = 2; + cmd = realloc(cmd, sizeof(char) * (strlen(fnames1_wordexp.we_wordv[wordexp_offset]) + strlen("bzcat "))); + sprintf(cmd, "bzcat %s", fnames1_wordexp.we_wordv[wordexp_offset]); + f1->x.fptxt = popen(cmd, "r"); + } else { + f1->type = 0; + f1->x.fptxt = fopen(fnames1_wordexp.we_wordv[wordexp_offset], "r"); + } + f1->finished = 0; + f1->pos = 0; + f1->buf[0] = '\0'; //Just so that we know that we're at the start of a buffer + if(config.paired) { + p = strrchr(fnames2_wordexp.we_wordv[wordexp_offset], '.'); + if(strcmp(p, ".gz") == 0 || strcmp(p, ".GZ") == 0) { + f2->type = 1; + f2->x.fpgz = gzopen(fnames2_wordexp.we_wordv[wordexp_offset], "rb"); + } else if(strcmp(p, ".bz") == 0 || strcmp(p, ".bz2") == 0) { + f2->type = 2; + cmd = realloc(cmd, sizeof(char) * (strlen(fnames2_wordexp.we_wordv[wordexp_offset]) + strlen("bzcat "))); + sprintf(cmd, "bzcat %s", fnames2_wordexp.we_wordv[wordexp_offset]); + f2->x.fptxt = popen(cmd, "r"); + } else { + f2->type = 0; + f2->x.fptxt = fopen(fnames2_wordexp.we_wordv[wordexp_offset], "r"); + } + f2->finished = 0; + f2->pos = 0; + f2->buf[0] = '\0'; //Just so that we know that we're at the start of a buffer + } + + //read everything in + total = 0; + while(1) { + if(upto) { + if(total >= upto) break; + } + + if(read_fastq(f1, read, 0) == -1) break; + if(config.paired) read_fastq(f2, read, 1); + + //Pack the struct + packed = pack_fastq(read); + + //Store this in the linked-list +#ifdef DEBUG + if(global_debug_taskid == MASTER) { +#endif + add_element(last_fastq_sentinel_node[i], packed->packed); +#ifdef DEBUG + } +#endif + + //Send it to the appropriate nodes + for(j=1; j<=max_j; j++) { +#ifdef DEBUG + if(global_debug_taskid != MASTER) { + if(j+multiplier*i == taskid) { + status = MPI_Send((void *) packed->packed, packed->size, MPI_BYTE, 0, 3, MPI_COMM_WORLD); + if(status != MPI_SUCCESS) { + printf("MPI_Send returned %i\n", status); + fflush(stdout); + } + } + } +#else + //Send to j+multiplier*i + status = MPI_Send((void *) packed->packed, packed->size, MPI_BYTE, j+multiplier*i, 3, MPI_COMM_WORLD); + if(status != MPI_SUCCESS) { + printf("MPI_Send returned %i\n", status); + fflush(stdout); + } +#endif + } + i++; + if(i >= nnode_groups) i=0; + + //Free packed (packed->packed is in the linked list!) +#ifdef DEBUG + if(global_debug_taskid != MASTER) free(packed->packed); +#endif + free(packed); + total++; + +#ifndef NOTHROTTLE + if(config.reads_in_queue > 0) { + if(total % THROTTLE_CHECK_INTERVAL == 0) { + while(total - nwritten[current_file] > config.reads_in_queue) sleep(1); + } + } +#endif + } + flengths[current_file] = total; //Otherwise, the writer thread will keep waiting + //Notify the master_processor_threads that they need to update the methylation metrics + for(j=0; jtype == 0) fclose(f1->x.fptxt); + else if(f1->type == 1) { gzclearerr(f1->x.fpgz); gzclose(f1->x.fpgz); } + else if(f1->type == 2) pclose(f1->x.fptxt); + if(config.paired) { + if(f2->type == 0) fclose(f2->x.fptxt); + else if(f2->type == 1) { gzclearerr(f2->x.fpgz); gzclose(f2->x.fpgz); } + else if(f2->type == 2) pclose(f2->x.fptxt); + } + + current_file++; + if(++wordexp_offset >= fnames1_wordexp.we_wordc) { + //Ensure we move to the next file + wordexp_offset = 0; + fname1 = strtok_r(NULL,",", &save_ptr1); + if(fname1 == NULL) break; + rv1 = wordexp(fname1, &fnames1_wordexp, WRDE_SHOWERR | WRDE_UNDEF | WRDE_REUSE); + if(config.paired) { + fname2 = strtok_r(NULL,",", &save_ptr2); + rv2 = wordexp(fname2, &fnames2_wordexp, WRDE_SHOWERR | WRDE_UNDEF | WRDE_REUSE); + fnames2[current_file] = strdup(fnames2_wordexp.we_wordv[wordexp_offset]); //This will need to be free()d + } + } //Else we've incremented to the next file + fnames1[current_file] = strdup(fnames1_wordexp.we_wordv[wordexp_offset]); //This will need to be free()d + if(config.paired) { + fnames2[current_file] = strdup(fnames2_wordexp.we_wordv[wordexp_offset]); //This will need to be free()d + } + } + +finish: //We'll only ever "goto" here on an error, otherwise we'll get here normally + //Send a 1-byte package to signal completion +#ifdef DEBUG + if(global_debug_taskid != MASTER) { + status = MPI_Send(A, 1, MPI_BYTE, 0, 3, MPI_COMM_WORLD); + } +#else + for(j=1; j<=effective_nodes(); j++) { + status = MPI_Send(A, 1, MPI_BYTE, j, 3, MPI_COMM_WORLD); + if(status != MPI_SUCCESS) printf("Couldn't send 'finished' message to worker %i!\n", j); + } +#endif + + //Add the "finished" element +#ifdef DEBUG + if(global_debug_taskid == MASTER) { +#endif + for(i=0; iname1); + free(read->seq1); + free(read->qual1); + free(read->name2); + free(read->seq2); + free(read->qual2); + free(read); + wordfree(&fnames1_wordexp); + if(config.paired) wordfree(&fnames2_wordexp); + if(!config.quiet) printf("Finished reading in fastq files!\n"); fflush(stdout); + return NULL; +} diff --git a/herd/main.c b/herd/main.c new file mode 100644 index 0000000..e30074b --- /dev/null +++ b/herd/main.c @@ -0,0 +1,509 @@ +#include "../bison.h" +#include + +void usage(char *prog) { + printf("Usage: %s [OPTIONS] -g genome_dir {-1 fastq_A1.gz,fastq_B1.gz -2 fastq_A2.gz,fastq_B2.gz | -U fastq.gz}\n", prog); + printf("\n \ + N.B., Bison has a number of defaults that are different from that of bowtie2.\n \ + All of these can be changed with the normal bowtie2 options, which change\n \ + bison's behavior as well. MAPQ scores are recalculated by bison in the same\n \ + way as they are in bowtie2 (or at least they should be). Any option not\n \ + listed below will be passed directly to bowtie2, so you can specify, e.g.,\n \ + --very-fast if you want. If you specify --local, --score-min is changed back\n \ + to the bowtie2 default of 'G,20,6', unless you specify otherwise.\n \ +\n \ + Note also that both -1/-2 and -U can accept a comma-separated list of input\n \ + files. Unlike other aligners, the alignments from each of these files will\n \ + be output to different files. This is meant to speed alignments of multiple\n \ + samples, since the bowtie2 index and the genome sequence only need to be\n \ + loaded a single time. Inputting more than one file (or pair, when using -1\n \ + -2) implies --reorder.\n \ +\n \ +-g Directory containing the genome fasta files and the\n \ + Bisulfite_Sequences directory.\n \ +\n \ +-1 Fastq file containing read #1 (normally named something like \n \ + foo_1.fastq.gz). Reads needn't be gzipped, but that'll be more\n \ + convenient. You may also input a comma-separated list of files to be\n \ + aligned (but see note above). Doing this implies --reorder.\n \ +\n \ +-2 As with -1, but with read #2.\n \ +\n \ +-U For convenience, this denotes a fastq file from single-ended reads.\n \ + Alternatively, -1 can be used without using -2. As with -1, you may\n \ + also specify more than one file, in which case alignments from each\n \ + will be printed to different files.\n \ +\n \ +-p How many threads bowtie2 should use on each node. Default is 11.\n \ +\n \ +-mp How many processing threads should run on the master node. Default\n \ + is 1. Increasing this will be required to prevent the MPI buffer\n \ + from becoming depleted and the master node then crashing. However,\n \ + too many of these will cause resource underutilization. Keep in\n \ + mind also that there are an additional 2 threads already running to\n \ + do other things.\n \ +\n \ +-o Output directory. By default, everything will be written to the\n \ + directory holding the fastq files (or the file containing read #1,\n \ + as appropriate). If you would prefer for the output BAM file and\n \ + metrics txt file to be placed elsewhere, specify that here.\n \ +\n \ + N.B., the directory must exist! \n \ +\n \ +-tmp Temporary directory where named pipes will be created on the worker\n \ + nodes. This just need to be a directory that is bison_herd can read\n \ + and write to. The default is \"/tmp\".\n \ +\n \ +--directional Denotes that the library was created in a directional, rather\n \ + than non-directional manner. This will result in 3, rather than 5\n \ + nodes being used as only alignments to 2 (rather than 4) strands are\n \ + possible.\n \ +\n \ +-upto The maximum number of reads to process. This is mostly useful for\n \ + debuging and more quickly determining if a library is directional or\n \ + not. 0 is the default, meaning all reads are used. N.B., the\n \ + maximum value for this parameter is whatever an unsigned long is on\n \ + your system.\n \ +\n \ +--reorder Reorder output to match the same order as the input. This will make\n \ + things slower, but enable easier comparisons. This is passed to\n \ + bowtie2 regardless of whether you specify it or not.\n \ +\n \ +-@ Number of BAM compression threads to use. This is equivalent to -@\n \ + in samtools. The default is 1, but this may need to be increased as\n \ + you increase the number of alotted nodes.\n \ +\n \ +--unmapped Save unaligned reads to a file or files (as appropriate). This files\n \ + will be placed in the same directory as the source fastq files,\n \ + regardless of whether \"-o\" is used.\n \ +\n"); +#ifndef NOTHROTTLE + printf(" \ +-queue_size The maximum difference between the number of reads that have been\n \ + read and the number that have been written. The default is 1000000\n \ + and a value of 0 (or just not compiling with -DTHROTTLE) will\n \ + disable this. Since bison_herd can have a quiet large number of\n \ + worker nodes performing alignments, it can happen that they\n \ + overwhelm the master node that must then process their results. This\n \ + option can help to prevent that (though increasing -mp is a better\n \ + solution) by pausing the sending of reads out for alignment.\n \ +\n"); +#endif + printf(" \ +--quiet Don't print anything but errors to the console (this is also passed\n \ + to bowtie2).\n \ +\n \ +-h Print this help message.\n \ +\n \ +-v Print version information.\n \ +\n"); +#ifdef DEBUG + printf("\ +-taskid Which node number to act as. The default is 0, the master node.\n \ + Other possibilities are 1-4, which are the worker nodes that\n \ + process OT, OB, CTOT, and CTOB alignments, respectively.\n \ +\n \ + Note that if you plan to run with taskid=0 (i.e., as the master\n \ + node), files named OT.bam, OB.bam, etc. should exist in your\n \ + working directory. These will be created automatically if you run\n \ + each pseudo-worker node first, which is recommended.\n\n"); +#endif +} + +int main(int argc, char *argv[]) { + int i, taskid=0, provided; + pthread_t *threads; + int bowtie2_options_max = MAXREAD; + char *p = NULL, *tmp = NULL; + wordexp_t p_wordexp; + unsigned long upto = 0; + int ngroups; + int multi_file=0; +#ifndef DEBUG + int name_len; + char processor_name[MPI_MAX_PROCESSOR_NAME]; +#endif + + //Deal with MPI initialization, this seems like an odd way to do things. + MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided); + if(provided != MPI_THREAD_MULTIPLE) { + printf("You're MPI implementation doesn't support MPI_THREAD_MULTIPLE, which is required for bison_herd to work.\n"); + return -1; + } +#ifndef DEBUG + MPI_Comm_rank(MPI_COMM_WORLD, &taskid); + MPI_Get_processor_name(processor_name, &name_len); +#endif + + config.odir = NULL; + config.paired = 0; //Default is single-ended + config.directional = 0; //Default is non-directional + config.nthreads = 11; //Default is 11 threads/node + config.bowtie2_options = calloc(MAXREAD, sizeof(char)); + config.unmapped = 0; //By default, unmapped reads are NOT written to a fastq file + config.scoremin_type = 'L'; //--score-min 'L,-0.6,-0.6' + config.scoremin_intercept = -0.6; + config.scoremin_coef = -0.6; + config.mode = 0; //--end-to-end + config.tmpdir = NULL; //-tmpdir + config.nmthreads = 1; //-mp + config.reorder = 0; //--reorder + config.outname = NULL; //Otherwise, we'll have problems when we realloc! + config.basename = NULL; //To handle multiple inputs + config.n_compression_threads = 0; + config.unmapped1 = NULL; + config.unmapped2 = NULL; + global_header = NULL; + unmapped1 = NULL; + unmapped2 = NULL; +#ifndef NOTHROTTLE + config.reads_in_queue = 1000000; + nwritten = 0; +#endif + chromosomes.nchromosomes = 0; //We need to initialize the struct + + //These are only used during cleanup and will otherwise cause an error + config.FASTQ1CT = NULL; + config.FASTQ1GA = NULL; + config.FASTQ2CT = NULL; + config.FASTQ2GA = NULL; + + //Initialize the global counts + t_reads = 0; + m_reads_OT = 0; + m_reads_OB = 0; + m_reads_CTOT = 0; + m_reads_CTOB = 0; + t_CpG = 0; + m_CpG = 0; + t_CHG = 0; + m_CHG = 0; + t_CHH = 0; + m_CHH = 0; + + if(argc == 1) { + usage(argv[0]); + quit(0, 0); + } + + for(i=1; i= bowtie2_options_max) { + bowtie2_options_max = strlen(config.bowtie2_options) + 1 + strlen(argv[i]) + 100; + config.bowtie2_options = realloc(config.bowtie2_options, sizeof(char) * bowtie2_options_max); + } + strcat(config.bowtie2_options, " "); + strcat(config.bowtie2_options, argv[i]); + } + } + + if(config.FASTQ1 == NULL || config.genome_dir == NULL || (config.FASTQ2 == NULL && config.paired == 1)) { + if(taskid == MASTER) { + printf("No FASTQ files!\n"); + usage(argv[0]); + } + quit(0, -1); + } + + //If more than one input file was specified, enable reorder + tmp = strdup(config.FASTQ1); + p = strtok(tmp, ","); + if(wordexp(p, &p_wordexp, WRDE_SHOWERR | WRDE_UNDEF) != 0) { + printf("There was an error while parsing %s.\n", p); + free(tmp); + wordfree(&p_wordexp); + quit(0, -1); + } + multi_file += p_wordexp.we_wordc; + p = strtok(NULL, ","); + while(p != NULL) { + if(wordexp(p, &p_wordexp, WRDE_SHOWERR | WRDE_UNDEF | WRDE_REUSE) != 0) { + printf("There was an error while parsing %s.\n", p); + free(tmp); + wordfree(&p_wordexp); + quit(0, -1); + } + multi_file += p_wordexp.we_wordc; + p = strtok(NULL, ","); + } + free(tmp); + wordfree(&p_wordexp); +#ifdef DEBUG + if(multi_file>1) { + printf("In DEBUG mode, you can't input multiple file-sets!\n"); + quit(0,-1); + } +#else + if(!config.quiet) printf("%s has rank %i\n", processor_name, taskid); fflush(stdout); + if(taskid > effective_nodes()) { + printf("From node %i: So long and thanks for all the bits.\n", taskid); fflush(stdout); + return(-2); //We're an extraneous node + } +#endif + ngroups = effective_nodes(); + + if(config.tmpdir == NULL) config.tmpdir = "/tmp"; + + //Allocate room for the genome, if needed + if(taskid == MASTER) { + chromosomes.max_genome = 3000000000; + if(!config.quiet) printf("Allocating space for %llu characters\n", chromosomes.max_genome); + fflush(stdout); + chromosomes.genome = malloc(sizeof(char)*chromosomes.max_genome); + *chromosomes.genome = '\0'; + if(chromosomes.genome == NULL) { + printf("Could not allocate enough room to hold the genome!\n"); + return -1; + } + } else { + chromosomes.max_genome = 0; + } + + //Setup the global variables (these will need to be free()d!) +#ifndef DEBUG + if(taskid == MASTER) { +#endif + nwritten = calloc(multi_file, sizeof(char *)); + fnames1 = calloc(multi_file, sizeof(char *)); + fnames2 = calloc(multi_file, sizeof(char *)); + flengths = calloc(multi_file, sizeof(unsigned long long)); +#ifndef DEBUG + } +#endif + + //Append score_min, and p + if(strlen(config.bowtie2_options) + 1000 >= bowtie2_options_max) { + bowtie2_options_max = strlen(config.bowtie2_options) + 1000; //This should suffice + config.bowtie2_options = realloc(config.bowtie2_options, sizeof(char) * bowtie2_options_max); + } + if(strlen(config.bowtie2_options) > 0) { + sprintf(config.bowtie2_options, "%s -p %i --score-min '%c,%g,%g'", config.bowtie2_options, config.nthreads, config.scoremin_type, config.scoremin_intercept, config.scoremin_coef); + } else { + sprintf(config.bowtie2_options, "-p %i --score-min '%c,%g,%g'", config.nthreads, config.scoremin_type, config.scoremin_intercept, config.scoremin_coef); + } + + //There should be as many tasks according to MPI as dictated by the library type. + if(config.directional) { + ngroups /= 2; + } else { + ngroups /= 4; + } + if(ngroups < 1) { + if(taskid == MASTER) printf("There are only %i groups of nodes available!! You need to allocate more nodes (at least 3 for direcional and 5 for non-directional libraries)!\n", ngroups); + quit(0, -1); + } + //Yes, these silently change user input + if(config.nmthreads < 1) config.nmthreads = 1; + if(config.nmthreads > ngroups) config.nmthreads = ngroups; + +#ifdef DEBUG + //DEBUG can't handle multiple files + update_odir(); + config.basename = get_basename(config.FASTQ1); + config.outname = malloc(sizeof(char)*(strlen(config.odir)+ strlen(config.basename)+5)); + sprintf(config.outname, "%s%s.bam", config.odir, config.basename); + if(taskid == MASTER) { +#else + if(taskid == MASTER) { + //Deal with the output directory + update_odir(); +#endif + + //Store the genome into memory + read_genome(); + + //Setup the mutexes + pthread_mutex_init(&metrics_mutex, NULL); + + //Setup the linked-lists + nodes = malloc(sizeof(struct packed_struct *)*effective_nodes()); + last_sentinel_node = malloc(sizeof(struct packed_struct *)*effective_nodes()); + fastq_nodes = malloc(sizeof(struct packed_struct *)*ngroups); + last_fastq_sentinel_node = malloc(sizeof(struct packed_struct *)*ngroups); + to_write_node = malloc(sizeof(struct packed_struct *)*config.nmthreads); + to_write_sentinel_node = malloc(sizeof(struct packed_struct *)*config.nmthreads); + for(i=0; inext; + } + for(i=0; inext; + } + for(i=0; inext; + } + + //Start the master node processer threads + threads = calloc(2+config.nmthreads, sizeof(pthread_t)); + int *threadids = malloc(sizeof(int)*config.nmthreads); + pthread_create(&(threads[0]), NULL, &send_store_fastq, (void *) &upto); + for(i=0; ithread_id = taskid; + silly_struct->fastq1 = malloc(sizeof(char) * (strlen(tmpdir) + strlen("/read1") + 1)); + silly_struct->fastq2 = malloc(sizeof(char) * (strlen(tmpdir) + strlen("/read2") + 1)); + sprintf(silly_struct->fastq1, "%s/read1", tmpdir); + sprintf(silly_struct->fastq2, "%s/read2", tmpdir); + + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; + int rv = mkfifo(silly_struct->fastq1, mode); + if(rv != 0) { + printf("mkfifo returned with status %i!\n", rv); + fflush(stdout); + } + if(config.paired) { + rv = mkfifo(silly_struct->fastq2, mode); + if(rv != 0) { + printf("mkfifo returned with status %i!\n", rv); + fflush(stdout); + } + } + + //Start slurping in the fastq reads and converting them so they can be aligned +#ifndef DEBUG + threads = calloc(1, sizeof(pthread_t)); + pthread_create(&(threads[0]), NULL, &slurp_fastq, (void *) silly_struct); +#else + threads = calloc(2, sizeof(pthread_t)); + pthread_create(&(threads[1]), NULL, &send_store_fastq, (void *) &upto); + pthread_create(&(threads[0]), NULL, &slurp_fastq, (void *) silly_struct); +#endif + //worker node stuff + herd_worker_node(taskid, silly_struct->fastq1, silly_struct->fastq2); + pthread_join(threads[0], NULL); +#ifdef DEBUG + pthread_join(threads[1], NULL); +#endif + if(!config.quiet) printf("Returning from worker node %i\n", taskid); + fflush(stdout); + free(silly_struct->fastq1); //The worker node unlinks this + free(silly_struct->fastq2); //The worker node unlinks this + free(silly_struct); + if(rmdir(tmpdir) != 0) { + printf("Couldn't remove %s directory!\n", tmpdir); + fflush(stdout); + } + free(tmpdir); + free(threads); + } + +#ifndef DEBUG + if(taskid == MASTER) { +#endif + free(nwritten); + free(fnames1); + free(fnames2); + free(flengths); +#ifndef DEBUG + } +#endif + + //Clean up + if(config.odir != NULL) free(config.odir); + quit(3, 0); + return 0; +} diff --git a/herd/master.c b/herd/master.c new file mode 100644 index 0000000..05af3f7 --- /dev/null +++ b/herd/master.c @@ -0,0 +1,221 @@ +#include "../bison.h" +#include +#include + +/******************************************************************************* +* +* The master node function. +* +* void *a: Actually an int*, the thread_id +* +*******************************************************************************/ +void * herd_master_processer_thread(void *a) { + int thread_id = *((int *) a), best_node, j, quit = 0, multiplier; + int ngroups = effective_nodes(); + int node_base, node_final; + int tmp_j = 0; + char **seq = malloc(sizeof(char *) * 2); + bam1_t **node1_read = malloc(sizeof(bam1_t*) * 2); + bam1_t **node2_read = malloc(sizeof(bam1_t*) * 2); + bam1_t **node3_read = malloc(sizeof(bam1_t*) * 2); + bam1_t **node4_read = malloc(sizeof(bam1_t*) * 2); + bam1_t **best_read = NULL; + fastq *read = malloc(sizeof(fastq)); + time_t now; + char ctime_buffer[26]; + unsigned long long local_m_reads_OT = 0, local_m_reads_OB = 0; + unsigned long long local_m_reads_CTOT = 0, local_m_reads_CTOB = 0; + unsigned long long local_total = 0; + + //Properly set the number of node groups and other small things + if(config.directional) { + ngroups /= 2; + multiplier = 2; + } else { + ngroups /= 4; + multiplier = 4; + } + read->max_name1 = 0; + read->max_seq1 = 0; + read->max_qual1 = 0; + read->max_name2 = 0; + read->max_seq2 = 0; + read->max_qual2 = 0; + read->name1 = NULL; + read->seq1 = NULL; + read->qual1 = NULL; + read->name2 = NULL; + read->seq2 = NULL; + read->qual2 = NULL; + + //Get the minimum and maximum node group to work on + node_base = thread_id*(ngroups/config.nmthreads); + node_final = node_base+(ngroups/config.nmthreads)-1; + if(thread_id+1-config.nmthreads == 0) node_final += ngroups % config.nmthreads; + node_final++; + + //Process read i/o + while(quit < node_final-node_base) { + //Currently, we output everything in the same order as the original input + //We could also invoke one thread per node-group (or multiple groups) and + //then either output in order or randomly (easier to implement). + //I'll have to benchmark things to see if this can keep up + for(j=node_base; jnext->packed)) == '\2') { + //Update! +//lock + pthread_mutex_lock(&metrics_mutex); + m_reads_OT += local_m_reads_OT; + m_reads_OB += local_m_reads_OB; + m_reads_CTOT += local_m_reads_CTOT; + m_reads_CTOB += local_m_reads_CTOB; + pthread_mutex_unlock(&metrics_mutex); +//unlock + local_m_reads_OT = 0; + local_m_reads_OB = 0; + local_m_reads_CTOT = 0; + local_m_reads_CTOB = 0; + local_total = 0; + tmp_j = j; + for(j=node_base; jnext->packed); + *seq = read->seq1; + *(*seq + strlen(*seq) - 1) = '\0'; //remove the \n, + if(config.paired) { + *(seq+1) = read->seq2; + *(*(seq+1) + strlen(*(seq+1)) - 1) = '\0'; + } + + //Process the reads + if(!config.paired) { + best_node = process_single(*node1_read, *node2_read, *node3_read, *node4_read, *seq); //Output is stored in read1 + } else { + best_node = process_paired(node1_read, node2_read, node3_read, node4_read, seq); //Output is stored in read + } + + if(best_node == 1) { + best_read = node1_read; + if(!((*best_read)->core.flag & BAM_FUNMAP)) local_m_reads_OT++; + } else if(best_node == 2) { + best_read = node2_read; + if(!((*best_read)->core.flag & BAM_FUNMAP)) local_m_reads_OB++; + } else if(best_node == 3) { + best_read = node3_read; + if(!((*best_read)->core.flag & BAM_FUNMAP)) local_m_reads_CTOT++; + } else if(best_node == 4) { + best_read = node4_read; + if(!((*best_read)->core.flag & BAM_FUNMAP)) local_m_reads_CTOB++; + } + + //Store the reads and free up space (N.B., the writer thread will free up the space used by the best read) + if(best_node != 1) { + remove_element(nodes[multiplier*j]); + if(config.paired) remove_element(nodes[multiplier*j]); + } else { + move_element(nodes[multiplier*j], to_write_sentinel_node[thread_id]); + } + if(best_node != 2) { + remove_element(nodes[multiplier*j+1]); + if(config.paired) remove_element(nodes[multiplier*j+1]); + } else { + move_element(nodes[multiplier*j+1], to_write_sentinel_node[thread_id]); + } + if(!config.directional) { + if(best_node != 3) { + remove_element(nodes[multiplier*j+2]); + if(config.paired) remove_element(nodes[multiplier*j+2]); + } else { + move_element(nodes[multiplier*j+2], to_write_sentinel_node[thread_id]); + } + if(best_node != 4) { + remove_element(nodes[multiplier*j+3]); + if(config.paired) remove_element(nodes[multiplier*j+3]); + } else { + move_element(nodes[multiplier*j+3], to_write_sentinel_node[thread_id]); + } + } + remove_raw_element(fastq_nodes[j]); + } + } + + //Tell the writer thread that we're finished + add_finished(to_write_sentinel_node[thread_id]); + + //Update the global metrics +//lock + pthread_mutex_lock(&metrics_mutex); + m_reads_OT += local_m_reads_OT; + m_reads_OB += local_m_reads_OB; + m_reads_CTOT += local_m_reads_CTOT; + m_reads_CTOB += local_m_reads_CTOB; + pthread_mutex_unlock(&metrics_mutex); +//unlock + + //Clean up + free(seq); + free(node1_read); + free(node2_read); + free(node3_read); + free(node4_read); + free(read->name1); + free(read->seq1); + free(read->qual1); + if(config.paired) { + free(read->name2); + free(read->seq2); + free(read->qual2); + } + free(read); + if(!config.quiet) printf("Thread %i finishing!\n", thread_id); fflush(stdout); + + return NULL; +} diff --git a/herd/slurp.c b/herd/slurp.c new file mode 100644 index 0000000..8a8cecd --- /dev/null +++ b/herd/slurp.c @@ -0,0 +1,286 @@ +#include "../bison.h" + +/****************************************************************************** +* +* Remove a raw element from the start of a linked-list +* is_ready(first, 0) must return 1! +* +* struct packed_struct *first: first sentinel struct +* +*******************************************************************************/ +void remove_raw_element(struct packed_struct *first) { + struct packed_struct *remove = first->next; + struct packed_struct *new_next = remove->next; + + first->next = new_next; + free(remove->packed); + free(remove); +} + +/****************************************************************************** +* +* Move an element from one linked-list to another. +* +* struct packed_struct *source: source linked list +* struct packed-struct *dest: destination sentinel node +* +*******************************************************************************/ +void move_element(struct packed_struct *source, struct packed_struct *dest) { + struct packed_struct *next_to_last = dest->previous; + struct packed_struct *element = source->next; + struct packed_struct *new_next = NULL; + + //Remove from source + if(config.paired) { + new_next = source->next->next->next; //the next read #1 + source->next->next->previous = source->next; //Ensure that read #2 has the address for read #1 + source->next->previous = dest->previous; //Ensure that read #1 points to the previous read #2 + //Remove from source + element->next->next = dest; //point read #2 to the sentinel node + element->state = 0; //read #1 set not ready + element->next->state = 0; //read #2 set not ready + dest->previous = element->next; //Update the destination sentinel node + } else { + new_next = source->next->next; //the next read + source->next->previous = dest->previous; //Ensure that the read knows who came before it + //Remove from source + element->next = dest; //Next is the sentinel node + element->state = 0; //read is set not ready + dest->previous = element; //Update destination sentinel node + } + + //Update the source + source->next = new_next; + + //Add to destination + next_to_last->next = element; //Update previous read to point to the new one + if(!config.paired) { + //Don't do anything if the previous node is a sentinel node + if(next_to_last->previous != next_to_last) next_to_last->state = 1; //set previous read to ready + } else { + //Don't do anything if the previous node is a sentinel node + if(next_to_last->previous != next_to_last) next_to_last->previous->state = 1; //Set previous read #1 to ready (we never check read #2 + } +} + +/****************************************************************************** +* +* Destroy a linked list of packed_structs with unmodified ->packed +* +* struct packed_struct *first: linked list to destroy +* +*******************************************************************************/ +void destroy_raw_list(struct packed_struct *first) { + while(first->next->next != first->next) remove_raw_element(first); + free(first->next); + free(first); +} + +/****************************************************************************** +* +* The MPI receiver thread on a bison_herd main node +* +* void *a: NULL input +* +* returns NULL +* +*******************************************************************************/ +#ifndef DEBUG +void *herd_slurp(void *a) { + time_t t0, t1; + void *p = NULL; + int nnodes = effective_nodes(); + int nfinished = 0; + int source = 0; + int size = 0; + struct packed_struct *target_node = NULL; + bam_header_t *tmp_header; + MPI_Status status; + if(MPI_Recv((void *) &size, 1, MPI_INT, 1, 1, MPI_COMM_WORLD, &status) != MPI_SUCCESS) { + printf("Received an error when trying to receive header size.\n"); + fflush(stdout); + quit(3, -2); + } + p = malloc((size_t) size); + if(MPI_Recv(p, size, MPI_BYTE, 1, 2, MPI_COMM_WORLD, &status) != MPI_SUCCESS) { + printf("Received an error when trying to receive header.\n"); + fflush(stdout); + quit(3, -2); + } + tmp_header= bam_header_init(); + unpack_header(tmp_header, p); + free(p); + global_header = tmp_header; //Now the writer thread is unblocked! + + t0 = time(NULL); + if(!config.quiet) printf("Started slurping @%s", ctime(&t0)); fflush(stdout); + while(nfinished < nnodes) { + MPI_Probe(MPI_ANY_SOURCE, 5, MPI_COMM_WORLD, &status); + source = status.MPI_SOURCE; + MPI_Get_count(&status, MPI_BYTE, &size); + target_node = last_sentinel_node[source-1]; + + if(size > 1) { + p = malloc((size_t) size); + MPI_Recv(p, size, MPI_BYTE, source, 5, MPI_COMM_WORLD, &status); + add_element(target_node, p); + } else { + p = malloc((size_t) size); + MPI_Recv(p, size, MPI_BYTE, source, 5, MPI_COMM_WORLD, &status); + free(p); + add_finished(target_node); + nfinished++; + } + } + t1 = time(NULL); + if(!config.quiet) printf("Finished slurping @%s\t(%f seconds elapsed)\n", ctime(&t1), difftime(t1, t0)); fflush(stdout); + return NULL; +} +#else +void *herd_slurp(void *a) { + time_t t0, t1; + bamFile fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8; + char *iname = malloc(sizeof(char) * (1+strlen(config.odir)+strlen(config.basename)+strlen("_X.bam"))); + bam1_t *read = bam_init1(); + bam_header_t *tmp; + MPI_read *packed = calloc(1, sizeof(MPI_read)); + struct packed_struct *target_node = NULL; + + //Open the input files and get the header + sprintf(iname, "%s%s_1.bam", config.odir, config.basename); + fp1 = bam_open(iname, "r"); + global_header = bam_header_read(fp1); + sprintf(iname, "%s%s_2.bam", config.odir, config.basename); + fp2 = bam_open(iname, "r"); + tmp = bam_header_read(fp2); + bam_header_destroy(tmp); + sprintf(iname, "%s%s_3.bam", config.odir, config.basename); + fp3 = bam_open(iname, "r"); + tmp = bam_header_read(fp3); + bam_header_destroy(tmp); + sprintf(iname, "%s%s_4.bam", config.odir, config.basename); + fp4 = bam_open(iname, "r"); + tmp = bam_header_read(fp4); + bam_header_destroy(tmp); + sprintf(iname, "%s%s_5.bam", config.odir, config.basename); + fp5 = bam_open(iname, "r"); + tmp = bam_header_read(fp5); + bam_header_destroy(tmp); + sprintf(iname, "%s%s_6.bam", config.odir, config.basename); + fp6 = bam_open(iname, "r"); + tmp = bam_header_read(fp6); + bam_header_destroy(tmp); + sprintf(iname, "%s%s_7.bam", config.odir, config.basename); + fp7 = bam_open(iname, "r"); + tmp = bam_header_read(fp7); + bam_header_destroy(tmp); + sprintf(iname, "%s%s_8.bam", config.odir, config.basename); + fp8 = bam_open(iname, "r"); + tmp = bam_header_read(fp8); + bam_header_destroy(tmp); + free(iname); + + + //Write a header + bam_header_write(OUTPUT_BAM, global_header); + packed->size = 0; + + t0 = time(NULL); + if(!config.quiet) printf("Started slurping @%s", ctime(&t0)); fflush(stdout); + + while(bam_read1(fp1, read) > 1) { + packed->packed = NULL; + packed->size = 0; + packed = pack_read(read, packed); + target_node = last_sentinel_node[0]; + add_element(target_node, packed->packed); + + //Node2 + bam_read1(fp2, read); + packed->packed = NULL; + packed->size = 0; + packed = pack_read(read, packed); + target_node = last_sentinel_node[1]; + add_element(target_node, packed->packed); + + //Node3 + bam_read1(fp3, read); + packed->packed = NULL; + packed->size = 0; + packed = pack_read(read, packed); + target_node = last_sentinel_node[2]; + add_element(target_node, packed->packed); + + //Node4 + bam_read1(fp4, read); + packed->packed = NULL; + packed->size = 0; + packed = pack_read(read, packed); + target_node = last_sentinel_node[3]; + add_element(target_node, packed->packed); + + //Node5 + bam_read1(fp5, read); + packed->packed = NULL; + packed->size = 0; + packed = pack_read(read, packed); + target_node = last_sentinel_node[4]; + add_element(target_node, packed->packed); + + //Node6 + bam_read1(fp6, read); + packed->packed = NULL; + packed->size = 0; + packed = pack_read(read, packed); + target_node = last_sentinel_node[5]; + add_element(target_node, packed->packed); + + //Node7 + bam_read1(fp7, read); + packed->packed = NULL; + packed->size = 0; + packed = pack_read(read, packed); + target_node = last_sentinel_node[6]; + add_element(target_node, packed->packed); + + //Node8 + bam_read1(fp8, read); + packed->packed = NULL; + packed->size = 0; + packed = pack_read(read, packed); + target_node = last_sentinel_node[7]; + add_element(target_node, packed->packed); + } + free(packed); + bam_destroy1(read); + bam_close(fp1); + bam_close(fp2); + bam_close(fp3); + bam_close(fp4); + bam_close(fp5); + bam_close(fp6); + bam_close(fp7); + bam_close(fp8); + + target_node = last_sentinel_node[0]; + add_finished(target_node); + target_node = last_sentinel_node[1]; + add_finished(target_node); + target_node = last_sentinel_node[2]; + add_finished(target_node); + target_node = last_sentinel_node[3]; + add_finished(target_node); + target_node = last_sentinel_node[4]; + add_finished(target_node); + target_node = last_sentinel_node[5]; + add_finished(target_node); + target_node = last_sentinel_node[6]; + add_finished(target_node); + target_node = last_sentinel_node[7]; + add_finished(target_node); + + t1 = time(NULL); + if(!config.quiet) printf("Finished slurping @%s\t(%f seconds elapsed)\n", ctime(&t1), difftime(t1, t0)); fflush(stdout); + return NULL; +} +#endif diff --git a/herd/worker.c b/herd/worker.c new file mode 100644 index 0000000..c914c63 --- /dev/null +++ b/herd/worker.c @@ -0,0 +1,427 @@ +#include "../bison.h" +#include + +struct packed_struct *first_writer, *first_writer_sentinel; +struct packed_struct *second_writer, *second_writer_sentinel; + +//write read #1 +void * first_writer_func(void *a) { + int thread_id = ((slurp_fastq_struct *) a)->thread_id; + char *fastq1 = ((slurp_fastq_struct *) a)->fastq1; + FILE *f1 = fopen(fastq1, "w"); + fastq *read = malloc(sizeof(fastq)); + int strand; + + //Determine the conversions to make + if(config.directional) { + strand = (thread_id-1) % 2; + } else { + strand = (thread_id-1) % 4; + } + + //Initialize the fastq struct + read->max_name1 = 10; + read->max_name2 = 10; + read->max_seq1 = 10; + read->max_seq2 = 10; + read->max_qual1 = 10; + read->max_qual2 = 10; + read->name1 = malloc(sizeof(char) * 10); + read->seq1 = malloc(sizeof(char) * 10); + read->qual1 = malloc(sizeof(char) * 10); + read->name2 = malloc(sizeof(char) * 10); + read->seq2 = malloc(sizeof(char) * 10); + read->qual2 = malloc(sizeof(char) * 10); + + while(1) { + while(!is_ready(first_writer, 0)); //Sleeping slows things down too much + if(is_finished(first_writer)) break; + + //Unpack + read = unpack_fastq(read, first_writer->next->packed); + //Remove from the linked list + remove_raw_element(first_writer); + //Convert + switch(strand) { + case 0 : + case 1 : + convertCT(read, 0); + break; + case 2 : + case 3 : + convertGA(read, 0); + break; + } + fprintf(f1, "%s%s+\n%s", read->name1, read->seq1, read->qual1); + } + + //Free things up + fclose(f1); + free(read->name1); + free(read->seq1); + free(read->qual1); + free(read->name2); + free(read->seq2); + free(read->qual2); + free(read); + destroy_list(first_writer); + return NULL; +} + +//write read #2 +void * second_writer_func(void *a) { + int thread_id = ((slurp_fastq_struct *) a)->thread_id; + char *fastq2 = ((slurp_fastq_struct *) a)->fastq2; + FILE *f2 = fopen(fastq2, "w"); + fastq *read = malloc(sizeof(fastq)); + int strand; + + //Determine the conversions to make + if(config.directional) { + strand = (thread_id-1) % 2; + } else { + strand = (thread_id-1) % 4; + } + + //Initialize the fastq struct + read->max_name1 = 10; + read->max_name2 = 10; + read->max_seq1 = 10; + read->max_seq2 = 10; + read->max_qual1 = 10; + read->max_qual2 = 10; + read->name1 = malloc(sizeof(char) * 10); + read->seq1 = malloc(sizeof(char) * 10); + read->qual1 = malloc(sizeof(char) * 10); + read->name2 = malloc(sizeof(char) * 10); + read->seq2 = malloc(sizeof(char) * 10); + read->qual2 = malloc(sizeof(char) * 10); + + while(1) { + while(!is_ready(second_writer, 0)); //Sleeping slows things down too much + if(is_finished(second_writer)) break; + + //Unpack + read = unpack_fastq(read, second_writer->next->packed); + //Remove from the linked list + remove_raw_element(second_writer); + //Convert + switch(strand) { + case 0 : + case 1 : + convertGA(read, 1); + break; + case 2 : + case 3 : + convertCT(read, 1); + break; + } + fprintf(f2, "%s%s+\n%s", read->name2, read->seq2, read->qual2); + } + + //Free things up + fclose(f2); + free(read->name1); + free(read->seq1); + free(read->qual1); + free(read->name2); + free(read->seq2); + free(read->qual2); + free(read); + destroy_list(second_writer); + return NULL; +} + +/****************************************************************************** +* +* This receives the reads, converts them, and writes them to the FIFO(s) +* +* void *a: a pointer to a struct with the following components: +* +* int thread_id: the thread_id +* char *fastq1: FIFO from which bowtie2 can get read1 +* char *fastq2: FIFO from which bowtie2 can get read2 (if it exists) +* +*******************************************************************************/ +void * slurp_fastq(void *a) { + pthread_t threads[2]; + void *p = NULL, *p2 = NULL, *p3 = NULL; + int size = 0, current_p_size = 0; + MPI_Status status; + fastq *read = malloc(sizeof(fastq)); + + first_writer = malloc(sizeof(struct packed_struct)); + first_writer_sentinel = malloc(sizeof(struct packed_struct)); + first_writer = initialize_list(first_writer); + first_writer_sentinel = first_writer->next; + pthread_create(&(threads[0]), NULL, &first_writer_func, a); + if(config.paired) { + //If we have pairs, then writing simultaneuosly to two fifos (that will be read sequentially by bowtie2) won't work, since bowtie2 will read from a single fifo multiple times!!! + second_writer = malloc(sizeof(struct packed_struct)); + second_writer_sentinel = malloc(sizeof(struct packed_struct)); + second_writer = initialize_list(second_writer); + second_writer_sentinel = second_writer->next; + pthread_create(&(threads[1]), NULL, &second_writer_func, a); + } + + //Initialize the fastq struct + read->max_name1 = 10; + read->max_name2 = 10; + read->max_seq1 = 10; + read->max_seq2 = 10; + read->max_qual1 = 10; + read->max_qual2 = 10; + read->name1 = malloc(sizeof(char) * 10); + read->seq1 = malloc(sizeof(char) * 10); + read->qual1 = malloc(sizeof(char) * 10); + read->name2 = malloc(sizeof(char) * 10); + read->seq2 = malloc(sizeof(char) * 10); + read->qual2 = malloc(sizeof(char) * 10); + + //Receive and process the raw reads + while(1) { + MPI_Probe(0, 3, MPI_COMM_WORLD, &status); + MPI_Get_count(&status, MPI_BYTE, &size); + if(size > current_p_size) { + p = realloc(p, (size_t) size); + } + MPI_Recv(p, size, MPI_BYTE, 0, 3, MPI_COMM_WORLD, &status); + //Are we finished receiving? + if(size <= 1) break; + + //Copy if needed + if(config.paired) { + p2 = malloc(size); + memcpy(p2,p,size); + add_element(second_writer_sentinel, p2); + } + p3 = malloc(size); + memcpy(p3,p,size); + add_element(first_writer_sentinel, p3); + } + add_finished(first_writer_sentinel); + if(config.paired) add_finished(second_writer_sentinel); + + //Wait for the other thread + pthread_join(threads[0], NULL); + if(config.paired) { + pthread_join(threads[1], NULL); + } + + //Free things up + free(p); + free(read->name1); + free(read->seq1); + free(read->qual1); + free(read->name2); + free(read->seq2); + free(read->qual2); + free(read); + + return NULL; +} + +/****************************************************************************** +* +* The main worker node function. +* +* int thread_id: the thread_id +* char *fastq1: FIFO from which bowtie2 can get read1 +* char *fastq2: FIFO from which bowtie2 can get read2 (if it exists) +* +*******************************************************************************/ +void herd_worker_node(int thread_id, char *fastq1, char *fastq2) { + int cmd_length = 1, max_qname = 0, status, strand; + char *cmd, *last_qname = calloc(1, sizeof(char)); + MPI_Header *packed_header; + MPI_read *packed_read = calloc(1, sizeof(MPI_read)); + bam_header_t *header; + bam1_t *read1 = bam_init1(); + bam1_t *read2 = bam_init1(); + tamFile fp; +#ifdef DEBUG + MPI_Status stat; + int current_p_size = 100; + bamFile of; + bam_header_t *debug_header = bam_header_init(); + bam1_t *debug_read = bam_init1(); + global_header = bam_header_init(); + void *p = calloc(100,1); + char *oname = NULL; +#else + int i = 0; +#endif + time_t t0, t1; + + //Which strand should we be aligning to? + if(config.directional) { + strand = (thread_id-1) % 2; + } else { + strand = (thread_id-1) % 4; + } + + packed_read->size = 0; + packed_read->packed = NULL; + + //construct the bowtie2 command + cmd_length += (int) strlen("bowtie2 -q --reorder --no-mixed --no-discordant") + 1; + cmd_length += (int) strlen(config.bowtie2_options) + 1; + cmd_length += (int) strlen("--norc -x") + 1; + cmd_length += (int) strlen(config.genome_dir) + strlen("bisulfite_genome/CT_conversion/BS_CT") + 1; + cmd_length += (int) 2*(strlen("-1 ") + strlen(fastq1)) + 3; + if(config.paired) cmd_length += (int) strlen(fastq2); //This is likely unneeded. + +#ifdef DEBUG + oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_X.bam"))); + sprintf(oname, "%s%s_%i.bam", config.odir, config.basename, thread_id); + if(!config.quiet) printf("Writing output to %s\n", oname); + of = bam_open(oname, "w"); + free(oname); +#endif + + cmd = (char *) malloc(sizeof(char) * cmd_length); + if(strand == 0) { //OT Read#1 C->T, Read#2 G->A, Genome C->T only the + strand + if(config.paired) { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); + } else { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, fastq1); + } + } else if(strand == 1) { //OB Read#1 C->T, Read#2 G->A, Genome G->A only the - strand + if(config.paired) { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); + } else { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, fastq1); + } + } else if(strand == 2) { //CTOT Read#1 G->A, Read#2 C->T, Genome C->T, only the - strand + if(config.paired) { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); + } else { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, fastq1); + } + } else if(strand == 3) { //CTOB Read#1 G->A, Read#2 C->T, Genome G->A, only the + strand + if(config.paired) { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, fastq1, fastq2); + } else { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, fastq1); + } + } else { + printf("Oh shit, got strand %i!\n", strand); + return; + } + + //Start the process + if(!config.quiet) printf("Node %i executing: %s\n", thread_id, cmd); fflush(stdout); + fp = sam_popen(cmd); + header = sam_header_read(fp); +#ifdef DEBUG + bam_header_write(of, header); +#endif + +#ifndef DEBUG + packed_header = pack_header(header); + if(thread_id == 1) { + //Send the header + MPI_Send((void *) &(packed_header->size), 1, MPI_INT, 0, 1, MPI_COMM_WORLD); + status = MPI_Send((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD); + if(status != MPI_SUCCESS) { + printf("MPI_Send returned %i\n", status); + fflush(stdout); + } + } +#else + packed_header = pack_header(header); + void *tmp_pointer = malloc(packed_header->size); + MPI_Request request; + MPI_Isend((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD, &request); + status = MPI_Recv(tmp_pointer, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD, &stat); + if(status != MPI_SUCCESS) printf("We seem to have not been able to send the message to ourselves!\n"); + MPI_Wait(&request, &stat); + unpack_header(debug_header, tmp_pointer); + global_header = debug_header; + free(tmp_pointer); +#endif + + t0 = time(NULL); + if(!config.quiet) printf("Node %i began sending reads @%s", thread_id, ctime(&t0)); fflush(stdout); + while(sam_read1(fp, header, read1) > 1) { +#ifdef DEBUG + bam_write1(of, read1); +#endif + if(strcmp(bam1_qname(read1), last_qname) == 0) { //Multimapper + if(config.paired) { + sam_read1(fp, header, read2); +#ifdef DEBUG + bam_write1(of, read2); +#endif + } + continue; + } else { + if(read1->core.l_qname > max_qname) { + max_qname = read1->core.l_qname + 10; + last_qname = realloc(last_qname, sizeof(char) * max_qname); + } + strcpy(last_qname, bam1_qname(read1)); + } + + //Send the read + packed_read = pack_read(read1, packed_read); +#ifndef DEBUG + MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); +#else + if(packed_read->size > current_p_size) p = realloc(p, packed_read->size); + MPI_Isend(packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request); + status = MPI_Recv(p, packed_header->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat); + MPI_Wait(&request, &stat); +#endif + //Deal with paired-end reads + if(config.paired) { + sam_read1(fp, header, read2); + packed_read = pack_read(read2, packed_read); +#ifndef DEBUG + MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); +#else + bam_write1(of, read2); + if(packed_read->size > current_p_size) p = realloc(p, packed_read->size); + MPI_Isend((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &request); + status = MPI_Recv(p, packed_header->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD, &stat); + MPI_Wait(&request, &stat); + debug_read = unpack_read(debug_read, p); +#endif + } +#ifndef DEBUG + i++; +#endif + } + t1 = time(NULL); + if(!config.quiet) printf("Node %i finished sending reads @%s\t(%f sec elapsed)\n", thread_id, ctime(&t1), difftime(t1, t0)); fflush(stdout); + + //Notify the master node + packed_read->size = 0; +#ifndef DEBUG + void *A = malloc(1); + MPI_Send(A, 1, MPI_BYTE, 0, 5, MPI_COMM_WORLD); + free(A); +#endif + + //Close things up + bam_header_destroy(header); + bam_destroy1(read1); + bam_destroy1(read2); + free(cmd); + if(packed_read->packed != NULL) free(packed_read->packed); + free(packed_read); + if(packed_header->packed != NULL) free(packed_header->packed); + free(packed_header); + free(last_qname); + sam_pclose(fp); + //Remove the FIFO(s) + unlink(fastq1); + if(config.paired) unlink(fastq2); +#ifdef DEBUG + bam_close(of); + bam_header_destroy(debug_header); + bam_destroy1(debug_read); + free(p); +#endif + if(!config.quiet) printf("Exiting worker node %i\n", thread_id); fflush(stdout); +}; diff --git a/herd/writer.c b/herd/writer.c new file mode 100644 index 0000000..0122ee0 --- /dev/null +++ b/herd/writer.c @@ -0,0 +1,204 @@ +#include "../bison.h" +#include + +/****************************************************************************** +* +* Update the CpG/CHG/CHH metrics according to the methylation calls in a read +* +*******************************************************************************/ +void herd_update_counts(bam1_t *read) { + char *XM = bam_aux2Z(bam_aux_get(read, "XM")); + char base; + int i; + + for(i=0; icore.l_qseq; i++) { + base = *(XM+i); + if(base != '.') { + if(base == 'Z') { + t_CpG++; + m_CpG++; + } else if(base == 'z') { + t_CpG++; + } else if(base == 'X') { + t_CHG++; + m_CHG++; + } else if(base == 'x') { + t_CHG++; + } else if(base == 'H') { + t_CHH++; + m_CHH++; + } else if(base == 'h') { + t_CHH++; + } + } + } +} + +void herd_setup(char *fname1, char *fname2) { + char *cmd = NULL; + if(config.basename) free(config.basename); + config.basename = get_basename(fname1); + config.outname = realloc(config.outname, sizeof(char)*(strlen(config.odir)+ strlen(config.basename)+5)); + sprintf(config.outname, "%s%s.bam", config.odir, config.basename); + //Open the output file handles + if(config.unmapped) { + create_fastq_names(fname1, fname2); + cmd = malloc(sizeof(char) * (strlen(config.unmapped1) + 8)); + if(!config.quiet) printf("Unmapped reads will be written to %s\n", config.unmapped1); + sprintf(cmd, "gzip > %s", config.unmapped1); + unmapped1 = popen(cmd, "w"); + if(config.paired) { + cmd = realloc(cmd, sizeof(char) * (strlen(config.unmapped2) + 8)); + if(!config.quiet) printf("Unmapped reads will be written to %s\n", config.unmapped2); + sprintf(cmd, "gzip > %s", config.unmapped2); + unmapped2 = popen(cmd, "w"); + } + free(cmd); + } + + //Open a file for output + OUTPUT_BAM = bam_open(config.outname, "w"); + if(OUTPUT_BAM == NULL) { + printf("Could not open %s for writing!\n", config.outname); + quit(2,-1); + } + if(!config.quiet) printf("Alignments will be written to %s\n",config.outname); + if(config.n_compression_threads > 1) bgzf_mt(OUTPUT_BAM, config.n_compression_threads, 256); + bam_header_write(OUTPUT_BAM, global_header); + if(!config.quiet) printf("Alignment metrics will be printed to %s%s.txt\n",config.odir,config.basename); + fflush(stdout); +} + +/****************************************************************************** +* +* This function will run as its own thread and process the linked lists +* output from the master processor threads, writing them in order to a BAM +* file. This will also write all of the other output (aside from metrics). +* Furthermore, this provides a readout of the current number of reads +* processed. +* +* Output is NULL, as is the input (needed by pthreads). +* +*******************************************************************************/ +void * bam_writer(void *a) { + int i, j, *times = malloc(sizeof(int)*config.nmthreads); + int times_per_thread = effective_nodes(); + int nfinished = 0; + int nlooped = 0, current_file = 0; + bam1_t *best_read1 = NULL; + bam1_t *best_read2 = NULL; + time_t now; + char ctime_buffer[26]; + + //If we write output in the exact same order as the input, we need to know + //how many times to write from each master_processor_thread before going to the next + if(config.directional){ + times_per_thread /= 2; + } else { + times_per_thread /= 4; + } + for(i=0; i 0 && flengths[current_file] == t_reads) { + print_metrics(); + t_reads = 0; + m_reads_OT = 0; + m_reads_OB = 0; + m_reads_CTOT = 0; + m_reads_CTOB = 0; + t_CpG = 0; + m_CpG = 0; + t_CHG = 0; + m_CHG = 0; + t_CHH = 0; + m_CHH = 0; + //Are we finished? + if(is_finished(to_write_node[i])) goto finished; + if(unmapped1 != NULL) pclose(unmapped1); + if(unmapped2 != NULL) pclose(unmapped2); + bam_close(OUTPUT_BAM); + current_file++; + herd_setup(fnames1[current_file], fnames2[current_file]); + i=0; + j=0; + } + //Just poll every second if we haven't yet written anything or if we've already looped a few times + if(i == 0 && j == 0) nfinished = 0; + if(!config.reorder) { + if(!is_ready(to_write_node[i], 0)) { + if(config.nmthreads == 1) { + sleep(1); //This is the same as --reorder + break; + } + if(t_reads == 0) sleep(1); + if(++nlooped > 100) { + if(nlooped > 1000) nlooped = 1000; + sleep(1); + } + break; + } + } else { + while(!is_ready(to_write_node[i], 0)) sleep(1); + } + if(is_finished(to_write_node[i])) { + nfinished += 1; + break; + } + best_read1 = to_write_node[i]->next->packed; + if(config.paired) best_read2 = to_write_node[i]->next->next->packed; + if(!config.paired) { //single-end + if(!(best_read1->core.flag & BAM_FUNMAP)) { + bam_write1(OUTPUT_BAM, best_read1); + herd_update_counts(best_read1); + } else { + if(config.unmapped) write_unmapped(unmapped1, best_read1); + } + } else { + if(!(best_read1->core.flag & BAM_FUNMAP) && !(best_read2->core.flag & BAM_FUNMAP)) { + bam_write1(OUTPUT_BAM, best_read1); + herd_update_counts(best_read1); + bam_write1(OUTPUT_BAM, best_read2); + herd_update_counts(best_read2); + } else { + if(config.unmapped) { + write_unmapped(unmapped1, best_read1); + write_unmapped(unmapped2, best_read2); + } + } + } + + remove_element(to_write_node[i]); + if(config.paired) remove_element(to_write_node[i]); + nlooped = 0; + t_reads++; + nwritten[current_file]++; //Only keep track of this if we're throttling + + //Give some status + if((t_reads % 100000) == 0) { + now = time(NULL); + if(!config.quiet) printf("%llu reads written @ %s", t_reads, ctime_r(&now, ctime_buffer)); fflush(stdout); + } + } + } + } + +//This isn't elegant, but... +finished: + if(t_reads != 0) print_metrics(); //There seems to be a race condition for the last sample in a list. This gets around that. + now = time(NULL); + if(!config.quiet) printf("Finished writing output @%s", ctime_r(&now, ctime_buffer)); fflush(stdout); + + free(times); + for(i=0; i +#include +#include +#include +#include +#include +#include +#include + +#define MAXLINE 1024 + +typedef struct { + char *options; + char *odir; + char conversion; +} btoptions_struct; + +void usage(char *prog) { + printf("Usage: %s [options] reference(s)\n", prog); + printf("\n \ +Note, references is a comma separated list of FASTA files. A \"bisulfite_genome\"\n \ +directory with CT_conversion and GA_conversion subdirectories will be created.\n \ +While the directory structure and indexing method are identical to bismark, the\n \ +resulting indexes are not compatible, owing to bismark's changing of\n \ +chromosome/contig names.\n \ +\n \ +Options are currently identical to those for bowtie2-build\n \ +(http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer),\n \ +as this program is effectively just a wrapper.\n"); +} + +void * bt2_build(void *a) { + btoptions_struct *options = (btoptions_struct *) a; + int rv; + char *cmd; + + //Create the command + cmd = malloc(sizeof(char) * (strlen(options->options) + 2*strlen(options->odir) + 200)); + if(options->conversion == 'C') { + sprintf(cmd, "bowtie2-build %s %s/genome.fa %s/BS_CT", options->options, options->odir, options->odir); + } else { + sprintf(cmd, "bowtie2-build %s %s/genome.fa %s/BS_GA", options->options, options->odir, options->odir); + } + printf("Now executing: %s\n", cmd); + rv = system(cmd); + if(rv) printf("%s returned with status %i!\n", cmd, rv); + return NULL; +} + +int main(int argc, char *argv[]) { + char *odir, *p, *CT_dir, *GA_dir; + char *CT_line, *GA_line; + char *options; + FILE *fp, *CT, *GA; + btoptions_struct CT_data, GA_data; + pthread_t threads[2]; + int i; + + if(argc == 1) { + usage(argv[0]); + return 0; + } else if(strcmp(argv[1], "-h") == 0) { + usage(argv[0]); + return 0; + } + + //Store bowtie2-build options + options = (char *) calloc(1, sizeof(char)); + for(i=1; i= bowtie2_options_max) { + bowtie2_options_max = strlen(config.bowtie2_options) + 1 + strlen(argv[i]) + 100; + config.bowtie2_options = realloc(config.bowtie2_options, sizeof(char) * bowtie2_options_max); + } + strcat(config.bowtie2_options, " "); + strcat(config.bowtie2_options, argv[i]); + } + } + +#ifndef DEBUG + if(!config.quiet) {printf("%s has rank %i\n", processor_name, taskid); fflush(stdout);} +#endif + + if(config.FASTQ1 == NULL || config.genome_dir == NULL || (config.FASTQ2 == NULL && config.paired == 1)) { + if(taskid == MASTER) { + printf("No FASTQ files!\n"); + usage(argv[0]); + } + quit(0, -1); + } + + //Allocate room for the genome, if needed + if(taskid == MASTER) { + chromosomes.max_genome = 3000000000; + if(!config.quiet) printf("Allocating space for %llu characters\n", chromosomes.max_genome); + fflush(stdout); + chromosomes.genome = malloc(sizeof(char)*chromosomes.max_genome); + *chromosomes.genome = '\0'; + if(chromosomes.genome == NULL) { + printf("Could not allocate enough room to hold the genome!\n"); + return -1; + } + } else { + chromosomes.max_genome = 0; + } + + //Append score_min, and p + if(strlen(config.bowtie2_options) + 1000 >= bowtie2_options_max) { + bowtie2_options_max = strlen(config.bowtie2_options) + 1000; //This should suffice + config.bowtie2_options = realloc(config.bowtie2_options, sizeof(char) * bowtie2_options_max); + } + sprintf(config.bowtie2_options, "%s -p %i --score-min '%c,%g,%g'", config.bowtie2_options, config.nthreads, config.scoremin_type, config.scoremin_intercept, config.scoremin_coef); + + //There should be as many tasks according to MPI as dictated by the library type. +#ifndef DEBUG + ntasks = 5; + if(config.directional) ntasks = 3; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_ntasks); + if(mpi_ntasks < ntasks) { + if(taskid == MASTER) printf("There are only %i nodes available but we need %i to work. You need to allocate more nodes!\n", mpi_ntasks, ntasks); + quit(0, -1); + } +#endif + /*********************************************************************************************** + * + * Convert the input reads C->T and G->A as needed. There are 4 use cases: + * Directional: + * Paired-end: FASTQ1 will be C->T converted and FASTQ2 will be G->A converted. + * Single-end: FASTQ1 will be C->T converted + * Non-directional: + * Paired-end: Both FASTQ1 and FASTQ2 will be C->T and G->A converted. + * Single-end: FASTQ1 will be both C->T and G->A converted. + * + * convert_fastq() takes a single integer parameter: + * 8 = convert FASTQ1 C->T + * 4 = convert FASTQ1 G->A + * 2 = convert FASTQ2 C->T + * 1 = convert FASTQ2 G->A + * + ***********************************************************************************************/ + update_odir(); + create_fastq_names(config.FASTQ1, config.FASTQ2); + +#ifndef DEBUG + if(taskid == MASTER) { +#endif + //MASTER specific procedures + config.basename = get_basename(config.FASTQ1); + config.outname = malloc(sizeof(char)*(strlen(config.odir)+ strlen(config.basename)+5)); + sprintf(config.outname, "%s%s.bam", config.odir, config.basename); +#ifdef DEBUG + //When debugging, don't convert the files if it's already been done + if(access(config.FASTQ1CT, F_OK) == -1) { +#endif + if(config.directional) { + if(config.paired) { + convert_fastq(9, upto); + } else { + convert_fastq(8, upto); + } + } else { + if(config.paired) { + convert_fastq(15, upto); + } else { + convert_fastq(12, upto); + } + } +#ifdef DEBUG + } + //Just convert the reads + if(taskid == -1) { + quit(3, 0); + return 0; + } +#endif + +#ifdef DEBUG + if(taskid == MASTER) { +#endif + //Open the input reads + cmd = malloc(sizeof(char) * (strlen(config.FASTQ1) + 7)); + p = strrchr(config.FASTQ1, '.'); + if(strcmp(p,".gz") == 0 || strcmp(p,".GZ") == 0) { + sprintf(cmd, "zcat %s", config.FASTQ1); + } else if(strcmp(p,".bz") == 0 || strcmp(p,".bz2") == 0) { + sprintf(cmd, "bzcat %s", config.FASTQ1); + } else { + sprintf(cmd, "cat %s", config.FASTQ1); + } + zip1 = popen(cmd, "r"); + if(config.paired) { + cmd = realloc(cmd, sizeof(char) * (strlen(config.FASTQ2) + 7)); + p = strrchr(config.FASTQ2, '.'); + if(strcmp(p,".gz") == 0 || strcmp(p,".GZ") == 0) { + sprintf(cmd, "zcat %s", config.FASTQ2); + } else if(strcmp(p,".bz") == 0 || strcmp(p,".bz2") == 0) { + sprintf(cmd, "bzcat %s", config.FASTQ2); + } else { + sprintf(cmd, "cat %s", config.FASTQ2); + } + zip2 = popen(cmd, "r"); + } + + //Open the output file handles + if(config.unmapped) { + cmd = realloc(cmd, sizeof(char) * (strlen(config.unmapped1) + 8)); + if(!config.quiet) printf("Writing unmapped reads to %s\n", config.unmapped1); + sprintf(cmd, "gzip > %s", config.unmapped1); + unmapped1 = popen(cmd, "w"); + if(config.paired) { + cmd = realloc(cmd, sizeof(char) * (strlen(config.unmapped2) + 8)); + if(!config.quiet) printf("Writing unmapped reads to %s\n", config.unmapped2); + sprintf(cmd, "gzip > %s", config.unmapped2); + unmapped2 = popen(cmd, "w"); + } + } + free(cmd); + + //Store the genome into memory + read_genome(); + + //Open a file for output + OUTPUT_BAM = bam_open(config.outname, "w"); + if(OUTPUT_BAM == NULL) { + printf("Could not open %s for writing!\n", config.outname); + quit(2,-1); + } + if(!config.quiet) printf("Alignment metrics will be printed to %s%s.txt\n",config.odir,config.basename); + fflush(stdout); + + //Setup the linked-lists + node1 = initialize_list(node1); + node1_last_sentinel = node1->next; + node2 = initialize_list(node2); + node2_last_sentinel = node2->next; + node3 = initialize_list(node3); + node3_last_sentinel = node3->next; + node4 = initialize_list(node4); + node4_last_sentinel = node4->next; + + //Start the master node processer threads + threads = calloc(1, sizeof(pthread_t)); + pthread_create(&(threads[0]), NULL, &master_processer_thread, NULL); + slurp(NULL); + pthread_join(threads[0], NULL); + + //Start freeing things up + if(!config.quiet) printf("Closing input files\n"); + free(threads); + pclose(zip1); + if(config.paired) pclose(zip2); + + //Print some metrics + print_metrics(); + } else { + //worker node stuff, wait for the master + worker_node(taskid); + if(!config.quiet) printf("Returning from worker node %i\n", taskid); + fflush(stdout); + } + + //Clean up + if(config.odir != NULL) free(config.odir); + quit(3, 0); + return 0; +} diff --git a/markduplicates.c b/markduplicates.c new file mode 100644 index 0000000..fc2c20c --- /dev/null +++ b/markduplicates.c @@ -0,0 +1,286 @@ +#include +#include +#include + +#define WORD_OFFSET(b) b/32 +#define BIT_OFFSET(b) b%32 + +typedef struct { + int32_t tid, start1, start2, stop1, stop2; + int strand, MAPQ; + unsigned read_number; +} alignment; + +typedef struct { + uint64_t nelements; + int threadid; + alignment *alignments; +} qsort_func_struct; + +/* + Sort a list of alignments, they'll be ordered as follows (always low to high): + (1) tid (chromosome index ID) + (2) start1 (5' position of read #1) + (3) stop1 (3' position of read #1) + (4) start2 (5' position of read #2) + (5) stop2 (3' position of read #2) + (6) strand (0 OT, 1 OB, 2 CTOT, 3 CTOB) + (7) MAPQ (high to low) +*/ +int comp_func(const void *a, const void *b) { + alignment *a1 = (alignment*) a; + alignment *a2 = (alignment*) b; + + if(a1->tid < a2->tid) return -1; + else if(a1->tid > a2->tid) return 1; + else { + if(a1->start1 < a2->start1) return -1; + else if(a1->start1 > a2->start1) return 1; + else { + if(a1->stop1 < a2->stop1) return -1; + else if(a1->stop1 > a2->stop1) return 1; + else { + if(a1->start2 < a2->start2) return -1; + else if(a1->start2 > a2->start2) return 1; + else { + if(a1->stop2 < a2->stop2) return -1; + else if(a1->stop2 > a2->stop2) return 1; + else { + if(a1->strand < a2->strand) return -1; + else if(a1->strand > a2->strand) return 1; + else { //This will be the other way around + if(a1->MAPQ > a2->MAPQ) return -1; + else if(a1->MAPQ < a2->MAPQ) return 1; + else return 0; + } + } + } + } + } + } +} + +//This is the same as comp_func(), except that MAPQ is ignored +int comp_func2(const void *a, const void *b) { + alignment *a1 = (alignment*) a; + alignment *a2 = (alignment*) b; + + if(a1->tid < a2->tid) return -1; + else if(a1->tid > a2->tid) return 1; + else { + if(a1->start1 < a2->start1) return -1; + else if(a1->start1 > a2->start1) return 1; + else { + if(a1->stop1 < a2->stop1) return -1; + else if(a1->stop1 > a2->stop1) return 1; + else { + if(a1->start2 < a2->start2) return -1; + else if(a1->start2 > a2->start2) return 1; + else { + if(a1->stop2 < a2->stop2) return -1; + else if(a1->stop2 > a2->stop2) return 1; + else { + if(a1->strand < a2->strand) return -1; + else if(a1->strand > a2->strand) return 1; + else return 0; + } + } + } + } + } +} + +void *qsort_func(void *a) { + uint64_t total_pairs = ((qsort_func_struct *) a)->nelements; + int thread_id = ((qsort_func_struct *) a)->threadid; + uint64_t nelements = total_pairs/(thread_id+1); + uint64_t offset = thread_id*nelements; + alignment *alignments = ((qsort_func_struct *) a)->alignments; + void *p = (void *) (alignments+offset); + + if(thread_id == 5) nelements += total_pairs % thread_id; + qsort(p, (size_t) nelements, sizeof(alignment), &comp_func); + + return NULL; +} + +//Set 1 at the given offset +void set_bit(uint32_t *map, uint64_t n) { + map[WORD_OFFSET(n)] |= (1 << BIT_OFFSET(n)); +} + +//Get the value at a given offset +int get_bit(uint32_t *map, uint64_t n) { + uint32_t bit = map[WORD_OFFSET(n)] & (1 << BIT_OFFSET(n)); + return bit != 0; +} + +uint64_t mark_dups(alignment *alignments, uint32_t *bitmap, uint64_t total_pairs) { + uint64_t i, ndups = 0; + void *cur_alignment = (void *) alignments; + + for(i=1;i 1) { + alignments[total_pairs].tid = read->core.tid; + alignments[total_pairs].start1 = read->core.pos; + alignments[total_pairs].stop1 = bam_calend(&(read->core), bam1_cigar(read)); + alignments[total_pairs].MAPQ = read->core.qual; + alignments[total_pairs].strand = get_strand(read); + if(read->core.flag & BAM_FPAIRED) { + assert(bam_read1(fp,read)>1); + alignments[total_pairs].start2 = read->core.pos; + alignments[total_pairs].stop2 = bam_calend(&(read->core), bam1_cigar(read)); + } else { + alignments[total_pairs].start2 = 0; + alignments[total_pairs].stop2 = 0; + } + alignments[total_pairs].read_number = total_pairs; + total_pairs++; + + //Lengthen the array as needed + if(max_length-total_pairs == 0) { + max_length += grow_size; + alignments = realloc(alignments, max_length * sizeof(alignment)); + assert(alignments != NULL); + } + } + bam_close(fp); + + //create bitmap + bitmap_length = max_length/32; + bitmap_length += (max_length % 32 > 0) ? 1 : 0; + bitmap = calloc(bitmap_length, sizeof(uint32_t)); + + //Sort + qsort((void *) alignments, (size_t) total_pairs, sizeof(alignment), &comp_func); + + //Mark duplicates in bitmap + ndups = mark_dups(alignments, bitmap, total_pairs); + free(alignments); + printf("There were %"PRIu64" duplicates from %"PRIu64" total reads or pairs\n", ndups, total_pairs); + + //reopen file, iterate through and change flags as appropriate + fp = bam_open(iname, "r"); + header = bam_header_read(fp); + of = bam_open(oname, "w"); + bgzf_mt(of, 4, 256); //This should be user configurable + bam_header_write(of, header); + + while(bam_read1(fp, read) > 1) { + if(get_bit(bitmap, cur_read)) read->core.flag = read->core.flag | BAM_FDUP; + bam_write1(of, read); + if(read->core.flag & BAM_FPAIRED) { + assert(bam_read1(fp, read) > 1); + if(get_bit(bitmap, cur_read)) read->core.flag = read->core.flag | BAM_FDUP; + bam_write1(of, read); + } + cur_read++; + } + + //Clean up + bam_close(fp); + bam_close(of); + bam_destroy1(read); + free(bitmap); + + return 0; +} diff --git a/master.c b/master.c new file mode 100644 index 0000000..fd3ef2e --- /dev/null +++ b/master.c @@ -0,0 +1,1091 @@ +#include "bison.h" +#include +#include + +typedef struct { + unsigned long long t_reads; //total reads + unsigned long long m_reads_OT; //reads mapped to the OT + unsigned long long m_reads_OB; + unsigned long long m_reads_CTOT; + unsigned long long m_reads_CTOB; + unsigned long long t_CpG; //Total CpGs + unsigned long long m_CpG; //Methylated CpGs + unsigned long long t_CHG; + unsigned long long m_CHG; + unsigned long long t_CHH; + unsigned long long m_CHH; +} metrics_struct; + +/****************************************************************************** +* +* Update the CpG/CHG/CHH metrics according to the methylation calls in a read +* +*******************************************************************************/ +void update_counts(bam1_t *read, metrics_struct *metrics) { + char *XM = bam_aux2Z(bam_aux_get(read, "XM")); + char base; + int i; + + for(i=0; icore.l_qseq; i++) { + base = *(XM+i); + if(base != '.') { + if(base == 'Z') { + metrics->t_CpG++; + metrics->m_CpG++; + } else if(base == 'z') { + metrics->t_CpG++; + } else if(base == 'X') { + metrics->t_CHG++; + metrics->m_CHG++; + } else if(base == 'x') { + metrics->t_CHG++; + } else if(base == 'H') { + metrics->t_CHH++; + metrics->m_CHH++; + } else if(base == 'h') { + metrics->t_CHH++; + } + } + } +} + +/****************************************************************************** +* +* Return the alignment score or -MAX_INT if unaligned +* +* bam1_t *read: the read in question +* +*******************************************************************************/ +int get_AS(bam1_t *read) { + int AS = INT_MIN>>2; + uint8_t *p = bam_aux_get(read, "AS"); + + if(read->core.flag & BAM_FUNMAP) return AS; + if(p != NULL) AS = bam_aux2i(p); + return AS; +} + +/****************************************************************************** +* +* Calculate the minimum score for a given readlength +* +* int32_t rlen: a read length +* +*******************************************************************************/ +inline int scoreMin(int32_t rlen) { + //Return different values, depending on --score-min + if(config.scoremin_type == 'L') { + return (config.scoremin_intercept + config.scoremin_coef * rlen); + } else if(config.scoremin_type == 'S') { + return (config.scoremin_intercept + config.scoremin_coef * sqrt((float) rlen)); + } else if(config.scoremin_type == 'G') { + return (config.scoremin_intercept + config.scoremin_coef * log((float) rlen)); + } else { //'C' + return (config.scoremin_intercept + config.scoremin_coef); + } +} + +/****************************************************************************** +* +* Return the secondary alignment score or -MAX_INT if unaligned +* +* bam1_t *read: the read in question +* +*******************************************************************************/ +int get_XS(bam1_t *read) { + int XS = INT_MIN>>2; + uint8_t *p = bam_aux_get(read, "XS"); + + if(read->core.flag & BAM_FUNMAP) return XS; + if(p != NULL) XS = bam_aux2i(p); + return XS; +} + +/****************************************************************************** +* +* Calculate a MAPQ, given AS, XS, and the minimum score (ala bowtie2) +* +*******************************************************************************/ +int calc_MAPQ_BT2(int AS, int XS, int scMin) { + int diff, bestOver, bestdiff; + diff = abs(scMin); //Range of possible alignment scores + bestOver = AS-scMin; //Shift alignment score range, so worst score is 0 + + //This seems like an odd way to calculate this! + + //The method depends on config.mode + bestdiff = (int) abs(abs((float) AS)-abs((float) XS)); //Absolute distance between alignment scores + if(config.mode == 0) { //--end-to-end (default) + if(XS < scMin) { + if(bestOver >= diff * (double) 0.8f) return 42; + else if(bestOver >= diff * (double) 0.7f) return 40; + else if(bestOver >= diff * (double) 0.6f) return 24; + else if(bestOver >= diff * (double) 0.5f) return 23; + else if(bestOver >= diff * (double) 0.4f) return 8; + else if(bestOver >= diff * (double) 0.3f) return 3; + else return 0; + } else { + if(bestdiff >= diff * (double) 0.9f) { + if(bestOver == diff) { + return 39; + } else { + return 33; + } + } else if(bestdiff >= diff * (double) 0.8f) { + if(bestOver == diff) { + return 38; + } else { + return 27; + } + } else if(bestdiff >= diff * (double) 0.7f) { + if(bestOver == diff) { + return 37; + } else { + return 26; + } + } else if(bestdiff >= diff * (double) 0.6f) { + if(bestOver == diff) { + return 36; + } else { + return 22; + } + } else if(bestdiff >= diff * (double) 0.5f) { + if(bestOver == diff) { + return 35; + } else if(bestOver >= diff * (double) 0.84f) { + return 25; + } else if(bestOver >= diff * (double) 0.68f) { + return 16; + } else { + return 5; + } + } else if(bestdiff >= diff * (double) 0.4f) { + if(bestOver == diff) { + return 34; + } else if(bestOver >= diff * (double) 0.84f) { + return 21; + } else if(bestOver >= diff * (double) 0.68f) { + return 14; + } else { + return 4; + } + } else if(bestdiff >= diff * (double) 0.3f) { + if(bestOver == diff) { + return 32; + } else if(bestOver >= diff * (double) 0.88f) { + return 18; + } else if(bestOver >= diff * (double) 0.67f) { + return 15; + } else { + return 3; + } + } else if(bestdiff >= diff * (double) 0.2f) { + if(bestOver == diff) { + return 31; + } else if(bestOver >= diff * (double) 0.88f) { + return 17; + } else if(bestOver >= diff * (double) 0.67f) { + return 11; + } else { + return 0; + } + } else if(bestdiff >= diff * (double) 0.1f) { + if(bestOver == diff) { + return 30; + } else if(bestOver >= diff * (double) 0.88f) { + return 12; + } else if(bestOver >= diff * (double) 0.67f) { + return 7; + } else { + return 0; + } + } else if(bestdiff > 0) { + if(bestOver >= diff * (double)0.67f) { + return 6; + } else { + return 2; + } + } else { + if(bestOver >= diff * (double)0.67f) { + return 1; + } else { + return 0; + } + } + } + } else { //--local + if(XS < scMin) { + if(bestOver >= diff * (double) 0.8f) return 44; + else if(bestOver >= diff * (double) 0.7f) return 42; + else if(bestOver >= diff * (double) 0.6f) return 41; + else if(bestOver >= diff * (double) 0.5f) return 36; + else if(bestOver >= diff * (double) 0.4f) return 28; + else if(bestOver >= diff * (double) 0.3f) return 24; + else return 22; + } else { + if(bestdiff >= diff * (double) 0.9f) return 40; + else if(bestdiff >= diff * (double) 0.8f) return 39; + else if(bestdiff >= diff * (double) 0.7f) return 38; + else if(bestdiff >= diff * (double) 0.6f) return 37; + else if(bestdiff >= diff * (double) 0.5f) { + if (bestOver == diff) return 35; + else if(bestOver >= diff * (double) 0.5f) return 25; + else return 20; + } else if(bestdiff >= diff * (double) 0.4f) { + if (bestOver == diff) return 34; + else if(bestOver >= diff * (double) 0.5f) return 21; + else return 19; + } else if(bestdiff >= diff * (double) 0.3f) { + if (bestOver == diff) return 33; + else if(bestOver >= diff * (double) 0.5f) return 18; + else return 16; + } else if(bestdiff >= diff * (double) 0.2f) { + if (bestOver == diff) return 32; + else if(bestOver >= diff * (double) 0.5f) return 17; + else return 12; + } else if(bestdiff >= diff * (double) 0.1f) { + if (bestOver == diff) return 31; + else if(bestOver >= diff * (double) 0.5f) return 14; + else return 9; + } else if(bestdiff > 0) { + if(bestOver >= diff * (double) 0.5f) return 11; + else return 2; + } else { + if(bestOver >= diff * (double) 0.5f) return 1; + else return 0; + } + } + } +} + +/****************************************************************************** +* +* Determine whether the alignment is actually unique by comparing the AS and +* XS auxiliary tags. +* +* bam1_t *read: The read to look at +* +*******************************************************************************/ +int unique_alignment(bam1_t *read) { + int AS, XS; + + AS = bam_aux2i(bam_aux_get(read, "AS")); + if(bam_aux_get(read, "XS") == 0) return 1; + XS = bam_aux2i(bam_aux_get(read, "XS")); + if(AS > XS) return 1; + return 0; +} + +/****************************************************************************** +* +* Replace the stored sequence in a read. +* +* bam1_t *read: The read whose sequence will be replaced +* char *seq: Sequence to coopy into read. +* +* If read is reverse complemented, the same will be done to seq. +* +*******************************************************************************/ +void swap_sequence(bam1_t *read, char *seq) { + uint8_t *sequence = bam1_seq(read), val; + char *seq2 = strdup(seq); + int i, j; + + //Do we need to reverse complement? + if(read->core.flag & BAM_FREVERSE) reverse_complement(seq2); + for(i=0, j=0; icore.l_qseq, sizeof(char)); + char *XM = calloc(1+read->core.l_qseq, sizeof(char)); + char genome_base, read_base, *bases; + int i; + uint8_t b; + + //Extract the read sequence + for(i=0; icore.l_qseq; i++) { + b = bam1_seqi(bam1_seq(read), i); + if(b == 1) { + *(read_seq+i) = 'A'; + } else if(b == 2) { + *(read_seq+i) = 'C'; + } else if(b == 4) { + *(read_seq+i) = 'G'; + } else if(b == 8) { + *(read_seq+i) = 'T'; + } else if(b == 15) { + *(read_seq+i) = 'N'; + } + current_position = *(genomic_position+i); + } + + for(i=0; icore.l_qseq; i++) { + current_position = *(genomic_position+i); + if(current_position == ULLONG_MAX) { + *(XM+i) = '.'; + continue; + } + genome_base = toupper(*(chromosomes.genome+offset+current_position)); + read_base = toupper(*(read_seq+i)); + if(read_base != genome_base) { + //Mismatches to the top and bottom strands are treated differently + if(*XG == 'C') { //OT or CTOT + if(genome_base == 'C' && read_base == 'T') { + bases = get_genomic_context(offset, current_position, 2, chrom_end); + if(*(bases+1) == 'G') { + //Unmethylated CpG + *(XM+i) = 'z'; + } else if(*(bases+2) == 'G') { + //Unmethylated CHG + *(XM+i) = 'x'; + } else { + //Unmethylated CHH + *(XM+i) = 'h'; + } + free(bases); + } else { + //Just a mismatch + *(XM+i) = '.'; + } + } else { //OB or CTOB + if(genome_base == 'G' && read_base == 'A') { + bases = get_genomic_context(offset, current_position, -2, chrom_end); + if(*(bases+1) == 'C') { + //Unmethylated CpG + *(XM+i) = 'z'; + } else if(*(bases+0) == 'C') { + //Unmethylated CHG + *(XM+i) = 'x'; + } else { + //Unmethylated CHH + *(XM+i) = 'h'; + } + free(bases); + } else { + *(XM+i) = '.'; + } + } + } else { + if(*XG == 'C') { //OT or CTOT + if(genome_base == 'C') { + bases = get_genomic_context(offset, current_position, 2, chrom_end); + if(*(bases+1) == 'G') { + //Methylated CpG + *(XM+i) = 'Z'; + } else if(*(bases+2) == 'G') { + //Methylated CHG + *(XM+i) = 'X'; + } else { + //Methylated CHH + *(XM+i) = 'H'; + } + free(bases); + } else { + *(XM+i) = '.'; + } + } else { //OB or CTOB + if(genome_base == 'G') { + bases = get_genomic_context(offset, current_position, -2, chrom_end); + if(*(bases+1) == 'C') { + //Methylated CpG + *(XM+i) = 'Z'; + } else if(*(bases+0) == 'C') { + //Methylated CHG + *(XM+i) = 'X'; + } else { + //Methylated CHH + *(XM+i) = 'H'; + } + free(bases); + } else { + *(XM+i) = '.'; + } + } + } + } + + free(read_seq); + free(genomic_position); + return XM; +} + +/****************************************************************************** +* +* As with callXM, but return the mismatches with the reference. +* +* bam_t *read; the read in question +* char *XM: output from callXM +* char *XG: The XG tag, indicating which coversion to pay attention to. +* +* THE OUTPUT MUST BE fre()d +* The length of XX is currently limited to MAXREAD!!! +* +*******************************************************************************/ +char *callXX(bam1_t *read, char *XM, char *XG) { + char *chrom = lookup_chrom(read); + unsigned long long offset = genome_offset(chrom, 0), current_position; + unsigned long long *genomic_position = calculate_positions(read); + uint8_t base, NM = 0; + + char *read_seq = calloc(1+read->core.l_qseq, sizeof(char)); + char *XX = calloc(MAXREAD, sizeof(char)); + int i, good = 0; + + //Extract the read sequence + for(i=0; icore.l_qseq; i++) { + base = bam1_seqi(bam1_seq(read), i); + if(base == 1) *(read_seq+i) = 'A'; + else if(base == 2) *(read_seq+i) = 'C'; + else if(base == 4) *(read_seq+i) = 'G'; + else if(base == 8) *(read_seq+i) = 'T'; + else *(read_seq+i) = 'N'; + current_position = *(genomic_position+i); + } + + //Create the XM string + for(i=0; i AS2) { + sprintf(XR, "CT"); + sprintf(XG, "CT"); + if(!(read1->core.flag & BAM_FUNMAP)) { + tmp_read = read1; + best_node = 1; + } + } else if(AS2 > AS1) { + sprintf(XR, "CT"); + sprintf(XG, "GA"); + if(!(read2->core.flag & BAM_FUNMAP)) { + tmp_read = read2; + best_node = 2; + } + } + } else { + if(AS1 > AS2 && AS1 > AS3 && AS1 > AS4) { //OT + sprintf(XR, "CT"); + sprintf(XG, "CT"); + if(!(read1->core.flag & BAM_FUNMAP)) { + tmp_read = read1; + best_node = 1; + } + } else if(AS2 > AS1 && AS2 > AS3 && AS2 > AS4) { //OB + sprintf(XR, "CT"); + sprintf(XG, "GA"); + if(!(read2->core.flag & BAM_FUNMAP)) { + tmp_read = read2; + best_node = 2; + } + } else if(AS3 > AS1 && AS3 > AS2 && AS3 > AS4) { //CTOT + sprintf(XR, "GA"); + sprintf(XG, "CT"); + if(!(read3->core.flag & BAM_FUNMAP)) { + tmp_read = read3; + best_node = 3; + } + } else if(AS4 > AS1 && AS4 > AS2 && AS4 > AS3) { //CTOB + sprintf(XR, "GA"); + sprintf(XG, "GA"); + if(!(read4->core.flag & BAM_FUNMAP)) { + tmp_read = read4; + best_node = 4; + } + } + } + + //If there is no best score (tmp_read == NULL), mark read1 as unmapped + if(tmp_read == NULL) { + swap_sequence(read1, seq); + read1->core.flag = read1->core.flag | 0x4; + best_node = 1; + } else { + swap_sequence(tmp_read, seq); + XM = callXM(tmp_read, XG); + XX = callXX(tmp_read, XM, XG); + //append the tags + kputs(XX, kXX); + kputs(XM, kXM); + + bam_aux_del(tmp_read, bam_aux_get(tmp_read, "XM")); + bam_aux_del(tmp_read, bam_aux_get(tmp_read, "XG")); + bam_aux_append(tmp_read, "XX", 'Z', kXX->l + 1, (uint8_t*) kXX->s); + bam_aux_append(tmp_read, "XM", 'Z', kXM->l + 1, (uint8_t*) kXM->s); + bam_aux_append(tmp_read, "XR", 'Z', 3, (uint8_t*) XR); + bam_aux_append(tmp_read, "XG", 'Z', 3, (uint8_t*) XG); + free(kXX->s); + free(kXM->s); + free(XM); + free(XX); + + //Recalculate MAPQ and replace the XS score + scMin = scoreMin(tmp_read->core.l_qseq); + XS = get_XS(tmp_read); + if(best_node == 1) { + AS = AS1; + if(AS2 > XS) XS = AS2; + if(!config.directional) { + if(AS3 > XS) XS = AS3; + if(AS4 > XS) XS = AS4; + } + } + if(best_node == 2) { + AS = AS2; + if(AS1 > XS) XS = get_AS(read2); + if(!config.directional) { + if(AS3 > XS) XS = AS3; + if(AS4 > XS) XS = AS4; + } + } + if(best_node == 3) { + AS = AS3; + if(AS1 > XS) XS = AS1; + if(AS2 > XS) XS = AS2; + if(AS4 > XS) XS = AS4; + } + if(best_node == 4) { + AS = AS4; + if(AS1 > XS) XS = AS1; + if(AS2 > XS) XS = AS2; + if(AS3 > XS) XS = AS3; + } + MAPQ = calc_MAPQ_BT2(AS, XS, scMin); + MAPQ = (MAPQ < tmp_read->core.qual) ? MAPQ : tmp_read->core.qual; + tmp_read->core.qual = MAPQ; + if(XS >= scMin) { + //replace/add the XS tag + if(bam_aux_get(tmp_read, "XS")) bam_aux_del(tmp_read, bam_aux_get(tmp_read, "XS")); + bam_aux_append(tmp_read, "XS", 'i', 4, (uint8_t*) &XS); + } + } + free(kXX); + free(kXM); + return best_node; +} + +/****************************************************************************** +* +* Like process_single, but for paired_end reads. The bam1_t**s hold the +* buffered reads. i denotes the read#1 of interest (read #2 is the next read) +* +*******************************************************************************/ +int process_paired(bam1_t **read1, bam1_t **read2, bam1_t **read3, bam1_t **read4, char **seq) { + int AS1=0, AS2=0, AS3=0, AS4=0; + bam1_t *tmp_read1 = NULL, *tmp_read2 = NULL; + char *XM1, *XM2, *XX1, *XX2, XG[] = "CT", XR1[] = "CT", XR2[] = "CT"; + kstring_t *kXM1 = (kstring_t *) calloc(1, sizeof(kstring_t)); + kstring_t *kXM2 = (kstring_t *) calloc(1, sizeof(kstring_t)); + kstring_t *kXX1 = (kstring_t *) calloc(1, sizeof(kstring_t)); + kstring_t *kXX2 = (kstring_t *) calloc(1, sizeof(kstring_t)); + int best_node = 0; + //For MAPQ/XS replacement + int MAPQ, XS1, XS2, scMin1, scMin2; + + //Determine the read with the highest alignment score + AS1 = get_AS(*(read1)) + get_AS(*(read1+1)); + AS2 = get_AS(*(read2)) + get_AS(*(read2+1)); + if(!config.directional) { + AS3 = get_AS(*(read3)) + get_AS(*(read3+1)); + AS4 = get_AS(*(read4)) + get_AS(*(read4+1)); + } + if(config.directional) { + if(AS1 > AS2) { //OT + sprintf(XR1, "CT"); + sprintf(XR2, "GA"); + sprintf(XG, "CT"); + if(!((*(read1))->core.flag & BAM_FUNMAP)) { + tmp_read1 = *(read1); + tmp_read2 = *(read1+1); + best_node = 1; + } + } else if(AS2 > AS1) { //OB + sprintf(XR1, "CT"); + sprintf(XR2, "GA"); + sprintf(XG, "GA"); + if(!((*(read2))->core.flag & BAM_FUNMAP)) { + tmp_read1 = *(read2); + tmp_read2 = *(read2+1); + best_node = 2; + } + } + } else { + if(AS1 > AS2 && AS1 > AS3 && AS1 > AS4) { //OT + sprintf(XR1, "CT"); + sprintf(XR2, "GA"); + sprintf(XG, "CT"); + if(!((*(read1))->core.flag & BAM_FUNMAP)) { + tmp_read1 = *(read1); + tmp_read2 = *(read1+1); + best_node = 1; + } + } else if(AS2 > AS1 && AS2 > AS3 && AS2 > AS4) { //OB + sprintf(XR1, "CT"); + sprintf(XR2, "GA"); + sprintf(XG, "GA"); + if(!((*(read2))->core.flag & BAM_FUNMAP)) { + tmp_read1 = *(read2); + tmp_read2 = *(read2+1); + best_node = 2; + } + } else if(AS3 > AS1 && AS3 > AS2 && AS3 > AS4) { //CTOT + sprintf(XR1, "GA"); + sprintf(XR2, "CT"); + sprintf(XG, "CT"); + if(!((*(read3))->core.flag & BAM_FUNMAP)) { + tmp_read1 = *(read3); + tmp_read2 = *(read3+1); + best_node = 3; + } + } else if(AS4 > AS1 && AS4 > AS2 && AS4 > AS3) { //CTOB + sprintf(XR1, "GA"); + sprintf(XR2, "CT"); + sprintf(XG, "GA"); + if(!((*(read4))->core.flag & BAM_FUNMAP)) { + tmp_read1 = *(read4); + tmp_read2 = *(read4+1); + best_node = 4; + } + } + } + + //If there is no best score (tmp_read == NULL), mark read1 as unmapped + if(tmp_read1 == NULL) { + swap_sequence(*(read1), *(seq)); + swap_sequence(*(read1+1), *(seq+1)); + (*(read1))->core.flag = (*(read1))->core.flag | 0x4; + (*(read1+1))->core.flag = (*(read1+1))->core.flag | 0x4; + best_node = 1; + } else { + swap_sequence(tmp_read1, *(seq)); + swap_sequence(tmp_read2, *(seq+1)); + XM1 = callXM(tmp_read1, XG); + XX1 = callXX(tmp_read1, XM1, XG); + XM2 = callXM(tmp_read2, XG); + XX2 = callXX(tmp_read2, XM2, XG); + + kputs(XX1, kXX1); + kputs(XX2, kXX2); + kputs(XM1, kXM1); + kputs(XM2, kXM2); + + bam_aux_del(tmp_read1, bam_aux_get(tmp_read1, "XM")); + bam_aux_del(tmp_read2, bam_aux_get(tmp_read2, "XM")); + bam_aux_del(tmp_read1, bam_aux_get(tmp_read1, "XG")); + bam_aux_del(tmp_read2, bam_aux_get(tmp_read2, "XG")); + + bam_aux_append(tmp_read1, "XX", 'Z', kXX1->l + 1, (uint8_t*) kXX1->s); + bam_aux_append(tmp_read2, "XX", 'Z', kXX2->l + 1, (uint8_t*) kXX2->s); + + bam_aux_append(tmp_read1, "XM", 'Z', kXM1->l + 1, (uint8_t*) kXM1->s); + bam_aux_append(tmp_read2, "XM", 'Z', kXM2->l + 1, (uint8_t*) kXM2->s); + + bam_aux_append(tmp_read1, "XR", 'Z', 3, (uint8_t*) XR1); + bam_aux_append(tmp_read2, "XR", 'Z', 3, (uint8_t*) XR2); + + bam_aux_append(tmp_read1, "XG", 'Z', 3, (uint8_t*) XG); + bam_aux_append(tmp_read2, "XG", 'Z', 3, (uint8_t*) XG); + free(kXX1->s); + free(kXX2->s); + free(kXM1->s); + free(kXM2->s); + free(XM1); + free(XM2); + free(XX1); + free(XX2); + + //Recalculate MAPQ and replace the XS score + scMin1 = scoreMin(tmp_read1->core.l_qseq); + scMin2 = scoreMin(tmp_read2->core.l_qseq); + XS1 = get_XS(tmp_read1); + XS2 = get_XS(tmp_read2); + if(XS2 < scMin2) { + if(XS1 >= scMin1) XS2 = get_AS(tmp_read2); + } else if(XS1 < scMin1) { + if(XS2 >= scMin2) XS1 = get_AS(tmp_read1); + } + if(best_node == 1) { + if(AS2 > XS1+XS2) { + XS1 = get_AS(*(read2)); + XS2 = get_AS(*(read2+1)); + } + if(!config.directional) { + if(AS3 > XS1+XS2) { + XS1 = get_AS(*(read3)); + XS2 = get_AS(*(read3+1)); + } + if(AS4 > XS1+XS2) { + XS1 = get_AS(*(read4)); + XS2 = get_AS(*(read4+1)); + } + } + } + if(best_node == 2) { + if(AS1 > XS1+XS2) { + XS1 = get_AS(*(read1)); + XS2 = get_AS(*(read1+1)); + } + if(!config.directional) { + if(AS3 > XS1+XS2) { + XS1 = get_AS(*(read3)); + XS2 = get_AS(*(read3+1)); + } + if(AS4 > XS1+XS2) { + XS1 = get_AS(*(read4)); + XS2 = get_AS(*(read4+1)); + } + } + } + if(best_node == 3) { + if(AS1 > XS1+XS2) { + XS1 = get_AS(*(read1)); + XS2 = get_AS(*(read1+1)); + } + if(AS2 > XS1+XS2) { + XS1 = get_AS(*(read2)); + XS2 = get_AS(*(read2+1)); + } + if(AS4 > XS1+XS2) { + XS1 = get_AS(*(read4)); + XS2 = get_AS(*(read4+1)); + } + } + if(best_node == 4) { + if(AS1 > XS1+XS2) { + XS1 = get_AS(*(read1)); + XS2 = get_AS(*(read1+1)); + } + if(AS2 > XS1+XS2) { + XS1 = get_AS(*(read2)); + XS2 = get_AS(*(read2+1)); + } + if(AS3 > XS1+XS2) { + XS1 = get_AS(*(read3)); + XS2 = get_AS(*(read3+1)); + } + } + MAPQ = calc_MAPQ_BT2(get_AS(tmp_read1)+get_AS(tmp_read2), XS1+XS2, scMin1+scMin2); + MAPQ = (MAPQ < tmp_read1->core.qual) ? MAPQ : tmp_read1->core.qual; //Otherwise, a mapping can get worse but a score better! + tmp_read1->core.qual = MAPQ; + tmp_read2->core.qual = MAPQ; + //replace/add the XS tag + if(XS1 >= scMin1) { + if(bam_aux_get(tmp_read1, "XS")) bam_aux_del(tmp_read1, bam_aux_get(tmp_read1, "XS")); + bam_aux_append(tmp_read1, "XS", 'i', 4, (uint8_t*) &XS1); + } + if(XS2 >= scMin2) { + if(bam_aux_get(tmp_read2, "XS")) bam_aux_del(tmp_read2, bam_aux_get(tmp_read2, "XS")); + bam_aux_append(tmp_read2, "XS", 'i', 4, (uint8_t*) &XS2); + } + } + free(kXX1); + free(kXX2); + free(kXM1); + free(kXM2); + return best_node; +} + +/******************************************************************************* +* +* Update a packed read so that it's a proper bam1_t and return a pointer +* +* struct packed_struct *first: first sentinel node +* int offset: Return the read from the first (0) or second (1) element +* +* returns a pointer to a bam1_t read +* +*******************************************************************************/ +bam1_t * update_read(struct packed_struct *first, int offset) { + bam1_t *pbam1_t; + uint8_t *data; + bam1_t *new_copy = bam_init1(); + + if(offset == 0) { + pbam1_t = (bam1_t *) first->next->packed; + } else { + pbam1_t = (bam1_t *) first->next->next->packed; + } + data = (uint8_t *) (pbam1_t+1); + pbam1_t->data = data; + bam_copy1(new_copy, pbam1_t); + free(pbam1_t); + if(offset == 0) { + first->next->packed = (void *) new_copy; + } else { + first->next->next->packed = (void *) new_copy; + } + return new_copy; +} + +/******************************************************************************* +* +* The master node function. +* +* void *a: Actually a int*, the thread_id +* +*******************************************************************************/ +void * master_processer_thread(void *a) { + int thread_id = 0, best_node, i; + int times = (config.paired) ? 2 : 1; + char **seq = malloc(sizeof(char *) * 2); + *(seq) = malloc(sizeof(char)*MAXREAD); + *(seq+1) = malloc(sizeof(char)*MAXREAD); + bam1_t **node1_read = malloc(sizeof(bam1_t*) * 2); + bam1_t **node2_read = malloc(sizeof(bam1_t*) * 2); + bam1_t **node3_read = malloc(sizeof(bam1_t*) * 2); + bam1_t **node4_read = malloc(sizeof(bam1_t*) * 2); + bam1_t **best_read = NULL; + time_t now; + + //Metrics + metrics_struct *metrics = malloc(sizeof(metrics_struct)); + metrics->t_reads = 0; + metrics->m_reads_OT = 0; + metrics->m_reads_OB = 0; + metrics->m_reads_CTOT = 0; + metrics->m_reads_CTOB = 0; + metrics->t_CpG = 0; + metrics->m_CpG = 0; + metrics->t_CHG = 0; + metrics->m_CHG = 0; + metrics->t_CHH = 0; + metrics->m_CHH = 0; + + //Process read i/o + while(1) { + while(!is_ready(node1, 0)); + if(is_finished(node1)) break; + *(node1_read) = update_read(node1, 0); + if(config.paired) { + while(!is_ready(node1, 1)); + *(node1_read+1) = update_read(node1, 1); + } + while(!is_ready(node2, 0)); + *(node2_read) = update_read(node2, 0); + if(config.paired) { + while(!is_ready(node2, 1)); + *(node2_read+1) = update_read(node2, 1); + } + if(!config.directional) { + while(!is_ready(node3, 0)); + *node3_read = update_read(node3, 0); + if(config.paired) { + while(!is_ready(node3, 1)); + *(node3_read+1) = update_read(node3, 1); + } + while(!is_ready(node4, 0)); + *node4_read = update_read(node4, 0); + if(config.paired) { + while(!is_ready(node4, 1)); + *(node4_read+1) = update_read(node4, 1); + } + } + metrics->t_reads++; + + //Give some output, it's a bit misleading as the count is actually only for this thread and it'll only display for thread 0. + if(!config.quiet) { + if(thread_id == 0) { + if((metrics->t_reads) % 100000 == 0) { + now = time(NULL); + printf("%llu reads %s", metrics->t_reads, ctime(&now)); fflush(stdout); + } + } + } + + get_seq(*seq, zip1); + if(config.paired) get_seq(*(seq+1), zip2); + + //Process the reads + if(!config.paired) { + best_node = process_single(*node1_read, *node2_read, *node3_read, *node4_read, *seq); //Output is stored in read1 + } else { + best_node = process_paired(node1_read, node2_read, node3_read, node4_read, seq); //Output is stored in read + } + if(best_node == 1) { + best_read = node1_read; + if(!((*best_read)->core.flag & BAM_FUNMAP)) metrics->m_reads_OT++; + } else if(best_node == 2) { + best_read = node2_read; + if(!((*best_read)->core.flag & BAM_FUNMAP)) metrics->m_reads_OB++; + } else if(best_node == 3) { + best_read = node3_read; + if(!((*best_read)->core.flag & BAM_FUNMAP)) metrics->m_reads_CTOT++; + } else if(best_node == 4) { + best_read = node4_read; + if(!((*best_read)->core.flag & BAM_FUNMAP)) metrics->m_reads_CTOB++; + } + + //Store the reads + if(!config.paired) { + if(!((*(best_read))->core.flag & BAM_FUNMAP)) { + bam_write1(OUTPUT_BAM, *(best_read)); + update_counts(*(best_read), metrics); + } else { + if(config.unmapped) write_unmapped(unmapped1, *(best_read)); + } + } else { + if(!((*(best_read))->core.flag & BAM_FUNMAP) && !((*(best_read+1))->core.flag & BAM_FUNMAP)) { + bam_write1(OUTPUT_BAM, *(best_read)); + update_counts(*(best_read), metrics); + bam_write1(OUTPUT_BAM, *(best_read+1)); + update_counts(*(best_read+1), metrics); + } else { + if(config.unmapped) { + write_unmapped(unmapped1, *(best_read)); + write_unmapped(unmapped2, *(best_read+1)); + } + } + } + + //Remove the processed reads + for(i=0; it_reads; + m_reads_OT += metrics->m_reads_OT; + m_reads_OB += metrics->m_reads_OB; + m_reads_CTOT += metrics->m_reads_CTOT; + m_reads_CTOB += metrics->m_reads_CTOB; + t_CpG += metrics->t_CpG; + m_CpG += metrics->m_CpG; + t_CHG += metrics->t_CHG; + m_CHG += metrics->m_CHG; + t_CHH += metrics->t_CHH; + m_CHH += metrics->m_CHH; + + //Clean up + free(*(seq)); free(*(seq+1)); free(seq); + free(metrics); + bam_header_destroy(global_header); + free(node1_read); + free(node2_read); + free(node3_read); + free(node4_read); + destroy_list(node1); + destroy_list(node2); + destroy_list(node3); + destroy_list(node4); + return NULL; +} diff --git a/mbias.c b/mbias.c new file mode 100644 index 0000000..e08e9a0 --- /dev/null +++ b/mbias.c @@ -0,0 +1,224 @@ +#include "bison.h" +#include + +unsigned long long *r1_m[4]; +unsigned long long *r1_um[4]; +unsigned long long *r2_m[4]; +unsigned long long *r2_um[4]; +int min_phred = 10; + +void store_calls(unsigned long long *m, unsigned long long *um, bam1_t *read, int reversed) { + char *meth = bam_aux2Z(bam_aux_get(read, "XM")); + uint8_t *qual = bam1_qual(read); + int i; + + if(!reversed) { + for(i=0; i=0; i--) { + if(*(qual+i) < min_phred) continue; + if(*(meth+i) == 'Z') *(m+i) += 1; + if(*(meth+i) == 'z') *(um+i) += 1; + } + } +} + +void usage(char *prog) { + printf("Usage: %s [OPTIONS] file.bam\n", prog); + printf("\n\ + Compute the methylation percentage as a function of read position for a BAM\n\ + file. The output can be conveniently plotted with the accompanying\n\ + plot_mbias.R script\n\ +\n\ + -phred Minimum Phred score that a base must have for inclusion in the\n\ + metrics (default 10).\n\ +\n\ + -q Read MAPQ value must at least this for inclusion (default 20).\n\ + Specify 0 to include everything.\n\ +\n\ + -pdf Run the R script to convert the output to pdf format, including\n\ + recommended inclusion bounds. R must be installed and in your PATH.\n"); +} + +int main(int argc, char *argv[]) { + bamFile ifile = NULL; + FILE *ofile = NULL; + char *prefix = NULL; + char *p, *XR, *XG; + bam1_t *read = bam_init1(); + bam_header_t *header = NULL; + int max_length = 50; + int paired = 0, reversed = 0, hasComp = 0; + int i, j, min_mapq = 20, pdf = 0; + unsigned long long treads = 0; + + if(argc < 2) { + usage(argv[0]); + return 1; + } + for(i=1; i 1) { + if(++treads % 10000000 == 0) printf("Processed %llu reads\n", treads); + if(read->core.qual < min_mapq) continue; + if(read->core.flag & BAM_FUNMAP) continue; + + //Lengthen the output arrays if needed + if(read->core.l_qseq > max_length) { + for(i=0; i<4; i++) { + r1_m[i] = realloc(r1_m[i], read->core.l_qseq * sizeof(unsigned long long)); + r1_um[i] = realloc(r1_um[i], read->core.l_qseq * sizeof(unsigned long long)); + r2_m[i] = realloc(r2_m[i], read->core.l_qseq * sizeof(unsigned long long)); + r2_um[i] = realloc(r2_um[i], read->core.l_qseq * sizeof(unsigned long long)); + for(j=max_length; jcore.l_qseq; j++) { + *(r1_m[i]+j) = 0; + *(r1_um[i]+j) = 0; + *(r2_m[i]+j) = 0; + *(r2_um[i]+j) = 0; + } + } + max_length = read->core.l_qseq; + } + reversed = (read->core.flag & BAM_FREVERSE) ? 1 : 0; + + if(bam_aux_get(read, "XR") == NULL || bam_aux_get(read, "XG") == NULL) printf("%s\n", bam_format1(header, read)); + XR = bam_aux2Z(bam_aux_get(read, "XR")); + XG = bam_aux2Z(bam_aux_get(read, "XG")); + if(!(read->core.flag & BAM_FREAD2)) { + if(strcmp(XG, "CT") == 0) { //OT or CTOT + if(strcmp(XR, "CT") == 0) { //OT + store_calls(r1_m[0], r1_um[0], read, reversed); + } else { //CTOT + hasComp = 1; + store_calls(r1_m[1], r1_um[1], read, reversed); + } + } else { + if(strcmp(XR, "CT") == 0) { //OB + store_calls(r1_m[2], r1_um[2], read, reversed); + } else { //CTOB + hasComp = 1; + store_calls(r1_m[3], r1_um[3], read, reversed); + } + } + } else { + paired = 1; + if(strcmp(XG, "CT") == 0) { //OT or CTOT + if(strcmp(XR, "GA") == 0) { //OT + store_calls(r2_m[0], r2_um[0], read, reversed); + } else { //CTOT + hasComp = 1; + store_calls(r2_m[1], r2_um[1], read, reversed); + } + } else { + if(strcmp(XR, "GA") == 0) { //OB + store_calls(r2_m[2], r2_um[2], read, reversed); + } else { //CTOB + hasComp = 1; + store_calls(r2_m[3], r2_um[3], read, reversed); + } + } + } + } + + //Output the calls + fprintf(ofile, "Strand\tRead\tPosition\tnMethylated\tnUnmethylated\n"); + for(i=0; i 0 || r1_um[0][i] > 0) fprintf(ofile, "OT\t1\t%i\t%llu\t%llu\n", i+1, r1_m[0][i], r1_um[0][i]); + if(paired) { + if(r2_m[0][i] > 0 || r2_um[0][i] > 0) fprintf(ofile, "OT\t2\t%i\t%llu\t%llu\n", i+1, r2_m[0][i], r2_um[0][i]); + } + } + for(i=0; i 0 || r1_um[2][i] > 0) fprintf(ofile, "OB\t1\t%i\t%llu\t%llu\n", i+1, r1_m[2][i], r1_um[2][i]); + if(paired) { + if(r2_m[2][i] > 0 || r2_um[2][i] > 0) fprintf(ofile, "OB\t2\t%i\t%llu\t%llu\n", i+1, r2_m[2][i], r2_um[2][i]); + } + } + if(hasComp) { + for(i=0; i 0 || r1_um[1][i] > 0) fprintf(ofile, "CTOT\t1\t%i\t%llu\t%llu\n", i+1, r1_m[1][i], r1_um[1][i]); + if(paired) { + if(r2_m[1][i] > 0 || r2_um[1][i] > 0) fprintf(ofile, "CTOT\t2\t%i\t%llu\t%llu\n", i+1, r2_m[1][i], r2_um[1][i]); + } + } + for(i=0; i 0 || r1_um[3][i] > 0) fprintf(ofile, "CTOB\t1\t%i\t%llu\t%llu\n", i+1, r1_m[3][i], r1_um[3][i]); + if(paired) { + if(r2_m[3][i] > 0 || r2_um[3][i] > 0) fprintf(ofile, "CTOB\t2\t%i\t%llu\t%llu\n", i+1, r2_m[3][i], r2_um[3][i]); + } + } + } + + printf("Processed %llu reads\n", treads); + fclose(ofile); + + if(pdf) { + char *cmd = malloc(sizeof(char) * (strlen("bison_mbias2pdf ") + strlen(prefix) + 1)); + sprintf(cmd, "bison_mbias2pdf %s", prefix); + printf("Executing %s\n", cmd); + if(system(cmd) == -1) printf("N.B. an error occured while running bison_mbias2pdf!\n"); + free(cmd); + } else { + printf("The output may be converted to PDF with recommended inclusion bounds by running bison_mbias2pdf %s\n", prefix); + } + + //Cleanup + free(prefix); + for(i=0; i<4; i++) { + free(r1_m[i]); + free(r1_um[i]); + free(r2_m[i]); + free(r2_um[i]); + } + bam_header_destroy(header); + bam_destroy1(read); + bam_close(ifile); + return(0); +} diff --git a/methylation_extractor.c b/methylation_extractor.c new file mode 100644 index 0000000..f2b953e --- /dev/null +++ b/methylation_extractor.c @@ -0,0 +1,1001 @@ +#include "bison.h" +#include "sam.h" + +//Eh, this is simple enough for a small program +int storeCpG, storeCHG, storeCHH, min_Phred; + +//Inclusion bounds +int OT[4], OB[4], CTOT[4], CTOB[4]; + +//This will hold the output file handles (some of which can be NULL) +struct of_struct { + FILE *CpG; + FILE *CHG; + FILE *CHH; +}; + +//This stores an individual methylation call +typedef struct { + int32_t tid; + int32_t start; + _Bool strand; //+/- == 1/0 + unsigned int type; //This would normally be 0 (unmethylated) or 1 (methylated) +} Site; + +//This struct hold an array of methylation calls that will need to be sorted +typedef struct { + Site *CpG; + Site *CHG; + Site *CHH; + int num_CpG; + int max_CpG; + int num_CHG; + int max_CHG; + int num_CHH; + int max_CHH; + int only_CpG; + int only_CHG; + int only_CHH; +} Sites; + +struct list_struct { + int32_t tid; + int32_t pos; //negative positions are - strand, otherwise, + strand + unsigned int n_methylated; + unsigned int n_unmethylated; + struct list_struct *next; +}; + +//Linked lists holding the final methylation calls +struct list_struct *CpGlist, *CHGlist, *CHHlist; + +//Initialize a linked list +struct list_struct* init_list() { + struct list_struct *output = calloc(1, sizeof(struct list_struct)); + struct list_struct *next = calloc(1, sizeof(struct list_struct)); + output->next = next; + output->tid = -1; + output->pos = -1; + + next->next = NULL;; + next->tid = INT_MAX; + next->pos = INT_MAX; + + return output; +} + +//Destroy the linked list +void destroy_methyl_list(struct list_struct *list) { + struct list_struct *next = list->next; + struct list_struct *current = list; + + while(next != NULL) { + next = current->next; + free(current); + current = next; + } +} + +//Insert a new methylation call into the linked list +struct list_struct* insert_call(struct list_struct *current, Site *site) { + struct list_struct *next = current->next; + struct list_struct *new = malloc(sizeof(struct list_struct)); + + new->next = next; + new->tid = site->tid; + new->pos = (site->strand) ? site->start : -1 * (site->start); + if(site->type) { + new->n_methylated = 1; + new->n_unmethylated = 0; + } else { + new->n_methylated = 0; + new->n_unmethylated = 1; + } + current->next = new; + return new; +} + +/******************************************************************************* +* +* Initialize a Sites structure +* +*******************************************************************************/ +Sites* init_sites() { + Sites *output = malloc(sizeof(Sites)); + output->CpG = malloc(sizeof(Site)*1000000); + output->CHG = malloc(sizeof(Site)*1000000); + output->CHH = malloc(sizeof(Site)*1000000); + output->num_CpG = 0; + output->max_CpG = 1000000; + output->num_CHG = 0; + output->max_CHG = 1000000; + output->num_CHH = 0; + output->max_CHH = 1000000; + output->only_CpG = 0; + output->only_CHG = 0; + output->only_CHH = 0; + return output; +} + +/******************************************************************************* +* +* Free space used by a Sites structure +* +*******************************************************************************/ +void destroy_sites(Sites *p) { + free(p->CpG); + free(p->CHG); + free(p->CHH); + free(p); +} + +/******************************************************************************* +* +* Site sorting comparison function used by qsort in sort calls +* +*******************************************************************************/ +int site_comparison(const void *p1, const void *p2) { + Site *site1 = (Site *) p1; + Site *site2 = (Site *) p2; + int output = 0; + + if(site1->tid == site2->tid) { + if(site1->start == site2->start) { + output = 0; + } else { + output = site1->start - site2->start; + } + } else { + output = strcmp(global_header->target_name[site1->tid],global_header->target_name[site2->tid]); + } + return output; +} + +/******************************************************************************* +* +* Sort methylation sites according to chromosome and start position +* +*******************************************************************************/ +void sort_sites(Sites *sites, int which) { + if(which == 1) qsort((void *) sites->CpG, (size_t) sites->num_CpG, sizeof(Site), site_comparison); + else if(which == 2) qsort((void *) sites->CHG, (size_t) sites->num_CHG, sizeof(Site), site_comparison); + else if(which == 3) qsort((void *) sites->CHH, (size_t) sites->num_CHH, sizeof(Site), site_comparison); +} + +void merge_calls(Sites *sites, int which) { + Site *type; + int nsites=0, i=0; + struct list_struct *olist=NULL, *current=NULL; + + if(which == 1) { + type = sites->CpG; + olist = CpGlist; + nsites = sites->num_CpG; + } else if(which == 2) { + type = sites->CHG; + olist = CHGlist; + nsites = sites->num_CHG; + } else if(which == 3) { + type = sites->CHH; + olist = CHHlist; + nsites = sites->num_CHH; + } + + //Take care of the first call + current = olist; + while(itid == type[i].tid) { + if(abs(current->pos) == abs(type[i].start)) { + if(type[i].type == 1) current->n_methylated++; + else current->n_unmethylated++; + i++; + } else if(abs(current->next->pos) > type[i].start) { + current = insert_call(current, type+i); + i++; + } else { + if(current->next->tid == type[i].tid) { + current = current->next; + } else { + current = insert_call(current, type+i); + i++; + } + } + } else { + if(current->next->tid == INT_MAX) { + current = insert_call(current, type+i); + i++; + } else if(current->next->tid == type[i].tid) { //Changing chromosomes + if(abs(current->next->pos) > type[i].start) { + current = insert_call(current, type+i); + i++; + } else { + current = current->next; + } + } else if(strcmp(global_header->target_name[type[i].tid], global_header->target_name[current->next->tid]) < 0) { + current = insert_call(current, type+i); + i++; + } else { + current = current->next; + } + } + } + + //Reset the appropriate counter + if(which == 1) sites->num_CpG = 0; + else if(which == 2) sites->num_CHG = 0; + else if(which == 3) sites->num_CHH = 0; +} + + +/******************************************************************************* +* +* This will write the actual output, return 1 on success and 0 on error. +* +*******************************************************************************/ +int process_call(int32_t tid, unsigned int position, char call, Sites *sites, char strand) { + Site *site; + + if(call == 'Z' || call == 'z') { //CpG (methylated == Z, unmethylated = z) + if(sites->only_CHG || sites->only_CHH) return 1; + site = sites->CpG+sites->num_CpG; + site->tid = tid; + site->strand = (strand == '+') ? 1 : 0; + site->start = position; + site->type = (call == 'Z') ? 1 : 0; + (sites->num_CpG)++; + } else if(call == 'H' || call == 'h') { //CHH (methylated == H, unmethylated == h) + if(sites->only_CpG || sites->only_CHG) return 1; + site = sites->CHH+sites->num_CHH; + site->tid = tid; + site->strand = (strand == '+') ? 1 : 0; + site->start = position; + site->type = (call == 'H') ? 1 : 0; + (sites->num_CHH)++; + } else if(call == 'X' || call == 'x') { //CHG (methylated == X, unmethylated == x) + if(sites->only_CpG || sites->only_CHH) return 1; + site = sites->CHG+sites->num_CHG; + site->tid = tid; + site->strand = (strand == '+') ? 1 : 0; + site->start = position; + site->start = position; + site->type = (call == 'X') ? 1 : 0; + (sites->num_CHG)++; + } else { + printf("(1) Got an unknown character in the XM string of a read: %c\n",call); + return 0; + } + return 1; +} + +/******************************************************************************* +* +* Process either a single-end read or a non-overlapping paired-end read. +* +* Return 1 on success and 0 on error. +* +*******************************************************************************/ +int extractor_process_single(bam1_t *read, Sites *sites) { + unsigned long long *positions = NULL; + char *XM = bam_aux2Z(bam_aux_get(read,"XM")); + char *XR = bam_aux2Z(bam_aux_get(read, "XR")); + char *XG = bam_aux2Z(bam_aux_get(read, "XG")); + uint8_t *QUAL = bam1_qual(read); + char strand = (strcmp(bam_aux2Z(bam_aux_get(read,"XG")), "CT") == 0) ? '+' : '-'; + char call; + int i, start = 0, end = strlen(XM); //These may be overridden + + /*************************************************************************** + * + * Do we need to increase the size of anything pointed to by sites? + * + ***************************************************************************/ + if(storeCpG) { + if(sites->num_CpG + 100000 > sites->max_CpG) { + sites->CpG = realloc(sites->CpG, (sites->max_CpG+100000)*sizeof(Site)); + sites->max_CpG += 100000; + } + } + if(storeCHG) { + if(sites->num_CHG + 100000 > sites->max_CHG) { + sites->CHG = realloc(sites->CHG, (sites->max_CHG+100000)*sizeof(Site)); + sites->max_CHG += 100000; + } + } + if(storeCHH) { + if(sites->num_CHH + 100000 > sites->max_CHH) { + sites->CHH = realloc(sites->CHH, (sites->max_CHH+100000)*sizeof(Site)); + sites->max_CHH += 100000; + } + } + + positions = calculate_positions(read); + + //Should we override "start" and "end"? + if(read->core.flag & BAM_FREAD2) { //#2 + if(strcmp(XR, "GA") == 0 && strcmp(XG, "CT") == 0) { //OT + if(OT[2] != 0) start = OT[2]; + if(OT[3] != 0) { + if(end > OT[3]) end = OT[3]; + } + } else if(strcmp(XR, "GA") == 0 && strcmp(XG, "GA") == 0) { //OB + if(OB[2] != 0) start = OB[2]; + if(OB[3] != 0) { + if(end > OB[3]) end = OB[3]; + } + } else if(strcmp(XR, "CT") == 0 && strcmp(XG, "CT") == 0) { //CTOT + if(CTOT[2] != 0) start = CTOT[2]; + if(CTOT[3] != 0) { + if(end > CTOT[3]) end = CTOT[3]; + } + } else if(strcmp(XR, "CT") == 0 && strcmp(XG, "CT") == 0) { //CTOT + if(CTOB[2] != 0) start = CTOB[2]; + if(CTOB[3] != 0) { + if(end > CTOB[3]) end = CTOB[3]; + } + } + } else { //#1 + if(strcmp(XR, "CT") == 0 && strcmp(XG, "CT") == 0) { //OT + if(OT[0] != 0) start = OT[0]; + if(OT[1] != 0) { + if(end > OT[1]) end = OT[1]; + } + } else if(strcmp(XR, "CT") == 0 && strcmp(XG, "GA") == 0) { //OB + if(OB[0] != 0) start = OB[0]; + if(OB[1] != 0) { + if(end > OB[1]) end = OB[1]; + } + } else if(strcmp(XR, "GA") == 0 && strcmp(XG, "CT") == 0) { //CTOT + if(CTOT[0] != 0) start = CTOT[0]; + if(CTOT[1] != 0) { + if(end > CTOT[1]) end = CTOT[1]; + } + } else if(strcmp(XR, "GA") == 0 && strcmp(XG, "GA") == 0) { //CTOB + if(CTOB[0] != 0) start = CTOB[0]; + if(CTOB[1] != 0) { + if(end > CTOB[1]) end = CTOB[1]; + } + } + } + + for(i=start; icore.tid, *(positions+i), *(XM+i), sites, strand)) { + printf("(2) Got an unknown character (%i) in the XM string of a single-ended read: %s\n", i, XM); + free(positions); + return 0; + } + } + } + } + } + free(positions); + return 1; +} + +/******************************************************************************* +* +* Process either overlapping paired-end reads +* +* Return 1 on success and 0 on error. +* +*******************************************************************************/ +int extractor_process_overlapping(bam1_t *read1, bam1_t *read2, Sites *sites) { + unsigned long long *positions1 = calculate_positions(read1), *positions2 = calculate_positions(read2); + char strand = (strcmp(bam_aux2Z(bam_aux_get(read1, "XG")), "CT") == 0) ? '+' : '-'; + char call; + char *XR = bam_aux2Z(bam_aux_get(read1,"XR")); + char *XG = bam_aux2Z(bam_aux_get(read1,"XG")); + char *XM1 = bam_aux2Z(bam_aux_get(read1,"XM")); + char *XM2 = bam_aux2Z(bam_aux_get(read2,"XM")); + int i, j, end1 = (int) read1->core.l_qseq, end2 = (int) read2->core.l_qseq; + int start1 = 0, start2 = 0; + + /*************************************************************************** + * + * Do we need to increase the size of anything pointed to by sites? + * + ***************************************************************************/ + if(sites->num_CpG + 100000 > sites->max_CpG) { + sites->CpG = realloc(sites->CpG, (sites->max_CpG+100000)*sizeof(Site)); + sites->max_CpG += 100000; + } + if(sites->num_CHG + 100000 > sites->max_CHG) { + sites->CHG = realloc(sites->CHG, (sites->max_CHG+100000)*sizeof(Site)); + sites->max_CHG += 100000; + } + if(sites->num_CHH + 100000 > sites->max_CHH) { + sites->CHH = realloc(sites->CHH, (sites->max_CHH+100000)*sizeof(Site)); + sites->max_CHH += 100000; + } + + //Should we override start1,start2, end1 and end2? + if(strcmp(XR, "CT") == 0 && strcmp(XG, "CT") == 0) { //OT + if(OT[0] != 0) start1 = OT[0]; + if(OT[1] != 0) { + if(end1 > OT[1]) end1 = OT[1]; + } + if(OT[2] != 0) start2 = OT[2]; + if(OT[3] != 0) { + if(end2 > OT[3]) end2 = OT[3]; + } + } else if(strcmp(XR, "CT") == 0 && strcmp(XG, "GA") == 0) { //OB + if(OB[0] != 0) start1 = OB[0]; + if(OB[1] != 0) { + if(end1 > OB[1]) end1 = OB[1]; + } + if(OB[2] != 0) start2 = OB[2]; + if(OB[3] != 0) { + if(end2 > OB[3]) end2 = OB[3]; + } + } else if(strcmp(XR, "GA") == 0 && strcmp(XG, "CT") == 0) { //CTOT + if(CTOT[0] != 0) start1 = CTOT[0]; + if(CTOT[1] != 0) { + if(end1 > CTOT[1]) end1 = CTOT[1]; + } + if(CTOT[2] != 0) start2 = CTOT[2]; + if(CTOT[3] != 0) { + if(end2 > CTOT[3]) end2 = CTOT[3]; + } + } else if(strcmp(XR, "GA") == 0 && strcmp(XG, "GA") == 0) { //CTOB + if(CTOB[0] != 0) start1 = CTOB[0]; + if(CTOB[1] != 0) { + if(end1 > CTOB[1]) end1 = CTOB[1]; + } + if(CTOB[2] != 0) start2 = CTOB[2]; + if(CTOB[3] != 0) { + if(end2 > CTOB[3]) end2 = CTOB[3]; + } + } + i = start1; + XM1 += start1; + j = start2; + XM2 += start2; + while(*(positions1+i) == ULLONG_MAX) { + i++; + start1++; + XM1++; + } + while(*(positions2+j) == ULLONG_MAX) { + j++; + start2++; + XM2++; + } + while(*(positions1+end1-1) == ULLONG_MAX) end1--; + while(*(positions2+end2-1) == ULLONG_MAX) end2--; + + /*************************************************************************** + * + * If there is a 5' overhang when comparing the two sequences, then we + * should process that first before dealing with the overlap. + * + ***************************************************************************/ + if(*positions1 < *positions2) { + while(*(positions1+i) < *positions2) { + if(*(positions1+i) != ULLONG_MAX) { + if(*XM1 != '.') { + call = *XM1; + if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) { + if(!process_call(read1->core.tid, *(positions1+i), *XM1, sites, strand)) { + printf("(3) Got an unknown character in the XM string: %s\n", XM1); + return 0; + } + } + } + } + i++; + XM1++; + if(i == end1) break; + } + } else if(*positions2 < *positions1) { + while(*(positions2+j) < *positions1) { + if(*(positions2+j) != ULLONG_MAX) { + if(*XM2 != '.') { + call = *XM2; + if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) { + if(!process_call(read2->core.tid, *(positions2+j), *XM2, sites, strand)) { + printf("(4) Got an unknown character in the XM string: %s\n", XM2); + return 0; + } + } + } + } + j++; + XM2++; + if(j == end2) break; + } + } + + //We are now up to the overlapping section + while((icore.tid, *(positions1+i), *XM1, sites, strand)) { + printf("(5a) Got an unknown character in the XM string: %s\n", XM1); + return 0; + } + } + } + XM1++; + i++; + } else { + if(*XM2 != '.') { + call = *XM2; + if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) { + if(!process_call(read2->core.tid, *(positions2+j), *XM2, sites, strand)) { + printf("(5b) Got an unknown character in the XM string: %s\n", XM2); + return 0; + } + } + } + XM2++; + j++; + } + continue; + } + + if(*XM1 == *XM2) { + if(*XM1 != '.') { + call = *XM1; + if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) { + if(!process_call(read1->core.tid, *(positions1+i), *XM1, sites, strand)) { + printf("(6) Got an unknown character in the XM string: %s\n", XM1); + return 0; + } + } + } + } else { //bison will call '.' if there is an N in a read or an impossible conversion, so whichever read has a call is correct (the call becomes '.' if the reads have different calls) + if(*XM2 != '.' && *XM1 == '.') { + call = *XM2; + if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) { + if(!process_call(read2->core.tid, *(positions2+j), *XM2, sites, strand)) { + printf("(7) Got an unknown character in the XM string: %s\n", XM2); + return 0; + } + } + } else if(*XM1 != '.' && *XM2 == '.') { + call = *XM1; + if(((call == 'Z' || call == 'z') && storeCpG) || ((call == 'X' || call == 'x') && storeCHG) || ((call == 'H' || call == 'h') && storeCHH)) { + if(!process_call(read1->core.tid, *(positions1+i), *XM1, sites, strand)) { + printf("(8) Got an unknown character in the XM string: %s\n", XM1); + return 0; + } + } + } + } + XM1++; + XM2++; + i++; + j++; + } else { + if(*(positions1+i) == ULLONG_MAX) { + XM1++; + i++; + } + if(*(positions2+j) == ULLONG_MAX) { + XM2++; + j++; + } + } + } + + if(i >= end1 && j >= end2) { + free(positions1); + free(positions2); + return 1; + } + if(i >= end1) { + while(jcore.tid, *(positions2+j), *XM2, sites, strand)) { + printf("(12) Got an unknown character in the XM string: %s\n", XM2); + free(positions1); + free(positions2); + return 0; + } + } + } + } + XM2++; + j++; + } + } else { + while(icore.tid, *(positions1+i), *XM1, sites, strand)) { + printf("(13) Got an unknown character in the XM string: %s\n", XM1); + free(positions1); + free(positions2); + return 0; + } + } + } + } + XM1++; + i++; + } + } + free(positions1); + free(positions2); + return 1; +} + +//Remove methylation calls on 5' ends of reads +void trim5(bam1_t *read, int digest_types) { + int MspI = digest_types & 1; + int TaqI = digest_types & 2; + unsigned long long offset = genome_offset(lookup_chrom(read), read->core.pos); + char *sequence; + char *XM = bam_aux2Z(bam_aux_get(read, "XM")); + int i; + + for(i=0; i<2; i++) { + sequence = chromosomes.genome+offset+read->core.pos-i; + if(MspI) { + if(strcmp(sequence, "CCGG")) { + *(XM+2-i) = '.'; + break; + } + } + if(TaqI) { + if(strcmp(sequence, "TCGA")) { + *(XM+2-i) = '.'; + break; + } + } + } +} + +//Remove methylation calls on 3' ends of reads +void trim3(bam1_t *read, int digest_types) { + int MspI = digest_types & 1; + int TaqI = digest_types & 2; + unsigned long long offset = genome_offset(lookup_chrom(read), read->core.pos); + char *sequence; + char *XM = bam_aux2Z(bam_aux_get(read, "XM")); + uint32_t end = bam_calend(&(read->core), bam1_cigar(read)); + int i, len = strlen(XM); + + for(i=0; i<2; i++) { + sequence = chromosomes.genome+offset+end-2-i; + if(MspI) { + if(strcmp(sequence, "CCGG")) { + *(XM+len-2-i) = '.'; + break; + } + } + if(TaqI) { + if(strcmp(sequence, "TCGA")) { + *(XM+len-2-i) = '.'; + break; + } + } + } +} + +void process_RRBS_read(bam1_t *read1, bam1_t *read2, int digest_types) { + if(strncmp(bam_aux2Z(bam_aux_get(read1, "XG")), "CT", 2) == 0) { //OT or CTOT + trim3(read1, digest_types); + if(read1->core.flag & BAM_FPAIRED) trim3(read2, digest_types); + } else { //OB or CTOB + trim5(read1, digest_types); + if(read1->core.flag & BAM_FPAIRED) trim5(read2, digest_types); + } +} + +void write_sites(struct of_struct *of, int which) { + struct list_struct *list = NULL; + int mpercent; + FILE *f = NULL; + + if(which == 1) { + list = CpGlist->next; + f = of->CpG; + } else if(which == 2) { + list = CHGlist->next; + f = of->CHG; + } else if(which == 3) { + list = CHHlist->next; + f = of->CHH; + } + + while(list->next != NULL) { + mpercent = (int) (1000 * ((float) list->n_methylated)/(float)(list->n_methylated + list->n_unmethylated)); + fprintf(f, "%s\t%u\t%u\t%i\t%u\t%u\n", global_header->target_name[list->tid], \ + abs(list->pos), abs(list->pos)+1, mpercent, list->n_methylated, list->n_unmethylated); + list = list->next; + } +} + +//Generate output file names and open them for writing +void generate_output_names(char *ifile, struct of_struct *of) { + char *p, *tmp = strdup(ifile); + char *oname = NULL; + + //Generate the basename by stripping off .sam or .bam + p = strrchr(tmp, '.'); + if(strcmp(p, ".sam") == 0 || strcmp(p, ".bam") == 0) *p = '\0'; + oname = malloc(sizeof(char) * (strlen(tmp) + strlen("_CpG.bedGraph "))); + + if(storeCpG) { + sprintf(oname, "%s_CpG.bedGraph", tmp); + printf("CpG counts will be written to %s\n", oname); + of->CpG = fopen(oname, "w"); + } + if(storeCHG) { + sprintf(oname, "%s_CHG.bedGraph", tmp); + printf("CHG counts will be written to %s\n", oname); + of->CHG = fopen(oname, "w"); + } + if(storeCHH) { + sprintf(oname, "%s_CHH.bedGraph", tmp); + printf("CHH counts will be written to %s\n", oname); + of->CHH = fopen(oname, "w"); + } + + free(tmp); + free(oname); +} + +//Fill the inclusion bounds +void fill_bounds(char *str, int bounds[4]) { + int i; + char *p; + + for(i=0; i<4; i++) { + if(i==0) { + p = strtok(str, ","); + } else { + p = strtok(NULL, ","); + } + if(p == NULL) break; + bounds[i] = atoi(p); + } +} + +void usage(char *prog) { + printf("Usage: %s OPTIONS genome_directory input.(sam|bam)\n", prog); + printf("\n\ + Extract methylation information into a bedGraph file or files. By default,\n\ + only CpG metrics are output\n\ +\n\ + -h Print this message.\n\ +\n\ + -q Read MAPQ value must at least this for inclusion (default 10).\n\ + Specify 0 to include everything.\n\ +\n\ + -phred Minimum Phred score that a base must have for its methylation\n\ + state to be included in the output. The default is 5.\n\ +\n\ + --MspI Library was MspI digested.\n\ +\n\ + --TaqI Library was TaqI digested (this can be in addition to\n\ + MspI digestion).\n\ +\n\ + -no_CpG Don't output CpG sites (they're output by default).\n\ +\n\ + -CHH Output CHH statistics.\n\ +\n\ + -CHG Output CHG statistics.\n\ +\n\ + -OT Bounds for the region of reads mapped to the original top\n\ + strand to include. It is highly recommended that bison_mbias\n\ + and/or bison_mbias2pdf be run so that approximate bounds can\n\ + be generated for this. The format is \"-OT A,B,C,D\", where \"A\"\n\ + is the 5'-most and \"B\" the 3'-most bound of the included\n\ + region for read #1. \"C\" and \"D\" are the equivalent bounds for\n\ + read #2. A value of 0 means to leave that portion of the read\n\ + unbound (e.g., \"-OT 0,90,20,0\" will not include methylation\n\ + calls after the 90th base on read #1 or before the 20th base\n\ + on read #2). The default is \"-OT 0,0,0,0\", meaning that all\n\ + methylation calls are included.\n\ +\n\ + -OB Like -OT, but for reads mapping to the original bottom strand.\n\ +\n\ + -CTOT Like -OT, but for reads mapping to the complementary to\n\ + original top strand.\n\ +\n\ + -CTOB Like -OT, but for reads mapping to the complementary to\n\ + original bottom strand.\n\ +\n\ + -max-sites-size N This option can increase or decrease memory\n\ + requirements by changing the number of methylation calls\n\ + stored in memory prior to sorting and merging. The default is\n\ + 50,000.\n"); +} + +int main(int argc, char *argv[]) { + int i, max_sites_size = 50000, MspI = 0, TaqI = 0; + int min_MAPQ = 10; + samfile_t *fp = NULL; + bam1_t *read1 = bam_init1(), *read2 = bam_init1(); + struct of_struct *of = calloc(1, sizeof(struct of_struct)); + unsigned int r1_pos = 0, r2_pos = 0, total_reads = 0; + Sites *sites = init_sites(); + + CpGlist = init_list(); + CHGlist = init_list(); + CHHlist = init_list(); + config.genome_dir = NULL; + chromosomes.nchromosomes = 0; + storeCpG = storeCHG = storeCHH = 0; + storeCpG = 1; + min_Phred = 10; + for(i=0; i<4; i++) OT[i] = OB[i] = CTOT[i] = CTOB[i] = 0; + + /* read in the file names */ + if(argc < 3) { + usage(argv[0]); + return 1; + }; + for(i=1; iheader; + } else { + printf("Unknown parameter %s\n", argv[i]); + usage(argv[0]); + return 1; + } + } + + if(config.genome_dir == NULL || fp == NULL) { + printf("Genome directory or SAM/BAM input file not specified!\n"); + usage(argv[0]); + } + + //Generate the output names and open the output files + generate_output_names(argv[argc-1], of); + + //Read in the genome + chromosomes.max_genome = 3000000000; + printf("Allocating space for %llu characters\n", chromosomes.max_genome); fflush(stdout); + chromosomes.genome = malloc(sizeof(char)*chromosomes.max_genome); + *chromosomes.genome = '\0'; + if(chromosomes.genome == NULL) { + printf("Could not allocate enough room to hold the genome!\n"); + return -1; + } + read_genome(); + + //Process the reads + while(samread(fp, read1) > 1) { + if(read1->core.flag & BAM_FPAIRED) { + samread(fp, read2); + r1_pos = read1->core.pos+1; + r2_pos = read2->core.pos+1; + } else { + r1_pos = read1->core.pos+1; + r2_pos = INT_MAX; + } + if(read1->core.flag & BAM_FDUP) continue; + if(read1->core.qual < min_MAPQ) continue; + if(TaqI+MspI) process_RRBS_read(read1, read2, MspI+2*TaqI); + + //Are the reads even overlapping? If not, this is easy. + if(r2_pos == INT_MAX) { //Unpaired read + if(!extractor_process_single(read1, sites)) { printf("Error!\n"); break; } + } else if(r1_pos < r2_pos && r1_pos + read1->core.l_qseq - 1 < r2_pos) { //No Overlap + if(!extractor_process_single(read1, sites)) { printf("Error!\n"); break; } + if(!extractor_process_single(read2, sites)) { printf("Error!\n"); break; } + } else if(r2_pos < r1_pos && r2_pos + read2->core.l_qseq - 1 < r1_pos) { //No Overlap + if(!extractor_process_single(read1, sites)) { printf("Error!\n"); break; } + if(!extractor_process_single(read2, sites)) { printf("Error!\n"); break; } + } else { //Overlap + if(!extractor_process_overlapping(read1, read2, sites)) { printf("Error!\n"); break; } + } + + if(sites->num_CpG >= max_sites_size) { + sort_sites(sites, 1); + merge_calls(sites, 1); + } + if(sites->num_CHG >= max_sites_size) { + sort_sites(sites, 2); + merge_calls(sites, 2); + } + if(sites->num_CHH >= max_sites_size) { + sort_sites(sites, 3); + merge_calls(sites, 3); + } + + total_reads++; + if(total_reads % 1000000 == 0) { + printf("Processed %u reads\n", total_reads); + fflush(stdout); + } + } + if(samread(fp, read1) > 1) { + printf("We must have exited on an error as there are still reads left\n"); + fflush(stdout); + } + + //Do the final sort and merge + if(sites->num_CpG) { + sort_sites(sites, 1); + merge_calls(sites, 1); + } + if(sites->num_CHG) { + sort_sites(sites, 2); + merge_calls(sites, 2); + } + if(sites->num_CHH) { + sort_sites(sites, 3); + merge_calls(sites, 3); + } + + //Write output + if(storeCpG) write_sites(of, 1); + if(storeCHG) write_sites(of, 2); + if(storeCHH) write_sites(of, 3); + + //Close things up + if(of->CpG != NULL) fclose(of->CpG); + if(of->CHG != NULL) fclose(of->CHG); + if(of->CHH != NULL) fclose(of->CHH); + free(of); + free(chromosomes.genome); + for(i=0; ichrom); + free(*(chromosomes.chromosome+i)); + } + free(chromosomes.chromosome); + destroy_methyl_list(CpGlist); + destroy_methyl_list(CHGlist); + destroy_methyl_list(CHHlist); + destroy_sites(sites); + bam_destroy1(read1); + bam_destroy1(read2); + samclose(fp); + + return 0; +}; diff --git a/slurp.c b/slurp.c new file mode 100644 index 0000000..e2c0222 --- /dev/null +++ b/slurp.c @@ -0,0 +1,318 @@ +#include "bison.h" + +/****************************************************************************** +* +* Add an element to the end of a linked-list +* +* struct packed_struct *last: last sentinel struct +* void *packed: a packed read +* +*******************************************************************************/ +void add_element(struct packed_struct *last, void *packed) { + struct packed_struct *new = malloc(sizeof(struct packed_struct)); + struct packed_struct *next_to_last = last->previous; + + //Setup the new element + new->packed = packed; + new->next = last; + new->previous = next_to_last; + new->state = 0; + + //Update the sentinel struct + last->previous = new; + + //Update the next_to_last struct + next_to_last->next = new; + next_to_last->state = 1; +} + +/****************************************************************************** +* +* Destroy a (typically already removed) element from a linked-list +* +* struct packed_struct *remove: element to destroy +* +*******************************************************************************/ +inline void destroy_element(struct packed_struct *remove) { + bam1_t *pbam1_t = remove->packed; + if(pbam1_t != NULL) { + if(pbam1_t->data != NULL) free(pbam1_t->data); + free(pbam1_t); + } + free(remove); +} + +/****************************************************************************** +* +* Remove an element from the start of a linked-list +* is_ready(first, 0) must return 1! +* +* struct packed_struct *first: first sentinel struct +* +*******************************************************************************/ +void remove_element(struct packed_struct *first) { + struct packed_struct *remove = first->next; + struct packed_struct *new_next = remove->next; + + first->next = new_next; + + destroy_element(remove); +} + + +/****************************************************************************** +* +* Is the first or second element ready? +* +* struct packed_struct *first: first sentinel struct +* int offset: 0 (first element) or 1 (second element) +* +* returns 1 for element ready, or 0 otherwise +* +*******************************************************************************/ +inline int is_ready(struct packed_struct *first, int offset) { + if(offset == 0) { + if(first->next->state == 1) return 1; + } else { + if(first->next->next->state == 1) return 1; + } + return 0; +} + +/****************************************************************************** +* +* Is the linked list finished? +* +* struct packed_struct *first: first sentinel struct +* +* returns 1 for finished, 0 otherwise +* +*******************************************************************************/ +inline int is_finished(struct packed_struct *first) { + if(first->next->packed == NULL) return 1; + return 0; +} + +/****************************************************************************** +* +* Add a finished element to a linked list +* +* struct packed_struct *last: last sentenel struct of targeted list +* +*******************************************************************************/ +void add_finished(struct packed_struct *last) { + struct packed_struct *new = malloc(sizeof(struct packed_struct)); + struct packed_struct *next_to_last = last->previous; + + new->packed = NULL; + new->next = last; + new->previous = NULL; + new->state = 1; + + //Update the sentinel struct + last->previous = new; + + //Update the next_to_last struct + next_to_last->next = new; + next_to_last->state = 1; + if(config.paired) next_to_last->previous->state = 1; +} + +/****************************************************************************** +* +* Initialize a linked list, returning the last sentinel struct +* +* struct packed_struct *first: first sentinel struct +* +* returns first sentinel struct +* +*******************************************************************************/ +struct packed_struct *initialize_list(struct packed_struct *first) { + first = malloc(sizeof(struct packed_struct)); + struct packed_struct *last= malloc(sizeof(struct packed_struct)); + + first->next = last; + first->previous = first; + first->packed = NULL; + last->next = last; + last->previous = first; + last->packed = NULL; + + last->state = 0; //is_ready(last) should always be 0; + first->state = 0; //is_ready(last) should always be 0; + return first; +} + +/****************************************************************************** +* +* Destroy a linked list of packed_structs +* +* struct packed_struct *first: linked list to destroy +* +*******************************************************************************/ +void destroy_list(struct packed_struct *first) { + while(first->next->next != first->next) remove_element(first); + free(first->next); + free(first); +} + +/****************************************************************************** +* +* The MPI receiver thread on the main node +* +* void *a: NULL input +* +* returns NULL +* +*******************************************************************************/ +void *slurp(void *a) { + time_t t0, t1; +#ifndef DEBUG + void *p = NULL; + int nnodes = (config.directional) ? 2 : 4; + int nfinished = 0; + int source = 0; + int size = 0; + struct packed_struct *target_node = NULL; + MPI_Status status; + int start = 1; + int ntasks = (config.directional) ? 3: 5; + int i; + for(i=1; ipacked = NULL; + packed->size = 0; + bam_header_t *tmp; + tmp = bam_header_read(fp2); + bam_header_destroy(tmp); + if(!config.directional) { + tmp = bam_header_read(fp3); + bam_header_destroy(tmp); + tmp = bam_header_read(fp4); + bam_header_destroy(tmp); + } +#endif + + //Write a header + bam_header_write(OUTPUT_BAM, global_header); + + t0 = time(NULL); + if(!config.quiet) printf("Started slurping @%s", ctime(&t0)); fflush(stdout); +#ifndef DEBUG + while(nfinished < nnodes) { + MPI_Probe(MPI_ANY_SOURCE, 5, MPI_COMM_WORLD, &status); + source = status.MPI_SOURCE; + MPI_Get_count(&status, MPI_BYTE, &size); + if(source == 1) target_node = node1_last_sentinel; + else if(source == 2) target_node = node2_last_sentinel; + else if(source == 3) target_node = node3_last_sentinel; + else if(source == 4) target_node = node4_last_sentinel; + + if(size > 1) { + p = malloc((size_t) size); + MPI_Recv(p, size, MPI_BYTE, source, 5, MPI_COMM_WORLD, &status); + add_element(target_node, p); + } else { + p = malloc((size_t) size); + MPI_Recv(p, size, MPI_BYTE, source, 5, MPI_COMM_WORLD, &status); + free(p); + add_finished(target_node); + nfinished++; + } + } +#else + //OT + while(bam_read1(fp1, read) > 1) { + packed->size = 0; + packed = pack_read(read, packed); + add_element(node1_last_sentinel, packed->packed); + if(config.paired) { + bam_read1(fp1, read); + packed->size = 0; + packed = pack_read(read, packed); + add_element(node1_last_sentinel, packed->packed); + } + //OB + bam_read1(fp2, read); + packed->size = 0; + packed = pack_read(read, packed); + add_element(node2_last_sentinel, packed->packed); + if(config.paired) { + bam_read1(fp2, read); + packed->size = 0; + packed = pack_read(read, packed); + add_element(node2_last_sentinel, packed->packed); + } + if(!config.directional) { + //CTOT + bam_read1(fp3, read); + packed->size = 0; + packed = pack_read(read, packed); + add_element(node3_last_sentinel, packed->packed); + if(config.paired) { + bam_read1(fp3, read); + packed->size = 0; + packed = pack_read(read, packed); + add_element(node3_last_sentinel, packed->packed); + } + //CTOB + bam_read1(fp4, read); + packed->size = 0; + packed = pack_read(read, packed); + add_element(node4_last_sentinel, packed->packed); + if(config.paired) { + bam_read1(fp4, read); + packed->size = 0; + packed = pack_read(read, packed); + add_element(node4_last_sentinel, packed->packed); + } + } + } + bam_destroy1(read); + free(packed); + + add_finished(node1_last_sentinel); + add_finished(node2_last_sentinel); + if(!config.directional) { + add_finished(node3_last_sentinel); + add_finished(node4_last_sentinel); + } +#endif + t1 = time(NULL); + if(!config.quiet) printf("Finished slurping @%s\t(%f seconds elapsed)\n", ctime(&t1), difftime(t1, t0)); fflush(stdout); + return NULL; +} diff --git a/worker.c b/worker.c new file mode 100644 index 0000000..0137bf3 --- /dev/null +++ b/worker.c @@ -0,0 +1,218 @@ +#include "bison.h" +#include + +/****************************************************************************** +* +* The main worker node function. +* +* int thread_id: the thread_id +* +*******************************************************************************/ +void worker_node(int thread_id) { + int cmd_length = 1, max_qname = 0, status; + char *cmd, *last_qname = calloc(1, sizeof(char));; + MPI_Header *packed_header; + MPI_read *packed_read = calloc(1, sizeof(MPI_read)); + bam_header_t *header; + bam1_t *read1 = bam_init1(); + bam1_t *read2 = bam_init1(); + tamFile fp; + MPI_Status stat; +#ifdef DEBUG + int current_p_size = 100; + bamFile of; + bam_header_t *debug_header = bam_header_init(); + bam1_t *debug_read = bam_init1(); + global_header = bam_header_init(); + void *p = calloc(100,1); + int NODE_ID = -1; + MPI_Comm_rank(MPI_COMM_WORLD, &NODE_ID); + if(!config.quiet) printf("NODE_ID: %i\n",NODE_ID); fflush(stdout); + char *oname; +#else + int start = 0, i = 0; +#endif + time_t t0, t1; + + packed_read->size = 0; + packed_read->packed = NULL; + + //construct the bowtie2 command + cmd_length += (int) strlen("bowtie2 -q --reorder --no-mixed --no-discordant") + 1; + cmd_length += (int) strlen(config.bowtie2_options) + 1; + cmd_length += (int) strlen("--norc -x") + 1; + cmd_length += (int) strlen(config.genome_dir) + strlen("bisulfite_genome/CT_conversion/BS_CT") + 1; + cmd_length += (int) 2*(strlen("-1 ") + strlen(config.FASTQ1CT)) + 3; + + cmd = (char *) malloc(sizeof(char) * cmd_length); + if(thread_id == 1) { //OT Read#1 C->T, Read#2 G->A, Genome C->T only the + strand + if(config.paired) { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, config.FASTQ1CT, config.FASTQ2GA); + } else { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, config.FASTQ1CT); + } +#ifdef DEBUG + oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_OT.bam"))); + sprintf(oname, "%s%s_OT.bam", config.odir, config.basename); + of = bam_open(oname, "w"); + free(oname); +#endif + } else if(thread_id == 2) { //OB Read#1 C->T, Read#2 G->A, Genome G->A only the - strand + if(config.paired) { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, config.FASTQ1CT, config.FASTQ2GA); + } else { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, config.FASTQ1CT); + } +#ifdef DEBUG + oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_OB.bam"))); + sprintf(oname, "%s%s_OB.bam", config.odir, config.basename); + of = bam_open(oname, "w"); + free(oname); +#endif + } else if(thread_id == 3) { //CTOT Read#1 G->A, Read#2 C->T, Genome C->T, only the - strand + if(config.paired) { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -1 %s -2 %s", config.bowtie2_options, config.genome_dir, config.FASTQ1GA, config.FASTQ2CT); + } else { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --nofw -x %sbisulfite_genome/CT_conversion/BS_CT -U %s", config.bowtie2_options, config.genome_dir, config.FASTQ1GA); + } +#ifdef DEBUG + oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_CTOT.bam"))); + sprintf(oname, "%s%s_CTOT.bam", config.odir, config.basename); + of = bam_open(oname, "w"); + free(oname); +#endif + } else if(thread_id == 4) { //CTOB Read#1 G->A, Read#2 C->T, Genome G->A, only the + strand + if(config.paired) { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -1 %s -2 %s", config.bowtie2_options, config.genome_dir, config.FASTQ1GA, config.FASTQ2CT); + } else { + sprintf(cmd, "bowtie2 -q --reorder --no-mixed --no-discordant %s --norc -x %sbisulfite_genome/GA_conversion/BS_GA -U %s", config.bowtie2_options, config.genome_dir, config.FASTQ1GA); + } +#ifdef DEBUG + oname = malloc(sizeof(char) *(1+strlen(config.odir)+strlen(config.basename)+strlen("_CTOB.bam"))); + sprintf(oname, "%s%s_CTOB.bam", config.odir, config.basename); + of = bam_open(oname, "w"); + free(oname); +#endif + } else { + printf("Oh shit, got thread_id %i!\n", thread_id); + return; + } + + //Wait for the signal to start +#ifndef DEBUG + while(start == 0) MPI_Recv(&start, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &stat); +#endif + + //Start the process + if(!config.quiet) printf("Node %i executing: %s\n", thread_id, cmd); fflush(stdout); + fp = sam_popen(cmd); + header = sam_header_read(fp); +#ifdef DEBUG + bam_header_write(of, header); +#endif + +#ifndef DEBUG + packed_header = pack_header(header); + if(thread_id == 1) { + //Send the header + MPI_Send((void *) &(packed_header->size), 1, MPI_INT, 0, 1, MPI_COMM_WORLD); + status = MPI_Send((void *) packed_header->packed, packed_header->size, MPI_BYTE, 0, 2, MPI_COMM_WORLD); + if(status != MPI_SUCCESS) { + printf("MPI_Send returned %i\n", status); + fflush(stdout); + } + } +#else + packed_header = pack_header(header); + void *tmp_pointer = malloc(packed_header->size); + MPI_Request request; + MPI_Isend((void *) packed_header->packed, packed_header->size, MPI_BYTE, NODE_ID, 2, MPI_COMM_WORLD, &request); + status = MPI_Recv(tmp_pointer, packed_header->size, MPI_BYTE, NODE_ID, 2, MPI_COMM_WORLD, &stat); + if(status != MPI_SUCCESS) printf("We seem to have received an error when sending to ourselves!\n"); + MPI_Wait(&request, &stat); + unpack_header(debug_header, tmp_pointer); + global_header = debug_header; + free(tmp_pointer); +#endif + + t0 = time(NULL); + if(!config.quiet) printf("Node %i began sending reads @%s", thread_id, ctime(&t0)); fflush(stdout); + while(sam_read1(fp, header, read1) > 1) { +#ifdef DEBUG + bam_write1(of, read1); +#endif + if(strcmp(bam1_qname(read1), last_qname) == 0) { //Multimapper + if(config.paired) { + sam_read1(fp, header, read2); +#ifdef DEBUG + bam_write1(of, read2); +#endif + } + continue; + } else { + if(read1->core.l_qname > max_qname) { + max_qname = read1->core.l_qname + 10; + last_qname = realloc(last_qname, sizeof(char) * max_qname); + } + strcpy(last_qname, bam1_qname(read1)); + } + + //Send the read + packed_read = pack_read(read1, packed_read); +#ifndef DEBUG + MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); +#else + if(packed_read->size > current_p_size) p = realloc(p, packed_read->size); + MPI_Isend(packed_read->packed, packed_read->size, MPI_BYTE, NODE_ID, 5, MPI_COMM_WORLD, &request); + status = MPI_Recv(p, packed_header->size, MPI_BYTE, NODE_ID, 5, MPI_COMM_WORLD, &stat); + MPI_Wait(&request, &stat); +#endif + //Deal with paired-end reads + if(config.paired) { + sam_read1(fp, header, read2); + packed_read = pack_read(read2, packed_read); +#ifndef DEBUG + MPI_Send((void *) packed_read->packed, packed_read->size, MPI_BYTE, 0, 5, MPI_COMM_WORLD); +#else + bam_write1(of, read2); + if(packed_read->size > current_p_size) p = realloc(p, packed_read->size); + MPI_Isend((void *) packed_read->packed, packed_read->size, MPI_BYTE, NODE_ID, 5, MPI_COMM_WORLD, &request); + status = MPI_Recv(p, packed_header->size, MPI_BYTE, NODE_ID, 5, MPI_COMM_WORLD, &stat); + MPI_Wait(&request, &stat); + debug_read = unpack_read(debug_read, p); +#endif + } +#ifndef DEBUG + i++; +#endif + } + t1 = time(NULL); + if(!config.quiet) printf("Node %i finished sending reads @%s\t(%f sec elapsed)\n", thread_id, ctime(&t1), difftime(t1, t0)); fflush(stdout); + + //Notify the master node + packed_read->size = 0; +#ifndef DEBUG + void *A = malloc(1); + MPI_Send(A, 1, MPI_BYTE, 0, 5, MPI_COMM_WORLD); + free(A); +#endif + + //Close things up + bam_header_destroy(header); + bam_destroy1(read1); + bam_destroy1(read2); + free(cmd); + if(packed_read->packed != NULL) free(packed_read->packed); + free(packed_read); + if(packed_header->packed != NULL) free(packed_header->packed); + free(packed_header); + free(last_qname); + sam_pclose(fp); +#ifdef DEBUG + bam_close(of); + bam_header_destroy(debug_header); + bam_destroy1(debug_read); + free(p); +#endif + if(!config.quiet) printf("Exiting worker node %i\n", thread_id); fflush(stdout); +};