diff --git a/Cargo.toml b/Cargo.toml index b50f0d37a3..b86ce2230c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,4 +17,6 @@ bigtools = "0.5.3" tokio = "*" flate2 = "*" tempfile = "*" -ndarray = "0.16.1" \ No newline at end of file +ndarray = "0.16.1" +npyz = "*" +zip = "*" \ No newline at end of file diff --git a/pydeeptools/deeptools/multiBamSummary2.py b/pydeeptools/deeptools/multiBamSummary2.py new file mode 100644 index 0000000000..a42eeff8fb --- /dev/null +++ b/pydeeptools/deeptools/multiBamSummary2.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import sys +import argparse +import numpy as np + +import deeptools.countReadsPerBin as countR +from deeptools import parserCommon +from deeptools.utilities import smartLabels +from importlib.metadata import version +old_settings = np.seterr(all='ignore') + +from deeptools.hp import r_mbams + +def parse_arguments(args=None): + parser = \ + argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=""" + +``multiBamSummary`` computes the read coverages for genomic regions for typically two or more BAM files. +The analysis can be performed for the entire genome by running the program in 'bins' mode. +If you want to count the read coverage for specific regions only, use the ``BED-file`` mode instead. +The standard output of ``multiBamSummary`` is a compressed numpy array (``.npz``). +It can be directly used to calculate and visualize pairwise correlation values between the read coverages using the tool 'plotCorrelation'. +Similarly, ``plotPCA`` can be used for principal component analysis of the read coverages using the .npz file. +Note that using a single bigWig file is only recommended if you want to produce a bedGraph file (i.e., with the ``--outRawCounts`` option; the default output file cannot be used by ANY deepTools program if only a single file was supplied!). + +A detailed sub-commands help is available by typing: + + multiBamSummary bins -h + + multiBamSummary BED-file -h + + +""", + epilog='example usages:\n' + 'multiBamSummary bins --bamfiles file1.bam file2.bam -o results.npz \n\n' + 'multiBamSummary BED-file --BED selection.bed --bamfiles file1.bam file2.bam \n' + '-o results.npz' + ' \n\n', + conflict_handler='resolve') + + parser.add_argument('--version', action='version', + version='%(prog)s {}'.format(version('deeptools'))) + subparsers = parser.add_subparsers( + title="commands", + dest='command', + description='subcommands', + help='subcommands', + metavar='') + + parent_parser = parserCommon.getParentArgParse(binSize=False) + read_options_parser = parserCommon.read_options() + + # bins mode options + subparsers.add_parser( + 'bins', + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + parents=[bamcorrelate_args(case='bins'), + parent_parser, read_options_parser, + parserCommon.gtf_options(suppress=True) + ], + help="The coverage calculation is done for consecutive bins of equal " + "size (10 kilobases by default). This mode is useful to assess the " + "genome-wide similarity of BAM files. The bin size and " + "distance between bins can be adjusted.", + add_help=False, + usage='%(prog)s ' + '--bamfiles file1.bam file2.bam ' + '-o results.npz \n' + 'help: multiBamSummary bins -h / multiBamSummary bins --help\n') + + # BED file arguments + subparsers.add_parser( + 'BED-file', + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + parents=[bamcorrelate_args(case='BED-file'), + parent_parser, read_options_parser, + parserCommon.gtf_options() + ], + help="The user provides a BED file that contains all regions " + "that should be considered for the coverage analysis. A " + "common use is to compare ChIP-seq coverages between two " + "different samples for a set of peak regions.", + usage='%(prog)s --BED selection.bed --bamfiles file1.bam file2.bam -o results.npz\n' + 'help: multiBamSummary BED-file -h / multiBamSummary bins --help\n', + add_help=False) + + return parser + + +def bamcorrelate_args(case='bins'): + parser = argparse.ArgumentParser(add_help=False) + required = parser.add_argument_group('Required arguments') + + # define the arguments + required.add_argument('--bamfiles', '-b', + metavar='FILE1 FILE2', + help='List of indexed bam files separated by spaces.', + nargs='+', + required=True) + + required.add_argument('--outFileName', '-out', '-o', + help='File name to save the coverage matrix. This matrix ' + 'can be subsequently plotted using plotCorrelation or ' + 'or plotPCA.', + type=parserCommon.writableFile) + + optional = parser.add_argument_group('Optional arguments') + + optional.add_argument("--help", "-h", action="help", + help="show this help message and exit") + optional.add_argument('--labels', '-l', + metavar='sample1 sample2', + help='User defined labels instead of default labels from ' + 'file names. ' + 'Multiple labels have to be separated by a space, e.g. ' + '--labels sample1 sample2 sample3', + nargs='+') + optional.add_argument('--smartLabels', + action='store_true', + help='Instead of manually specifying labels for the input ' + 'BAM files, this causes deepTools to use the file name ' + 'after removing the path and extension.') + + optional.add_argument('--genomeChunkSize', + type=int, + default=None, + help='Manually specify the size of the genome provided to each processor. ' + 'The default value of None specifies that this is determined by read ' + 'density of the BAM file.') + + if case == 'bins': + optional.add_argument('--binSize', '-bs', + metavar='INT', + help='Length in bases of the window used ' + 'to sample the genome. (Default: %(default)s)', + default=10000, + type=int) + + optional.add_argument('--distanceBetweenBins', '-n', + metavar='INT', + help='By default, multiBamSummary considers consecutive ' + 'bins of the specified --binSize. However, to ' + 'reduce the computation time, a larger distance ' + 'between bins can by given. Larger distances ' + 'result in fewer bins considered. (Default: %(default)s)', + default=0, + type=int) + + required.add_argument('--BED', + help=argparse.SUPPRESS, + default=None) + else: + optional.add_argument('--binSize', '-bs', + help=argparse.SUPPRESS, + default=10000, + type=int) + + optional.add_argument('--distanceBetweenBins', '-n', + help=argparse.SUPPRESS, + metavar='INT', + default=0, + type=int) + + required.add_argument('--BED', + help='Limits the coverage analysis to ' + 'the regions specified in these files.', + metavar='FILE1.bed FILE2.bed', + nargs='+', + required=True) + + group = parser.add_argument_group('Output optional options') + + group.add_argument('--outRawCounts', + help='Save the counts per region to a tab-delimited file.', + type=parserCommon.writableFile, + metavar='FILE') + + group.add_argument('--scalingFactors', + help='Compute scaling factors (in the DESeq2 manner) ' + 'compatible for use with bamCoverage and write them to a ' + 'file. The file has tab-separated columns "sample" and ' + '"scalingFactor".', + type=parserCommon.writableFile, + metavar='FILE') + + return parser + + +def process_args(args=None): + args = parse_arguments().parse_args(args) + + if len(sys.argv) == 1: + parse_arguments().print_help() + sys.exit() + + if args.labels and len(args.bamfiles) != len(args.labels): + print("The number of labels does not match the number of bam files.") + exit(0) + if not args.labels: + if args.smartLabels: + args.labels = smartLabels(args.bamfiles) + else: + args.labels = [os.path.basename(x) for x in args.bamfiles] + + if not args.BED: + args.BED = "None" + if not args.region: + args.region = [] + if not args.blackListFileName: + args.blackListFileName = "None" + if not args.outRawCounts: + args.outRawCounts = "None" + if not args.scalingFactors: + args.scalingFactors = "None" + # defaults for the filtering options + if not args.samFlagInclude: + args.samFlagInclude = 0 + if not args.samFlagExclude: + args.samFlagExclude = 0 + if not args.minFragmentLength: + args.minFragmentLength = 0 + if not args.maxFragmentLength: + args.maxFragmentLength = 0 + if not args.minMappingQuality: + args.minMappingQuality = 0 + return args + + +def main(args=None): + """ + 1. get read counts at different positions either + all of same length or from genomic regions from the BED file + + 2. save data for further plotting + + """ + args = process_args(args) + print(f"args = {args}") + print("running r_mbams") + r_mbams( + args.command, + args.bamfiles, + args.outFileName, + args.outRawCounts, + args.scalingFactors, + args.labels, + args.binSize, + args.distanceBetweenBins, + args.numberOfProcessors, + args.BED, + args.region, + args.blackListFileName, + args.verbose, + args.extendReads, + args.centerReads, + args.samFlagInclude, + args.samFlagExclude, + args.minFragmentLength, + args.maxFragmentLength, + args.minMappingQuality, + args.keepExons, + args.transcriptID, + args.exonID, + args.transcript_id_designator + ) + + # if 'BED' in args: + # bed_regions = args.BED + # else: + # bed_regions = None + + # if len(args.bamfiles) == 1 and not (args.outRawCounts or args.scalingFactors): + # sys.stderr.write("You've input a single BAM file and not specified " + # "--outRawCounts or --scalingFactors. The resulting output will NOT be " + # "useful with any deepTools program!\n") + + # stepsize = args.binSize + args.distanceBetweenBins + # c = countR.CountReadsPerBin( + # args.bamfiles, + # args.binSize, + # numberOfSamples=None, + # genomeChunkSize=args.genomeChunkSize, + # numberOfProcessors=args.numberOfProcessors, + # verbose=args.verbose, + # region=args.region, + # bedFile=bed_regions, + # blackListFileName=args.blackListFileName, + # extendReads=args.extendReads, + # minMappingQuality=args.minMappingQuality, + # ignoreDuplicates=args.ignoreDuplicates, + # center_read=args.centerReads, + # samFlag_include=args.samFlagInclude, + # samFlag_exclude=args.samFlagExclude, + # minFragmentLength=args.minFragmentLength, + # maxFragmentLength=args.maxFragmentLength, + # stepSize=stepsize, + # zerosToNans=False, + # out_file_for_raw_data=args.outRawCounts) + + # num_reads_per_bin = c.run(allArgs=args) + + # sys.stderr.write("Number of bins " + # "found: {}\n".format(num_reads_per_bin.shape[0])) + + # if num_reads_per_bin.shape[0] < 2: + # exit("ERROR: too few non zero bins found.\n" + # "If using --region please check that this " + # "region is covered by reads.\n") + + # # numpy will append .npz to the file name if we don't do this... + # if args.outFileName: + # f = open(args.outFileName, "wb") + # np.savez_compressed(f, + # matrix=num_reads_per_bin, + # labels=args.labels) + # f.close() + + # if args.scalingFactors: + # f = open(args.scalingFactors, 'w') + # f.write("sample\tscalingFactor\n") + # scalingFactors = countR.estimateSizeFactors(num_reads_per_bin) + # for sample, scalingFactor in zip(args.labels, scalingFactors): + # f.write("{}\t{:6.4f}\n".format(sample, scalingFactor)) + # f.close() + + # if args.outRawCounts: + # # append to the generated file the + # # labels + # header = "#'chr'\t'start'\t'end'\t" + # header += "'" + "'\t'".join(args.labels) + "'\n" + # f = open(args.outRawCounts, 'r+') + # content = f.read() + # f.seek(0, 0) + # f.write(header + content) + # f.close() diff --git a/pyproject.toml b/pyproject.toml index a5a0a925ec..14f5902480 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,3 +88,4 @@ bamCoverage2 = "deeptools.bamCoverage2:main" bamCompare2 = "deeptools.bamCompare2:main" computeMatrix2 = "deeptools.computeMatrix2:main" alignmentSieve2 = "deeptools.alignmentSieve2:main" +multiBamSummary2 = "deeptools.multiBamSummary2:main" \ No newline at end of file diff --git a/src/alignmentsieve.rs b/src/alignmentsieve.rs index 195179d93d..3f37e54970 100644 --- a/src/alignmentsieve.rs +++ b/src/alignmentsieve.rs @@ -27,6 +27,9 @@ pub fn r_alignmentsieve( _blacklist: &str, // blacklist file name. min_fragment_length: u32, // minimum fragment length. max_fragment_length: u32, // maximum fragment length. + extend_reads: u32, + center_reads: bool, + ) -> PyResult<()> { // Input bam file let mut bam = Reader::from_path(bamifile).unwrap(); diff --git a/src/bamcompare.rs b/src/bamcompare.rs index 108dd9c9a0..f327392262 100644 --- a/src/bamcompare.rs +++ b/src/bamcompare.rs @@ -143,12 +143,12 @@ pub fn r_bamcompare( Ok(()) } -struct ParsedBamFile<'a> { - bamfile: &'a str, - ispe: bool, - bg: Vec, - mapped: u32, - unmapped: u32, - readlen: f32, - fraglen: f32 +pub struct ParsedBamFile<'a> { + pub bamfile: &'a str, + pub ispe: bool, + pub bg: Vec, + pub mapped: u32, + pub unmapped: u32, + pub readlen: f32, + pub fraglen: f32 } \ No newline at end of file diff --git a/src/calc.rs b/src/calc.rs index faaf539e20..15b4c83e43 100644 --- a/src/calc.rs +++ b/src/calc.rs @@ -155,4 +155,4 @@ pub fn calc_ratio( return (num / den).log2(); } } -} \ No newline at end of file +} diff --git a/src/lib.rs b/src/lib.rs index fd6c71ef4b..3cad8db12e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,12 +1,13 @@ use pyo3::prelude::*; +mod alignmentsieve; mod bamcoverage; mod bamcompare; +mod calc; mod computematrix; mod covcalc; -mod alignmentsieve; mod filehandler; +mod multibamsummary; mod normalization; -mod calc; mod test; #[pymodule] @@ -15,5 +16,6 @@ fn hp(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(bamcompare::r_bamcompare, m)?)?; m.add_function(wrap_pyfunction!(computematrix::r_computematrix, m)?)?; m.add_function(wrap_pyfunction!(alignmentsieve::r_alignmentsieve, m)?)?; + m.add_function(wrap_pyfunction!(multibamsummary::r_mbams, m)?)?; Ok(()) } diff --git a/src/multibamsummary.rs b/src/multibamsummary.rs new file mode 100644 index 0000000000..6458656d46 --- /dev/null +++ b/src/multibamsummary.rs @@ -0,0 +1,179 @@ +use pyo3::prelude::*; +use pyo3::types::PyList; +use rayon::prelude::*; +use rayon::ThreadPoolBuilder; +use itertools::{multizip, multiunzip, izip}; +use std::io::prelude::*; +use std::io::BufReader; +use std::fs::File; +use ndarray::Array2; +use ndarray::Array1; +use std::io; +use crate::covcalc::{bam_pileup, parse_regions, Alignmentfilters}; +use crate::filehandler::{bam_ispaired}; +use crate::calc::{median, calc_ratio}; +use crate::bamcompare::ParsedBamFile; +use crate::normalization::scale_factor_bamcompare; + +#[pyfunction] +pub fn r_mbams( + // required parameters + mode: &str, // either bins or BED-file + bam_files: Py, + ofile: &str, + // additional output + out_raw_counts: &str, + scaling_factors: &str, + // optional parameters + labels: Py, + binsize: u32, + distance_between_bins: u32, + nproc: usize, + _bed_file: &str, + regions: Vec<(String, u32, u32)>, + _blacklist: &str, + _verbose: bool, + _extend_reads: u32, + _center_reads: bool, + sam_flag_incl: u16, // sam flag include + sam_flag_excl: u16, // sam flag exclude + min_fragment_length: u32, // minimum fragment length. + max_fragment_length: u32, // maximum fragment length. + min_mapping_quality: u8, // minimum mapping quality. + _keep_exons: bool, + _txnid: &str, // transcript id to use when parsing GTF file + _exonid: &str, // exon id to use when parsing GTF file + _txniddesignator: &str, // designator to use when parsing GTF file +) -> PyResult<()> { + let mut bamfiles: Vec = Vec::new(); + let mut bamlabels: Vec = Vec::new(); + let mut ignorechr: Vec = Vec::new(); + Python::with_gil(|py| { + bamfiles = bam_files.extract(py).expect("Failed to retrieve bam files."); + bamlabels = labels.extract(py).expect("Failed to retrieve labels."); + }); + + // Get paired-end information + let ispe = bamfiles.iter() + .map(|x| bam_ispaired(x)) + .collect::>(); + + // zip through ispe and bamfiles + for (_ispe, _bf) in ispe.iter().zip(bamfiles.iter()) { + println!("Sample: {} is-paired: {}", _bf, _ispe); + } + + let filters: Alignmentfilters = Alignmentfilters { + minmappingquality: min_mapping_quality, + samflaginclude: sam_flag_incl, + samflagexclude: sam_flag_excl, + minfraglen: min_fragment_length, + maxfraglen: max_fragment_length + }; + + let (regions, chromsizes) = parse_regions(®ions, bamfiles.get(0).unwrap()); + let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap(); + + + // Zip together bamfiles and ispe into a vec of tuples. + let bampfiles: Vec<_> = bamfiles.into_iter().zip(ispe.into_iter()).collect(); + + let covcalcs: Vec<_> = pool.install(|| { + bampfiles.par_iter() + .map(|(bamfile, ispe)| { + let (bg, mapped, unmapped, readlen, fraglen) = regions.par_iter() + .map(|i| bam_pileup(bamfile, &i, &binsize, &ispe, &ignorechr, &filters, false)) + .reduce( + || (vec![], 0, 0, vec![], vec![]), + |(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| { + _bg.extend(bg); + _readlen.extend(readlen); + _fraglen.extend(fraglen); + _mapped += mapped; + _unmapped += unmapped; + (_bg, _mapped, _unmapped, _readlen, _fraglen) + } + ); + bg + }) + .collect() + }); + + + // Scale factors (when needed) + // relevant python code. + // loggeomeans = np.sum(np.log(m), axis=1) / m.shape[1] + // # Mask after computing the geometric mean + // m = np.ma.masked_where(m <= 0, m) + // loggeomeans = np.ma.masked_where(np.isinf(loggeomeans), loggeomeans) + // # DESeq2 ratio-based size factor + // sf = np.exp(np.ma.median((np.log(m).T - loggeomeans).T, axis=0)) + // return 1. / sf + // create output file to write to + + // create output file to write lines into + let mut file = File::create(out_raw_counts).unwrap(); + + let its: Vec<_> = covcalcs.iter().map(|x| x.iter()).collect(); + let zips = TempZip { iterators: its }; + for c in zips { + // set up Readers + let readers: Vec<_> = c.iter().map(|x| BufReader::new(File::open(x).unwrap()).lines()).collect(); + + for mut _l in (TempZip { iterators: readers }) { + // unwrap all lines in _l + let lines: Vec<_> = _l + .iter_mut() + .map(|x| x.as_mut().unwrap()) + .map(|x| x.split('\t').collect()) + .map(|x: Vec<&str> | ( x[0].to_string(), x[1].parse::().unwrap(), x[2].parse::().unwrap(), x[3].parse::().unwrap() ) ) + .collect(); + // collect all fields 3 together + let mut line: Vec = Vec::new(); + line.push(Countline::Text(lines[0].0.clone())); + line.push(Countline::Int(lines[0].1)); + line.push(Countline::Int(lines[0].2)); + for _line in lines { + line.push(Countline::Float(_line.3)); + } + println!("{:?}", line); + let fline = line.iter().map(|x| x.to_string()).collect::>().join("\t"); + writeln!(file, "{}", fline).unwrap(); + } + } + + + + Ok(()) +} + +#[derive(Debug)] +enum Countline { + Int(u32), + Float(f32), + Text(String), +} + +impl Countline { + fn to_string(&self) -> String { + match self { + Countline::Int(i) => i.to_string(), + Countline::Float(f) => f.to_string(), + Countline::Text(t) => t.clone(), + } + } +} + +struct TempZip +where I: Iterator { + iterators: Vec +} + +impl Iterator for TempZip +where I: Iterator { + type Item = Vec; + fn next(&mut self) -> Option { + let o: Option> = self.iterators.iter_mut().map(|x| x.next()).collect(); + o + } +} \ No newline at end of file