diff --git a/cnvlib/bintest.py b/cnvlib/bintest.py new file mode 100644 index 00000000..60ed9f7d --- /dev/null +++ b/cnvlib/bintest.py @@ -0,0 +1,93 @@ +"""Z-test for single-bin copy number alterations.""" +import logging + +import numpy as np +import pandas as pd +from scipy.stats import norm + +from . import params, segfilters + + +def do_bintest(cnarr, segments=None, alpha=0.005, target_only=False): + """Get a probability for each bin based on its Z-score. + + Adds a column w/ p-values to the input .cnr. + + Returns either (without `segments`) bins where the probability < `alpha`, or + (with `segments`) segments with those significant bin regions spiked in. + """ + cnarr = cnarr.copy() + # Subtract segment means, if given, to report only the CNA bins that + # weren't already detected (including exon-size CNAs within a + # larger-scale, smaller-amplitude CNA) + cnarr['log2'] = cnarr.residuals(segments) + + if target_only: + antitarget_idx = cnarr['gene'].isin(params.ANTITARGET_ALIASES) + if antitarget_idx.any(): + logging.info("Ignoring %d off-target bins", antitarget_idx.sum()) + # NB: bins no longer match the original input + cnarr = cnarr[~antitarget_idx] + + cnarr['p_bintest'] = z_prob(cnarr) + is_sig = cnarr['p_bintest'] < alpha + logging.info("Significant hits in {}/{} bins ({:.3g}%)" + .format(is_sig.sum(), len(is_sig), + 100 * is_sig.sum() / len(is_sig))) + + if segments: + if is_sig.any(): + # Splice significant hits into the given segments + # NB: residuals() above ensures hits all occur within segments + cnarr['is_sig'] = is_sig + chunks = [] + for segment, seghits in cnarr.by_ranges(segments, keep_empty=True): + if seghits['is_sig'].any(): + # Merge each run of adjacent non-significant bins within this + # segment, leaving the significant hits as single-bin segments + levels = seghits['is_sig'].cumsum() * seghits['is_sig'] + chunks.append(seghits.data + .assign(_levels=levels) + .groupby('_levels', sort=False) + .apply(segfilters.squash_region) + .reset_index(drop=True)) + else: + # Keep this segment as-is + chunks.append(pd.DataFrame.from_records([segment], + columns=segments.data.columns)) + return cnarr.as_dataframe(pd.concat(chunks, sort=False)) + else: + # Nothing to do + return segments + else: + # May be empty + hits = cnarr[is_sig] + return hits + + +def z_prob(cnarr): + """Calculate z-test p-value at each bin.""" + # Bin weights ~ 1-variance; bin log2 values already centered at 0.0 + sd = np.sqrt(1 - cnarr['weight']) + # Convert to Z-scores + z = cnarr['log2'] / sd + # Two-sided survival function (1-CDF) probability + p = 2. * norm.cdf(-np.abs(z)) + # Similar to the above -- which is better? + # p2 = 2 * norm.pdf(cnarr['log2'], loc=0, scale=sd) + # if not np.allclose(p, p2): + # print("Max diff:", np.abs(p - p2).max()) + # print("Median diff:", np.median(np.abs(p - p2))) + # print("Ratio:", (p / p2).mean()) + # Correct for multiple hypothesis tests + return p_adjust_bh(p) + + +def p_adjust_bh(p): + """Benjamini-Hochberg p-value correction for multiple hypothesis testing.""" + p = np.asfarray(p) + by_descend = p.argsort()[::-1] + by_orig = by_descend.argsort() + steps = float(len(p)) / np.arange(len(p), 0, -1) + q = np.minimum(1, np.minimum.accumulate(steps * p[by_descend])) + return q[by_orig] diff --git a/cnvlib/commands.py b/cnvlib/commands.py index edd1ba95..57384f05 100644 --- a/cnvlib/commands.py +++ b/cnvlib/commands.py @@ -26,10 +26,10 @@ from skgenome import tabio, GenomicArray as _GA from skgenome.rangelabel import to_label -from . import (access, antitarget, autobin, batch, call, core, coverage, - diagram, export, fix, heatmap, import_rna, importers, metrics, - parallel, reference, reports, scatter, segmentation, segmetrics, - target) +from . import (access, antitarget, autobin, batch, bintest, call, core, + coverage, diagram, export, fix, heatmap, import_rna, importers, + metrics, parallel, reference, reports, scatter, segmentation, + segmetrics, target) from .cmdutil import (load_het_snps, read_cna, verify_sample_sex, write_tsv, write_text, write_dataframe) @@ -1360,6 +1360,34 @@ def _cmd_segmetrics(args): P_segmetrics.set_defaults(func=_cmd_segmetrics) +# bintest ----------------------------------------------------------------------- + +do_bintest = public(bintest.do_bintest) + +def _cmd_bintest(args): + """Z-test for single-bin copy number alterations.""" + cnarr = read_cna(args.cnarray) + segments = read_cna(args.segment) if args.segment else None + sig = do_bintest(cnarr, segments, args.alpha, args.target) + if len(sig): + tabio.write(sig, args.output or sys.stdout) + + +P_bintest = AP_subparsers.add_parser('bintest', help=_cmd_bintest.__doc__) +P_bintest.add_argument('cnarray', + help="Bin-level log2 ratios (.cnr file), as produced by 'fix'.") +P_bintest.add_argument('-s', '--segment', metavar="FILENAME", + help="""Segmentation calls (.cns), the output of the + 'segment' command).""") +P_bintest.add_argument("-a", "--alpha", type=float, default=0.005, + help="Significance threhold. [Default: %(default)s]") +P_bintest.add_argument("-t", "--target", action="store_true", + help="Test target bins only; ignore off-target bins.") +P_bintest.add_argument("-o", "--output", + help="Output filename.") +P_bintest.set_defaults(func=_cmd_bintest) + + # _____________________________________________________________________________ # Other I/O and compatibility diff --git a/cnvlib/segfilters.py b/cnvlib/segfilters.py index 8a1d6610..ac32715f 100644 --- a/cnvlib/segfilters.py +++ b/cnvlib/segfilters.py @@ -106,6 +106,9 @@ def squash_region(cnarr): else: out['cn1'] = np.median(cnarr['cn1']) out['cn2'] = out['cn'] - out['cn1'] + if 'p_bintest' in cnarr: + # Only relevant for single-bin segments, but this seems safe/conservative + out['p_bintest'] = cnarr['p_bintest'].max() return pd.DataFrame(out) diff --git a/cnvlib/ztest.py b/cnvlib/ztest.py deleted file mode 100644 index 3357f141..00000000 --- a/cnvlib/ztest.py +++ /dev/null @@ -1,71 +0,0 @@ -"""Z-test for single-bin copy number alterations.""" -import logging - -import numpy as np -from scipy.stats import norm - -from . import params - - -def do_ztest(cnarr, segments=None, is_male_reference=False, - is_sample_female=None, alpha=0.005, target_only=False): - """Get a probability for each bin based on its Z-score. - - Return bins where the probability < `alpha`. - """ - if segments: - # Subtract segment means to report only the CNA bins that weren't - # already detected (including exon-size CNAs within a larger-scale, - # smaller-amplitude CNA) - cnarr['log2'] = cnarr.residuals(segments) - else: - # Account for non-diploid sex chromosomes - cnarr['log2'] -= cnarr.expect_flat_log2(is_male_reference) - if is_sample_female: - # chrX has same ploidy as autosomes; chrY is just unusable noise - cnarr = cnarr[cnarr.chromosome != cnarr._chr_y_label] - else: - # 1/2 #copies of each sex chromosome - is_chr_xy = cnarr.chromosome.isin((cnarr._chr_x_label, - cnarr._chr_y_label)) - cnarr[is_chr_xy, 'log2'] += 1.0 - - if target_only: - antitarget_idx = cnarr['gene'].isin(params.ANTITARGET_ALIASES) - if antitarget_idx.any(): - logging.info("Ignoring", antitarget_idx.sum(), "off-target bins") - cnarr = cnarr[~antitarget_idx] - - cnarr['ztest'] = z_prob(cnarr) - is_sig = cnarr['ztest'] < alpha - logging.info("Significant hits in {}/{} bins ({:.3g}%)" - .format(is_sig.sum(), len(is_sig), - 100 * is_sig.sum() / len(is_sig))) - return cnarr[is_sig] - - -def z_prob(cnarr): - # Bin weights ~ 1-variance; bin log2 values already centered at 0.0 - sd = np.sqrt(1 - cnarr['weight']) - # Convert to Z-scores - z = cnarr['log2'] / sd - # Two-sided survival function (1-CDF) probability - p = 2. * norm.cdf(-np.abs(z)) - # Similar to the above -- which is better? - # p2 = 2 * norm.pdf(cnarr['log2'], loc=0, scale=sd) - # if not np.allclose(p, p2): - # print("Max diff:", np.abs(p - p2).max()) - # print("Median diff:", np.median(np.abs(p - p2))) - # print("Ratio:", (p / p2).mean()) - # Correct for multiple hypothesis tests - return p_adjust_bh(p) - - -def p_adjust_bh(p): - """Benjamini-Hochberg p-value correction for multiple hypothesis testing.""" - p = np.asfarray(p) - by_descend = p.argsort()[::-1] - by_orig = by_descend.argsort() - steps = float(len(p)) / np.arange(len(p), 0, -1) - q = np.minimum(1, np.minimum.accumulate(steps * p[by_descend])) - return q[by_orig] diff --git a/scripts/cnv_ztest.py b/scripts/cnv_ztest.py deleted file mode 100755 index a92bdbd4..00000000 --- a/scripts/cnv_ztest.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python -"""Z-test for single-bin copy number alterations.""" -import argparse -import logging -import sys - -import cnvlib -from cnvlib.cmdutil import verify_sample_sex -from cnvlib.ztest import do_ztest -from skgenome import tabio - -logging.basicConfig(level=logging.INFO, format="%(message)s") - - -def _cmd_ztest(args): - cnarr = cnvlib.read(args.cnarr) - if args.segment: - segments = cnvlib.read(args.segment) - is_sample_female = None - else: - segments = None - is_sample_female = verify_sample_sex(cnarr, args.sample_sex, - args.male_reference) - sig = do_ztest(cnarr, segments, args.male_reference, is_sample_female, - args.alpha, args.target) - if len(sig): - tabio.write(sig, args.output or sys.stdout) - - - -if __name__ == '__main__': - logging.basicConfig(level=logging.INFO, format="%(message)s") - - AP = argparse.ArgumentParser(description=__doc__) - AP.add_argument("cnarr", help=".cnr file") - AP.add_argument('-s', '--segment', metavar="FILENAME", - help="""Segmentation calls (.cns), the output of the - 'segment' command).""") - - AP.add_argument("-a", "--alpha", type=float, default=0.005, - help="Significance threhold. [Default: %(default)s]") - AP.add_argument("-t", "--target", action="store_true", - help="Test target bins only; ignore off-target bins.") - AP.add_argument('-y', '--male-reference', '--haploid-x-reference', - action='store_true', - help="""Assume inputs were normalized to a male reference - (i.e. female samples will have +1 log-coverage of chrX; - otherwise male samples would have -1 chrX).""") - AP.add_argument('-x', '--sample-sex', - choices=('m', 'y', 'male', 'Male', - 'f', 'x', 'female', 'Female'), - help="""Specify the sample's chromosomal sex as male or - female. (Otherwise guessed from X and Y coverage).""") - - AP.add_argument("-o", "--output", - help="Output filename.") - _cmd_ztest(AP.parse_args()) diff --git a/test/bintest.makefile b/test/bintest.makefile new file mode 100644 index 00000000..a605868e --- /dev/null +++ b/test/bintest.makefile @@ -0,0 +1,15 @@ +# bin-level testing +# Pick up from clustering .cnr results + +cnr_of_interest=TR_101.clustered.cnr TR_77.clustered.cnr TR_55.clustered.cnr TR_42.clustered.cnr +cns_of_interest=$(cnr_of_interest:.cnr=.cns) +# TR_101.clustered.cnr TR_77.clustered.cnr TR_55.clustered.cnr TR_42.clustered.cnr +out_cns=$(cnr_of_interest:.cnr=.bintest.cns) + +all: $(out_cns) + +$(out_cns): %.bintest.cns: %.cns %.cnr + cnvkit.py bintest -s $^ -t -o $@ + +$(cns_of_interest): %.cns: %.cnr + cnvkit.py segment $< -o $@ diff --git a/test/test_cnvlib.py b/test/test_cnvlib.py index 6abb3787..df95940a 100755 --- a/test/test_cnvlib.py +++ b/test/test_cnvlib.py @@ -13,10 +13,10 @@ import cnvlib # Import all modules as a smoke test -from cnvlib import (access, antitarget, autobin, batch, cnary, commands, core, - coverage, diagram, export, fix, import_rna, importers, - metrics, params, plots, reference, reports, segmentation, - segmetrics, smoothing, vary) +from cnvlib import (access, antitarget, autobin, batch, bintest, cnary, + commands, core, coverage, diagram, export, fix, import_rna, + importers, metrics, params, plots, reference, reports, + segmentation, segmetrics, smoothing, vary) class CNATests(unittest.TestCase):