New 'bintest' command superseding cnv_ztest.py script

Report p-value as a column 'p_bintest' in the output. (Formerly 'ztest') - With segments as input (-s), splice significant bins into those segments and recalculate log2, weight, etc. - Without segments, return a .cnr of just the significant bins, as before.
etal · Mar 29, 2019 · 4deb0b0 · 4deb0b0
1 parent 74b20d1
commit 4deb0b0
Show file tree

Hide file tree

Showing 7 changed files with 147 additions and 136 deletions.
diff --git a/cnvlib/bintest.py b/cnvlib/bintest.py
@@ -0,0 +1,93 @@
+"""Z-test for single-bin copy number alterations."""
+import logging
+
+import numpy as np
+import pandas as pd
+from scipy.stats import norm
+
+from . import params, segfilters
+
+
+def do_bintest(cnarr, segments=None, alpha=0.005, target_only=False):
+    """Get a probability for each bin based on its Z-score.
+
+    Adds a column w/ p-values to the input .cnr.
+
+    Returns either (without `segments`) bins where the probability < `alpha`, or
+    (with `segments`) segments with those significant bin regions spiked in.
+    """
+    cnarr = cnarr.copy()
+    # Subtract segment means, if given, to report only the CNA bins that
+    # weren't already detected (including exon-size CNAs within a 
+    # larger-scale, smaller-amplitude CNA)
+    cnarr['log2'] = cnarr.residuals(segments)
+
+    if target_only:
+        antitarget_idx = cnarr['gene'].isin(params.ANTITARGET_ALIASES)
+        if antitarget_idx.any():
+            logging.info("Ignoring %d off-target bins", antitarget_idx.sum())
+            # NB: bins no longer match the original input
+            cnarr = cnarr[~antitarget_idx]
+
+    cnarr['p_bintest'] = z_prob(cnarr)
+    is_sig = cnarr['p_bintest'] < alpha
+    logging.info("Significant hits in {}/{} bins ({:.3g}%)"
+                 .format(is_sig.sum(), len(is_sig),
+                         100 * is_sig.sum() / len(is_sig)))
+
+    if segments:
+        if is_sig.any():
+            # Splice significant hits into the given segments
+            # NB: residuals() above ensures hits all occur within segments
+            cnarr['is_sig'] = is_sig
+            chunks = []
+            for segment, seghits in cnarr.by_ranges(segments, keep_empty=True):
+                if seghits['is_sig'].any():
+                    # Merge each run of adjacent non-significant bins within this
+                    # segment, leaving the significant hits as single-bin segments
+                    levels = seghits['is_sig'].cumsum() * seghits['is_sig']
+                    chunks.append(seghits.data
+                                  .assign(_levels=levels)
+                                  .groupby('_levels', sort=False)
+                                  .apply(segfilters.squash_region)
+                                  .reset_index(drop=True))
+                else:
+                    # Keep this segment as-is
+                    chunks.append(pd.DataFrame.from_records([segment],
+                                                            columns=segments.data.columns))
+            return cnarr.as_dataframe(pd.concat(chunks, sort=False))
+        else:
+            # Nothing to do
+            return segments
+    else:
+        # May be empty
+        hits = cnarr[is_sig]
+        return hits
+
+
+def z_prob(cnarr):
+    """Calculate z-test p-value at each bin."""
+    # Bin weights ~ 1-variance; bin log2 values already centered at 0.0
+    sd = np.sqrt(1 - cnarr['weight'])
+    # Convert to Z-scores
+    z = cnarr['log2'] / sd
+    # Two-sided survival function (1-CDF) probability
+    p = 2. * norm.cdf(-np.abs(z))
+    # Similar to the above -- which is better?
+    # p2 = 2 * norm.pdf(cnarr['log2'], loc=0, scale=sd)
+    # if not np.allclose(p, p2):
+    #     print("Max diff:", np.abs(p - p2).max())
+    #     print("Median diff:", np.median(np.abs(p - p2)))
+    #     print("Ratio:", (p / p2).mean())
+    # Correct for multiple hypothesis tests
+    return p_adjust_bh(p)
+
+
+def p_adjust_bh(p):
+    """Benjamini-Hochberg p-value correction for multiple hypothesis testing."""
+    p = np.asfarray(p)
+    by_descend = p.argsort()[::-1]
+    by_orig = by_descend.argsort()
+    steps = float(len(p)) / np.arange(len(p), 0, -1)
+    q = np.minimum(1, np.minimum.accumulate(steps * p[by_descend]))
+    return q[by_orig]
diff --git a/cnvlib/commands.py b/cnvlib/commands.py
@@ -26,10 +26,10 @@
 from skgenome import tabio, GenomicArray as _GA
 from skgenome.rangelabel import to_label
 
-from . import (access, antitarget, autobin, batch, call, core, coverage,
-               diagram, export, fix, heatmap, import_rna, importers, metrics,
-               parallel, reference, reports, scatter, segmentation, segmetrics,
-               target)
+from . import (access, antitarget, autobin, batch, bintest, call, core,
+               coverage, diagram, export, fix, heatmap, import_rna, importers,
+               metrics, parallel, reference, reports, scatter, segmentation,
+               segmetrics, target)
 from .cmdutil import (load_het_snps, read_cna, verify_sample_sex,
                       write_tsv, write_text, write_dataframe)
 
@@ -1360,6 +1360,34 @@ def _cmd_segmetrics(args):
 P_segmetrics.set_defaults(func=_cmd_segmetrics)
 
 
+# bintest -----------------------------------------------------------------------
+
+do_bintest = public(bintest.do_bintest)
+
+def _cmd_bintest(args):
+    """Z-test for single-bin copy number alterations."""
+    cnarr = read_cna(args.cnarray)
+    segments = read_cna(args.segment) if args.segment else None
+    sig = do_bintest(cnarr, segments, args.alpha, args.target)
+    if len(sig):
+        tabio.write(sig, args.output or sys.stdout)
+
+
+P_bintest = AP_subparsers.add_parser('bintest', help=_cmd_bintest.__doc__)
+P_bintest.add_argument('cnarray',
+        help="Bin-level log2 ratios (.cnr file), as produced by 'fix'.")
+P_bintest.add_argument('-s', '--segment', metavar="FILENAME",
+        help="""Segmentation calls (.cns), the output of the
+                'segment' command).""")
+P_bintest.add_argument("-a", "--alpha", type=float, default=0.005,
+        help="Significance threhold. [Default: %(default)s]")
+P_bintest.add_argument("-t", "--target", action="store_true",
+        help="Test target bins only; ignore off-target bins.")
+P_bintest.add_argument("-o", "--output",
+        help="Output filename.")
+P_bintest.set_defaults(func=_cmd_bintest)
+
+
 # _____________________________________________________________________________
 # Other I/O and compatibility
 

diff --git a/cnvlib/segfilters.py b/cnvlib/segfilters.py
@@ -106,6 +106,9 @@ def squash_region(cnarr):
             else:
                 out['cn1'] = np.median(cnarr['cn1'])
             out['cn2'] = out['cn'] - out['cn1']
+    if 'p_bintest' in cnarr:
+        # Only relevant for single-bin segments, but this seems safe/conservative
+        out['p_bintest'] = cnarr['p_bintest'].max()
     return pd.DataFrame(out)
 
 

diff --git a/cnvlib/ztest.py b/cnvlib/ztest.py
diff --git a/scripts/cnv_ztest.py b/scripts/cnv_ztest.py
diff --git a/test/bintest.makefile b/test/bintest.makefile
@@ -0,0 +1,15 @@
+# bin-level testing
+# Pick up from clustering .cnr results
+
+cnr_of_interest=TR_101.clustered.cnr TR_77.clustered.cnr TR_55.clustered.cnr TR_42.clustered.cnr
+cns_of_interest=$(cnr_of_interest:.cnr=.cns)
+# TR_101.clustered.cnr TR_77.clustered.cnr TR_55.clustered.cnr TR_42.clustered.cnr
+out_cns=$(cnr_of_interest:.cnr=.bintest.cns)
+
+all: $(out_cns)
+
+$(out_cns): %.bintest.cns: %.cns %.cnr
+	cnvkit.py bintest -s $^ -t -o $@
+
+$(cns_of_interest): %.cns: %.cnr
+	cnvkit.py segment $< -o $@
diff --git a/test/test_cnvlib.py b/test/test_cnvlib.py
@@ -13,10 +13,10 @@
 
 import cnvlib
 # Import all modules as a smoke test
-from cnvlib import (access, antitarget, autobin, batch, cnary, commands, core,
-                    coverage, diagram, export, fix, import_rna, importers,
-                    metrics, params, plots, reference, reports, segmentation,
-                    segmetrics, smoothing, vary)
+from cnvlib import (access, antitarget, autobin, batch, bintest, cnary,
+                    commands, core, coverage, diagram, export, fix, import_rna,
+                    importers, metrics, params, plots, reference, reports,
+                    segmentation, segmetrics, smoothing, vary)
 
 
 class CNATests(unittest.TestCase):