Skip to content

Commit

Permalink
New 'bintest' command superseding cnv_ztest.py script
Browse files Browse the repository at this point in the history
Report p-value as a column 'p_bintest' in the output. (Formerly 'ztest')

- With segments as input (-s), splice significant bins into those
  segments and recalculate log2, weight, etc.
- Without segments, return a .cnr of just the significant bins, as
  before.
  • Loading branch information
etal committed Mar 29, 2019
1 parent 74b20d1 commit 4deb0b0
Show file tree
Hide file tree
Showing 7 changed files with 147 additions and 136 deletions.
93 changes: 93 additions & 0 deletions cnvlib/bintest.py
@@ -0,0 +1,93 @@
"""Z-test for single-bin copy number alterations."""
import logging

import numpy as np
import pandas as pd
from scipy.stats import norm

from . import params, segfilters


def do_bintest(cnarr, segments=None, alpha=0.005, target_only=False):
"""Get a probability for each bin based on its Z-score.
Adds a column w/ p-values to the input .cnr.
Returns either (without `segments`) bins where the probability < `alpha`, or
(with `segments`) segments with those significant bin regions spiked in.
"""
cnarr = cnarr.copy()
# Subtract segment means, if given, to report only the CNA bins that
# weren't already detected (including exon-size CNAs within a
# larger-scale, smaller-amplitude CNA)
cnarr['log2'] = cnarr.residuals(segments)

if target_only:
antitarget_idx = cnarr['gene'].isin(params.ANTITARGET_ALIASES)
if antitarget_idx.any():
logging.info("Ignoring %d off-target bins", antitarget_idx.sum())
# NB: bins no longer match the original input
cnarr = cnarr[~antitarget_idx]

cnarr['p_bintest'] = z_prob(cnarr)
is_sig = cnarr['p_bintest'] < alpha
logging.info("Significant hits in {}/{} bins ({:.3g}%)"
.format(is_sig.sum(), len(is_sig),
100 * is_sig.sum() / len(is_sig)))

if segments:
if is_sig.any():
# Splice significant hits into the given segments
# NB: residuals() above ensures hits all occur within segments
cnarr['is_sig'] = is_sig
chunks = []
for segment, seghits in cnarr.by_ranges(segments, keep_empty=True):
if seghits['is_sig'].any():
# Merge each run of adjacent non-significant bins within this
# segment, leaving the significant hits as single-bin segments
levels = seghits['is_sig'].cumsum() * seghits['is_sig']
chunks.append(seghits.data
.assign(_levels=levels)
.groupby('_levels', sort=False)
.apply(segfilters.squash_region)
.reset_index(drop=True))
else:
# Keep this segment as-is
chunks.append(pd.DataFrame.from_records([segment],
columns=segments.data.columns))
return cnarr.as_dataframe(pd.concat(chunks, sort=False))
else:
# Nothing to do
return segments
else:
# May be empty
hits = cnarr[is_sig]
return hits


def z_prob(cnarr):
"""Calculate z-test p-value at each bin."""
# Bin weights ~ 1-variance; bin log2 values already centered at 0.0
sd = np.sqrt(1 - cnarr['weight'])
# Convert to Z-scores
z = cnarr['log2'] / sd
# Two-sided survival function (1-CDF) probability
p = 2. * norm.cdf(-np.abs(z))
# Similar to the above -- which is better?
# p2 = 2 * norm.pdf(cnarr['log2'], loc=0, scale=sd)
# if not np.allclose(p, p2):
# print("Max diff:", np.abs(p - p2).max())
# print("Median diff:", np.median(np.abs(p - p2)))
# print("Ratio:", (p / p2).mean())
# Correct for multiple hypothesis tests
return p_adjust_bh(p)


def p_adjust_bh(p):
"""Benjamini-Hochberg p-value correction for multiple hypothesis testing."""
p = np.asfarray(p)
by_descend = p.argsort()[::-1]
by_orig = by_descend.argsort()
steps = float(len(p)) / np.arange(len(p), 0, -1)
q = np.minimum(1, np.minimum.accumulate(steps * p[by_descend]))
return q[by_orig]
36 changes: 32 additions & 4 deletions cnvlib/commands.py
Expand Up @@ -26,10 +26,10 @@
from skgenome import tabio, GenomicArray as _GA
from skgenome.rangelabel import to_label

from . import (access, antitarget, autobin, batch, call, core, coverage,
diagram, export, fix, heatmap, import_rna, importers, metrics,
parallel, reference, reports, scatter, segmentation, segmetrics,
target)
from . import (access, antitarget, autobin, batch, bintest, call, core,
coverage, diagram, export, fix, heatmap, import_rna, importers,
metrics, parallel, reference, reports, scatter, segmentation,
segmetrics, target)
from .cmdutil import (load_het_snps, read_cna, verify_sample_sex,
write_tsv, write_text, write_dataframe)

Expand Down Expand Up @@ -1360,6 +1360,34 @@ def _cmd_segmetrics(args):
P_segmetrics.set_defaults(func=_cmd_segmetrics)


# bintest -----------------------------------------------------------------------

do_bintest = public(bintest.do_bintest)

def _cmd_bintest(args):
"""Z-test for single-bin copy number alterations."""
cnarr = read_cna(args.cnarray)
segments = read_cna(args.segment) if args.segment else None
sig = do_bintest(cnarr, segments, args.alpha, args.target)
if len(sig):
tabio.write(sig, args.output or sys.stdout)


P_bintest = AP_subparsers.add_parser('bintest', help=_cmd_bintest.__doc__)
P_bintest.add_argument('cnarray',
help="Bin-level log2 ratios (.cnr file), as produced by 'fix'.")
P_bintest.add_argument('-s', '--segment', metavar="FILENAME",
help="""Segmentation calls (.cns), the output of the
'segment' command).""")
P_bintest.add_argument("-a", "--alpha", type=float, default=0.005,
help="Significance threhold. [Default: %(default)s]")
P_bintest.add_argument("-t", "--target", action="store_true",
help="Test target bins only; ignore off-target bins.")
P_bintest.add_argument("-o", "--output",
help="Output filename.")
P_bintest.set_defaults(func=_cmd_bintest)


# _____________________________________________________________________________
# Other I/O and compatibility

Expand Down
3 changes: 3 additions & 0 deletions cnvlib/segfilters.py
Expand Up @@ -106,6 +106,9 @@ def squash_region(cnarr):
else:
out['cn1'] = np.median(cnarr['cn1'])
out['cn2'] = out['cn'] - out['cn1']
if 'p_bintest' in cnarr:
# Only relevant for single-bin segments, but this seems safe/conservative
out['p_bintest'] = cnarr['p_bintest'].max()
return pd.DataFrame(out)


Expand Down
71 changes: 0 additions & 71 deletions cnvlib/ztest.py

This file was deleted.

57 changes: 0 additions & 57 deletions scripts/cnv_ztest.py

This file was deleted.

15 changes: 15 additions & 0 deletions test/bintest.makefile
@@ -0,0 +1,15 @@
# bin-level testing
# Pick up from clustering .cnr results

cnr_of_interest=TR_101.clustered.cnr TR_77.clustered.cnr TR_55.clustered.cnr TR_42.clustered.cnr
cns_of_interest=$(cnr_of_interest:.cnr=.cns)
# TR_101.clustered.cnr TR_77.clustered.cnr TR_55.clustered.cnr TR_42.clustered.cnr
out_cns=$(cnr_of_interest:.cnr=.bintest.cns)

all: $(out_cns)

$(out_cns): %.bintest.cns: %.cns %.cnr
cnvkit.py bintest -s $^ -t -o $@

$(cns_of_interest): %.cns: %.cnr
cnvkit.py segment $< -o $@
8 changes: 4 additions & 4 deletions test/test_cnvlib.py
Expand Up @@ -13,10 +13,10 @@

import cnvlib
# Import all modules as a smoke test
from cnvlib import (access, antitarget, autobin, batch, cnary, commands, core,
coverage, diagram, export, fix, import_rna, importers,
metrics, params, plots, reference, reports, segmentation,
segmetrics, smoothing, vary)
from cnvlib import (access, antitarget, autobin, batch, bintest, cnary,
commands, core, coverage, diagram, export, fix, import_rna,
importers, metrics, params, plots, reference, reports,
segmentation, segmetrics, smoothing, vary)


class CNATests(unittest.TestCase):
Expand Down

0 comments on commit 4deb0b0

Please sign in to comment.