In [92]:
import numpy as np
import ipyrad as ip
from collections import Counter

import scipy.stats
import scipy.misc
import itertools

In [154]:
arrayed = np.array([list("AAAATnnnnGGGG"), list("AAAA-nnnnGGGG"), 
                    list("AAAA-nnnnGGGG"), list("AAAA-nnnnGGGG"), 
                    list("AAAA-nnnnGGGG"), list("AAAA-nnnnGGGG"), 
                    list("AAAA-nnnnGGGG")])

consens = list("AAAANnnnnGGGG")

In [155]:
print arrayed
print consens

[['A' 'A' 'A' 'A' 'T' 'n' 'n' 'n' 'n' 'G' 'G' 'G' 'G']
 ['A' 'A' 'A' 'A' '-' 'n' 'n' 'n' 'n' 'G' 'G' 'G' 'G']
 ['A' 'A' 'A' 'A' '-' 'n' 'n' 'n' 'n' 'G' 'G' 'G' 'G']
 ['A' 'A' 'A' 'A' '-' 'n' 'n' 'n' 'n' 'G' 'G' 'G' 'G']
 ['A' 'A' 'A' 'A' '-' 'n' 'n' 'n' 'n' 'G' 'G' 'G' 'G']
 ['A' 'A' 'A' 'A' '-' 'n' 'n' 'n' 'n' 'G' 'G' 'G' 'G']
 ['A' 'A' 'A' 'A' '-' 'n' 'n' 'n' 'n' 'G' 'G' 'G' 'G']]
['A', 'A', 'A', 'A', 'N', 'n', 'n', 'n', 'n', 'G', 'G', 'G', 'G']


In [140]:
data = ip.load_json("cli/cli.json")

  loading Assembly: cli
  from saved path: ~/Documents/ipyrad/tests/cli/cli.json


In [141]:
def basecall(site, data):
    """ prepares stack for making base calls """
    ## count em
    site = Counter(site)

    ## remove Ns and (-)s
    if "N" in site:
        site.pop("N")
    if "-" in site:
        site.pop("-")

    ## get the two most common alleles
    if site:
        base1 = base2 = 0
        comms = site.most_common()
        base1 = comms[0][1]
        if len(comms) > 1:
            base2 = comms[1][1]

        ## if site depth after removing Ns, (-s) and third bases is below limit
        bidepth = base1 + base2
        if bidepth < data.paramsdict["mindepth_majrule"]:
            cons = "N"

        else:
            ## if depth > 500 reduce to randomly sampled 500 
            if bidepth >= 500: 
                randomsample = numpy.array(tuple("A"*base1+"B"*base2))
                numpy.random.shuffle(randomsample)
                base1 = list(randomsample[:500]).count("A")
                base2 = list(randomsample[:500]).count("B")

            ## speedhack: make the base call using a method depending on depth
            ## if highdepth and invariable just call the only base
            if (bidepth > 10) and (not base2):
                cons = comms[0][0]
            ## but if variable then use basecaller
            else:
                cons = basecaller(data, site, base1, base2)
    else:
        cons = "N"
    return cons

In [142]:
arrayed

array([['A', 'A', 'A', 'A', 'T', 'n', 'n', 'n', 'n', 'G', 'G', 'G', 'G'],
       ['A', 'A', 'A', 'A', '-', 'n', 'n', 'n', 'n', 'G', 'G', 'G', 'G'],
       ['A', 'A', 'A', 'A', '-', 'n', 'n', 'n', 'n', 'G', 'G', 'G', 'G'],
       ['A', 'A', 'A', 'A', '-', 'n', 'n', 'n', 'n', 'G', 'G', 'G', 'G'],
       ['A', 'A', 'A', 'A', '-', 'n', 'n', 'n', 'n', 'G', 'G', 'G', 'G'],
       ['A', 'A', 'A', 'A', '-', 'n', 'n', 'n', 'n', 'G', 'G', 'G', 'G'],
       ['A', 'A', 'A', 'A', '-', 'n', 'n', 'n', 'n', 'G', 'G', 'G', 'G']], 
      dtype='|S1')

In [143]:
site = arrayed[:, 1]
site

array(['A', 'A', 'A', 'A', 'A', 'A', 'A'], 
      dtype='|S1')

In [144]:
def newcall(site, data):
    """ prepares stack for making base calls """
    ## count em
    site = Counter(site)

    ## remove Ns and (-)s
    if "N" in site:
        site.pop("N")
    if "-" in site:
        site.pop("-")

    ## get the two most common alleles
    if site:
        base1 = base2 = 0
        comms = site.most_common()
        base1 = comms[0][1]
        if len(comms) > 1:
            base2 = comms[1][1]

        ## if site depth after removing Ns, (-s) and third bases is below limit
        bidepth = base1 + base2
        if bidepth < data.paramsdict["mindepth_majrule"]:
            cons = "N"

        else:
            ## if depth > 500 divide to a number below 500
            if bidepth >= 500:
                divisor = base1 // 500
                base1 //= divisor
                base2 //= divisor
                #randomsample = numpy.array(tuple("A"*base1+"B"*base2))
                #numpy.random.shuffle(randomsample)
                #base1 = list(randomsample[:500]).count("A")
                #base2 = list(randomsample[:500]).count("B")

            ## speedhack: make the base call using a method depending on depth
            ## if highdepth and invariable just call the only base
            if (bidepth > 10) and (not base2):
                cons = comms[0][0]
            ## but if variable then use basecaller
            else:
                cons = basecaller(data, site, base1, base2)
    else:
        cons = "N"
    return cons

In [100]:
def basecaller(data, site, base1, base2):
    """ inputs data to binomprobr and gets alleles correctly oriented """

    ## make statistical base call
    if base1+base2 >= data.paramsdict["mindepth_statistical"]:
        prob, _, who = binomprobr(base1, base2, data._este, data._esth)
        
    elif base1+base2 >= data.paramsdict["mindepth_majrule"]:
        prob, _, who = simpleconsensus(base1, base2)

    else:
        LOGGER.error("gap in mindepth settings")

    ## if the base could be called with 95% probability
    if float(prob) >= 0.95:
        if who != "ab":
            ## site is homozygous
            cons = site.most_common(1)[0][0]
        else:
            ## site is heterozygous
            cons = hetero(*[i[0] for i in site.most_common(2)])
    else:
        cons = "N"
    return cons

In [101]:

def simpleconsensus(base1, base2):
    """
    majority consensus calling for sites with too low of coverage for
    statistical calling. Only used with 'lowcounts' option. Returns 
    the most common base. Returns consistent alphabetical order for ties.
    """
    #qQn = ['aa','bb','ab']
    maf = base1/(base1+base2)
    return [1.0, maf, 'aa']


In [102]:
def binomprobr(base1, base2, error, het):
    """
    given two bases are observed at a site n1 and n2, and the error rate e, the
    probability the site is truly aa,bb,ab is calculated using binomial 
    distribution as in Li_et al 2009, 2011, and if coverage > 500, 500 
    dereplicated reads were randomly sampled.
    """
    ## major allele freq
    mjaf = base1/float(base1+base2)
    prior_homo = ((1.-het)/2.)
    prior_het = het

    ## get probabilities. Note, b/c only allow two bases, base2 == sum-base1
    hetro = scipy.misc.comb(base1+base2, base1)/(2.**(base1+base2))
    homoa = scipy.stats.binom.pmf(base2, base1+base2, error)
    homob = scipy.stats.binom.pmf(base1, base1+base2, error)

    ## calculate probs
    homoa *= prior_homo
    homob *= prior_homo
    hetro *= prior_het

    ## return 
    probabilities = [homoa, homob, hetro]
    genotypes = ['aa', 'bb', 'ab']
    bestprob = max(probabilities)/float(sum(probabilities))

    return [bestprob, mjaf, genotypes[probabilities.index(max(probabilities))]]

In [103]:
def removerepeats(consens, arrayed):
    """ Checks for interior Ns in consensus seqs and removes those that are at
    low depth, here defined as less than 1/3 of the average depth. The prop 1/3
    is chosen so that mindepth=6 requires 2 base calls that are not in [N,-].
    """

    ## default trim no edges
    consens = "".join(consens).replace("-", "N")
    edges = [None, None]

    ## trim from left else index starts at zero
    lcons = len(consens)
    consens = consens.lstrip("N")
    edges[0] = lcons - len(consens)

    ## trim from right if nonzero
    lcons = len(consens)
    consens = consens.rstrip("N")
    if lcons - len(consens):
        edges[1] = -1*(lcons - len(consens))

    ## trim same from arrayed
    arrayed = arrayed[:, edges[0]:edges[1]]

    ## what is the total site coverage
    totdepth = arrayed.shape[0]
    mindepth = max(1, totdepth // 3)

    ## test across N-called sites
    nsites = [i for (i, j) in enumerate(consens) if j == "N"]

    ## get column counts of Ns and -s
    ndepths = np.sum(arrayed == 'N', axis=0) 
    idepths = np.sum(arrayed == '-', axis=0)

    ## find sites to remove
    ridx = []
    for nsite in nsites:
        ## If not at least mindepth non (N-) char at site, then remove
        if (idepths[nsite]+ndepths[nsite]) > mindepth:
            ridx.append(nsite)
    
    ## remove repeat sites from shortcon and stacked
    ## If consens is all N's this will raise a ValueError which 
    ## consensus() will catch and then pass over this sample.
    keeps, consens = zip(*[(i, j) for (i, j) in enumerate(consens) \
                        if i not in ridx])

    consens = "".join(list(consens))
    arrayed = arrayed[:, list(keeps)]

    return np.array(consens), arrayed

In [152]:
def remove_repeats(consens, arrayed):
    """ Checks for interior Ns in consensus seqs and removes those that are at
    low depth, here defined as less than 1/3 of the average depth. The prop 1/3
    is chosen so that mindepth=6 requires 2 base calls that are not in [N,-].
    """

    ## default trim no edges
    consens = "".join(consens).replace("-", "N")
    edges = [None, None]

    ## trim from left else index starts at zero
    lcons = len(consens)
    consens = consens.lstrip("N")
    edges[0] = lcons - len(consens)

    ## trim from right if nonzero
    lcons = len(consens)
    consens = consens.rstrip("N")
    if lcons - len(consens):
        edges[1] = -1*(lcons - len(consens))

    ## trim same from arrayed
    consens = np.array(list(consens))
    arrayed = arrayed[:, edges[0]:edges[1]]

    ## get column counts of Ns and -s
    ndepths = np.sum(arrayed == 'N', axis=0) 
    idepths = np.sum(arrayed == '-', axis=0)

    ## get proportion of bases that are N- at each site
    nons = ((ndepths + idepths) / float(arrayed.shape[0])) >= 0.75
    ## boolean of whether base was called N
    isn = consens == "N"
    ## make ridx
    ridx = nons * isn

    ## apply filter
    consens = consens[~ridx]
    arrayed = arrayed[:, ~ridx]
    
    return consens, arrayed

In [134]:
a = np.array([0, 1, 1, 0], dtype=np.bool)
b = np.array([1, 1, 0, 0], dtype=np.bool)
print a
print b
print a*b

[False  True  True False]
[ True  True False False]
[False  True False False]


In [135]:
ndepths = np.sum(arrayed=="n", axis=0)
idepths = np.sum(arrayed=="i", axis=0)

nons = (ndepths+idepths) / float(arrayed.shape[0])
nons > 0.90

array([False, False, False, False, False,  True,  True,  True,  True,
       False, False, False, False], dtype=bool)

In [157]:
rconsens, rarray = remove_repeats(consens, arrayed)
rconsens

array(['A', 'A', 'A', 'A', 'n', 'n', 'n', 'n', 'G', 'G', 'G', 'G'], 
      dtype='|S1')

In [161]:
rconsens[rconsens=="N"].size

0

In [52]:
%%timeit
removerepeats(consens, arrayed)

The slowest run took 4.56 times longer than the fastest. This could mean that an intermediate result is being cached 
10000 loops, best of 3: 63.8 µs per loop


In [145]:
data._este = data.stats.error_est.mean()
data._esth = data.stats.hetero_est.mean()

In [54]:
%%timeit 
np.apply_along_axis(basecall, 0, arrayed, data)

100 loops, best of 3: 4.26 ms per loop


In [55]:
%%timeit 
np.apply_along_axis(newcall, 0, arrayed, data)

100 loops, best of 3: 3.88 ms per loop


In [146]:
consens = np.apply_along_axis(newcall, 0, arrayed, data)
consens

array(['A', 'A', 'A', 'A', 'N', 'n', 'n', 'n', 'n', 'G', 'G', 'G', 'G'], 
      dtype='|S1')

In [148]:
%%timeit 
[i for (i,j) in enumerate(consens) if j in "RKSYWM"]

The slowest run took 5.00 times longer than the fastest. This could mean that an intermediate result is being cached 
100000 loops, best of 3: 3 µs per loop


In [150]:
%%timeit 
[np.where(consens==i) for i in "RKSYWM"]

100000 loops, best of 3: 17.3 µs per loop
