### Cookbook: abba-baba introgression stats

The *ipyrad.analysis* Python module includes functions to calculate abba-baba admixture statistics, as well as several variants of these measures, and to perform signifance tests. The code in this notebook is all Python, which you can copy/paste into an IPython terminal to execute, or, preferably, run in a Jupyter notebook like this one. See the other analysis cookbooks for [instructions](http://ipyrad.readthedocs.io/analysis.html) on using Jupyter notebooks.

#### Testing muscle (please be faster)

In [14]:
import ipyrad as ip
from ipyrad.assemble.cluster_within import *
from ipyrad.assemble.write_outfiles import *

### Seems to be much faster
Test on a real clust.gz file before you spend time incorporating this.

In [15]:
def muscle_call(data, names, seqs):
    """
    Makes subprocess call to muscle. A little faster than before.
    TODO: Need to make sure this works on super large strings and does not
    overload the PIPE buffer.
    """

    ## make input string
    inputstr = "\n".join(["{}\n{}".format(i, j) for i, j in zip(names, seqs)])
    cmd = [ipyrad.bins.muscle, "-quiet"]

    ## increase gap penalty if reference region is included
    ## This could use more testing/refining!
    if "_REF;+0" in names:
        cmd += ["-gapopen", "-1200"]

    ## make a call arg
    proc1 = sps.Popen(cmd, stdin=sps.PIPE, stdout=sps.PIPE, close_fds=True)
    ## return result
    return proc1.communicate(inputstr)[0]


In [16]:
names = [">A", ">B", ">C", ">D"]

seqs = ["AAAATTTTCCCCGGGGAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGGG",
        "AAAATTTTCCCCGGGGAAAATTTTCCCCGGGGAAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGG",
        "AAAATTTTTCCCCGGGGAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGG",
        "AAAATTTTCCCCGGGGAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGGG"]

inputstr = "\n".join(["{}\n{}".format(i, j) for i, j in zip(names, seqs)])

m = ipyrad.bins.muscle
sss = inputstr
sss
#! echo -c $inputstr # | m -

'>A\nAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGGG\n>B\nAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGAAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGG\n>C\nAAAATTTTTCCCCGGGGAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGG\n>D\nAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGAAAATTTTCCCCGGGGGG'

In [17]:
%%timeit 
x = muscle_call("", names, seqs)

100 loops, best of 3: 7.43 ms per loop


In [18]:
#!/usr/bin/env python
from __future__ import print_function
import uuid
import random
from subprocess import Popen, PIPE, STDOUT
import time



start = time.time()

def newtrick():
    MARKER = str(uuid.uuid4())

    #shell_command = [ip.bins.muscle, '-quiet'
    p = Popen("sh", stdin=PIPE, stdout=PIPE, stderr=STDOUT,
              universal_newlines=True) # decode output as utf-8, newline is '\n'

    i = 0
    outs = []
    while True:
        # write next command
        shell_command = "echo '{}'".format(inputstr)# | {} - ".format(inputstr, ip.bins.muscle)
        #print(shell_command)
        print(shell_command, file=p.stdin)

        # insert MARKER into stdout to separate output from different shell_command
        print("echo '%s'" % MARKER, file=p.stdin)

        # read command output
        for line in iter(p.stdout.readline, MARKER+'\n'):
            if line.endswith(MARKER+'\n'):
                outs.append(line[:-len(MARKER)-1])
                break # command output ended without a newline
            outs.append(line)
        outs.append("\n")
            #print(line, end='')
        #print('\n')

        # exit on condition
        if i == 100:
            break
        else:
            i += 1
        
    # cleanup
    p.stdout.close()
    if p.stderr:
        p.stderr.close()
    p.stdin.close()
    p.wait()

In [19]:
start = time.time()
newtrick()
print(time.time() - start)




0.0197730064392


In [22]:
0.007 * 100

0.7000000000000001

### Back to baba

In [123]:
## start by loading several Python librarires
import ipyrad as ip
import ipyrad.analysis as ipa
import numpy as np
import pandas as pd

In [124]:
data = ip.load_json("cli/cli.json")

  loading Assembly: cli
  from saved path: ~/Documents/ipyrad/tests/cli/cli.json


In [125]:
data.outfiles

geno : /home/deren/Documents/ipyrad/tests/cli/cli_outfiles/cli.geno
loci : /home/deren/Documents/ipyrad/tests/cli/cli_outfiles/cli.loci
nexus : /home/deren/Documents/ipyrad/tests/cli/cli_outfiles/cli.nex
phy : /home/deren/Documents/ipyrad/tests/cli/cli_outfiles/cli.phy
snpsmap : /home/deren/Documents/ipyrad/tests/cli/cli_outfiles/cli.snps.map
snpsphy : /home/deren/Documents/ipyrad/tests/cli/cli_outfiles/cli.snps.phy
str : /home/deren/Documents/ipyrad/tests/cli/cli_outfiles/cli.str
ugeno : /home/deren/Documents/ipyrad/tests/cli/cli_outfiles/cli.u.geno
usnpsphy : /home/deren/Documents/ipyrad/tests/cli/cli_outfiles/cli.u.snps.phy
ustr : /home/deren/Documents/ipyrad/tests/cli/cli_outfiles/cli.ustr
vcf : /home/deren/Documents/ipyrad/tests/cli/cli_outfiles/cli.vcf


In [126]:
import pandas as pd
#pd.read_table(data.outfiles.vcf, comment="#")

## Make loci parsing script
This takes a .loci file and converts it to an array of the format:

(4 or 5, n-variable-loci, maxsnps)
e.g., (4, 1000, 50)


In [332]:
test = {
    'p1': ["1A_0", "1B_0", "1C_0"],
    'p2': ["1D_0"],
    'p3': ["2E_0", "2F_0"],
    'out': ["3L_0", "3J_0", "3K_0"], 
}

mindict = {
    'p1': 1,
    'p2': 1,
    'p3': 1, 
    'out': 1,
}

handle = data.outfiles.loci
ntips = 4

In [333]:

def loci_to_arr(locifile, test, mindict=None):

    ## read in the input file
    with open(locifile, 'r') as infile:
        loci = infile.read().strip().split("|\n")
        nloci = len(loci)

    ## get max loc length
    maxlen = 0
    for iloc in xrange(nloci):
        lines = loci[iloc].split("\n")[:-1]
        _maxl = len(lines[0]) 
        maxlen = max(maxlen, _maxl)

    ## make the array (4 or 5)
    arr = np.zeros((nloci, len(test), maxlen), dtype=np.float64)
    
    ## if not mindict, make one that requires 1 in each taxon
    if not mindict:
        mindict = {i:1 for i in test}

    ## grab seqs just for the good guys
    for loc in xrange(nloci):    

        ## parse the locus
        lines = loci[loc].split("\n")[:-1]
        names = [i.split()[0] for i in lines]
        seqs = np.array([list(i.split()[1]) for i in lines])

        ## check that names cover the test
        covs = [sum([j in names for j in test[tax]]) >= mindict[tax] for tax in test]
        if all(covs):
        
            ## get that refseq 
            ref = np.where([i in test['out'] for i in names])[0]
            refseq = seqs[ref].view(np.uint8)
            ancestral = np.array([reftrick(refseq, GETCONS)[:, 0]])

            ## and fill it in
            iseq = reffreq(ancestral, refseq, GETCONS)
            arr[loc, -1, :iseq.shape[1]] = iseq 

            ## fill each other tax freq in test
            keys = sorted([i for i in test.keys() if i[0] == 'p'])
            for tidx, key in enumerate(keys):

                ## get idx of names in test tax
                nidx = np.where([i in test[key] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                ## get freq of sidx
                iseq = reffreq(ancestral, sidx, GETCONS)
                ## fill it in 
                arr[loc, tidx, :iseq.shape[1]] = iseq

    ## size-down array to the number of loci that have taxa for the test
    return arr



In [334]:
arr = loci_to_arr(handle, test)
arr

array([[[ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ]],

       [[ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ]],

       [[ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ]],

       ..., 
       [[ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ]],

       [[ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 0.5,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 

In [318]:
##np.sum(arr[:, 0] * (1.-arr[:, 1]) * arr[:, 2] * (1-arr[:, 3])
       
    
np.seterr(divide="ignore")
    
aa = arr[:10]

top = (aa[:, 0] * (1.-aa[:, 1]) * aa[:, 2] * (1.-aa[:, 3])) - \
((1.-aa[:, 0]) * (aa[:, 1]) * (aa[:, 2]) * (1.-aa[:, 3]))


bot = (aa[:, 0] * (1.-aa[:, 1]) * aa[:, 2] * (1.-aa[:, 3])) + \
((1.-aa[:, 0]) * (aa[:, 1]) * (aa[:, 2]) * (1.-aa[:, 3]))


In [338]:
top = (arr[:, 0]) * (1.-arr[:, 1]) * (arr[:, 2]) * (1.-arr[:, 3]) - \
(1.-arr[:, 0]) * (arr[:, 1]) * (arr[:, 2]) * (1.-arr[:, 3])

bot = (arr[:, 0]) * (1.-arr[:, 1]) * (arr[:, 2]) * (1.-arr[:, 3]) + \
(1.-arr[:, 0]) * (arr[:, 1]) * (arr[:, 2]) * (1.-arr[:, 3])


In [346]:
np.sum(bot != 0)

24

In [345]:
top.sum(), bot.sum()

(-0.13888888888888903, 3.6388888888888893)

In [341]:
np.sum(top) / np.sum(bot)

-0.038167938931297746

In [208]:
arr[[0, 2, 78, 3], :]

boot = np.random.randint(0, high=arr.shape[0], size=arr.shape[0])
arr[boot]

array([[[ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ]],

       [[ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ]],

       [[ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  0.5, ...,  0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ]],

       ..., 
       [[ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 1. ,  1. ,  1. , ...,  0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
        [ 1.

In [441]:
me = seqs[ref].view(np.uint8)

In [442]:
#reftrick()
ancestral = reftrick(me, GETCONS)[:, 0]
ancestral

array([67, 84, 65, 71, 67, 71, 84, 84, 84, 84, 84, 84, 67, 71, 71, 67, 71,
       67, 65, 84, 65, 71, 67, 67, 84, 67, 67, 67, 65, 84, 65, 84, 67, 65,
       67, 67, 71, 65, 71, 84, 71, 67, 71, 67, 71, 65, 84, 71, 67, 71, 84,
       65, 65, 71, 84, 71, 71, 71, 65, 67, 65, 67, 67, 84, 67, 67, 67, 67,
       67, 67, 84, 65, 67, 71, 67, 71, 71, 67, 84, 67, 65], dtype=uint8)

In [436]:
refseq

array([['C', 'T', 'A', 'G', 'C', 'G', 'T', 'T', 'T', 'T', 'T', 'T', 'C',
        'G', 'G', 'C', 'G', 'C', 'A', 'T', 'A', 'G', 'C', 'C', 'T', 'C',
        'C', 'C', 'A', 'T', 'A', 'T', 'C', 'A', 'C', 'C', 'G', 'A', 'G',
        'T', 'G', 'C', 'G', 'C', 'G', 'A', 'T', 'G', 'C', 'G', 'T', 'A',
        'A', 'G', 'T', 'G', 'G', 'G', 'A', 'C', 'A', 'C', 'C', 'T', 'C',
        'C', 'C', 'C', 'C', 'C', 'T', 'A', 'C', 'G', 'C', 'G', 'G', 'C',
        'T', 'C', 'A'],
       ['C', 'T', 'A', 'G', 'C', 'G', 'T', 'T', 'T', 'T', 'T', 'T', 'C',
        'G', 'G', 'C', 'G', 'C', 'A', 'T', 'A', 'G', 'C', 'C', 'T', 'C',
        'C', 'C', 'A', 'T', 'A', 'T', 'C', 'A', 'C', 'C', 'G', 'A', 'G',
        'T', 'G', 'C', 'G', 'C', 'G', 'A', 'T', 'G', 'C', 'G', 'T', 'A',
        'A', 'G', 'T', 'G', 'G', 'G', 'A', 'C', 'A', 'C', 'C', 'T', 'C',
        'C', 'C', 'C', 'C', 'C', 'T', 'A', 'C', 'G', 'C', 'G', 'G', 'C',
        'T', 'C', 'A']], 
      dtype='|S1')

In [406]:
larr = np.array([list(i.split("|")[0]) for i in lines])
getem = np.where(np.all(larr == " ", axis=0))[0].max()
larr[:, getem:]

#np.where(larr[-1] == '-' +\
#larr[-1] == " "
         
    
## could subselect SNPs but then we still need to do it again after
## we subset the taxa, so instead let's just grab all sites.
## only drawback is speed and mem, a bit.
who = np.where(np.logical_or(larr[-1] == "-", larr[-1] == "*"))[0]
larr[:, who]

array([['T', 'G', 'G', 'T', 'C'],
       ['T', 'G', 'G', 'T', 'C'],
       ['T', 'G', 'G', 'T', 'C'],
       ['G', 'G', 'T', 'C', 'C'],
       ['T', 'G', 'G', 'T', 'M'],
       ['T', 'G', 'G', 'T', 'C'],
       ['T', 'G', 'G', 'T', 'C'],
       ['T', 'G', 'G', 'T', 'C'],
       ['T', 'G', 'G', 'T', 'C'],
       ['T', 'G', 'G', 'T', 'C'],
       ['T', 'G', 'G', 'T', 'C'],
       ['T', 'A', 'G', 'T', 'C'],
       ['-', '-', '-', '-', '-']], 
      dtype='|S1')

In [395]:
larr[:]

array([['1', 'A', '_', ..., 'T', 'C', 'A'],
       ['1', 'B', '_', ..., 'T', 'C', 'A'],
       ['1', 'C', '_', ..., 'T', 'C', 'A'],
       ..., 
       ['3', 'K', '_', ..., 'T', 'C', 'A'],
       ['3', 'L', '_', ..., 'T', 'C', 'A'],
       ['/', '/', ' ', ..., ' ', ' ', ' ']], 
      dtype='|S1')

In [287]:
for loc in xrange(10): #nloci):
    ## parse the locus
    lines = loci[loc].split("\n")#[:-1]
    names = [i.split()[0] for i in lines]
    seqs = np.array([list(i.split()[1]) for i in lines])
    snpidx = lines[-1]
    
    print(snpidx)
    print(np.array(list(lines)))

//                                                       *                                |0
[ '1A_0     TTAGTTCTTAGACTATTCGTTAACTCGAGGCGAGTGCCCTAAGCGCTATACGTGGCAGGACCTGTTGGAAAAACACGCAGA'
 '1B_0     TTAGTTCTTAGACTATTCGTTAACTCGAGGCGAGTGCCCTAAGCGCTATACGTGGCAGGACCTGTTGGAAAAACACGCAGA'
 '1C_0     TTAGTTCTTAGACTATTCGTTAACTCGAGGCGAGTGCCCTAAGCGCTATACGTGGCAGGACCTGTTGGAAAAACACGCAGA'
 '1D_0     TTAGTTCTTAGACTATTCGTTAACTCGAGGCGAGTGCCCTAAGCGCTATACGTGGCAGGACCTGTTGGAAAAACACGCAGA'
 '2E_0     TTAGTTCTTAGACTATTCGTTAACTCGAGGCGAGTGCCCTAAGCGCTACACGTGGCAGGACCTGTTGGAAAAACACGCAGA'
 '2F_0     TTAGTTCTTAGACTATTCGTTAACTCGAGGCGAGTGCCCTAAGCGCTACACGTGGCAGGACCTGTTGGAAAAACACGCAGA'
 '2G_0     TTAGTTCTTAGACTATTCGTTAACTCGAGGCGAGTGCCCTAAGCGCTATACGTGGCAGGACCTGTTGGAAAAACACGCAGA'
 '2H_0     TTAGTTCTTAGACTATTCGTTAACTCGAGGCGAGTGCCCTAAGCGCTACACGTGGCAGGACCTGTTGGAAAAACACGCAGA'
 '3I_0     TTAGTTCTTAGACTATTCGTTAACTCGAGGCGAGTGCCCTAAGCGCTATACGTGGCAGGACCTGTTGGAAAAACACGCAGA'
 '3J_0     TTAGTTCTTAGACTATTCGTTAACTCGAGGCGAGTGCCCTAAGCGCTAT

In [33]:
def _loci2loci(handle, taxonlist, maxlen=200):
    """
    Converts loci format data to a numpy array as floats of SNP props.
    """

    ## read in the input file
    with open(handle, 'r') as infile:
        loci = infile.read().strip().split("|\n")
        nloci = len(loci)

    ## count max snps 
    maxsnps = 0
    for iloc in xrange(nloci):
        lines = loci[iloc].split("//")[1].strip()
        _maxl = len([i for i in lines.split("|")[0].strip() if i != " "])
        maxsnps = max(maxsnps, _maxl)

    ## build array ({4or5}, nloci, maxsnps)
    arr = np.zeros((nloci, 4, maxsnps), dtype=np.float)
       
    ## get the outgroup allele (most common in outgroup)
    for loc in xrange(10): #nloci):
        ## parse the locus
        lines = loci[loc].split("\n")[:-1]
        names = [i.split()[0] for i in lines]
        seqs = np.array([list(i.split()[1]) for i in lines])
        snpidx = lines[-1]
        
        ## find most frequent allele among outgroups and call that the
        ## outgroup allele (A). How do we break ties? For simplicity, we'll
        ## consistently choose lowest base to break ties (e.g., A over C)
        arr[:, -1].fill(1)
        
        ## 
        #for otax in taxonlist[-1]:
        #    print seqs[names.index(otax)]
        
        #print(seqs)
        #print(names)
        #seqlen = seqs.shape[1]

        #taxi = sum([i in names for i in taxonlist])
        #taxc[iloc] += taxi        
        #print(loci[loc])

    return arr
    
    
#print maxlen
arr = _loci2loci(handle, test)
arr[0]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]])

In [None]:
            arr = np.zeros((5, maxlen), dtype=np.float64)
            ## find most frequent allele among outgroups and call that the
            ## outgroup allele (A). How do we break ties? For simplicity, we'll
            ## consistently choose lowest base to break ties (e.g., A over C)
            arr[-1].fill(1)

            ## fill fake data columns with 9s
            arr[:, seqlen-maxlen:].fill(9)

            ## get outgroup values
            outvals = seqs[names.index(taxonlist[4])]
            for itax in xrange(4):
                ## make 1s all sites that match to outg
                tmparr = np.int8(seqs[names.index(taxonlist[itax])] == outvals)
                ## make 9s all sites that have (N-RKSMW)
                #tmparr[
                arr[itax][:tmparr.shape[0]] = tmparr
            farr[iloc] = arr

    ## warn if no SNPs are found
    ## warn if no loci have sampling of all taxa

    #print(np.histogram(taxc, range(7)))
    ## return array that includes np.ones for loci w/o taxa
    return farr[taxc == len(taxonlist)], taxc

In [22]:
with open(handle, 'r') as infile:
    ## split on "//" for legacy compatibility
    loci = infile.read().strip().split("|\n")

## create emtpy array to fill
nloci = len(loci)
farr = np.ones((nloci, ntips, maxlen), dtype=np.float64)
taxc = np.zeros((nloci,))

## iterate over loci to find those which have taxon sampling
for iloc in xrange(nloci):
    lines = loci[iloc].split("//")[0].split()
    names = [i[1:] for i in lines[::2]]
    seqs = np.array([list(i) for i in lines[1::2]])

    taxi = sum([i in names for i in taxonlist])
    taxc[iloc] += taxi
    if taxi == len(taxonlist):
        arr = np.zeros((5, maxlen), dtype=np.float64)
print names
print seqs


NameError: name 'taxonlist' is not defined

In [17]:

    print seqs
    seqlen = seqs.shape[1]

    taxi = sum([i in names for i in taxonlist])
    taxc[iloc] += taxi
    if taxi == len(taxonlist):
        arr = np.zeros((5, maxlen), dtype=np.float64)
        ## find most frequent allele among outgroups and call that the
        ## outgroup allele (A). How do we break ties? For simplicity, we'll
        ## consistently choose lowest base to break ties (e.g., A over C)
        arr[-1].fill(1)

        ## fill fake data columns with 9s
        arr[:, seqlen-maxlen:].fill(9)

        ## get outgroup values
        outvals = seqs[names.index(taxonlist[4])]
        for itax in xrange(4):
            ## make 1s all sites that match to outg
            tmparr = np.int8(seqs[names.index(taxonlist[itax])] == outvals)
            ## make 9s all sites that have (N-RKSMW)
            #tmparr[
            arr[itax][:tmparr.shape[0]] = tmparr
        farr[iloc] = arr

[['T' 'T' 'A' 'G' 'T' 'T' 'C' 'T' 'T' 'A' 'G' 'A' 'C' 'T' 'A' 'T' 'T' 'C'
  'G' 'T' 'T' 'A' 'A' 'C' 'T' 'C' 'G' 'A' 'G' 'G' 'C' 'G' 'A' 'G' 'T' 'G'
  'C' 'C' 'C' 'T' 'A' 'A' 'G' 'C' 'G' 'C' 'T' 'A' 'T' 'A' 'C' 'G' 'T' 'G'
  'G' 'C' 'A' 'G' 'G' 'A' 'C' 'C' 'T' 'G' 'T' 'T' 'G' 'G' 'A' 'A' 'A' 'A'
  'A' 'C' 'A' 'C' 'G' 'C' 'A' 'G' 'A']
 ['T' 'T' 'A' 'G' 'T' 'T' 'C' 'T' 'T' 'A' 'G' 'A' 'C' 'T' 'A' 'T' 'T' 'C'
  'G' 'T' 'T' 'A' 'A' 'C' 'T' 'C' 'G' 'A' 'G' 'G' 'C' 'G' 'A' 'G' 'T' 'G'
  'C' 'C' 'C' 'T' 'A' 'A' 'G' 'C' 'G' 'C' 'T' 'A' 'T' 'A' 'C' 'G' 'T' 'G'
  'G' 'C' 'A' 'G' 'G' 'A' 'C' 'C' 'T' 'G' 'T' 'T' 'G' 'G' 'A' 'A' 'A' 'A'
  'A' 'C' 'A' 'C' 'G' 'C' 'A' 'G' 'A']
 ['T' 'T' 'A' 'G' 'T' 'T' 'C' 'T' 'T' 'A' 'G' 'A' 'C' 'T' 'A' 'T' 'T' 'C'
  'G' 'T' 'T' 'A' 'A' 'C' 'T' 'C' 'G' 'A' 'G' 'G' 'C' 'G' 'A' 'G' 'T' 'G'
  'C' 'C' 'C' 'T' 'A' 'A' 'G' 'C' 'G' 'C' 'T' 'A' 'T' 'A' 'C' 'G' 'T' 'G'
  'G' 'C' 'A' 'G' 'G' 'A' 'C' 'C' 'T' 'G' 'T' 'T' 'G' 'G' 'A' 'A' 'A' 'A'
  'A' 'C' 'A' 'C' 'G' 'C' 'A' 'G' 

NameError: name 'taxonlist' is not defined

In [6]:
farr.shape

(1000, 4, 200)

In [8]:
iloc = 0

## parse the loci file
names = lines[::2]
seqs = np.array([list(i) for i in lines[1::2]])
seqlen = seqs.shape[1]
print seqlen

## check for taxon coverage
taxi = [any([i in names for i in ptax]) for ptax in test]
print taxi

## group sequences into arr[arrs] 
sidx = [[names.index(i) for i in ptax] for ptax in test]
seqs = np.array([np.array([seqs[i] for i in isid]).view(np.int8) for isid in sidx])

NameError: name 'lines' is not defined

In [7]:
## choose test
if ntips == 4:
    p3 = [taxi[2]]
elif ntips == 5:
    p3 = [taxi[2], taxi[3]]
else:
    raise IPyradWarningExit("ntips can only be 4 or 5")   
print ntips


## filter for loci w/ sufficient coverage
suff = all([taxi[0], taxi[1], taxi[-1]]) & any(p3)
if suff:
    ## an array to fill
    arr = np.zeros((ntips, maxlen), dtype=np.float64)
    print arr.shape
    
    ## fill outgroup with most common base in outgroup
    ancestral = reftrick(seqs[-1], GETCONS)[:, 0]
    for i in xrange(ntips):
        arr[i][:seqlen] = np.sum(seqs[i] == ancestral, axis=0) / \
                                float(seqs[i].shape[0])
    print ancestral

NameError: name 'taxi' is not defined

In [29]:
import numba

#@numba.jit()
def reffreq(refseq, iseq, consdict):
    ## empty arrays
    freq = np.zeros((1, iseq.shape[1]), dtype=np.float64)
    amseq = np.zeros((iseq.shape[0]*2, iseq.shape[1]), dtype=np.uint8)
    
    ## fill in both copies
    for seq in xrange(iseq.shape[0]):
        for col in xrange(iseq.shape[1]):
            ## expand colums with ambigs and remove N-
            base = iseq[seq][col]
            who = np.where(consdict[:, 0] == base)[0]
            ## resolve heteros or enter into both copies
            if np.any(who):
                who = who[0]
                amseq[(seq*2)][col] = consdict[who][0]
                amseq[(seq*2)+1][col] = consdict[who][1]
            else:
                amseq[(seq*2)][col] = base
                amseq[(seq*2)+1][col] = base
    
    ## get as frequencies
    amseq = (refseq == amseq).astype(np.float64)
    for i in xrange(amseq.shape[0]):
        freq += amseq[i]
    return freq / np.float64(amseq.shape[0])


In [30]:
reffreq(ancestral, refseq, consdict)

NameError: name 'consdict' is not defined

In [466]:
refseq = np.array([list("AACCGGTT")]).view(np.uint8)
iseq = np.array([list("AACCGGGT"), list("AACCGCGT")]).view(np.uint8)
consdict = GETCONS

reffreq(refseq, iseq, consdict)

array([[ 1. ,  1. ,  1. ,  1. ,  1. ,  0.5,  0. ,  1. ]])

In [467]:
reffreq(refseq, iseq, consdict)

array([[ 1. ,  1. ,  1. ,  1. ,  1. ,  0.5,  0. ,  1. ]])

In [230]:
## iterate over loci to find those which have taxon sampling
for iloc in xrange(nloci):
    
    ## parse the loci file
    lines = loci[iloc].split("//")[0].split()
    names = lines[::2]
    seqs = np.array([list(i) for i in lines[1::2]])
    seqlen = seqs.shape[1]

    ## check for taxon coverage
    taxi = [any([i in names for i in ptax]) for ptax in test]
    
    ## group sequences into arr[arrs] 
    sidx = [[names.index(i) for i in ptax] for ptax in test]
    seqs = np.array([np.array([seqs[i] for i in isid]).view(np.int8) for isid in sidx])
    
    ## choose test
    if ntips == 4:
        p3 = [taxi[2]]
    elif ntips == 5:
        p3 = [taxi[2], taxi[3]]
    else:
        raise IPyradWarningExit("ntips can only be 4 or 5")
        
    ## filter for loci w/ sufficient coverage
    if all([taxi[0], taxi[1], taxi[-1]]) & any(p3):
        
        ## an array to fill
        arr = np.zeros((seqs.shape[0], maxlen), dtype=np.float64)
        
        ## fill outgroup with most common base in outgroup
        ancestral = reftrick(seqs[-1], GETCONS)[:, 0]
        for i in xrange(seqs.shape[0]):
            arr[i][:seqlen] = np.sum(seqs[i] == ancestral, axis=0) / \
                                 float(seqs[i].shape[0])

print arr

[[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
   1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
   1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
   1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
   1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
   1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
   1.  1.  1.  1.  1.  1.  

In [171]:
        
        arr[-1].fill(1)
        ## fill fake data columns with 9s
        arr[:, seqlen-maxlen:].fill(9)

        ## get outgroup values
        outvals = seqs[names.index(taxonlist[3])]
        for itax in xrange(4):
            ## make 1s all sites that match to outg
            tmparr = np.int8(seqs[names.index(taxonlist[itax])] == outvals)
            ## make 9s all sites that have (N-RKSMW)
            arr[itax][:tmparr.shape[0]] = tmparr
        farr[iloc] = arr

ValueError: could not broadcast input array from shape (2,81) into shape (2)

In [86]:
farr[taxc == len(taxonlist)]

array([[[1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9]],

       [[1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9]],

       [[1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9]],

       ..., 
       [[1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9]],

       [[0, 1, 1, ..., 9, 9, 9],
        [0, 1, 1, ..., 9, 9, 9],
        [0, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9]],

       [[1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..., 9, 9, 9],
        [1, 1, 1, ..