### Canarium bucky analysis

In [1]:
import numpy as np
import glob
import os
import subprocess
import ipyparallel
from collections import Counter


In [2]:
%%bash
mkdir -p analysis_bucky/testlocs4/
mkdir -p analysis_bucky/testlocs5/
mkdir -p analysis_bucky/testlocs6/

### Test for hypothesis 4

In [3]:
D4 = {"out": "D14269",
      "K1": "SF328",
      "K2": "D14483",
      "K3": "D14478",
      "K4": "SF224"}

### Test for hypothesis 5

In [4]:
D5 = {"out": "D14269",
      "K1": "SF328",
      "K2": "D14505",
      "K3": "D14478",
      "K4": "D14483",
      "K5": "SF224"}

### Test for hypothesis 6

In [5]:
D6 = {"out": "D14269",
      "K1": "SF328",
      "K2": "D14483",
      "K3": "D14478",
      "K4": "D14505",
      "K5": "D12950",
      "K6": "SF224"}

### Function to embed mrbayes block in nexus files

In [6]:
NEXBLOCK = """\
#NEXUS
begin data;
dimensions ntax={} nchar={};
format datatype=dna interleave=yes gap=- missing=N;
matrix
{}
    ;

begin mrbayes;
set autoclose=yes nowarn=yes;
lset nst=6 rates=gamma;
outgroup {};
mcmc ngen=4000000 samplefreq=4000 printfreq=40000000;
sump burnin=1000000;
sumt burnin=1000000;
end;
"""

def nexmake(hdict, nloci, outgname, name):
    ## open nexus file handle
    outloc = open("analysis_bucky/testlocs{}/{}.nex".format(name, str(nloci)), 'w')
    
    ## create matrix as a string
    matrix = ""
    for i in hdict.items():
        matrix += "{:<10} {}\n".format(i[0][:10], i[1])
    
    ## write nexus block
    outloc.write(NEXBLOCK.format(len(hdict), 
                                 len(hdict.values()[0]),
                                 matrix,
                                 outgname,
                                 ))
    outloc.close()


### Functions to count informative sites in RAD locus

In [7]:
## parse it
AMBIGS = {"R": ("G", "A"),
          "K": ("G", "T"),
          "S": ("G", "C"),
          "Y": ("T", "C"),
          "W": ("T", "A"),
          "M": ("C", "A")}
    
def unstruct(amb):
    " returns bases from ambiguity code"
    if amb in AMBIGS:
        return AMBIGS.get(amb)
    else:
        return (amb, amb)
            

def resolveambig(subseq):
    """ randomly resolves iupac hetero codes """
    N = []
    for col in subseq:
        N.append([unstruct(i)[np.random.binomial(1, 0.5)] for i in col])
    return np.array(N)
    
    
def newPIS(seqsamp):
    """ filters for loci with >= 2 PIS """
    counts = [Counter(col) for col in seqsamp.T if not ("-" in col or "N" in col)]
    pis = [i.most_common(2)[1][1] > 1 for i in counts if len(i.most_common(2))>1]
    if sum(pis) >= 2:
        return sum(pis)
    else:
        return 0      
    

### Function to parse RAD loci
Filters for only loci that are informative and have all samples in a test

In [8]:
def parseloci(loci, hdict, name):
    """ 
    This parses the .loci file format produced by ipyrad v.0.3.*
    """
    ## keep track of how many loci pass
    nloci = 0
    
    ## create subsampled data set
    for loc in loci:
        dat = loc.split("\n")[:-1]

        ## if all tip samples have data in this locus
        names = [i.split()[0] for i in dat]
        seqs = np.array([list(i.split()[1]) for i in dat])

        ## check that locus has required samples for each subtree
        if all([i in names for i in hdict.values()]):
            seqsamp = seqs[[names.index(tax) for tax in hdict.values()]]
            seqsamp = resolveambig(seqsamp)
            pis = newPIS(seqsamp)
            if pis:
                nloci += 1
                ## remove invariable columns given this subsampling
                keep = []
                seqsamp[seqsamp == "-"] = "N"
                rmcol = np.all(seqsamp == "N", axis=0)
                seqsamp = seqsamp[:, ~rmcol]

                ## write to a nexus file
                matrix = dict(zip(hdict.keys(), [i.tostring() for i in seqsamp]))
                nexmake(matrix, nloci, "out", name)
    print nloci, 'loci kept'            


### Parse the loci for hypotheses H4 and H6
Print the first and last locus, just for our edification.

In [16]:
inloci = "/Users/Sarah/Dropbox/Canarium_GBS/ipyrad/canarium_test/CanEnd_outfiles/CanEnd.loci"
loci = open(inloci).read().strip().split("|\n")

print loci[0]
print loci[-1]

5573        CGGCGAGNTTTACTGCAAATCTGGTTTGGGGTGTTGCTGAATTTCTGCCATCTGGGTAGTTG
D12950      CGGCGAGTTTTACTGCAAATCTGGTTTGGGGTGTTGCTGAATTTCTGCCATCTGGGTAGTTG
D13052      CGGCGAGTTTTACTGCAAATCTGGTTTGGGGTGTTGCTGAATTTCTGCCATCTGGGTAGTTG
D13063      ------------CTGCAAATCTGGTTTGGGGNGTTGCTGAATTTCTGCCNTNTGGGTNGTTG
D13097      CGGCGAGTTTTACTGCAAATCTGGTTTGGGGTGTTGCTGAATTTCTGCCATCTGGGTAGTTG
D13103      CGGCGAGTTTTACTGCAAATCTGGTTTGGGGTGTTGCTGAATTTCTGCCATCTGGGTAGTTG
D13374      CGGCGRGTTTTACTGCAAATCTGGTTTGGGGTGTTGCTGAATTTCTGCCATCTGGGTAGTTG
D14269      CGGCGAGTTTTACTGCAAATCTGGTTTGGGGTGTTGCTGAATTTCTGCCATCTGGGTAGTGG
D14477      CGGCGAGTTTTACTGCAAATCTGGTTTGGGGTGTTGCTGAATTTCTGCCATCTGGGTAGTTG
D14478      CGGCGAGTTTTACTGCAAATCTGGTTTGGGGTGTTGCTGAATTTCTGCCATCTGGGTAGTTG
D14480      CGGCGAGTTTTACTGCAAATCTGGTTTGGGGTGTTGCTGAATTTCTGCCATCTGGGTAGTTG
D14482      CGGCGAGTTTTACTGCAAATCTGGTTTGGGGTGTTGCTGAATTTCTGCCATCTGGGTAGTTG
D14483      CGGCGAGTTTTACTGCAAATCTGGTTTGGGGTGTTGCTGAATTTCTGCCATCTGGGTAGTTG
D14485      CGGCGAGTTTTAC

In [18]:
np.random.seed(12345)
parseloci(loci, D4, "4")

NameError: name 'D4' is not defined

In [19]:
np.random.seed(12345)
parseloci(loci, D5, "5")

746 loci kept


In [12]:
np.random.seed(12345)
parseloci(loci, D6, "6")

558 loci kept


### Submit jobs to run in parallel across many cores
We are submitting the nexus file to MrBayes 3.2.2. 

In [5]:
## create a parallel client
ipyclient = ipyparallel.Client()
lbview = ipyclient.load_balanced_view()

In [10]:
## call function across all engines
def mrbayes(infile):
    import subprocess
    cmd = "mb {}".format(infile)
    subprocess.check_call(cmd, shell=True)
    #stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
    

In [16]:
## submit all nexus files to run mb
allnex = glob.glob("/home/deren/Documents/Canarium/analysis_bucky/testlocs4/*.nex")
asyncs = []
for nex in allnex:
    asyncs.append(lbview.apply(mrbayes, nex))

ipyclient.wait_interactive()

 566/566 tasks finished after 4158 s
done


In [24]:
## submit all nexus files to run mb
allnex = glob.glob("/home/deren/Dropbox/Canarium_GBS/analysis_bucky/testlocs5/*.nex")
for nex in allnex:
    lbview.apply(mrbayes, nex)

ipyclient.wait_interactive()

 746/746 tasks finished after 6359 s
done


In [20]:
## submit all nexus files to run mb
allnex = glob.glob("/home/deren/Documents/Canarium/analysis_bucky/testlocs6/*.nex")
for nex in allnex:
    lbview.apply(mrbayes, nex)

ipyclient.wait_interactive()

 558/558 tasks finished after 3689 s
done


### Run mbsum to summarize posteriors

In [29]:
def mbsum(nexdir, nloci):    
    import subprocess
    ## combine trees from the two replicate runs
    for n in range(1, nloci+1):
        cmd = "mbsum -n 0 -o {}{}.in {}{}.nex.run1.t {}{}.nex.run2.t"\
              .format(nexdir, n, nexdir, n, nexdir, n)
        subprocess.check_call(cmd, shell=True)

In [30]:
## run mbsum
#mbsum("analysis_bucky/testlocs4/", 566)
mbsum("/home/deren/Dropbox/Canarium_GBS/analysis_bucky/testlocs5/", 746)
#mbsum("analysis_bucky/testlocs6/", 558)

### Run bucky to infer concordance factors

In [2]:
def bucky(arg):
    import subprocess
    subprocess.check_call(arg, shell=True)
    

In [3]:
## enter args for each run
args = []
for insdir in ["/home/deren/Dropbox/Canarium_GBS/analysis_bucky/testlocs5/"]:
    ## alpha at three levels
    for alpha in [0.1, 1, 10]:
        args.append("bucky -a {} -k 4 -n 4000000 -c 4 -o {}/BUCKY.{} {}/*.in".\
                    format(alpha, os.path.realpath(insdir), alpha, os.path.realpath(insdir)))
args

['bucky -a 0.1 -k 4 -n 4000000 -c 4 -o /home/deren/Dropbox/Canarium_GBS/analysis_bucky/testlocs5/BUCKY.0.1 /home/deren/Dropbox/Canarium_GBS/analysis_bucky/testlocs5/*.in',
 'bucky -a 1 -k 4 -n 4000000 -c 4 -o /home/deren/Dropbox/Canarium_GBS/analysis_bucky/testlocs5/BUCKY.1 /home/deren/Dropbox/Canarium_GBS/analysis_bucky/testlocs5/*.in',
 'bucky -a 10 -k 4 -n 4000000 -c 4 -o /home/deren/Dropbox/Canarium_GBS/analysis_bucky/testlocs5/BUCKY.10 /home/deren/Dropbox/Canarium_GBS/analysis_bucky/testlocs5/*.in']

In [27]:
## enter args for each run
args = []
for insdir in ["./analysis_bucky/testlocs4", 
               "./analysis_bucky/testlocs5", 
               "./analysis_bucky/testlocs6"]:
    ## alpha at three levels
    for alpha in [0.1, 1, 10]:
        args.append("bucky -a {} -k 4 -n 4000000 -c 4 -o {}/BUCKY.{} {}/*.in".\
                    format(alpha, os.path.realpath(insdir), alpha, os.path.realpath(insdir)))
args

['bucky -a 0.1 -k 4 -n 4000000 -c 4 -o /home/deren/Documents/Canarium/analysis_bucky/testlocs4/BUCKY.0.1 /home/deren/Documents/Canarium/analysis_bucky/testlocs4/*.in',
 'bucky -a 1 -k 4 -n 4000000 -c 4 -o /home/deren/Documents/Canarium/analysis_bucky/testlocs4/BUCKY.1 /home/deren/Documents/Canarium/analysis_bucky/testlocs4/*.in',
 'bucky -a 10 -k 4 -n 4000000 -c 4 -o /home/deren/Documents/Canarium/analysis_bucky/testlocs4/BUCKY.10 /home/deren/Documents/Canarium/analysis_bucky/testlocs4/*.in',
 'bucky -a 0.1 -k 4 -n 4000000 -c 4 -o /home/deren/Documents/Canarium/analysis_bucky/testlocs5/BUCKY.0.1 /home/deren/Documents/Canarium/analysis_bucky/testlocs5/*.in',
 'bucky -a 1 -k 4 -n 4000000 -c 4 -o /home/deren/Documents/Canarium/analysis_bucky/testlocs5/BUCKY.1 /home/deren/Documents/Canarium/analysis_bucky/testlocs5/*.in',
 'bucky -a 10 -k 4 -n 4000000 -c 4 -o /home/deren/Documents/Canarium/analysis_bucky/testlocs5/BUCKY.10 /home/deren/Documents/Canarium/analysis_bucky/testlocs5/*.in',
 'bu

In [6]:
## run on parallel processors
for job in args:
    async = lbview.apply(bucky, job)

ipyclient.wait_interactive()

   3/3 tasks finished after 26776 s
done


### View results

In [29]:
! head -n 35 analysis_bucky/testlocs4/BUCKY.1.concordance

translate
 1 K3,
 2 K2,
 3 K1,
 4 out,
 5 K4;

Population Tree:
((1,2),(3,4),5);

Primary Concordance Tree Topology:
((1,2),(3,4),5);

Population Tree, With Branch Lengths In Estimated Coalescent Units:
((1:10.000,2:10.000):0.543,(3:10.000,4:10.000):0.173,5:10.000);

Primary Concordance Tree with Sample Concordance Factors:
((1:1.000,2:1.000):0.553,(3:1.000,4:1.000):0.355,5:1.000);

Four-way partitions in the Population Tree: sample-wide CF, coalescent units and Ties(if present)
{1; 2|3,4; 5}	0.613, 0.543,  
{1,2; 5|3; 4}	0.439, 0.173,  

Splits in the Primary Concordance Tree: sample-wide and genome-wide mean CF (95% credibility), SD of mean sample-wide CF across runs
{1,2|3,4,5} 0.553(0.502,0.606) 0.552(0.486,0.618)	0.000
{1,2,5|3,4} 0.355(0.307,0.403) 0.355(0.294,0.417)	0.000

Splits NOT in the Primary Concordance Tree but with estimated CF > 0.050:
{1,2,4|3,5} 0.277(0.230,0.327) 0.277(0.218,0.339)	0.000
{1,5|2,3,4} 0.159(0.120,0.200) 0.159(0.112,0.211)	

In [28]:
! head -n 35 /home/deren/Dropbox/Canarium_GBS/analysis_bucky/testlocs5/

head: error reading ‘/home/deren/Dropbox/Canarium_GBS/analysis_bucky/testlocs5/’: Is a directory


In [34]:
! head -n 35 analysis_bucky/testlocs5/BUCKY.1.concordance

translate
 1 K3,
 2 K2,
 3 K1,
 4 K5,
 5 K4,
 6 out;

Population Tree:
(((1,2),(4,5)),3,6);

Primary Concordance Tree Topology:
(((1,2),(4,5)),3,6);

Population Tree, With Branch Lengths In Estimated Coalescent Units:
(((1:10.000,2:10.000):0.581,(4:10.000,5:10.000):0.659):0.429,3:10.000,6:10.000);

Primary Concordance Tree with Sample Concordance Factors:
(((1:1.000,2:1.000):0.466,(4:1.000,5:1.000):0.569):0.416,3:1.000,6:1.000);

Four-way partitions in the Population Tree: sample-wide CF, coalescent units and Ties(if present)
{1; 2|3,6; 4,5}	0.627, 0.581,  
{1,2; 4,5|3; 6}	0.566, 0.429,  
{1,2; 3,6|4; 5}	0.655, 0.659,  

Splits in the Primary Concordance Tree: sample-wide and genome-wide mean CF (95% credibility), SD of mean sample-wide CF across runs
{1,2,3,6|4,5} 0.569(0.507,0.627) 0.568(0.492,0.642)	0.000
{1,2|3,4,5,6} 0.466(0.406,0.530) 0.466(0.391,0.543)	0.001
{1,2,4,5|3,6} 0.416(0.337,0.489) 0.415(0.327,0.500)	0.001

Splits NOT in the Primary Concord

In [33]:
! head -n 40 analysis_bucky/testlocs6/BUCKY.1.concordance

translate
 1 K3,
 2 K2,
 3 K1,
 4 K6,
 5 K5,
 6 K4,
 7 out;

Population Tree:
((((1,6),2),(4,5)),3,7);

Primary Concordance Tree Topology:
((((1,6),2),(4,5)),3,7);

Population Tree, With Branch Lengths In Estimated Coalescent Units:
((((1:10.000,6:10.000):0.072,2:10.000):0.647,(4:10.000,5:10.000):0.797):0.309,3:10.000,7:10.000);

Primary Concordance Tree with Sample Concordance Factors:
((((1:1.000,6:1.000):0.261,2:1.000):0.455,(4:1.000,5:1.000):0.634):0.366,3:1.000,7:1.000);

Four-way partitions in the Population Tree: sample-wide CF, coalescent units and Ties(if present)
{1,2,6; 4,5|3; 7}	0.510, 0.309,  
{1,2,6; 3,7|4; 5}	0.699, 0.797,  
{1,6; 2|3,7; 4,5}	0.651, 0.647,  
{1; 6|2; 3,4,5,7}	0.380, 0.072,  

Splits in the Primary Concordance Tree: sample-wide and genome-wide mean CF (95% credibility), SD of mean sample-wide CF across runs
{1,2,3,6,7|4,5} 0.634(0.566,0.699) 0.634(0.554,0.709)	0.003
{1,2,6|3,4,5,7} 0.455(0.362,0.539) 0.455(0.353,0.550)	0.009
