### Canarium bucky analysis

In [1]:
## import Python libraries
import ipyparallel as ipp
import subprocess as sps
import ipyrad as ip
import glob
import os
import ipyrad.file_conversion as ifc


### Install software

In [2]:
## conda install -c ipyrad bucky
## conda install -c ipyrad ipyrad
## conda install -c BioBuilds mrbayes

### Cluster setup

In [3]:
## look for running ipcluster instance, and create load-balancer
ipyclient = ipp.Client()
lbview = ipyclient.load_balanced_view()
print "{} engines found".format(len(ipyclient))

40 engines found


In [4]:
%%px
## push imports to parallel engines
import subprocess as sps
import glob
import os

### Input/output organization

In [5]:
## enter the data file for your analysis here
LOCIFILE = "/home/deren/Documents/Canarium/analysis-ipyrad/Canarium_min4_outfiles/Canarium_min4.loci"
WORKDIR = "analysis-bucky"

In [6]:
## This ensures the file paths are Full Paths (not relative) 
LOCIFILE = os.path.realpath(LOCIFILE)
assert os.path.exists(LOCIFILE)
WORKDIR = os.path.realpath(WORKDIR)
print "infile is:", LOCIFILE
print "outdir is:", WORKDIR

infile is: /home/deren/Documents/Canarium/analysis-ipyrad/Canarium_min4_outfiles/Canarium_min4.loci
outdir is: /home/deren/Documents/Canarium/analysis-bucky


### Setup tests

In [7]:
## hypothesis for 4 species + outgroup
D4 = ["D14269",
      "SF328",
      "D12950",
      "D14478",
      "SF224",
     ]

## hypothesis for 5 species + outgroup
D5 = ["D14269",
      "SF328",
      "D12950",
      "D14478",
      "SF224", 
      "D14483",
     ]

## hypothesis for 6 species + outgroup
D6 = ["D14269",
      "SF328",
      "D14483",
      "D14478",
      "D14505",
      "D12950",
      "SF224"
     ]

In [11]:
## create nexus files for this data set
ifc.loci2multinex(name="D4", 
                  locifile=LOCIFILE, 
                  subsamples=D4, 
                  minSNPs=2, 
                  mcmc_burnin=1000000,
                  mcmc_ngen=4000000,
                  mcmc_sample_freq=4000,
                  outdir=WORKDIR)

ifc.loci2multinex(name="D5", 
                  locifile=LOCIFILE, 
                  subsamples=D5, 
                  minSNPs=2, 
                  mcmc_burnin=1000000,
                  mcmc_ngen=4000000,
                  mcmc_sample_freq=4000,
                  outdir=WORKDIR)

ifc.loci2multinex(name="D6", 
                  locifile=LOCIFILE, 
                  subsamples=D6, 
                  minSNPs=2, 
                  mcmc_burnin=1000000,
                  mcmc_ngen=4000000,
                  mcmc_sample_freq=4000,
                  outdir=WORKDIR)

wrote 199 nexus files to /home/deren/Documents/Canarium/analysis-bucky/bucky-D4
wrote 247 nexus files to /home/deren/Documents/Canarium/analysis-bucky/bucky-D5
wrote 282 nexus files to /home/deren/Documents/Canarium/analysis-bucky/bucky-D6


### An example NEXUS file

In [15]:
## get RUNDIR relative to WORKDIR to ensure it is a Full Path
RUNDIR4 = os.path.join(WORKDIR, "bucky-{}".format("D4"))
RUNDIR5 = os.path.join(WORKDIR, "bucky-{}".format("D5"))
RUNDIR6 = os.path.join(WORKDIR, "bucky-{}".format("D6"))

## print an example nexus file
with open(os.path.join(RUNDIR6, "1.nex")) as nex:
    print nex.read()

#NEXUS
begin data;
dimensions ntax=7 nchar=73;
format datatype=dna interleave=yes gap=- missing=N;
matrix
D14269  AAGCTTCCCAAAGGAAGGCATTTGATTGGCTCTCATGTCTCTTACGATATTGATGTTAGATTGGAAGGTGAAG
SF224   AAGCTTCCCAAGGGAAGGCATTTGATTGCCTCTCATGTCTCTTATGATATTGATGTTAGATTGGAAGGTGAAG
D12950  AAGCTTCCCAAGGGAAGGCATTTGATTGCCTCTCATGTCTCTTATGATATTGATGTTAGATTGGAAGGTGAAG
SF328   AAGCTTCCCAAAGGAAGGCATTTGATTGGCTCTCATGTCTCTTATGATATTGATGTTAGATTGGAAGGTGAAG
D14483  AAGCTTCCCAAAGGAAGGCATTTGATTGGCTCTCATGTCTCTTATGATATTGATGTTAGATTGGAAGGTGAAG
D14478  AAGCTTCCCAAAGGAAGGCATTTGATTGGCTCTCATGTCTCTTATGATATTGATGTTAGATTGGAAGGTGAAG
D14505  AAGCTTCCCAAAGGAAGGCATTTGATTGGCTCTCATGTCTCTTATGATATTGATGTTAGATTGGAAGGTGAAG

    ;

begin mrbayes;
set autoclose=yes nowarn=yes;
lset nst=6 rates=gamma;
mcmc ngen=4000000 samplefreq=4000 printfreq=4000000;
sump burnin=1000000;
sumt burnin=1000000;
end;



In [16]:
## get all nexus files for each data set
nexfiles4 = glob.glob(os.path.join(RUNDIR4, "*.nex"))
nexfiles5 = glob.glob(os.path.join(RUNDIR5, "*.nex"))
nexfiles6 = glob.glob(os.path.join(RUNDIR6, "*.nex"))

### Submit jobs to run in parallel across many cores
We are submitting the nexus file to MrBayes 3.2.2. 

In [17]:
def mrbayes(infile):
    ## double check file path
    infile = os.path.realpath(infile)
    if not os.path.exists(infile):
        raise Exception("infile not found; try using a fullpath")
        
    ## call mrbayes
    cmd = ['mb', infile]
    proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
    stdout = proc.communicate()
    
    ## check for errors
    if proc.returncode:
        return stdout

In [18]:
def mbsum(dirs):
    trees1 = glob.glob(os.path.join(dirs, "*.run1.t"))
    trees2 = glob.glob(os.path.join(dirs, "*.run2.t"))
    tidx = 0
    for tidx in xrange(len(trees1)):
        cmd = ["mbsum", 
               "-n", "0", 
               "-o", os.path.join(dirs, str(tidx))+".in", 
               trees1[tidx], 
               trees2[tidx]]
        proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
        proc.communicate()
    print "summed {} trees in: {}".format(tidx, dirs)

In [19]:
def bucky(outname, indir, alpha, nchains, nreps, niter):
    ## check paths
    if not os.path.exists(indir):
        raise Exception("infiles not found; try using a fullpath")
    
    ## call bucky 
    infiles = os.path.join(indir, "*.in")
    cmd = ["bucky", 
           "-a", str(alpha),
           "-c", str(nchains),
           "-k", str(nreps),
           "-n", str(int(niter)), 
           "-o", outname, 
           infiles]
    
    cmd = " ".join(cmd)
    proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, shell=True)
    stdout = proc.communicate()
    if proc.returncode:
        return " ".join(cmd), stdout

### Run mrbayes on all nexus file in parallel

In [21]:
## send jobs to the parallel engines
asyncs = []
for nexfile in nexfiles4 + nexfiles5 + nexfiles6:
    async = lbview.apply(mrbayes, nexfile)
    asyncs.append(async)

### Track progress of mrbayes runs

In [31]:
ready =  [i for i in asyncs if i.ready()]
failed = [i for i in ready if not i.successful()]

## print progress
print "mrbayes batch runs:"
print "{} jobs submitted".format(len(asyncs))
print "{} jobs finished".format(len(ready))

## print errors, if any.
if any(failed):
    print failed[0].exception()
    print failes[0].result()

mrbayes batch runs:
728 jobs submitted
728 jobs finished


In [32]:
## block progress until all mrbayes jobs are finished
ipyclient.wait()

True

### Run mbsum to summarize posteriors

In [33]:
## run mbsum on each directory of tree files
mbsum(RUNDIR4)
mbsum(RUNDIR5)
mbsum(RUNDIR6)

summed 198 trees in: /home/deren/Documents/Canarium/analysis-bucky/bucky-D4
summed 246 trees in: /home/deren/Documents/Canarium/analysis-bucky/bucky-D5
summed 281 trees in: /home/deren/Documents/Canarium/analysis-bucky/bucky-D6


### Run bucky to infer concordance factors

In [46]:
nchains = 4
nreps = 4
niter = 1e6
alphas = [0.1, 1, 10]

## submit jobs to run at several values of alpha
bsyncs = []
for rundir in [RUNDIR4, RUNDIR5, RUNDIR6]:
    for alpha in alphas:
        outname = os.path.join(rundir, "bucky-{}".format(alpha))
        args = (outname, rundir, alpha, nchains, nreps, niter)
        async = lbview.apply(bucky, *args)
        bsyncs.append(async)

### Track progress of Bucky runs

In [50]:
ready =  [i for i in bsyncs if i.ready()]
failed = [i for i in ready if not i.successful()]
print "bucky batch runs:"
print "{} jobs submitted".format(len(bsyncs))
print "{} jobs finished".format(len(ready))
if len(ready) == len(bsyncs):
    ## print errors, if any.
    if any(failed):
        print failed[0].exception()

bucky batch runs:
9 jobs submitted
9 jobs finished


In [51]:
ipyclient.wait()

True

### View results
alpha=1 results are below, the rest are in the github repo. 

In [52]:
! head -n 40 analysis-bucky/bucky-D4/bucky-1.concordance

translate
 1 D14269,
 2 D12950,
 3 SF224,
 4 SF328,
 5 D14478;

Population Tree:
((1,4),(2,3),5);

Primary Concordance Tree Topology:
((1,4),(2,3),5);

Population Tree, With Branch Lengths In Estimated Coalescent Units:
((1:10.000,4:10.000):0.733,(2:10.000,3:10.000):1.765,5:10.000);

Primary Concordance Tree with Sample Concordance Factors:
((1:1.000,4:1.000):0.656,(2:1.000,3:1.000):0.847,5:1.000);

Four-way partitions in the Population Tree: sample-wide CF, coalescent units and Ties(if present)
{1; 4|2,3; 5}	0.680, 0.733,  
{1,4; 5|2; 3}	0.886, 1.765,  

Splits in the Primary Concordance Tree: sample-wide and genome-wide mean CF (95% credibility), SD of mean sample-wide CF across runs
{1,4,5|2,3} 0.847(0.774,0.905) 0.844(0.754,0.918)	0.000
{1,4|2,3,5} 0.656(0.568,0.744) 0.653(0.542,0.760)	0.000

Splits NOT in the Primary Concordance Tree but with estimated CF > 0.050:
{1,2,3|4,5} 0.197(0.111,0.281) 0.197(0.100,0.302)	0.000
{1,2,5|3,4} 0.119(0.065,0.176) 0.

In [53]:
! head -n 40 analysis-bucky/bucky-D5/bucky-1.concordance

translate
 1 D14269,
 2 SF224,
 3 D12950,
 4 SF328,
 5 D14483,
 6 D14478;

Population Tree:
(((1,4),(2,3)),5,6);

Primary Concordance Tree Topology:
(((1,4),(2,3)),5,6);

Population Tree, With Branch Lengths In Estimated Coalescent Units:
(((1:10.000,4:10.000):1.172,(2:10.000,3:10.000):2.642):2.025,5:10.000,6:10.000);

Primary Concordance Tree with Sample Concordance Factors:
(((1:1.000,4:1.000):0.777,(2:1.000,3:1.000):0.938):0.853,5:1.000,6:1.000);

Four-way partitions in the Population Tree: sample-wide CF, coalescent units and Ties(if present)
{1; 4|2,3; 5,6}	0.793, 1.172,  
{1,4; 5,6|2; 3}	0.953, 2.642,  
{1,4; 2,3|5; 6}	0.912, 2.025,  

Splits in the Primary Concordance Tree: sample-wide and genome-wide mean CF (95% credibility), SD of mean sample-wide CF across runs
{1,4,5,6|2,3} 0.938(0.866,0.984) 0.934(0.853,0.985)	0.002
{1,2,3,4|5,6} 0.853(0.773,0.923) 0.850(0.758,0.929)	0.002
{1,4|2,3,5,6} 0.777(0.648,0.879) 0.774(0.633,0.886)	0.005

Splits NOT i

In [54]:
! head -n 40 analysis-bucky/bucky-D6/bucky-1.concordance

translate
 1 D14269,
 2 SF224,
 3 D12950,
 4 SF328,
 5 D14483,
 6 D14478,
 7 D14505;

Population Tree:
((((1,4),(2,3)),5),6,7);

Primary Concordance Tree Topology:
((((1,4),(2,3)),5),6,7);

Population Tree, With Branch Lengths In Estimated Coalescent Units:
((((1:10.000,4:10.000):0.607,(2:10.000,3:10.000):2.113):2.091,5:10.000):0.170,6:10.000,7:10.000);

Primary Concordance Tree with Sample Concordance Factors:
((((1:1.000,4:1.000):0.612,(2:1.000,3:1.000):0.887):0.857,5:1.000):0.415,6:1.000,7:1.000);

Four-way partitions in the Population Tree: sample-wide CF, coalescent units and Ties(if present)
{1,2,3,4; 5|6; 7}	0.438, 0.170,  
{1; 4|2,3; 5,6,7}	0.637, 0.607,  
{1,4; 5,6,7|2; 3}	0.919, 2.113,  
{1,4; 2,3|5; 6,7}	0.918, 2.091,  

Splits in the Primary Concordance Tree: sample-wide and genome-wide mean CF (95% credibility), SD of mean sample-wide CF across runs
{1,4,5,6,7|2,3} 0.887(0.830,0.936) 0.884(0.811,0.943)	0.006
{1,2,3,4|5,6,7} 0.857(0.777,0.922) 0