# Species delimitation in Malagasy Canarium using iBPP

This notebook is an empirical application of ibpp for species delimitation using GBS data assembled in ipyrad. We use the ipyrad utility function to `loci2bpp` to programatticaly setup a range of tests and to deploy them in parallel. 

### Information about this notebook
This is a jupyter notebook. All code in this notebook is Python. You should be able to download and execute this notebook and reproduce all of our results. This notebook along with other notebooks and data files are hosted on github: https://github.com/sarahfederman/Canarium-GBS/

### Import Python libraries

In [1]:
import ipyrad as ip
import ipyparallel as ipp
import pandas as pd
import random
import socket
import ete3
import sys
import os

## print versions
print "ipyrad v.{}".format(ip.__version__)

ipyrad v.0.5.10


### Create a directory to store results files in

In [2]:
WDIR = "./analysis_bpp"
if not os.path.exists(WDIR):
    os.mkdir(WDIR)

### Setup an ipyparallel cluster connection

In [7]:
rc = ipp.Client()
lbview = rc.load_balanced_view()

## print some information about our cluster
info = rc[:].apply(socket.gethostname)
for host in set(info.result_dict.values()):
    print "compute node: [{} cores] on {}".format(info.result_dict.values().count(host), host)

compute node: [4 cores] on oud


### The input data

In [8]:
## downoad .loci file from (replace dropbox link with zenodo link) and save path
#! curl -LkO https://dl.dropboxusercontent.com/u/2538935/CanEnd_min20.loci
LOCI = "./CanEnd_min20.loci"

In [9]:
## make a mapping dictionary grouping samples into 'species'
IMAP6 = {
    "A": ['SF172', 'SF175', 'SF328', 'SF200', 'SF209', 'D14528', 'SF276', 'SF286', 'D13052'],
    "B": ['D13101', 'D13103', 'D14482', 'D14483'],
    "C": ['D14504', 'D14505', 'D14506'],
    "D": ['D14477', 'D14478', 'D14480', 'D14485', 'D14501', 'D14513'], 
    "E": ['D13090', 'D12950'],
    "F": ['D13097', 'SF155', 'D13063', 'D12963', 'SF160', 'SF327',
          'SF224', 'SF228', '5573', 'SF153', 'SF164', 'D13075', 'SF197'], 
    }


## make a dictionary with min values to filter loci to those with N samples per species.
MINMAP6 = {
    "A": 8, 
    "B": 4, 
    "C": 3,
    "D": 4, 
    "E": 2, 
    "F": 8,
}


## Species tree hypothesis ('guide tree') based on raxml & bucky results
TREE6 = "((((D,C),B),(E,F)),A);"
print ete3.Tree(TREE6)


            /-D
         /-|
      /-|   \-C
     |  |
   /-|   \-B
  |  |
  |  |   /-E
--|   \-|
  |      \-F
  |
   \-A


In [10]:
## make a mapping dictionary grouping samples into 'species'
IMAP5 = {
    "A": ['SF172', 'SF175', 'SF328', 'SF200', 'SF209', 'D14528', 'SF276', 'SF286', 'D13052'],
    "B": ['D13101', 'D13103', 'D14482', 'D14483'],
    "C": ['D14504', 'D14505', 'D14506'],
    "D": ['D14477', 'D14478', 'D14480', 'D14485', 'D14501', 'D14513'], 
    "E": ['D13090', 'D12950', 'D13097', 'SF155', 'D13063', 'D12963', 'SF160', 'SF327',
          'SF224', 'SF228', '5573', 'SF153', 'SF164', 'D13075', 'SF197'], 
    }


## make a dictionary with min values to filter loci to those with N samples per species.
MINMAP5 = {
    "A": 8, 
    "B": 4, 
    "C": 3,
    "D": 4, 
    "E": 8, 
}


## Species tree hypothesis ('guide tree') based on raxml & bucky results
TREE5 = "((((D,C),B),E),A);"
print ete3.Tree(TREE5)


            /-D
         /-|
      /-|   \-C
     |  |
   /-|   \-B
  |  |
--|   \-E
  |
   \-A


In [11]:
## make a mapping dictionary grouping samples into 'species'
IMAP4 = {
    "A": ['SF172', 'SF175', 'SF328', 'SF200', 'SF209', 'D14528', 'SF276', 'SF286', 'D13052'],
    "B": ['D13101', 'D13103', 'D14482', 'D14483'],
    "C": ['D14504', 'D14505', 'D14506', 'D14477', 'D14478', 'D14480', 'D14485', 'D14501', 'D14513'], 
    "D": ['D13090', 'D12950', 'SF155', 'D12963',  'SF327', 'SF224', 'SF228', '5573', 
          'SF164', 'SF153', 'SF160', 'D13063', 'D13075', 'D13097', 'SF197'], 
    }

## 
MINMAP4 = {
    "A": 6,
    "B": 4,
    "C": 6,
    "D": 6    
}

## fix tree, and print it 
TREE4 = "(((B,C), D),A);"
print ete3.Tree(TREE4)


         /-B
      /-|
   /-|   \-C
  |  |
--|   \-D
  |
   \-A


In [12]:
## Trait data (csv) from (https://zenodo.../CanEnd_trait2.csv")
TRAITS = pd.read_csv("./CanEnd_traits.csv", na_values="", index_col=0)
TRAITS.head(10)

Unnamed: 0_level_0,leaf_tot,juga,leaf_juga_ratio,stip_dist,stip_scar_length,pet_length,petiole_stip_ratio,lateral_petiolules,basal_petiolule,termil_petiolule,...,lateral_lft_W,lateral_L_widest_point,ll_lw_ratio,ll_wp_ratio,termil_lft_L,termil_lft_W,termil_L_widest_point,tl_tw_ratio,tl_wp_ratio,X2o_vein_pairs
Indiv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SF175,371.77,4.67,79.49,21.38,2.3,59.43,2.91,19.45,8.98,45.31,...,36.79,55.04,2.98,2.98,85.59,40.27,45.99,2.18,1.88,10.67
SF328,268.61,4.0,67.15,39.91,1.39,67.65,1.72,9.01,5.4,27.41,...,33.04,41.29,2.35,2.35,67.84,29.39,40.11,2.32,1.71,10.5
SF200,208.42,3.67,58.15,12.6,1.78,32.21,2.6,8.58,4.63,29.89,...,27.35,37.53,2.37,2.37,55.54,30.66,34.89,1.82,1.59,10.0
SF209,218.85,4.0,56.69,13.0,2.04,42.98,3.46,8.62,4.94,17.66,...,30.8,35.68,2.05,2.05,61.2,30.43,30.8,2.03,2.01,10.67
D14528,264.57,4.0,66.14,10.85,2.99,53.21,4.9,12.23,9.12,32.79,...,33.91,53.94,2.61,2.61,79.49,35.67,49.57,2.22,1.6,10.33
SF276,283.45,3.0,94.48,21.84,2.64,74.72,3.55,12.56,8.35,31.82,...,37.79,52.69,2.31,2.31,82.05,44.05,51.32,1.87,1.6,8.0
SF286,288.35,3.0,96.12,27.57,2.75,72.73,2.65,17.31,11.73,35.23,...,52.27,42.52,1.58,1.58,83.78,46.91,44.01,1.78,1.93,8.5
D14504,323.02,6.0,54.21,12.54,2.82,65.82,6.09,8.6,5.32,18.3,...,48.74,44.06,1.7,1.7,70.46,36.71,34.97,1.94,2.05,16.67
D14505,448.82,8.0,56.25,9.51,3.0,63.97,6.7,5.77,5.67,23.1,...,54.47,45.0,2.29,2.29,129.33,64.38,65.85,2.01,2.03,17.5
D14506,534.65,7.0,77.69,11.67,3.5,91.8,8.0,15.4,13.51,35.22,...,70.34,66.34,2.85,2.85,155.46,61.82,74.68,2.52,2.06,20.33


### Make a function to call bpp/ibpp
We will submit a large range of jobs to our parallel cluster. First we will infer a species tree with bpp, and then we will add traits and test delimitation hypotheses with ibpp. To track the progress of all of the parallel processes we will store info about them (their async objects) in a dictionary called results. 

In [13]:
## a dictionary to store results
results = {}

In [14]:
## a function to call i/bpp
def bpp(ctlfile):
    """ 
    This assumes you installed bpp & ibpp in ~/local/bin/ following the 
    installation instructions in the ipyrad bpp tutorial. 
    """
    import subprocess
    import os
    if ".ibpp" in ctlfile:
        cmd = [os.path.expanduser("~/local/bin/ibpp"), ctlfile]
    else:
        cmd = [os.path.expanduser("~/local/bin/bpp"), ctlfile]
    subprocess.check_output(cmd)
    

### Infer species tree 

In [12]:
## an initial test for the species tree from starting 'guide tree'
ctl0 = ip.file_conversion.loci2bpp("tree-0", LOCI, IMAP, TREE, 
                                   wdir=WDIR,
                                   minmap=MINMAP,
                                   infer_sptree=1,
                                   infer_delimit=0,
                                   maxloci=10000,  
                                   nsample=100000,
                                   burnin=10000,
                                   sampfreq=2,
                                   thetaprior=(2, 2000),
                                   tauprior=(2, 200, 1)
                                   )

new files created (1007 loci, 6 species, 37 samples)
  tree-0.bpp.seq.txt
  tree-0.bpp.imap.txt
  tree-0.bpp.ctl.txt


### Infer species delimitation

In [15]:
## an initial test for the species tree from starting 'guide tree'
ctl1 = ip.file_conversion.loci2bpp("delim-6sp-notraits-300L", 
                                   #traits_df=TRAITS,
                                   locifile=LOCI, 
                                   imap=IMAP6, 
                                   guidetree=TREE6, 
                                   minmap=MINMAP6,
                                   wdir=WDIR,
                                   infer_sptree=0,
                                   infer_delimit=1,
                                   delimit_alg=(1, 1, 1),
                                   maxloci=300,  
                                   nsample=10000,
                                   burnin=1000,
                                   sampfreq=2,
                                   thetaprior=(5, 500),
                                   tauprior=(20, 200, 1),
                                   finetune=(2.0, 0.01, 0.01, 0.0001, 0.01, 0.1, 0.1, 0.1),
                                   seed=random.randint(1,1e6),
                                   verbose=1,
                                   )

ctl file
--------
seed = 545691
seqfile = /home/deren/Documents/Canarium-GBS/analysis_bpp/delim-6sp-notraits-300L.bpp.seq.txt
Imapfile = /home/deren/Documents/Canarium-GBS/analysis_bpp/delim-6sp-notraits-300L.bpp.imap.txt
mcmcfile = /home/deren/Documents/Canarium-GBS/analysis_bpp/delim-6sp-notraits-300L.bpp.mcmc.txt
outfile = /home/deren/Documents/Canarium-GBS/analysis_bpp/delim-6sp-notraits-300L.bpp.out.txt
nloci = 300
usedata = 1
cleandata = 0
speciestree = 0
speciesdelimitation = 1 1 1 1
species&tree = 6 A B C D E F
                 9 4 3 6 2 13
                 ((((D,C),B),(E,F)),A);
thetaprior = 5 500
tauprior = 20 200 1
finetune = 1: 2.0 0.01 0.01 0.0001 0.01 0.1 0.1 0.1
print = 1 0 0 0
burnin = 1000
sampfreq = 2
nsample = 10000
--------

new files created (300 loci, 6 species, 37 samples)
  delim-6sp-notraits-300L.bpp.seq.txt
  delim-6sp-notraits-300L.bpp.imap.txt
  delim-6sp-notraits-300L.bpp.ctl.txt


In [11]:
## an initial test for the species tree from starting 'guide tree'
ctl1 = ip.file_conversion.loci2bpp("delim-7sp-100", LOCI, IMAP, TREE, 
                                   wdir=WDIR,
                                   minmap=MINMAP,
                                   #traits_df=TRAITS,
                                   infer_sptree=0,
                                   infer_delimit=1,
                                   delimit_alg=(1.5, 1.5, 1.0),
                                   maxloci=100,  
                                   nsample=10000,
                                   burnin=1000,
                                   sampfreq=2,
                                   thetaprior=(5, 500),
                                   tauprior=(2, 20, 1),
                                   verbose=1,
                                   #kappa=0,
                                   #nu=1
                                   )

ctl file
--------
seed = 12345
seqfile = /home/deren/Documents/Canarium-GBS/analysis_bpp/delim-100.bpp.seq.txt
Imapfile = /home/deren/Documents/Canarium-GBS/analysis_bpp/delim-100.bpp.imap.txt
mcmcfile = /home/deren/Documents/Canarium-GBS/analysis_bpp/delim-100.bpp.mcmc.txt
outfile = /home/deren/Documents/Canarium-GBS/analysis_bpp/delim-100.bpp.out.txt
nloci = 100
usedata = 1
cleandata = 0
speciestree = 0
speciesdelimitation = 1 1.5 1.5 1.0
species&tree = 7 A C B E D G F
                 9 3 4 2 6 7 6
                 ((((D,B),C),(E(F,G))),A);
thetaprior = 5 500
tauprior = 2 20 1
finetune = 1: .01 .01 .01 .01 .01 .01 .01 .01
print = 1 0 0 0
burnin = 1000
sampfreq = 2
nsample = 10000
--------

new files created (100 loci, 7 species, 37 samples)
  delim-100.bpp.seq.txt
  delim-100.bpp.imap.txt
  delim-100.bpp.ctl.txt


In [14]:
! bpp $ctl1

bp&p Version 3.3, November 2016

Reading options from /home/deren/Documents/Canarium-GBS/analysis_bpp/delim-0.bpp.ctl.txt..
6 species:  A (9) C (3) B (4) E (2) D (6) F (13)
((((D, B), C), (E, F)), A);

pop by pop table showing node numbers in species tree

                        1  2  3  4  5  6  7  8  9 10 11
species  1 A            1  0  0  0  0  0  1  0  0  0  0
species  2 C            0  1  0  0  0  0  1  1  1  0  0
species  3 B            0  0  1  0  0  0  1  1  1  1  0
species  4 E            0  0  0  1  0  0  1  1  0  0  1
species  5 D            0  0  0  0  1  0  1  1  1  1  0
species  6 F            0  0  0  0  0  1  1  1  0  0  1
species  7 DBCEFA       0  0  0  0  0  0  1  0  0  0  0
species  8 DBCEF        0  0  0  0  0  0  1  1  0  0  0
species  9 DBC          0  0  0  0  0  0  1  1  1  0  0
species 10 DB           0  0  0  0  0  0  1  1  1  1  0
species 11 EF           0  0  0  0  0  0  1  1  0  0  1

11 theta parameters (populations) in the order:
  theta_1A theta_2C th

### Send job to run in parallel

In [67]:
## store async results in a dict
asyncs = {}
for ctl in [ctl0, ctl1]:
    asyncs[ctl] = lbview.apply(bpp, ctl)
    print "submitted {}".format(ctl)

submitted /ysm-gpfs/home/de243/Canarium-GBS/analysis_bpp/tree-0.bpp.ctl.txt
submitted /ysm-gpfs/home/de243/Canarium-GBS/analysis_bpp/delim-0.ibpp.ctl.txt


### Set up many additional species delimitation tests
We are interested in both how well the sequence data and the trait data can delimit species in Canarium. We will setup a range of tests to look at different settings for the priors, for different species delimitation algorithms, and for different types of data. We will start with a six taxon tree and allow the species delimitation algorithm to collapes nodes on the tree to test hypotheses of 1-6 species. 

In [68]:
## set up a couple tests to perform
DELIMIT_TESTS = [
    (0, 2),
    (0, 5),
    (0, 10),
    (1, 1.0, 1.0),
    (1, 1.0, 1.5),
    (1, 1.0, 2.0),
    (1, 1.5, 1.0), 
    (1, 1.5, 1.5), 
    (1, 1.5, 2.0),
    (1, 2.0, 1.0), 
    (1, 2.0, 1.5), 
    (1, 2.0, 2.0)
]

In [69]:
## iterate over combinations for a total of 36 tests. 
## (1) 0/1 with or without traits 
## (2) 12 delimitation algorithm combinations (DELIMIT_TESTS)
## (3) 3 independent replicates from different random seeds

for usetraits in [0, 1]:
    for tdx, delim in enumerate(DELIMIT_TESTS):
        for rep in range(3):
        
            ## make a name for this test
            rname = "delim-{}-{}-{}".format(usetraits, tdx, rep)
        
            ## make input files and get ctl path
            ctl = ip.file_conversion.loci2bpp(rname, LOCI, IMAP, TREE, 
                                          minmap=MINMAP, 
                                          infer_delimit=1,
                                          infer_sptree=0,
                                          delimit_alg=delim,
                                          traits=TRAITS,
                                          maxloci=10000,  
                                          nsample=100000,
                                          burnin=10000,
                                          sampfreq=2,
                                          thetaprior=(2, 2000),
                                          tauprior=(2, 200, 1),
                                          usetraitdata=usetraits,
                                          seed=random.randint(0, 1e9)
                                          )
        
            ## send job to run on cluster
            results[rname] = lbview.apply(bpp, ctl)
            sys.stderr.write("job {} submitted".format(rname))

NameError: name 'imap' is not defined

### Track progress

In [64]:
## check success/failure of jobs
for job in results:
    ## get shorter name for job
    jobname = job.split("/")[-1]
    
    ## print done or not
    if results[job].ready():
        if results[job].successful():
            print "{:<30} -- finished".format(jobname)
        else:
            print "{:<30} -- failed:".format(results[job].exception())
    else:
        print "{:<30} -- still running".format(jobname)

### Parse results (out.txt) files

In [None]:
## Let's read in the '.bpp.out.txt' results files for each test
median_dict = {}
ess_dict = {}

for test, job in enumerate(sorted(asyncs)):
    ## replace .ctl.txt with .out.txt
    outname = job.replace(".ctl.", ".out.")
    
    ## parse theta and tau priors from the job name
    theta = job.split("-")[1:3]
    tau = job.split("-")[3:5]
    
    ## read the file and parse out results
    with open(outname, 'r') as infile:
        data = infile.readlines()
    
    ## b/c sptree and delimit were set to 0 all this test did was infer sptree params
    ## on the fixed tree. So let's compare the parameters under different priors
    for line in data:
        if "theta_1" and "theta_2" in line:
            index = ["theta mean", "tau mean"] + line.split()
            
        if "median" in line:
            data = [5./float(theta[1]), 1./float(tau[1])] + line.split()[1:]
            median_dict[test] = pd.Series(data=data, index=index)
            
        if "ESS*" in line:
            data = [5./float(theta[1]), 1./float(tau[1])] + line.split()[1:]
            ess_dict[test] = pd.Series(data=data, index=index)

## make results into a dataframe and print. It appears that the prior has a large effect on theta5 (AB)
medians = pd.DataFrame(data=median_dict)
ess = pd.DataFrame(data=ess_dict)

## look at median values
medians.T

In [38]:
## check for finished jobs
rc.wait_interactive()

   0/2 tasks finished after 18258 s

KeyboardInterrupt: 