# Species delimitation in Malagasy Canarium using iBPP

This notebook is an empirical application of ibpp for species delimitation using GBS data assembled in ipyrad. We use the ipyrad utility function to `loci2bpp` to programatticaly setup a range of tests and to deploy them in parallel. 

### Information about this notebook
This is a jupyter notebook. All code in this notebook is Python. You should be able to download and execute this notebook and reproduce all of our results. This notebook along with other notebooks and data files are hosted on github: https://github.com/sarahfederman/Canarium-GBS/

### Import Python libraries

In [1]:
import ipyrad as ip
import ipyparallel as ipp
import pandas as pd
import socket
import sys
import os

## print versions
print "ipyrad v.{}".format(ip.__version__)

ipyrad v.0.5.10


### Create a directory to store results files in

In [11]:
WDIR = "./analysis_bpp"
if not os.path.exists(WDIR):
    os.mkdir(WDIR)

### Setup an ipyparallel cluster connection

In [12]:
rc = ipp.Client()
lbview = rc.load_balanced_view()

## print some information about our cluster
info = rc[:].apply(socket.gethostname)
for host in set(info.result_dict.values()):
    print "compute node: [{} cores] on {}".format(info.result_dict.values().count(host), host)

compute node: [16 cores] on c13n02.farnam.hpc.yale.internal
compute node: [16 cores] on c13n04.farnam.hpc.yale.internal
compute node: [16 cores] on c13n01.farnam.hpc.yale.internal
compute node: [16 cores] on c13n03.farnam.hpc.yale.internal


### The input data

In [3]:
## downoad .loci file from (replace dropbox link with zenodo link) and save path
! curl -LkO https://dl.dropboxusercontent.com/u/2538935/CanEnd_min20.loci
LOCI = "./CanEnd_min20.loci"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  106M  100  106M    0     0  13.0M      0  0:00:08  0:00:08 --:--:-- 10.7M


In [15]:
## make a mapping dictionary grouping samples into 'species'
IMAP = {
    "A": ['SF172', 'SF175', 'SF328', 'SF200', 'SF209',
          'D14528', 'SF276', 'SF286', 'D13052'],
    "B": ['D13101', 'D13103', 'D14482', 'D14483'],
    "C": ['D14504', 'D14505', 'D14506'],
    "D": ['D14477', 'D14478', 'D14480', 'D14485', 'D14501', 'D14513'], 
    "E": ['D13090', 'D12950'],
    "F": ['D13097', 'SF155', 'D13063', 'D12963', 'SF160', 'SF327',
          'SF224', 'SF228', '5573', 'SF153', 'SF164', 'D13075', 'SF197'], 
    }

In [16]:
## make a dictionary with min values to filter loci to those with N samples per species.
MINMAP = {
    "A": 8, 
    "B": 4, 
    "C": 3,
    "D": 4, 
    "E": 2, 
    "F": 10,
}

In [17]:
## Species tree hypothesis ('guide tree') based on raxml & bucky results
TREE = "((((D,B),C),(E,F)),A);"

In [18]:
## Trait data (csv) from (https://zenodo.../CanEnd_trait2.csv")
TRAITS = pd.read_csv("./CanEnd_traits.csv", na_values="", index_col=0)
TRAITS.head(10)

Unnamed: 0_level_0,leaf_tot,juga,leaf_juga_ratio,stip_dist,stip_scar_length,pet_length,petiole_stip_ratio,lateral_petiolules,basal_petiolule,termil_petiolule,...,lateral_lft_W,lateral_L_widest_point,ll_lw_ratio,ll_wp_ratio,termil_lft_L,termil_lft_W,termil_L_widest_point,tl_tw_ratio,tl_wp_ratio,X2o_vein_pairs
Indiv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SF175,371.77,4.67,79.49,21.38,2.3,59.43,2.91,19.45,8.98,45.31,...,36.79,55.04,2.98,2.98,85.59,40.27,45.99,2.18,1.88,10.67
SF328,268.61,4.0,67.15,39.91,1.39,67.65,1.72,9.01,5.4,27.41,...,33.04,41.29,2.35,2.35,67.84,29.39,40.11,2.32,1.71,10.5
SF200,208.42,3.67,58.15,12.6,1.78,32.21,2.6,8.58,4.63,29.89,...,27.35,37.53,2.37,2.37,55.54,30.66,34.89,1.82,1.59,10.0
SF209,218.85,4.0,56.69,13.0,2.04,42.98,3.46,8.62,4.94,17.66,...,30.8,35.68,2.05,2.05,61.2,30.43,30.8,2.03,2.01,10.67
D14528,264.57,4.0,66.14,10.85,2.99,53.21,4.9,12.23,9.12,32.79,...,33.91,53.94,2.61,2.61,79.49,35.67,49.57,2.22,1.6,10.33
SF276,283.45,3.0,94.48,21.84,2.64,74.72,3.55,12.56,8.35,31.82,...,37.79,52.69,2.31,2.31,82.05,44.05,51.32,1.87,1.6,8.0
SF286,288.35,3.0,96.12,27.57,2.75,72.73,2.65,17.31,11.73,35.23,...,52.27,42.52,1.58,1.58,83.78,46.91,44.01,1.78,1.93,8.5
D14504,323.02,6.0,54.21,12.54,2.82,65.82,6.09,8.6,5.32,18.3,...,48.74,44.06,1.7,1.7,70.46,36.71,34.97,1.94,2.05,16.67
D14505,448.82,8.0,56.25,9.51,3.0,63.97,6.7,5.77,5.67,23.1,...,54.47,45.0,2.29,2.29,129.33,64.38,65.85,2.01,2.03,17.5
D14506,534.65,7.0,77.69,11.67,3.5,91.8,8.0,15.4,13.51,35.22,...,70.34,66.34,2.85,2.85,155.46,61.82,74.68,2.52,2.06,20.33


### Make a function to call bpp/ibpp
We will submit a large range of jobs to our parallel cluster. First we will infer a species tree with bpp, and then we will add traits and test delimitation hypotheses with ibpp. To track the progress of all of the parallel processes we will store info about them (their async objects) in a dictionary called results. 

In [19]:
## a dictionary to store results
results = {}

In [20]:
## a function to call i/bpp
def bpp(ctlfile):
    """ 
    This assumes you installed bpp & ibpp in ~/local/bin/ following the 
    installation instructions in the ipyrad bpp tutorial. 
    """
    import subprocess
    import os
    if ".ibpp" in ctlfile:
        cmd = [os.path.expanduser("~/local/bin/ibpp"), ctlfile]
    else:
        cmd = [os.path.expanduser("~/local/bin/bpp"), ctlfile]
    subprocess.check_output(cmd)
    

### Infer species tree 

In [21]:
## an initial test for the species tree from starting 'guide tree'
ctl0 = ip.file_conversion.loci2bpp("tree", LOCI, IMAP, TREE, 
                                   wdir=WDIR,
                                   minmap=MINMAP,
                                   infer_sptree=1,
                                   infer_delimit=0,
                                   maxloci=10000,  
                                   nsample=100000,
                                   burnin=10000,
                                   sampfreq=2,
                                   thetaprior=(2, 200),
                                   tauprior=(2, 2, 1)
                                   )

new files created (1007 loci, 6 species, 37 samples)
  tree.bpp.seq.txt
  tree.bpp.imap.txt
  tree.bpp.ctl.txt


### Infer species delimitation

In [24]:
## an initial test for the species tree from starting 'guide tree'
ctl1 = ip.file_conversion.loci2bpp("delim", LOCI, IMAP, TREE, 
                                   wdir=WDIR,
                                   minmap=MINMAP,
                                   traits_df=TRAITS,
                                   infer_sptree=0,
                                   infer_delimit=1,
                                   maxloci=10000,  
                                   nsample=100000,
                                   burnin=10000,
                                   sampfreq=2,
                                   thetaprior=(2, 200),
                                   tauprior=(2, 2, 1),
                                   kappa=0,
                                   nu=1
                                   )

new files created (1007 loci, 6 species, 37 samples)
  delim.ibpp.seq.txt
  delim.ibpp.imap.txt
  delim.ibpp.ctl.txt
  delim.ibpp.traits.txt


### Send job to run in parallel

In [26]:
for ctl in [ctl0, ctl1]:
    lbview.apply(bpp, ctl)
    print "submitted {}".format(ctl)

submitted /ysm-gpfs/home/de243/Canarium-GBS/analysis_bpp/tree.bpp.ctl.txt
submitted /ysm-gpfs/home/de243/Canarium-GBS/analysis_bpp/delim.ibpp.ctl.txt


### Set up many additional species delimitation tests
We are interested in both how well the sequence data and the trait data can delimit species in Canarium. We will setup a range of tests to look at different settings for the priors, for different species delimitation algorithms, and for different types of data. We will start with a six taxon tree and allow the species delimitation algorithm to collapes nodes on the tree to test hypotheses of 1-6 species. 

In [51]:
## set up a couple tests to perform
DELIMIT_TESTS = [
    (0, 2),
    (0, 5),
    (0, 10),
    (1, 1.0, 1.0),
    (1, 1.0, 1.5),
    (1, 1.0, 2.0),
    (1, 1.5, 1.0), 
    (1, 1.5, 1.5), 
    (1, 1.5, 2.0),
    (1, 2.0, 1.0), 
    (1, 2.0, 1.5), 
    (1, 2.0, 2.0)
]

## make a mapping dictionary that INCLUDES "SF172"
IMAP_1 = {
    "A": ['SF172', 'SF175', 'SF328', 'SF200', 'SF209',
          'D14528', 'SF276', 'SF286', 'D13052'],
    "B": ['D13101', 'D13103', 'D14482', 'D14483'],
    "C": ['D14504', 'D14505', 'D14506'],
    "D": ['D14477', 'D14478', 'D14480', 'D14485', 'D14501', 'D14513'], 
    "E": ['D13090', 'D12950'],
    "F": ['D13097', 'SF155', 'D13063', 'D12963', 'SF160', 'SF327',
          'SF224', 'SF228', '5573', 'SF153', 'SF164', 'D13075', 'SF197'], 
    }


## make a mapping dictionary that EXCLUDES "SF172"
IMAP_2 = {
    "A": ['SF175', 'SF328', 'SF200', 'SF209',
          'D14528', 'SF276', 'SF286', 'D13052'],
    "B": ['D13101', 'D13103', 'D14482', 'D14483'],
    "C": ['D14504', 'D14505', 'D14506'],
    "D": ['D14477', 'D14478', 'D14480', 'D14485', 'D14501', 'D14513'], 
    "E": ['D13090', 'D12950'],
    "F": ['D13097', 'SF155', 'D13063', 'D12963', 'SF160', 'SF327',
          'SF224', 'SF228', '5573', 'SF153', 'SF164', 'D13075', 'SF197'], 
    }

In [None]:
## iterate over parameters
for tdx, delim in enumerate(DELIMIT_TESTS):
    for idx, imap in enumerate([IMAP_1, IMAP2]):
        for usetraits in [0, 1]:
        
        ## make a name for this test
        rname = "delim-{}-{}-{}".format(tdx, idx, usetraits)
        
        ## make input files and get ctl path
        ctl = ip.file_conversion.loci2bpp(rname, LOCI, imap, TREE, 
                                          minmap=MINMAP, 
                                          infer_delimit=1,
                                          delimit_alg=delim,
                                          traits=TRAITS,
                                          maxloci=10000,  
                                          nsample=100000,
                                          burnin=10000,
                                          sampfreq=2,
                                          thetaprior=(2, 2000),
                                          tauprior=(2, 2, 1),
                                          usetraitdata=usetraits,
                                          )
        
        ## send job to run on cluster
        results[rname] = lbview.apply(bpp, ctl)
        sys.stderr.write("job {} submitted".format(rname))
        