# Canarium GBS: ABBA-BABA analyses
### *Federman et al.*

This notebook provides all code necessary to reproduce the assembled GBS data sets used in Federman et al. (xxxx). Starting from demultiplexed fastq data files we assemble the data into four complete data sets that were used in downstream analyses. All code in this notebook is written in Python and uses the *ipyrad* package for assembly. 

### Required software

In [1]:
## conda install ipyrad -c ipyrad
## conda install toytree -c eaton-lab
## conda install structure -c ipyrad
## conda install clumpp -c ipyrad

### Imports

In [2]:
import toytree
import toyplot.svg
import ipyrad as ip
import ipyrad.analysis as ipa
print "ipyrad v.{}".format(ip.__version__)

ipyrad v.0.7.22


### Connect to cluster

In [3]:
import ipyparallel as ipp
ipyclient = ipp.Client()
ip.cluster_info(ipyclient)

host compute node: [40 cores] on sacra


## ABBA-BABA tests

In [4]:
## load input files
locifile = "./analysis-ipyrad/Canarium-min10_outfiles/Canarium-min10.loci"
newick = "./analysis-raxml/RAxML_bestTree.Canarium-min10"

In [5]:
## make clade lists for setting up tests easily
clade1A = ["SF328", "SF200", "SF175"]
clade1B = ["SF209", "D13052"]
clade1C = ["SF172", "D14528", "SF286", "SF276"]
clade1 = clade1A + clade1B + clade1C

clade2A = ["D14482", "D14483", "D13103", "D13101"]
clade2B = ["D14504", "D14505", "D14506"]
clade2C = ["D14501", "D14513", "D14480", "D14485", "D14477", "D14478"]
clade2 = clade2A + clade2B + clade2C

clade3A = ["D13090", "D12950"]
clade3B = ["D12963", "SF155", "5573", "SF327", "SF228", "SF224"]
clade3C = ["SF164", "SF153", "SF160", "D13097", "SF197", "D13075", "D13053", "D13063"]
clade3 = clade3A + clade3B + clade3C

outgroups = ["D13852", "SFC1988", "D14269", "D13374"]

# Question 1: Admixture between clades 2 and 3

### Test clade 2A into clade 3

In [6]:
## create a baba object linked to a data file and newick tree
aa = ipa.baba(data=locifile, newick=newick)

## generate all possible abba-baba tests meeting a set of constraints
aa.generate_tests_from_tree(
    constraint_exact=[False, False, True, True],
    constraint_dict={
        "p4": outgroups,
        "p3": clade2A,
        "p2": clade3A,
        "p1": clade3B+clade3C, #clade1C,
    })

## run all tests with clade1 against clades 2/3
aa.run(ipyclient)

81 tests generated from tree
[####################] 100%  calculating D-stats  | 0:01:58 |  


In [8]:
sres = aa.results_table.sort_values(by="Z", ascending=False)
aa.plot(
    #prune_tree_to_tests=True, 
    subset_tests=sres.head(25).index.tolist(),
    pct_tree_x=0.6,
    width=1000,
    height=600,
);

### Test clade 2B into clade 3

In [9]:
## create a baba object linked to a data file and newick tree
bb = ipa.baba(data=locifile, newick=newick)

## generate all possible abba-baba tests meeting a set of constraints
bb.generate_tests_from_tree(
    constraint_exact=[False, False, True, True],
    constraint_dict={
        "p4": outgroups,
        "p3": clade2B,
        "p2": clade3A,
        "p1": clade3B+clade3C, #clade1C,
    })

## run all tests with clade1 against clades 2/3
bb.run(ipyclient)

81 tests generated from tree
[####################] 100%  calculating D-stats  | 0:01:54 |  


In [10]:
sres = bb.results_table.sort_values(by="Z", ascending=False)
bb.plot(
    #prune_tree_to_tests=True, 
    subset_tests=sres.head(25).index.tolist(),
    pct_tree_x=0.6,
    width=1000,
    height=600,
);

### Test clade 2C into clade 3

In [11]:
## create a baba object linked to a data file and newick tree
cc = ipa.baba(data=locifile, newick=newick)

## generate all possible abba-baba tests meeting a set of constraints
cc.generate_tests_from_tree(
    constraint_exact=[False, False, True, True],
    constraint_dict={
        "p4": outgroups,
        "p3": clade2C,
        "p2": clade3A,
        "p1": clade3B+clade3C, #clade1C,
    })

## run all tests with clade1 against clades 2/3
cc.run(ipyclient)

81 tests generated from tree
[####################] 100%  calculating D-stats  | 0:02:09 |  


In [12]:
sres = cc.results_table.sort_values(by="Z", ascending=False)
cc.plot(
    #prune_tree_to_tests=True, 
    subset_tests=sres.head(25).index.tolist(),
    pct_tree_x=0.6,
    width=1000,
    height=600,
);

### Composite figure of 2ABC into 3A vs. 3BC
The ten tests with highest Z-scores for each test above.

In [13]:
idx1 = aa.results_table.sort_values(by="Z", ascending=False).head(10).index.tolist()
idx2 = bb.results_table.sort_values(by="Z", ascending=False).head(10).index.tolist()
idx3 = cc.results_table.sort_values(by="Z", ascending=False).head(10).index.tolist()

In [24]:
comp = bb.copy()
comp.tests = [aa.tests[i] for i in idx1] + \
             [bb.tests[i] for i in idx2] +  \
             [cc.tests[i] for i in idx3]
comp.run(ipyclient)
comp.plot(height=600, width=1000, pct_tree_x=0.6);

[####################] 100%  calculating D-stats  | 0:00:51 |  


In [26]:
comp5 = comp.copy()
comp5.tests = [
    {
        "p5": outgroups,
        "p4": clade3A,
        "p3": clade3B,
        "p2": clade2A,
        "p1": clade2B,    
    },
    {
        "p5": outgroups,
        "p4": clade3A,
        "p3": clade3B,
        "p2": clade2A,
        "p1": clade2C,    
    },
    {
        "p5": outgroups,
        "p4": clade3A,
        "p3": clade3B,
        "p2": clade2B,
        "p1": clade2C,    
    },
    
    {
        "p5": outgroups,
        "p4": clade2A,
        "p3": clade2B,
        "p2": clade3A,
        "p1": clade3B,    
    },
    {
        "p5": outgroups,
        "p4": clade2A,
        "p3": clade2C,
        "p2": clade3A,
        "p1": clade3B,    
    },
    {
        "p5": outgroups,
        "p4": clade2B,
        "p3": clade2C,
        "p2": clade3A,
        "p1": clade3B,    
    },
]

comp5.run(ipyclient)
comp5.results_table

[####################] 100%  calculating D-stats  | 0:01:05 |  


Unnamed: 0,Unnamed: 1,Dstat,bootmean,bootstd,Z,ABxxA,BAxxA,nloci
0,p3,-0.097,-0.097,0.053,1.832,98.366,119.559,29208
0,p4,-0.005,-0.008,0.063,0.085,94.555,95.569,29208
0,shared,-0.149,-0.15,0.03,5.031,324.695,438.754,29208
1,p3,-0.119,-0.121,0.045,2.649,104.127,132.366,30267
1,p4,0.041,0.044,0.056,0.731,96.334,88.732,30267
1,shared,-0.091,-0.09,0.027,3.419,354.332,425.31,30267
2,p3,-0.019,-0.017,0.042,0.457,112.968,117.427,30535
2,p4,0.01,0.012,0.054,0.188,91.123,89.297,30535
2,shared,0.062,0.063,0.026,2.373,405.515,358.08,30535
3,p3,0.027,0.028,0.056,0.481,122.729,116.263,29208


# Question 2: admixture between clades 1 and 3

In [40]:
bb.tests = [
    {
        "p5": outgroups,
        "p4": ["SF172"],
        "p3": clade1A,
        "p2": clade3C,
        "p1": clade2,    
    },
    {
        "p5": outgroups,
        "p4": clade3C,
        "p3": clade2,
        "p2": ["SF172"],
        "p1": clade1A,    
    },
]

bb.run(ipyclient)

[####################] 100%  calculating D-stats  | 0:01:24 |  


In [41]:
bb.results_table

Unnamed: 0,Unnamed: 1,Dstat,bootmean,bootstd,Z,ABxxA,BAxxA,nloci
0,p3,-0.276,-0.276,0.043,6.348,84.766,149.307,40803
0,p4,0.855,0.855,0.01,84.599,969.338,75.511,40803
0,shared,0.257,0.257,0.018,14.468,1070.936,633.492,40803
1,p3,-0.517,-0.517,0.032,15.981,64.424,202.258,40803
1,p4,0.79,0.791,0.014,55.491,726.482,85.113,40803
1,shared,0.372,0.371,0.019,19.165,920.538,421.728,40803


In [10]:
c, b, a = bb.plot(
    #prune_tree_to_tests=True,
    #collapse_outgroup=True, 
    #use_edge_lengths=False, 
    #height=400,
    pct_tree_x=0.7,
    #pct_tree_y=0.4,
    width=1000,
    #height=400,
    #debug=True,
);

In [34]:
## create a baba object linked to a data file and newick tree
bb = ipa.baba(data=locifile, newick=newick)

## generate all possible abba-baba tests meeting a set of constraints
bb.generate_tests_from_tree(
    #constraint_exact=[True, False, False, True, True],
    constraint_dict={
        #"p5": outgroups,
        #"p4": clade3A,
        "p3": clade3B,
        "p2": clade1A,
        "p1": clade1B,
    })

1050 tests generated from tree


In [38]:

## generate all possible abba-baba tests meeting a set of constraints
bb.generate_tests_from_tree(
    #constraint_exact=[True, False, False, True, True],
    constraint_dict={
        #"p5": outgroups,
        #"p4": clade3A,
        "p5": outgroups,
        "p4":
        "p3": clade3A,
        "p2": clade1A,
        "p1": clade1B,
    })

SyntaxError: invalid syntax (<ipython-input-38-33ed81c2cc2f>, line 10)

In [39]:
bb.tests = [
    { 
    "p5": outgroups,
    "p4": clade3A,
    "p3": clade3B,
    "p2": clade1A,
    "p1": clade1B,
    }
]

In [42]:
bb.run(ipyclient)

[####################] 100%  calculating D-stats  | 0:00:47 |  


In [44]:
bb.results_table

Unnamed: 0,Unnamed: 1,Dstat,bootmean,bootstd,Z,ABxxA,BAxxA,nloci
0,p3,0.092,0.092,0.06,1.55,71.523,59.442,29121
0,p4,0.212,0.215,0.092,2.299,47.955,31.186,29121
0,shared,0.128,0.126,0.037,3.483,263.142,203.486,29121


In [45]:
bb.plot(height=300)

(<toyplot.canvas.Canvas at 0x7fae5d321cd0>,
 <toyplot.coordinates.Cartesian at 0x7fae94199a10>,
 <ipyrad.plotting.baba_panel_plot.Panel at 0x7fae5d321650>)

In [16]:
bb.results_table.sort_values(by="Z", ascending=False).head(20)

Unnamed: 0,dstat,bootmean,bootstd,Z,ABBA,BABA,nloci
1367,0.241,0.241,0.016,14.666,1437.041,878.475,51303
744,-0.252,-0.253,0.018,14.316,776.541,1300.672,48751
444,-0.241,-0.241,0.017,14.167,882.424,1443.98,51448
624,-0.244,-0.245,0.017,14.148,865.995,1425.0,51098
1359,0.242,0.242,0.017,14.087,1459.324,891.533,51583
1369,0.244,0.245,0.018,13.747,1425.0,865.995,51098
1357,0.229,0.229,0.017,13.703,1519.046,952.275,52579
24,-0.225,-0.224,0.016,13.655,989.842,1563.329,52974
84,-0.229,-0.229,0.017,13.59,952.275,1519.046,52579
1373,0.252,0.252,0.019,13.581,1300.672,776.541,48751


In [None]:
bb.plot()

In [22]:
## save it
bb.results_table.to_csv("analysis-baba/clade3.csv")

In [20]:
## make a copy
cc = bb.results_table.copy(deep=True)
cc["p3"] = [i["p3"] for i in bb.tests]
cc["p2"] = [i["p2"] for i in bb.tests]
cc["p1"] = [i["p1"] for i in bb.tests]
cc.sort_values(by="Z", ascending=False)


Unnamed: 0,dstat,bootmean,bootstd,Z,ABBA,BABA,nloci,p3,p2,p1
1369,2.440e-01,2.439e-01,0.017,14.304,1425.000,865.995,51098,"[SF328, SF200, SF175, SF209, D13052, SF172, D1...",[D13097],"[D14480, D14477, D14478]"
1363,2.414e-01,2.420e-01,0.017,14.276,1443.980,882.424,51448,"[SF328, SF200, SF175, SF209, D13052, SF172, D1...",[D13097],"[D14513, D14485, D14480, D14477, D14478]"
204,-2.415e-01,-2.419e-01,0.017,14.200,891.533,1459.324,51583,"[SF328, SF200, SF175, SF209, D13052, SF172, D1...","[D14501, D14513, D14485, D14480, D14477, D14478]",[D13097]
1357,2.293e-01,2.294e-01,0.016,14.169,1519.046,952.275,52579,"[SF328, SF200, SF175, SF209, D13052, SF172, D1...",[D13097],"[D14504, D14505, D14506, D14501, D14513, D1448..."
1359,2.415e-01,2.414e-01,0.017,14.016,1459.324,891.533,51583,"[SF328, SF200, SF175, SF209, D13052, SF172, D1...",[D13097],"[D14501, D14513, D14485, D14480, D14477, D14478]"
624,-2.440e-01,-2.444e-01,0.017,13.985,865.995,1425.000,51098,"[SF328, SF200, SF175, SF209, D13052, SF172, D1...","[D14480, D14477, D14478]",[D13097]
564,-2.412e-01,-2.417e-01,0.017,13.933,878.475,1437.041,51303,"[SF328, SF200, SF175, SF209, D13052, SF172, D1...","[D14485, D14480, D14477, D14478]",[D13097]
1367,2.412e-01,2.405e-01,0.017,13.882,1437.041,878.475,51303,"[SF328, SF200, SF175, SF209, D13052, SF172, D1...",[D13097],"[D14485, D14480, D14477, D14478]"
744,-2.523e-01,-2.538e-01,0.018,13.844,776.541,1300.672,48751,"[SF328, SF200, SF175, SF209, D13052, SF172, D1...",[D14478],[D13097]
1373,2.523e-01,2.518e-01,0.018,13.708,1300.672,776.541,48751,"[SF328, SF200, SF175, SF209, D13052, SF172, D1...",[D13097],[D14478]


In [8]:
bb.results_table

Unnamed: 0,dstat,bootmean,bootstd,Z,ABBA,BABA,nloci
0,-0.095,-0.096,0.025,3.886,523.244,633.707,32900
1,-0.085,-0.083,0.033,2.565,277.359,328.591,20634
2,-0.101,-0.100,0.025,3.978,490.063,599.638,32027
3,-0.151,-0.151,0.014,11.119,1352.676,1832.992,60100
4,-0.182,-0.183,0.054,3.356,91.609,132.353,7550
5,-0.152,-0.152,0.013,11.541,1350.229,1832.648,60062
6,-0.147,-0.147,0.014,10.532,1265.992,1703.334,58306
7,-0.165,-0.164,0.014,11.554,1247.181,1739.090,58262
8,-0.221,-0.221,0.020,10.877,600.912,942.015,37952
9,-0.136,-0.136,0.015,9.225,1246.400,1639.352,57665


In [49]:
## generate tests.
cc = bb.copy()
cc.generate_tests_from_tree(
    constraint_dict={
        "p4": ["D13852", "SFC1988", "D14269", "D13374"],
        "p3": ["SF209", "D13052", "SF172", "D14528", "SF286", "SF276"],
        "p2": ["D13101", "D13103", "D14482", "D14483"],
    })

46 tests generated from tree
46 tests generated from tree


In [None]:
dd = bb.copy()
dd.generate_tests_from_tree(
    constraint_dict={
        "p4": ["D13852", "SFC1988", "D14269", "D13374"],
        "p3": clade1 + clade2,
        "p2": clade3A,
    })

In [50]:
## run all tests linked to bb 
bb.run(ipyclient)
cc.run(ipyclient)

## save the results table to a csv file
bb.results_table.to_csv("analysis-baba/bb.abba-baba.csv", sep="\t")
cc.results_table.to_csv("analysis-baba/cc.abba-baba.csv", sep="\t")

[####################] 100%  calculating D-stats  | 0:02:15 |  
[####################] 100%  calculating D-stats  | 0:01:59 |  


In [66]:
bb.tests[20]

{'p1': ['SF228'],
 'p2': ['D14482', 'D14483', 'D13103', 'D13101'],
 'p3': ['SF328', 'SF200', 'SF175'],
 'p4': ['SFC1988', 'D13852', 'D13374', 'D14269']}

In [72]:
bb.plot(
    height=800,
    width=700,
    pct_tree_y=0.15,
    pct_tree_x=0.65,
    ewidth=1, 
    alpha=4.,
    style_test_labels={"font-size":"10px"},
    style_tip_labels={"font-size": "9px"},
    style_results_labels={"font-size": "10px"}
);

In [67]:
cc.plot(
    height=900,
    width=700,
    pct_tree_y=0.15,
    pct_tree_x=0.65,
    ewidth=2, 
    alpha=4.,
    style_test_labels={"font-size":"10px"},
    style_tip_labels={"font-size": "9px"},
    style_results_labels={"font-size": "10px"}
);