# Species tree reconstruction with ASTRAL

## Load modules

In [10]:
import toytree
import toyplot.pdf
import pandas as pd
import ipyrad.analysis as ipa

## Matrix preparation

### Load sequences

In [5]:
SEQS = "../../Raw_data/full_dataset.seqs.hdf5"

### Check scaffolds lengths

In [7]:
scaff = ipa.treeslider(SEQS)

In [8]:
scaff_table = scaff.scaffold_table.sort_values(by="scaffold_length", ascending=False)
display (scaff_table)

#use 100 biggest scaffolds
scaff_toUse = scaff_table.index[:100].tolist()
print (scaff_toUse)

Unnamed: 0,scaffold_name,scaffold_length
3588,Scaffold_3589;HRSCAF=4012,47665733
28086,Scaffold_28087;HRSCAF=31587,44977913
17364,Scaffold_17365;HRSCAF=19499,36021024
62490,Scaffold_62491;HRSCAF=74262,28015097
61137,Scaffold_61138;HRSCAF=69458,27022671
...,...,...
56530,Scaffold_56531;HRSCAF=63672,1000
47705,Scaffold_47706;HRSCAF=53707,1000
2963,Scaffold_2964;HRSCAF=3308,1000
28804,Scaffold_28805;HRSCAF=32399,1000


[3588, 28086, 17364, 62490, 61137, 45956, 8703, 9533, 48436, 2632, 63334, 63141, 24082, 13033, 62358, 61104, 61047, 62117, 17617, 56542, 43757, 8389, 61746, 62952, 63518, 61744, 61298, 60968, 5035, 37875, 61000, 22506, 62326, 63551, 61247, 46026, 57237, 730, 62111, 12689, 19887, 62221, 62866, 31061, 62700, 62299, 60847, 40973, 59869, 60919, 63421, 25834, 17056, 62901, 54906, 15325, 62359, 8724, 62132, 647, 63199, 62114, 63268, 63026, 12280, 62608, 21867, 62227, 59617, 29043, 63514, 22028, 61433, 60934, 41200, 29135, 42178, 39260, 40340, 62802, 62290, 126, 63349, 62877, 63548, 61616, 61757, 16723, 7796, 63111, 58278, 4505, 50949, 61161, 22581, 10687, 34786, 38883, 60844, 47880]


## Multi-tree reconsctruction using Treeslider with reduced consensus

In [None]:
# import database
fulldata = pd.read_csv("../../Raw_data/oreinotinus_samples_database.csv")

# import color codes
colors = pd.read_csv("../../Raw_data/oreinotinus_color_codes.csv")

In [20]:
# import database
fulldata = pd.read_csv("oreinotinus_samples_database.csv")

# import color codes
colors = pd.read_csv("oreinotinus_color_codes.csv")

In [23]:
#split dataset and prepare dictionaries with names and colors
sdata = fulldata[["NameInAssembly","Country", "Lastest_SP_name"]]
namedict = {sdata.iloc[i, 0]: str(sdata.iloc[i, 2]) + "_" + str(sdata.iloc[i, 1]) for i in range(sdata.shape[0])}
namedict["reference"] = "lautum-reference"
colordata = colors[["Species","Color"]]
colordict = {colordata.iloc[i, 0]: str(colordata.iloc[i, 1]) for i in range(colordata.shape[0])}


## import collections to create an empty dict
from collections import defaultdict

#create imao for this analysis
imap = defaultdict(list)
#go row by row
for index, row in fulldata.iterrows():
    #only check fulldataset specimens
    if row["full_dataset_withAyava"]:
        #only check those that have Lastest_SP_name non-empty
        if row["Lastest_SP_name"]:
            #append each NamInAssembly to the key Lastest_SP_name
            imap[row["Lastest_SP_name"]].append(row["NameInAssembly"])


In [None]:
# define treeslider object and parameters
ts = ipa.treeslider(
    name="w2mb_msnip10_mcov9_IMAPED_100_biggestScaff_aug2021",
    data=SEQS, #include sequences
    workdir="analysis-treeslider",
    scaffold_idxs=scaff_toUse, #specify which scaffolds to use
    window_size=2000000, #define windows size to split the whole matrix
    slide_size=2000000, #make those windows do no overlap
    inference_method="raxml", #use raxml as software for tree individuals tree reconstruction
    inference_args={"N": 100, "T": 1},
    minsnps=10,
    mincov=9,
    imap=imap,
    consensus_reduce=True #make a consensus that represent each species based on specimens found for that species
)

In [14]:
# set parallelization parameters
ts.ipcluster['cores'] = 20
ts.ipcluster['threads'] = 1

In [15]:
ts.show_inference_command()

/home/carlos/anaconda3/bin/raxmlHPC-PTHREADS-AVX2 -f a -T 1 -m GTRGAMMA -n ... -w ... -s ... -p 54321 -N 100 -x 12345


In [None]:
# run treeslider
ts.run(auto=True, force=True)

In [12]:
#reload tablet
name = "w2mb_msnip10_mcov9_IMAPED_100_biggestScaff_aug2021"
table = pd.read_csv("analysis-treeslider/w2mb_msnip10_mcov9_IMAPED_100_biggestScaff_aug2021.tree_table.csv", index_col=0)

In [15]:
# extract only trees from data with more than 300 snps
trees = table[table.snps > 300].tree.tolist()

# load all trees into a multitree object
mtre = toytree.mtree(trees)

# root trees
mtre.treelist = [i.root("dentatum") for i in mtre.treelist]

# infer a consensus tree to get best tip order
ctre = mtre.get_consensus_tree()

# draw the first 12 trees in a grid
canvas, axes, mark = mtre.draw_cloud_tree(
    height=600, width=600, 
    fixed_order=ctre.get_tip_labels(),
    use_edge_lengths=False,
);

## Analysis

In [5]:
# define astral object
astral = ipa.astral(
    data=table[table.snps > 300].tree.tolist(), #define trees to include in the analysis
    name=f'astral_{name}_atLeast300snps',
    workdir='analysis-astral',
    annotation=3 #specify local support
)

#run astral with only trees that were reconsctructed with windows with more than 300 snps
astral.run()

[astral.5.7.1.jar]
inferred tree written to (/home/deren/Documents/jhub-mount/Viburnum-Oreinotinus/notebooks/Aug2021/analysis-astral/astral_w2mb_msnip10_mcov9_IMAPED_100_biggestScaff_aug2021_atLeast300snps.tre)


## Tree

In [27]:
# reload tree result
treeFile = f"analysis-astral/astral_w2mb_msnip10_mcov9_IMAPED_100_biggestScaff_aug2021_atLeast300snps.tre"
tre = toytree.tree(treeFile)

In [30]:
rtre = tre.root('dentatum')


# drop elatum
rtre = rtre.drop_tips('dentatum')


# # labels_updated = [namedict[i] for i in rtre.get_tip_labels()]
color_labels = []

for i in rtre.get_tip_labels():
    result = "Black"
    for key, item in colordict.items():
        if i.find(key) > -1:
            result = item
    color_labels.append(result)
            
canvas, axes, test = rtre.draw(
    height=600, width=300, 
    use_edge_lengths=False, 
    tip_labels_style={"font-size": "10px"},
#     tip_labels=labels_updated,
    tip_labels_colors=color_labels,
    node_sizes=[6 if i else 0 for i in rtre.get_node_values()],
    node_colors=['black' if (i and int(i) > 0.85) else 'white' for i in rtre.get_node_values('support', 1, 1)],
#     node_colors=colors,
    node_style={"stroke": "black", "stroke-width": 1},
);

In [19]:
toyplot.svg.render(canvas, "./SSPTree_astral_astral_w2mb_msnip10_mcov9_IMAPED_100_biggestScaff_aug2021_atLeast300snps.svg")

### Exploration with different SNPs threshold

In [46]:
trees = table[table.snps > 300].tree.tolist()

# load all trees into a multitree object
mtre = toytree.mtree(trees)

# root trees
mtre.treelist = [i.root("dentatum") for i in mtre.treelist]

# infer a consensus tree to get best tip order
ctre = mtre.get_consensus_tree()

# draw the first 12 trees in a grid
canvas, axes, mark = mtre.draw_cloud_tree(
    height=600, width=600, 
    fixed_order=ctre.get_tip_labels(),
    use_edge_lengths=False,
);

In [47]:
trees = table[table.snps > 500].tree.tolist()

# load all trees into a multitree object
mtre = toytree.mtree(trees)

# root trees
mtre.treelist = [i.root("dentatum") for i in mtre.treelist]

# infer a consensus tree to get best tip order
ctre = mtre.get_consensus_tree()

# draw the first 12 trees in a grid
canvas, axes, mark = mtre.draw_cloud_tree(
    height=600, width=600, 
    fixed_order=ctre.get_tip_labels(),
    use_edge_lengths=False,
);

In [59]:
trees = table[table.snps > 600].tree.tolist()

# load all trees into a multitree object
mtre = toytree.mtree(trees)

# root trees
mtre.treelist = [i.root("dentatum") for i in mtre.treelist]

# infer a consensus tree to get best tip order
ctre = mtre.get_consensus_tree()

# draw the first 12 trees in a grid
canvas, axes, mark = mtre.draw_cloud_tree(
    height=600, width=600, 
    fixed_order=ctre.get_tip_labels(),
    use_edge_lengths=False,
);

In [56]:
trees = table[table.snps > 800].tree.tolist()

# load all trees into a multitree object
mtre = toytree.mtree(trees)

# root trees
mtre.treelist = [i.root("dentatum") for i in mtre.treelist]

# infer a consensus tree to get best tip order
ctre = mtre.get_consensus_tree()

# draw the first 12 trees in a grid
canvas, axes, mark = mtre.draw_cloud_tree(
    height=600, width=600, 
    fixed_order=ctre.get_tip_labels(),
    use_edge_lengths=False,
);