# Species level tree reconstruction

## Load modules

In [None]:
import toytree
import toyplot.pdf
import pandas as pd
import ipyrad.analysis as ipa

## Matrix preparation

### Load sequences

In [2]:
SEQS = "../../Raw_data/full_dataset.seqs.hdf5"

### Check scaffolds lengths

In [None]:
# get scaffolds sorted by length and select the 100 biggest scaffolds
scaffs = ipa.window_extracter(SEQS)
scaff_table = scaffs.scaffold_table.sort_values(by="scaffold_length", ascending=False)
display (scaff_table)
scaff_toUse = scaff_table.index[:100].tolist()
print (scaff_toUse)
print(scaff_table.scaffold_length.sum())
print(len(scaff_toUse))

### Window extractor to generate phy file

In [None]:
# import database
fulldata = pd.read_csv("../../Raw_data/oreinotinus_samples_database.csv")

# get outgroup samples from ingroup
ingroup = list(fulldata[fulldata["spp_dataset"] == "1"]["NameInAssembly"])
outgroup = list(fulldata[fulldata["spp_dataset"] == "out"]["NameInAssembly"])

In [19]:
# compose the imap dictionary
IMAP = {
    "outgroup": outgroup,
    "ingroup": ingroup + ["ayavacense_PWS_4006"], #including only this real ayavacense from Plate from feb2021,
}

In [17]:
# define window extracter object and define parameters
wex = ipa.window_extracter(
    data=SEQS,
    scaffold_idxs=scaff_toUse,
    mincov=0.25,
    rmincov=0.1,
    name="splvl_withrealAyava_100scaff_mcov025_rmcov01_mar2021",
    imap=IMAP,
)

In [18]:
# run windows extracter
wex.run(force=True)

Wrote data to /home/deren/Documents/jhub-mount/Viburnum-Oreinotinus/notebooks/Mar2021/analysis-window_extracter/splvl_withrealAyava_100scaff_mcov025_rmcov01_mar2021.phy


In [19]:
# display stats about extraction
wex.stats

Unnamed: 0,scaffold,start,end,sites,snps,missing,samples
0,concatenated,0,7686800,7686800,161791,0.481,41


## Analysis

### Run RAXML

In [20]:
wex.name

'splvl_withrealAyava_100scaff_mcov025_rmcov01_mar2021'

In [21]:
# define raxml object and paremeters
rax = ipa.raxml(wex.outfile, name=wex.name, T=34, N=100, m="GTRCAT")

In [22]:
print(rax.command)

/home/carlos/anaconda3/bin/raxmlHPC-PTHREADS-AVX2 -f a -T 34 -m GTRCAT -n splvl_withrealAyava_100scaff_mcov025_rmcov01_mar2021 -w /home/deren/Documents/jhub-mount/Viburnum-Oreinotinus/notebooks/Mar2021/analysis-raxml -s /home/deren/Documents/jhub-mount/Viburnum-Oreinotinus/notebooks/Mar2021/analysis-window_extracter/splvl_withrealAyava_100scaff_mcov025_rmcov01_mar2021.phy -p 54321 -N 100 -x 12345


In [None]:
# run raxml
rax.run()

## Tree

#### Get final names and print RAXML results

In [6]:
# import database
fulldata = pd.read_csv("../../Raw_data/oreinotinus_samples_database.csv")

# import color codes
colors = pd.read_csv("../../Raw_data/oreinotinus_color_codes.csv")

In [5]:
#in this cell I am modifying the names for journal friendly names.
sdata = fulldata[["NameInAssembly","Lastest_SP_name"]]

namedict = {}
for i in range(sdata.shape[0]):
    namedict[sdata.iloc[i, 0]] = f"{sdata.iloc[i, 1]}"
        
colordata = colors[["Species","Color"]]
colordict = {colordata.iloc[i, 0]: str(colordata.iloc[i, 1]) for i in range(colordata.shape[0])}

In [6]:
# reload the resulting tree
treeFile = "./analysis-raxml/RAxML_bipartitions.splvl_withrealAyava_100scaff_mcov025_rmcov01_mar2021"
tre = toytree.tree(treeFile)

In [11]:
#Root tree
rtre = tre.root(wildcard="dentatum")

#Do some rotations to fit with geo
for i in [66,70,73,51,49,41,42,43,44]:
    rtre.idx_dict[i].children.reverse()
    rtre._coords.update()

#Set new names
labels_updated = [namedict[i] for i in rtre.get_tip_labels()]
color_labels = []

#Set color base on leaf form
for i in labels_updated:
    result = "Black"
    for key, item in colordict.items():
        if i.find(key) > -1:
            result = item
    color_labels.append(result)


#Collapse weak supported nodes
# rtre = rtre.collapse_nodes(min_support=75)

#Define threshold
support_value_threshold = 84

canvas, axes, marks = rtre.draw(
    height=600, width=400, 
    use_edge_lengths=True,
    tip_labels_align=True,
    tip_labels_style={"font-size": "10px"},
    tip_labels=labels_updated,
    tip_labels_colors=color_labels,
    node_sizes=[6 if i else 0 for i in rtre.get_node_values()],
    node_colors=['black' if (i and int(i) > support_value_threshold) else 'white' for i in rtre.get_node_values('support', 1, 1)],
#     node_colors=colors,
    node_style={"stroke": "black", "stroke-width": 1},
#     node_labels="support"
    node_labels=['' if (i and int(i) > support_value_threshold) else i for i in rtre.get_node_values('support', 1, 0)],
#     node_labels="idx",
    node_labels_style= {
        "-toyplot-anchor-shift": "10px",
        "baseline-shift": "0px",
        "text-shadow": "0.5px 0.5px #fff, -0.5px 0.5px #fff, 0.5px -0.5px #fff, -0.5px -0.5px #fff",
        "fill": "#000",
        "font-size": 7,
    },
    
);

In [10]:
import toyplot.svg
toyplot.svg.render(canvas, "./RAxML_bipartitions.splvl_withrealAyava_100scaff_mcov025_rmcov01_mar2021.svg")

In [10]:
# update label not only for presentation, this creates a new tree with current names, and exports it.
updateddict = {}
for i in rtre.get_tip_labels():
    updateddict[i] = namedict[i]

    
testtre = rtre.set_node_values(
    feature="name",
    values=updateddict,
)

testtre.write(f"{treeFile}_RENAMED", tree_format=0)