## Load modules

In [None]:
import toytree
import toyplot.pdf
import pandas as pd
import ipyrad.analysis as ipa

## Load sequences

In [None]:
SEQS = "/pinky/camayal/viburnumThings/HDF5/full_dataset.seqs.hdf5"

## Check scaffolds lengths

In [None]:
# get scaffolds sorted by length and select the biggest
scaffs = ipa.window_extracter(SEQS)
scaff_table = scaffs.scaffold_table.sort_values(by="scaffold_length", ascending=False)
display (scaff_table)
scaff_toUse = scaff_table.index[:10].tolist()
print (scaff_toUse)
print(scaff_table.scaffold_length.sum())
print(len(scaff_toUse))

### Window extractor to generate phy file
Using in this case only the biggest scaffold

In [None]:
# Import database 
import dbgdrive
fulldata = dbgdrive.get_database(sheet_name='sample-data', id_spreadsheet='1mnbZVtnI4AQDseWaawV2au4bNyFD_B9M0z8REMXbOcs', api_key='AIzaSyCbfzhhYZg6f3nDSFF9hbHQOMWx2td611o')

In [None]:
ingroup, outgroup = dbgdrive.get_names("full_dataset_withAyava", fulldata)

IMAP = {
    "outgroup": outgroup,
    "ingroup": ingroup,
}

IMAP

In [None]:
count = 0
for i in IMAP:
    count += len(IMAP[i])

In [8]:
wex = ipa.window_extracter(
    data=SEQS,
    scaffold_idxs=scaff_toUse,
    mincov=0.25,
    rmincov=0.1,
    name="fulldataset_withAyava_10scaff_mcov025_rmcov01_mar2021",
    imap=IMAP,
)

In [9]:
wex.run(force=True)

Wrote data to /pinky/camayal/viburnumThings/Viburnum-Oreinotinus/notebooks/Mar2021/analysis-window_extracter/fulldataset_withAyava_10scaff_mcov025_rmcov01_mar2021.phy


In [10]:
wex.stats

Unnamed: 0,scaffold,start,end,sites,snps,missing,samples
0,concatenated,0,1830509,1830509,109825,0.507,180


### Run RAXML (BEST FOR NOW)

In [12]:
wex.name

'fulldataset_withAyava_10scaff_mcov025_rmcov01_mar2021'

In [13]:
rax = ipa.raxml(wex.outfile, name=wex.name, T=34, N=100, m="GTRCAT")

In [15]:
print(rax.command)

/home/camayal/miniconda3/bin/raxmlHPC-PTHREADS-AVX2 -f a -T 34 -m GTRCAT -n fulldataset_withAyava_10scaff_mcov025_rmcov01_mar2021 -w /pinky/camayal/viburnumThings/Viburnum-Oreinotinus/notebooks/Mar2021/analysis-raxml -s /pinky/camayal/viburnumThings/Viburnum-Oreinotinus/notebooks/Mar2021/analysis-window_extracter/fulldataset_withAyava_10scaff_mcov025_rmcov01_mar2021.phy -p 54321 -N 100 -x 12345


In [16]:
rax.run()

job fulldataset_withAyava_10scaff_mcov025_rmcov01_mar2021 finished successfully


#### Get final names and print RAXML results

In [119]:
import dbgdrive
fulldata = dbgdrive.get_database(sheet_name='sample-data', id_spreadsheet='1mnbZVtnI4AQDseWaawV2au4bNyFD_B9M0z8REMXbOcs', api_key='AIzaSyCbfzhhYZg6f3nDSFF9hbHQOMWx2td611o')
colors = dbgdrive.get_database(sheet_name='color_leaf_type', id_spreadsheet='1mnbZVtnI4AQDseWaawV2au4bNyFD_B9M0z8REMXbOcs', api_key='AIzaSyCbfzhhYZg6f3nDSFF9hbHQOMWx2td611o')

In [133]:
#in this cell I am modifying the names for journal friendly names.
sdata = fulldata[["NameInAssembly","Lastest_SP_name", "Num_for_Publication", "UltimateName"]]

namedict = {}
for i in range(sdata.shape[0]):
    if sdata.iloc[i, 2]:
        number = " (" + sdata.iloc[i, 2] + ")"
    else:
        number = ""
    namedict[sdata.iloc[i, 0]] = f"V. {sdata.iloc[i, 1]}{number}"
        

# namedict["reference"] = "V. lautum 2"

colordata = colors[["sp","color"]]
colordict = {colordata.iloc[i, 0]: str(colordata.iloc[i, 1]) for i in range(colordata.shape[0])}

In [134]:
treeFile = "./analysis-raxml/RAxML_bipartitions.fulldataset_withAyava_10scaff_mcov025_rmcov01_mar2021"
tre = toytree.tree(treeFile)

In [135]:
#Root tree
rtre = tre.root(wildcard="dentatum")


# Do some rotations to fit with geo
for i in [309,262,251,252,239,233]:
    rtre.idx_dict[i].children.reverse()
    rtre._coords.update()


#Set new names
labels_updated = [namedict[i] for i in rtre.get_tip_labels()]
color_labels = []

#Set color base on leaf form
for i in labels_updated:
    result = "Black"
    for key, item in colordict.items():
        if i.find(key) > -1:
            result = item
    color_labels.append(result)


#Collapse weak supported nodes
# rtre = rtre.collapse_nodes(min_support=75)

#Define threshold
support_value_threshold = 84





canvas, axes, marks = rtre.draw(
    height=1800, width=600, 
    use_edge_lengths=True,
    tip_labels_align=True,
    tip_labels_style={"font-size": "12px"},
    tip_labels=labels_updated,
#     tip_labels_colors=color_labels,
    node_sizes=[5 if i else 0 for i in rtre.get_node_values()],
    node_colors=['black' if (i and int(i) > support_value_threshold) else 'white' for i in rtre.get_node_values('support', 1, 1)],
#     node_colors=colors,
    node_style={"stroke": "black", "stroke-width": 1},
#     node_labels="support"
    node_labels=['' if (i and int(i) > support_value_threshold) else i for i in rtre.get_node_values('support', 1, 0)],
    node_labels_style= {
        "-toyplot-anchor-shift": "10px",
        "baseline-shift": "0px",
        "text-shadow": "0.5px 0.5px #fff, -0.5px 0.5px #fff, 0.5px -0.5px #fff, -0.5px -0.5px #fff",
        "fill": "#000",
        "font-size": 8,
    },
#     node_labels="idx",
);

In [136]:
toyplot.svg.render(canvas, "./RAxML_bipartitions.fulldataset_withAyava_10scaff_mcov025_rmcov01_mar2021.svg")

In [129]:
# update label not only for presentation, this creates a new tree with current names, and exports it.
updateddict = {}
for i in rtre.get_tip_labels():
    updateddict[i] = namedict[i]

    
testtre = rtre.set_node_values(
    feature="name",
    values=updateddict,
)

testtre.write(f"{treeFile}_RENAMED", tree_format=0)

In [131]:
#Draw collapsed tree
rtre = tre.root(wildcard="dentatum")



# Do some rotations to fit with geo
for i in [309,262,251,252,239,233]:
    rtre.idx_dict[i].children.reverse()
    rtre._coords.update()

#Set new names
labels_updated = [namedict[i] for i in rtre.get_tip_labels()]
color_labels = []

#Set color base on leaf form
for i in labels_updated:
    result = "Black"
    for key, item in colordict.items():
        if i.find(key) > -1:
            result = item
    color_labels.append(result)


#Collapse weak supported nodes
rtre = rtre.collapse_nodes(min_support=84)

#Define threshold
support_value_threshold = 84




canvas, axes, marks = rtre.draw(
    height=1800, width=600, 
    use_edge_lengths=True,
    tip_labels_align=True,
    tip_labels_style={"font-size": "12px"},
    tip_labels=labels_updated,
#     tip_labels_colors=color_labels,
    node_sizes=[5 if i else 0 for i in rtre.get_node_values()],
    node_colors=['black' if (i and int(i) > support_value_threshold) else 'white' for i in rtre.get_node_values('support', 1, 1)],
#     node_colors=colors,
    node_style={"stroke": "black", "stroke-width": 1},
#     node_labels="support"
    node_labels=['' if (i and int(i) > support_value_threshold) else i for i in rtre.get_node_values('support', 1, 0)],
    node_labels_style= {
        "-toyplot-anchor-shift": "10px",
        "baseline-shift": "0px",
        "text-shadow": "0.5px 0.5px #fff, -0.5px 0.5px #fff, 0.5px -0.5px #fff, -0.5px -0.5px #fff",
        "fill": "#000",
        "font-size": 8,
    },
#     node_labels="idx",
);

In [132]:
toyplot.svg.render(canvas, "./RAxML_bipartitions.fulldataset_withAyava_10scaff_mcov025_rmcov01_mar2021.COLLAPSED.svg")

## Extract specimens of this tree

In [4]:
list_names = []
for k in IMAP:
    for i in IMAP[k]:
        list_names.append(i)
        
list_names

['dentatum_ELS_004',
 'dentatum_ELS_015',
 'dentatum_ELS_027',
 'dentatum_ELS_052',
 'dentatum_ELS_072',
 'dentatum_ELS_082',
 '__EJE_617',
 '__EJE_619',
 '__EJE_629',
 '__EJE_630',
 '__EJE_631',
 'acutifolium_DRY3_MEX_006',
 'acutifolium_MEX_005',
 'acutifolium_MJD_011_tuton_111816',
 'acutifolium_MJD_012_tuton_111816',
 'acutifolium_MJD_60',
 'acutifolium_PWS_3050',
 'acutifolium_PWS_3059',
 'alpinum_PWS_3924',
 'anabaptista_PWS_2156',
 'anabaptista_PWS_2160',
 'anabaptista_PWS_2162',
 'anabaptista_PWS_2164',
 'anabaptista_PWS_2165',
 'anabaptista_PWS_2173',
 'ayavacense_PWS_3884',
 'ayavacense_PWS_3889',
 'ayavacense_PWS_3902',
 'ayavacense_PWS_3908',
 'blandum_EJE_618',
 'blandum_EJE_621',
 'blandum_PWS_3088',
 'caudatum_MJD_64',
 'caudatum_PWS_3211',
 'caudatum_PWS_3216',
 'caudatum_PWS_3221',
 'caudatum_PWS_3223',
 'caudatum_PWS_3223_M1',
 'ciliatum_C1',
 'ciliatum_PWS_3220',
 'ciliatum_PWS_3225',
 'ciliatum_SH2',
 'costaricanum_MJD_85',
 'disjunctum_EJE_615',
 'disjunctum_MJD_66

In [5]:
import dbgdrive
fulldata = dbgdrive.get_database(sheet_name='sample-data', id_spreadsheet='1mnbZVtnI4AQDseWaawV2au4bNyFD_B9M0z8REMXbOcs', api_key='AIzaSyCbfzhhYZg6f3nDSFF9hbHQOMWx2td611o')
colors = dbgdrive.get_database(sheet_name='color_leaf_type', id_spreadsheet='1mnbZVtnI4AQDseWaawV2au4bNyFD_B9M0z8REMXbOcs', api_key='AIzaSyCbfzhhYZg6f3nDSFF9hbHQOMWx2td611o')

In [25]:
#in this cell I am modifying the names for journal friendly names.
sdata = fulldata[["NameInAssembly","UltimateName","Num_for_Publication"]]

namedict = {}
for i in range(sdata.shape[0]):
    part = sdata.iloc[i, 1].split("-")
    spnum = sdata.iloc[i, 2]
    if spnum == None:
        spnum = ""
    else:
        spnum = "_" + spnum
    namedict[sdata.iloc[i, 0]] = f"{part[0]}{spnum}-{part[1]}"
        
# namedict["reference"] = "V. lautum 2"

colordata = colors[["sp","color"]]
colordict = {colordata.iloc[i, 0]: str(colordata.iloc[i, 1]) for i in range(colordata.shape[0])}

In [26]:
sorted([namedict[i] for i in list_names])

['acutifolium_1-MJD_011_tuton_111816',
 'acutifolium_2-MJD_012_tuton_111816',
 'acutifolium_3-MJD_60',
 'acutifolium_4-DRY3_MEX_006',
 'acutifolium_5-MEX_005',
 'acutifolium_6-PWS_3050',
 'acutifolium_7-PWS_3059',
 'alpinum-PWS_3924',
 'ayavacense_1-PWS_4006',
 'ayavacense_2-PWS_4003',
 'ayavacense_3-PWS_4000',
 'blandum-EJE_630',
 'blandum_1-EJE_621',
 'blandum_2-PWS_3088',
 'caudatum_1-PWS_3223_M1',
 'caudatum_2-PWS_3223',
 'caudatum_3-MJD_64',
 'caudatum_4-PWS_3216',
 'caudatum_5-PWS_3211',
 'caudatum_6-PWS_3221',
 'ciliatum_1-PWS_3225',
 'ciliatum_2-SH2',
 'ciliatum_3-PWS_3220',
 'ciliatum_4-C1',
 'costaricanum-MJD_85',
 'dentatum_1-ELS_004',
 'dentatum_2-ELS_015',
 'dentatum_3-ELS_052',
 'dentatum_4-ELS_027',
 'dentatum_5-ELS_082',
 'dentatum_6-ELS_072',
 'discolor-EJE_629',
 'discolor-EJE_631',
 'disjunctum_1-MJD_66',
 'disjunctum_2-EJE_615',
 'dumatorum_1-PWS_3900',
 'dumatorum_2-PWS_3895',
 'fuscum_1-MJD_010_tuton_111816',
 'fuscum_2-EJE_604',
 'fuscum_3-MJD_009_tuton_111816',
