# Automatization of test making for ABBABABA 

In this notebook there is a utility that create n number of tests given the following conditions
1. Three species to compare
2. Two species sharing region
3. Two species sharing morfo (in this case pubescence)

Create two sets of test, morfo1-morfo2-morfo1 y morfo2-morfo1-morfo2

To avoid distant comparisons (Mex against Bolivia) a list of tuples of regions should be enter as well the number of test created

In [1]:
import toytree
import pandas as pd
import numpy as np
import ipyrad.analysis as ipa

In [2]:
regions = [
    ("11th_area","mex_e"),
    ("11th_area","mex_w"),
    ("11th_area","central"),
    ("mex_e","mex_w"),
    ("mex_e","central"),
    ("mex_w","central"),
    ("central","costa"),
    ("central","jamaica"),
    ("col_e","col_w"),
    ("col_e","ecu_n"),
    ("col_w","ecu_n"),
    ("ecu_n","peru"),
    ("peru","bol"),
]

# bol
# central
# col_e
# col_w
# costa
# ecu_n
# jamaica
# mex_e
# mex_w
# peru
# 11th_area

In [3]:
#Import tree fulldataset tree with real ayava
TREE = toytree.tree("/home/carlos/GDRIVE/viburnumThings/Viburnum-Oreinotinus/notebooks/Mar2021/RAxML_bipartitions.fulldataset_withAyava_10scaff_mcov025_rmcov01_mar2021")
rtree = TREE.root(wildcard = "dentatum").ladderize()

In [4]:
#Create big imap with all species and all samples per species
##import and load database
import dbgdrive
fulldata = dbgdrive.get_database(sheet_name='sample-data', 
                                 id_spreadsheet='1mnbZVtnI4AQDseWaawV2au4bNyFD_B9M0z8REMXbOcs', 
                                 api_key='AIzaSyCbfzhhYZg6f3nDSFF9hbHQOMWx2td611o')

##import collections to create an empty dict
from collections import defaultdict

##Get tips from TREE
tips = TREE.get_tip_labels()

#create empty dict
imap = defaultdict(list)
#go row by row
for index, row in fulldata.iterrows():
    #only check fulldataset specimens
    if row["full_dataset_withAyava"]:
        #check if each sample is included in the current tree, if not that sample is skipped
        if row["NameInAssembly"] in tips:
            #append each NamInAssembly to the key Lastest_SP_name
            imap[row["Lastest_SP_name"]].append(row["NameInAssembly"])

# imap

In [5]:
#get region sheet
regions_sheet = dbgdrive.get_database(sheet_name='regions', 
                                id_spreadsheet='1mnbZVtnI4AQDseWaawV2au4bNyFD_B9M0z8REMXbOcs', 
                                api_key='AIzaSyCbfzhhYZg6f3nDSFF9hbHQOMWx2td611o')

In [6]:
#split dataframe in two dataframes
pubescent = regions_sheet[regions_sheet["Pubescent"] == "1"]
glabrous = regions_sheet[regions_sheet["Pubescent"] == "0"]

In [7]:
seed = 12345
rng = np.random.default_rng(seed)

In [8]:
# create n tests per region, try at a maximum of max_tries to get different results
n_tests = 5
max_tries = 100

tests_ghg = {} #glabroubs-(hairy-glabrous)
tests_hgh = {} #hairy-(glabrous-hairy)


for region in regions:
    tests_ghg[region] = []
    tests_hgh[region] = []
    
    for i in range(n_tests):
        #ghg
        try:
            limit = 0
            while True:
                value = round(rng.random())
                test_ghg = (rng.choice(glabrous[glabrous["Region"] == region[value]]["Species"]),
                            rng.choice(pubescent[pubescent["Region"] == region[1-value]]["Species"]),
                            rng.choice(glabrous[glabrous["Region"] == region[1-value]]["Species"])
                       )
                if test_ghg and test_ghg not in tests_ghg[region]:
                    tests_ghg[region].append(test_ghg)
                    break
                else:
                    limit += 1
                    if limit >= max_tries:
                        break
        except:
            pass
        
        #hgh
        try:
            limit = 0
            while True:
                value = round(rng.random())
                test_hgh = (rng.choice(pubescent[pubescent["Region"] == region[value]]["Species"]),
                            rng.choice(glabrous[glabrous["Region"] == region[1-value]]["Species"]),
                            rng.choice(pubescent[pubescent["Region"] == region[1-value]]["Species"])
                       )
                if test_hgh and test_hgh not in tests_hgh[region]:
                    tests_hgh[region].append(test_hgh)
                    break
                else:
                    limit += 1
                    if limit >= max_tries:
                        break
        except:
            pass

        

In [9]:
tests_ghg

{('11th_area', 'mex_e'): [('stenocalyx', 'tiliaefolium', 'caudatum'),
  ('stenocalyx', 'tiliaefolium', 'hirsutum'),
  ('stenocalyx', 'microcarpum', 'hirsutum'),
  ('hirsutum', 'loeseneri', 'stenocalyx'),
  ('stenocalyx', 'tiliaefolium', 'ciliatum')],
 ('11th_area', 'mex_w'): [('membranaceum', 'loeseneri', 'stenocalyx'),
  ('acutifolium', 'loeseneri', 'stenocalyx'),
  ('microphyllum', 'loeseneri', 'stenocalyx'),
  ('stenocalyx', 'sulcatum', 'fuscum'),
  ('stenocalyx', 'sulcatum', 'microphyllum')],
 ('11th_area', 'central'): [('stenocalyx', 'disjunctum', 'obtusatum'),
  ('stenocalyx', 'disjunctum', 'lautum'),
  ('stenocalyx', 'disjunctum', 'hartwegii'),
  ('stenocalyx', 'jucundum', 'hartwegii'),
  ('stenocalyx', 'discolor', 'obtusatum')],
 ('mex_e', 'mex_w'): [('new_sp_2', 'microcarpum', 'hirsutum'),
  ('microphyllum', 'tiliaefolium', 'caudatum'),
  ('hirsutum', 'new_sp_1', 'microphyllum'),
  ('caudatum', 'sulcatum', 'new_sp_2'),
  ('ciliatum', 'sulcatum', 'new_sp_2')],
 ('mex_e', 'centr

In [10]:
tests_hgh

{('11th_area', 'mex_e'): [('tiliaefolium', 'stenocalyx', 'loeseneri'),
  ('microcarpum', 'stenocalyx', 'loeseneri'),
  ('loeseneri', 'caudatum', 'microcarpum'),
  ('loeseneri', 'caudatum', 'tiliaefolium'),
  ('loeseneri', 'ciliatum', 'microcarpum')],
 ('11th_area', 'mex_w'): [('loeseneri', 'acutifolium', 'new_sp_1'),
  ('sulcatum', 'stenocalyx', 'loeseneri'),
  ('new_sp_1', 'stenocalyx', 'loeseneri'),
  ('loeseneri', 'fuscum', 'sulcatum'),
  ('loeseneri', 'new_sp_2', 'new_sp_1')],
 ('11th_area', 'central'): [('loeseneri', 'hartwegii', 'discolor'),
  ('discolor', 'stenocalyx', 'loeseneri'),
  ('loeseneri', 'lautum', 'discolor'),
  ('loeseneri', 'obtusatum', 'discolor'),
  ('loeseneri', 'hartwegii', 'jucundum')],
 ('mex_e', 'mex_w'): [('tiliaefolium', 'fuscum', 'new_sp_1'),
  ('tiliaefolium', 'new_sp_2', 'new_sp_1'),
  ('tiliaefolium', 'acutifolium', 'new_sp_1'),
  ('sulcatum', 'ciliatum', 'tiliaefolium'),
  ('microcarpum', 'membranaceum', 'new_sp_1')],
 ('mex_e', 'central'): [('microcar

## Common functions definitions

In [11]:
#Function that creates different imaps base on pair of species
import random

def create_imap_for_test(imap, p1, p2, p3, p4="dentatum"):
    """
    Given the species name, fill a dictionary with all samples 
    associated with that species in the main imap
    """
    sub_imap = {}
    sub_imap["p4"] = imap[p4]
    sub_imap["p3"] = imap[p3]
    sub_imap["p2"] = imap[p2]
#     if not p1:
#         p1 = random.choice(list(imap.keys() - [p2, p3]))       
    sub_imap["p1"] = imap[p1]
    if len(set([p1, p2, p3])) < 3: 
        raise ValueError(f"Species must be different. sp1:{p1}, sp3:{p2}, and sp3:{p3}")
    return sub_imap

In [12]:
#Create baba object with dataset of few specimens per species
SEQS = "/home/deren/Documents/Viburnum-Oreinotinus/assembly_hic_feb/full_dataset_outfiles/full_dataset.snps.hdf5"
baba_obj = ipa.baba2(SEQS)

In [13]:
#Import tree fulldataset tree with real ayava
TREE = toytree.tree("/home/carlos/GDRIVE/viburnumThings/Viburnum-Oreinotinus/notebooks/Mar2021/RAxML_bipartitions.fulldataset_withAyava_10scaff_mcov025_rmcov01_mar2021")
rtree = TREE.root(wildcard = "dentatum").ladderize()

In [14]:
#put all tests (ghg and hgh) in only one list
fulllist = []
for i in tests_ghg.values():
    if i:
        for j in i:
            fulllist.append(j)
            
for i in tests_hgh.values():
    if i:
        for j in i:
            fulllist.append(j)

In [None]:
# run baba module in slow mode. That means slow only 5, and after finish, run another five, 
#in that way this can run for day using only 5 cores
import time

count = 0

for i in range(0,len(fulllist),5):
    imaps = []
    for test in fulllist[i:i+5]:
        imaps.append(create_imap_for_test(imap, test[0],test[1],test[2]))
    #run baba just for this subset of 5 elements
    count += 1
    baba_obj.run(imaps, nboots=100, retain_boots=True)
    baba_obj.results_table.to_csv(f"individual_ABBAs/result_tables/{count}-results_table.csv")
    baba_obj.taxon_table.to_csv(f"individual_ABBAs/taxon_tables/{count}-taxon_table.csv")

In [37]:
import abbaplot
import glob
import toyplot

In [38]:
rtable = pd.DataFrame()
ttable = pd.DataFrame()

for r in sorted(glob.glob("individual_ABBAs/result_tables/*")):
    rtable = rtable.append(pd.read_csv(r), ignore_index=True)
for t in sorted(glob.glob("individual_ABBAs/taxon_tables/*")):
    ttable = ttable.append(pd.read_csv(t), ignore_index=True)


print(rtable.shape, ttable.shape)

(114, 9) (114, 5)


In [61]:
# tests=[1,2]

# ttable = pd.read_csv(f"analysis_abba/{test_name}-taxon_table.csv")

#parse them and get a short version of names in a list of list
names, images = abbaplot.get_names_n_images_from_imap(ttable,
#                                              tests=tests,
                                             imap=imap, 
                                             images_dir="/home/carlos/GDRIVE/viburnumThings/Viburnum-Oreinotinus/notebooks/silhouettes/",
                                             images_suffix=".png",
                                                    )


# #use abbreviation instead of full names (abbreviation from database)
regions = dbgdrive.get_database(sheet_name='regions', 
                                id_spreadsheet='1mnbZVtnI4AQDseWaawV2au4bNyFD_B9M0z8REMXbOcs', 
                                api_key='AIzaSyCbfzhhYZg6f3nDSFF9hbHQOMWx2td611o')


for i, set_names in enumerate(names):
    for j, name in enumerate(set_names):
        new_name = regions[regions["Species"] == name]["Abrev."].values[0]
        names[i][j] = new_name



canvas = canvas = toyplot.Canvas(1100, 150)

#load d-stats result table
# rtable = pd.read_csv(f"analysis_abba/{test_name}-results_table.csv")

#plot results
fig = abbaplot.abbaplot(rtable, 
#          tests=tests,
             canvas=canvas,
             forced_margin=0.00, 
#              names=names, 
#              images=images, 
             points_size=1, 
             xsizeylabel=0.1,
             z_threshold=5,
             max_value=0.8,
            )

In [62]:
import toyplot.svg
toyplot.svg.render(fig, "114tests.svg")