## Tables

The tables generated below are used both as guide trees in the progressive
alignment proceedure as well as trees used in the clustering step 
of the regressive alignment proceedure.


The generated tables can be found in the `results/tables/` directory of this repository.



In [None]:
##########################
# Import python packages #
##########################

import os
import csv

In [None]:
################################################
# Change directory relative to repository home #
################################################

pwd = os.getcwd()
work_dir=pwd+"/.."
os.chdir(work_dir)
os.getcwd()

In [8]:
####################################################
# Read in the number of sequences for each dataset #
####################################################

with open("data/num_seqs.csv", mode='r') as infile:
    reader = csv.reader(infile, delimiter='\t')
    sizes_dict = {rows[0]:rows[1] for rows in reader}
    
print(sizes_dict)

{'seatoxin': '93', 'hip': '167', 'scorptoxin': '363', 'cyt3': '385', 'rnasemam': '498', 'bowman': '499', 'toxin': '508', 'ghf11': '521', 'TNF': '556', 'sti': '613', 'Stap_Strp_toxin': '640', 'profilin': '687', 'ricin': '747', 'ghf22': '760', 'ChtBD': '774', 'ins': '793', 'trfl': '837', 'slectin': '932', 'phoslip': '946', 'ltn': '1068', 'il8': '1073', 'az': '1086', 'kringle': '1091', 'cryst': '1160', 'DEATH': '1183', 'cah': '1379', 'mmp': '1427', 'rub': '1435', 'ghf10': '1502', 'tgfb': '1606', 'sodcu': '2038', 'KAS': '2070', 'DMRL_synthase': '2099', 'tms': '2118', 'GEL': '2195', 'kunitz': '2266', 'Sulfotransfer': '2489', 'mofe': '2567', 'Ald_Xan_dh_2': '2589', 'ghf5': '2717', 'phc': '2957', 'aadh': '3127', 'annexin': '3139', 'serpin': '3144', 'cytb': '3206', 'asp': '3262', 'oxidored_q6': '3348', 'hpr': '3349', 'hormone_rec': '3509', 'hr': '3707', 'tim': '3904', 'glob': '3983', 'ace': '3989', 'cys': '4316', 'ghf1': '4358', 'sodfe': '4455', 'peroxidase': '4514', 'uce': '4558', 'flav': '46

In [9]:
#################################################
# Function to read in accuracy and scores files #
#################################################

def scores_to_dict(scores_dir, scores_dict, tag):
    scores_list=[]
    for score_file in os.listdir(scores_dir):
        family, align_type, bucket, aligner, tree, score_type = score_file.split('.')
        y = tuple([tag, align_type, aligner, tree, family, score_type])
        with open(scores_dir + score_file, 'r') as infile:
            data = infile.read()
        scores_dict[y]=data.rstrip()
    return scores_dict

In [11]:
##################################################
# Read in the full and reference datasets scores #
##################################################

scores_dict = {}
full_scores_dir="results/individual_scores/"
scores_dict = scores_to_dict(full_scores_dir, scores_dict, "full")

ref_scores_dir="results_reference/individual_scores/"
scores_dict = scores_to_dict(ref_scores_dir, scores_dict, "ref")


In [21]:
#################################################################################
# Create a dictionary for average TC scores and CPU time when above 10,000 seqs #
#################################################################################

# Dictionary of the average TC scores and CPU time
full_top20_tc_avg_dict={}
ref_top20_tc_avg_dict={}
cpu_top20_dict={}

# Take all datasets and create a dictionary of average scores
datasets=set([k[1]+'/'+k[2]+'/'+k[3] for k,v in scores_dict.items() ])

# Calculate average TC score for each of the full datasets with over 10,000 sequences
for dataset in datasets:
    key = dataset.split("/")
    i = [v for k,v in scores_dict.items() if k[0]=='full' and k[1]==key[0] and k[2]==key[1] and k[3]==key[2] and int(sizes_dict[k[4]]) > 10000 and k[5]=='tc']
    l = [float(j) for j in i]
    if (len(l)==20):
        avg = sum(l)/float(len(l))
        full_top20_tc_avg_dict[tuple(key)]=avg

# Calculate average TC for each of the reference datasets
for dataset in datasets:
    key = dataset.split("/")
    i = [v for k,v in scores_dict.items() if k[0]=='ref' and k[1]==key[0] and k[2]==key[1] and k[3]==key[2] and int(sizes_dict[k[4]]) > 10000 and k[5]=='tc']
    l = [float(j) for j in i]
    if (len(l)==20):
        avg = sum(l)/float(len(l))
        print(key, avg)
        ref_top20_tc_avg_dict[tuple(key)]=avg

# Calculate average CPU time required for each the datasets with over 10,000 sequences
for dataset in datasets:
    key = dataset.split("/")
    i = [v for k,v in scores_dict.items() if k[0]=='full' and k[1]==key[0] and k[2]==key[1] and k[3]==key[2] and int(sizes_dict[k[4]]) > 10000 and k[5]=='cpu']
    l = [float(j) for j in i]
    if (len(l)==20):
        avg = sum(l)/float(len(l))
        cpu_top20_dict[tuple(key)]=avg

['default_align', 'MAFFT-SPARSECORE', 'DEFAULT'] 53.504999999999995
['dpa_align', 'MAFFT-SPARSECORE', 'MAFFT-FFTNS1'] 53.50500000000001
['std_align', 'MAFFT-GINSI', 'MAFFT-FFTNS1'] 52.605
['dpa_align', 'MAFFT-SPARSECORE', 'CLUSTALO'] 53.50500000000001
['dpa_align', 'UPP', 'MAFFT_PARTTREE'] 49.90500000000001
['dpa_align', 'MAFFT-FFTNS1', 'MAFFT-FFTNS1'] 47.989999999999995
['dpa_align', 'MAFFT-SPARSECORE', 'MAFFT_PARTTREE'] 53.504999999999995
['dpa_align', 'MAFFT-GINSI', 'MAFFT-FFTNS1'] 53.504999999999995
['std_align', 'MAFFT-GINSI', 'CLUSTALO'] 53.06999999999999
['std_align', 'MAFFT-FFTNS1', 'MAFFT-FFTNS1'] 47.99
['dpa_align', 'CLUSTALO', 'MAFFT-FFTNS1'] 53.71
['std_align', 'CLUSTALO', 'MAFFT-FFTNS1'] 50.53999999999999
['dpa_align', 'UPP', 'CLUSTALO'] 49.89
['std_align', 'MAFFT-GINSI', 'MAFFT_PARTTREE'] 49.459999999999994
['default_align', 'UPP', 'DEFAULT'] 49.78000000000001
['dpa_align', 'MAFFT-FFTNS1', 'CLUSTALO'] 47.99
['dpa_align', 'UPP', 'MAFFT-FFTNS1'] 49.845
['dpa_align', 'CLUSTA

In [22]:
#########################
# Export Table 1 as CSV #
#########################

with open('results/tables/table1.csv', 'w') as csvfile:
    filewriter = csv.writer(csvfile, delimiter=',')

    filewriter.writerow(['','','non-regressive','regressive','reference','non-regressive','regressive'])
    filewriter.writerow(['tree method','alignment method','score %','score %','score %','cpu time (ms)','cpu time (ms)'])

    f_dic, r_dic, cpu_dic, aligner, tree = full_top20_tc_avg_dict, ref_top20_tc_avg_dict, cpu_top20_dict, 'MAFFT-FFTNS1', 'MAFFT_PARTTREE'
    filewriter.writerow(['parttree','fftns1',f_dic['std_align',aligner,tree],f_dic['dpa_align',aligner,tree],r_dic['std_align',aligner,tree],cpu_dic['std_align',aligner,tree],cpu_dic['dpa_align',aligner,tree]])
 
    f_dic, r_dic, cpu_dic, aligner, tree = full_top20_tc_avg_dict, ref_top20_tc_avg_dict, cpu_top20_dict, 'MAFFT-FFTNS1', 'CLUSTALO'
    filewriter.writerow(['mbed','fftns1',f_dic['std_align',aligner,tree],f_dic['dpa_align',aligner,tree],r_dic['std_align',aligner,tree],cpu_dic['std_align',aligner,tree],cpu_dic['dpa_align',aligner,tree]])
 
    f_dic, r_dic, cpu_dic, aligner, tree = full_top20_tc_avg_dict, ref_top20_tc_avg_dict, cpu_top20_dict, 'CLUSTALO', 'MAFFT_PARTTREE'
    filewriter.writerow(['parttree','clustalo',f_dic['std_align',aligner,tree],f_dic['dpa_align',aligner,tree],r_dic['std_align',aligner,tree],cpu_dic['std_align',aligner,tree],cpu_dic['dpa_align',aligner,tree]])

    f_dic, r_dic, cpu_dic, aligner, tree = full_top20_tc_avg_dict, ref_top20_tc_avg_dict, cpu_top20_dict, 'CLUSTALO', 'CLUSTALO'
    filewriter.writerow(['mbed','clustalo',f_dic['std_align',aligner,tree],f_dic['dpa_align',aligner,tree],r_dic['std_align',aligner,tree],cpu_dic['std_align',aligner,tree],cpu_dic['dpa_align',aligner,tree]])
    
    filewriter.writerow(['average','','','','','',''])
    filewriter.writerow(['','','','','','',''])
    
    f_dic, r_dic, cpu_dic, aligner, tree = full_top20_tc_avg_dict, ref_top20_tc_avg_dict, cpu_top20_dict, 'UPP', 'DEFAULT'
    filewriter.writerow(['default/mbed','upp',f_dic['default_align',aligner,tree],f_dic['dpa_align',aligner,'CLUSTALO'],r_dic['default_align',aligner,tree],cpu_dic['default_align',aligner,tree],cpu_dic['dpa_align',aligner,'CLUSTALO']])
 
    f_dic, r_dic, cpu_dic, aligner, tree = full_top20_tc_avg_dict, ref_top20_tc_avg_dict, cpu_top20_dict, 'MAFFT-SPARSECORE', 'DEFAULT'
    filewriter.writerow(['default/mbed','sparsecore',f_dic['default_align',aligner,tree],f_dic['dpa_align',aligner,'CLUSTALO'],r_dic['default_align',aligner,tree],cpu_dic['default_align',aligner,tree],cpu_dic['dpa_align',aligner,'CLUSTALO']])
 
    f_dic, r_dic, cpu_dic, aligner, tree = full_top20_tc_avg_dict, ref_top20_tc_avg_dict, cpu_top20_dict, 'MAFFT-GINSI', 'MAFFT_PARTTREE'
    filewriter.writerow(['parttree','ginsi','-',f_dic['dpa_align',aligner,tree],r_dic['std_align',aligner,tree],'-',cpu_dic['dpa_align',aligner,tree]])

    f_dic, r_dic, cpu_dic, aligner, tree = full_top20_tc_avg_dict, ref_top20_tc_avg_dict, cpu_top20_dict, 'MAFFT-GINSI', 'CLUSTALO'
    filewriter.writerow(['mbed','ginsi','-',f_dic['dpa_align',aligner,tree],r_dic['std_align',aligner,tree],'-',cpu_dic['dpa_align',aligner,tree]]) 


In [24]:
#######################################
# Export all raw results as CSV table #
#######################################

with open('results/tables/full_table_sp.csv', 'w') as csvfile:
    filewriter = csv.writer(csvfile, delimiter=',')
    for e in scores_dict:
        if e[5] == 'sp':
            filewriter.writerow([e[0], e[1], e[2], e[3], e[4], sizes_dict[e[4]], scores_dict[e]])
            
with open('results/tables/full_table_tc.csv', 'w') as csvfile:
    filewriter = csv.writer(csvfile, delimiter=',')
    for e in scores_dict:
        if e[5] == 'tc':
            filewriter.writerow([e[0], e[1], e[2], e[3], e[4], sizes_dict[e[4]], scores_dict[e]])

with open('results/tables/full_table_cpu.csv', 'w') as csvfile:
    filewriter = csv.writer(csvfile, delimiter=',')
    for e in scores_dict:
        if e[5] == 'cpu':
            filewriter.writerow([e[0], e[1], e[2], e[3], e[4], sizes_dict[e[4]], scores_dict[e]])

In [None]:
########################
# Export table for CCA #
########################

with open('results/tables/Constrained_Correspondence_Analysis.csv', 'w') as csvfile:
    filewriter = csv.writer(csvfile, delimiter=',')
    
    f_dic = full_top20_tc_avg_dict
    
    filewriter.writerow(['reg_method','tree_method','align_method','accuracy'])
    
    filewriter.writerow(['non-reg','parttree','fft-ns-1',f_dic['std_align','MAFFT-FFTNS1','MAFFT_PARTTREE']])
    filewriter.writerow(['non-reg','mBed','fft-ns-1',f_dic['std_align','MAFFT-FFTNS1','CLUSTALO']])
    filewriter.writerow(['non-reg','partree','clustalo',f_dic['std_align','CLUSTALO','MAFFT_PARTTREE']])
    filewriter.writerow(['non-reg','mBed','clustalo',f_dic['std_align','CLUSTALO','CLUSTALO']])
    
    filewriter.writerow(['non-reg','uppT','upp',f_dic['default_align','UPP','DEFAULT']])
    filewriter.writerow(['non-reg','sparsecoreT','sparsecore',f_dic['default_align','MAFFT-SPARSECORE','DEFAULT']])
    
    filewriter.writerow(['reg','parttree','fft-ns-1',f_dic['dpa_align','MAFFT-FFTNS1','MAFFT_PARTTREE']])
    filewriter.writerow(['reg','mBed','fft-ns-1',f_dic['dpa_align','MAFFT-FFTNS1','CLUSTALO']])
    filewriter.writerow(['reg','parttree','clustalo',f_dic['dpa_align','CLUSTALO','MAFFT_PARTTREE']])
    filewriter.writerow(['reg','mBed','clustalo',f_dic['dpa_align','CLUSTALO','CLUSTALO']])
    
    filewriter.writerow(['reg','mBed','upp',f_dic['dpa_align','UPP','CLUSTALO']])

    filewriter.writerow(['reg','mBed','sparsecore',f_dic['dpa_align','MAFFT-SPARSECORE','CLUSTALO']])
    filewriter.writerow(['reg','parttree','sparsecore',f_dic['dpa_align','MAFFT-SPARSECORE','MAFFT_PARTTREE']])
    
    filewriter.writerow(['reg','parttree','g-ins-1',f_dic['dpa_align','MAFFT-GINSI','MAFFT_PARTTREE']])
    filewriter.writerow(['reg','mBed','g-ins-1',f_dic['dpa_align','MAFFT-GINSI','CLUSTALO']])
