## Tables

The tables generated below are used both as guide trees in the progressive
alignment proceedure as well as trees used in the clustering step 
of the regressive alignment proceedure.


The generated trees can be found in the `data/tables/full_results.csv` directory of this repository.



In [1]:
import os
pwd = os.getcwd()
work_dir=pwd+"/.."
os.chdir(work_dir)
os.getcwd()

'/nfs/users2/cn/efloden/projects/dpa-analysis'

In [3]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go

import numpy as np
import pandas as pd
import os
import csv

1. Create a dictionary for each familiy with values being the number of sequences in the dataset.


In [4]:
with open("data/num_seqs.csv", mode='r') as infile:
    reader = csv.reader(infile, delimiter='\t')
    sizes_dict = {rows[0]:rows[1] for rows in reader}

In [5]:
pwd

'/nfs/users2/cn/efloden/projects/dpa-analysis/notebook'

In [6]:
# Function to read in the directory of scores to a dictionary
def scores_to_dict(scores_dir, scores_dict, tag):
    scores_list=[]
    for score_file in os.listdir(scores_dir):
        family, align_type, bucket, aligner, tree, score_type = score_file.split('.')
        y = tuple([tag, align_type, aligner, tree, family, score_type])
        with open(scores_dir + score_file, 'r') as infile:
            data = infile.read()
        scores_dict[y]=data.rstrip()
    return scores_dict

In [7]:
# Read in the full datasets scores
scores_dict = {}
full_scores_dir="results/individual_scores/"
scores_dict = scores_to_dict(full_scores_dir, scores_dict, "full")


# Read in the reference datasets score
ref_scores_dir="results_reference/individual_scores/"
scores_dict = scores_to_dict(ref_scores_dir, scores_dict, "ref")

print(scores_dict)

{('full', 'dpa_align', 'UPP', 'MAFFT_PARTTREE', 'trfl', 'sp'): '43.1', ('full', 'dpa_align', 'MAFFT-GINSI', 'MAFFT_PARTTREE', 'icd', 'col'): '85.2', ('full', 'default_align', 'UPP', 'DEFAULT', 'kunitz', 'sp'): '92.2', ('full', 'dpa_align', 'MAFFT-GINSI', 'MAFFT_PARTTREE', 'gpdh', 'sp'): '95.0', ('full', 'dpa_align', 'MAFFT-SPARSECORE', 'MAFFT_PARTTREE', 'profilin', 'tc'): '80.5', ('full', 'dpa_align', 'UPP', 'CLUSTALO', 'mofe', 'tc'): '70.6', ('full', 'dpa_align', 'UPP', 'MAFFT_PARTTREE', 'Ald_Xan_dh_2', 'tc'): '17.7', ('full', 'std_align', 'MAFFT-FFTNS1', 'MAFFT_PARTTREE', 'egf', 'sp'): '61.3', ('full', 'dpa_align', 'MAFFT-FFTNS1', 'MAFFT-FFTNS1', 'cys', 'sp'): '76.8', ('full', 'std_align', 'CLUSTALO', 'MAFFT_PARTTREE', 'cys', 'col'): '37.2', ('full', 'dpa_align', 'MAFFT-SPARSECORE', 'MAFFT-FFTNS1', 'cys', 'sp'): '88.0', ('full', 'std_align', 'MAFFT-FFTNS1', 'MAFFT-FFTNS1', 'scorptoxin', 'tc'): '76.7', ('full', 'std_align', 'MAFFT-FFTNS1', 'MAFFT-FFTNS1', 'ghf1', 'tc'): '60.0', ('full

In [16]:
# Dictionary of average scores
full_top20_tc_avg_dict={}
ref_top20_tc_avg_dict={}

# Take all datasets and create a dictionary of average scores
datasets=set([k[1]+'/'+k[2]+'/'+k[3] for k,v in scores_dict.items() ])

# Calculate average for each
for dataset in datasets:
    key = dataset.split("/")
    i = [v for k,v in scores_dict.items() if k[0]=='full' and k[1]==key[0] and k[2]==key[1] and k[3]==key[2] and int(sizes_dict[k[4]]) > 10000 and k[5]=='tc']
    l = [float(j) for j in i]
    if (len(l)==20):
        print("full: " + dataset)
        avg = sum(l)/float(len(l))
        full_top20_tc_avg_dict[tuple(key)]=avg

# Calculate average for each
for dataset in datasets:
    key = dataset.split("/")
    i = [v for k,v in scores_dict.items() if k[0]=='ref' and k[1]==key[0] and k[2]==key[1] and k[3]==key[2] and int(sizes_dict[k[4]]) > 10000 and k[5]=='tc']
    l = [float(j) for j in i]
    if (len(l)==20):
        print(key)
        avg = sum(l)/float(len(l))
        ref_top20_tc_avg_dict[tuple(key)]=avg

full: std_align/CLUSTALO/MAFFT-FFTNS1
full: std_align/CLUSTALO/CLUSTALO
full: std_align/CLUSTALO/MAFFT_PARTTREE
full: dpa_align/MAFFT-FFTNS1/MAFFT_PARTTREE
full: default_align/UPP/DEFAULT
full: dpa_align/CLUSTALO/MAFFT_PARTTREE
full: dpa_align/CLUSTALO/MAFFT-FFTNS1
full: dpa_align/MAFFT-FFTNS1/MAFFT-FFTNS1
full: dpa_align/MAFFT-FFTNS1/CLUSTALO
full: dpa_align/CLUSTALO/CLUSTALO
full: std_align/MAFFT-FFTNS1/CLUSTALO
full: std_align/MAFFT-FFTNS1/MAFFT_PARTTREE
full: std_align/MAFFT-FFTNS1/MAFFT-FFTNS1
['std_align', 'CLUSTALO', 'MAFFT-FFTNS1']
['default_align', 'MAFFT-SPARSECORE', 'DEFAULT']
['dpa_align', 'MAFFT-SPARSECORE', 'CLUSTALO']
['dpa_align', 'MAFFT-GINSI', 'CLUSTALO']
['std_align', 'CLUSTALO', 'CLUSTALO']
['dpa_align', 'MAFFT-SPARSECORE', 'MAFFT_PARTTREE']
['std_align', 'CLUSTALO', 'MAFFT_PARTTREE']
['std_align', 'MAFFT-GINSI', 'MAFFT_PARTTREE']
['dpa_align', 'UPP', 'MAFFT_PARTTREE']
['dpa_align', 'MAFFT-FFTNS1', 'MAFFT_PARTTREE']
['default_align', 'UPP', 'DEFAULT']
['dpa_align', 

In [12]:
import csv
with open('results/tables/table1.csv', 'w') as csvfile:
    filewriter = csv.writer(csvfile, delimiter=',')

    filewriter.writerow(['','','Total Column Score (%)','','','','','CPU',''])

    filewriter.writerow(['Tree method','Alignment method','Non Regressive','','Regressive','','Reference','Non Regressive','Regressive'])

    filewriter.writerow(['','','Score','Relative Score','Score','Relative Score','Score','',''])

    f_dic, r_dic, aligner, tree = full_top20_tc_avg_dict, ref_top20_tc_avg_dict, 'MAFFT-FFTNS1', 'MAFFT_PARTTREE'
    filewriter.writerow(['parttree','fftns1',f_dic['std_align',aligner,tree],float(f_dic['std_align',aligner,tree]/r_dic['std_align',aligner,tree]),f_dic['dpa_align',aligner,tree],float(f_dic['dpa_align',aligner,tree]/r_dic['dpa_align',aligner,tree]),r_dic['dpa_align',aligner,tree],'CPU1','CPU2'])
 
    f_dic, r_dic, aligner, tree = full_top20_tc_avg_dict, ref_top20_tc_avg_dict, 'MAFFT-FFTNS1', 'CLUSTALO'
    filewriter.writerow(['parttree','fftns1',f_dic['std_align',aligner,tree],float(f_dic['std_align',aligner,tree]/r_dic['std_align',aligner,tree]),f_dic['dpa_align',aligner,tree],float(f_dic['dpa_align',aligner,tree]/r_dic['dpa_align',aligner,tree]),r_dic['dpa_align',aligner,tree],'CPU1','CPU2'])
 
    f_dic, r_dic, aligner, tree = full_top20_tc_avg_dict, ref_top20_tc_avg_dict, 'CLUSTALO', 'MAFFT_PARTTREE'
    filewriter.writerow(['parttree','fftns1',f_dic['std_align',aligner,tree],float(f_dic['std_align',aligner,tree]/r_dic['std_align',aligner,tree]),f_dic['dpa_align',aligner,tree],float(f_dic['dpa_align',aligner,tree]/r_dic['dpa_align',aligner,tree]),r_dic['dpa_align',aligner,tree],'CPU1','CPU2'])

    f_dic, r_dic, aligner, tree = full_top20_tc_avg_dict, ref_top20_tc_avg_dict, 'CLUSTALO', 'CLUSTALO'
    filewriter.writerow(['parttree','fftns1',f_dic['std_align',aligner,tree],float(f_dic['std_align',aligner,tree]/r_dic['std_align',aligner,tree]),f_dic['dpa_align',aligner,tree],float(f_dic['dpa_align',aligner,tree]/r_dic['dpa_align',aligner,tree]),r_dic['dpa_align',aligner,tree],'CPU1','CPU2'])
    
    filewriter.writerow(['Average','','','','','','','',''])
    
    

In [None]:
import csv
with open('results/tables/full_table_sp.csv', 'w') as csvfile:
    filewriter = csv.writer(csvfile, delimiter=',')
    for e in scores_dict:
        if e[5] == 'sp':
            filewriter.writerow([e[0], e[1], e[2], e[3], e[4], sizes_dict[e[4]], scores_dict[e]])
            
with open('results/tables/full_table_tc.csv', 'w') as csvfile:
    filewriter = csv.writer(csvfile, delimiter=',')
    for e in scores_dict:
        if e[5] == 'tc':
            filewriter.writerow([e[0], e[1], e[2], e[3], e[4], sizes_dict[e[4]], scores_dict[e]])

with open('results/tables/full_table_col.csv', 'w') as csvfile:
    filewriter = csv.writer(csvfile, delimiter=',')
    for e in scores_dict:
        if e[5] == 'col':
            filewriter.writerow([e[0], e[1], e[2], e[3], e[4], sizes_dict[e[4]], scores_dict[e]])

In [32]:
# Table for Ionas figure
import csv
with open('results/tables/Constrained_Correspondence_Analysis.csv', 'w') as csvfile:
    filewriter = csv.writer(csvfile, delimiter=',')
    
    f_dic = full_top20_tc_avg_dict
    
    filewriter.writerow(['reg_method','tree_method','align_method','accuracy'])
    
    filewriter.writerow(['pro','parttree','fftns1',f_dic['std_align','MAFFT-FFTNS1','MAFFT_PARTTREE']])
    filewriter.writerow(['pro','clustalo','fftns1',f_dic['std_align','MAFFT-FFTNS1','CLUSTALO']])
    filewriter.writerow(['pro','partree','clustalo',f_dic['std_align','CLUSTALO','MAFFT_PARTTREE']])
    filewriter.writerow(['pro','clustalo','clustalo',f_dic['std_align','CLUSTALO','CLUSTALO']])
    
    filewriter.writerow(['pro','default','upp',f_dic['default_align','UPP','DEFAULT']])
    filewriter.writerow(['pro','default','sparsecore',''])#f_dic['default_align','MAFFT-SPARSECORE','DEFAULT']])
    
    filewriter.writerow(['reg','parttree','fftns1',f_dic['dpa_align','MAFFT-FFTNS1','MAFFT_PARTTREE']])
    filewriter.writerow(['reg','clustalo','fftns1',f_dic['dpa_align','MAFFT-FFTNS1','CLUSTALO']])
    filewriter.writerow(['reg','parttree','clustalo',f_dic['dpa_align','CLUSTALO','MAFFT_PARTTREE']])
    filewriter.writerow(['reg','clustalo','clustalo',f_dic['dpa_align','CLUSTALO','CLUSTALO']])
    
    filewriter.writerow(['reg','clustalo','upp',''])#f_dic['dpa_align','UPP','CLUSTALO']])
    filewriter.writerow(['reg','clustalo','sparsecore','']) #f_dic['dpa_align','MAFFT-SPARSECORE','CLUSTALO']])
    
    filewriter.writerow(['reg','parttree','ginsi',''])#f_dic['dpa_align','MAFFT-GINSI','MAFFT_PARTTREE']])
    filewriter.writerow(['reg','clustalo','ginsi',''])#f_dic['dpa_align','MAFFT-GINSI','CLUSTALO']])


In [33]:
!cat Constrained_Correspondence_Analysis.csv

reg_method,tree_method,align_method,accuracy
pro,parttree,fftns1,29.635
pro,clustalo,fftns1,41.325
pro,partree,clustalo,26.939999999999998
pro,clustalo,clustalo,39.029999999999994
pro,default,upp,44.78
pro,default,sparsecore,
reg,parttree,fftns1,35.16000000000001
reg,clustalo,fftns1,37.94
reg,parttree,clustalo,42.209999999999994
reg,clustalo,clustalo,41.910000000000004
reg,clustalo,upp,
reg,clustalo,sparsecore,
reg,parttree,ginsi,
reg,clustalo,ginsi,
