In [24]:
from Bio import Phylo
import pandas as pd
from Bio.Phylo.TreeConstruction import DistanceCalculator 
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from ete3 import PhyloTree
from ete3 import Tree
from ete3 import NCBITaxa

In [25]:
ncbi = NCBITaxa()

In [26]:
with open('new_alignments/H1.fasta') as h1_file, open('new_alignments/H2A.fasta') as h2a_file,\
open('new_alignments/H2B.fasta') as h2b_file, open('new_alignments/H3.fasta') as h3_file,\
open('new_alignments/H4.fasta') as h4_file:

    h1_al = AlignIO.read(h1_file, 'fasta')
    h2a_al = AlignIO.read(h2a_file, 'fasta')
    h2b_al = AlignIO.read(h2b_file, 'fasta')
    h3_al = AlignIO.read(h3_file, 'fasta')
    h4_al = AlignIO.read(h4_file, 'fasta')
 

al_refs = [h1_al, h2a_al, h2b_al, h3_al, h4_al]
names = ['H1.nwk', 'H2A.nwk', 'H2B.nwk', 'H3.nwk', 'H4.nwk']

In [27]:
for alignment, name in zip(al_refs, names):
    dm = DistanceCalculator('identity').get_distance(alignment)
    constructor = DistanceTreeConstructor()
    nj_tree = constructor.nj(dm)
    path = 'trees/' + name
    Phylo.write(nj_tree, path, 'newick')
    tree = PhyloTree(newick='trees/H1.nwk', alignment='new_alignments/H1.fasta', alg_format='fasta',
                     format=3)
    break

Warnning: [49] terminal nodes could not be found in the alignment.


In [28]:
his_df, subs_df = pd.read_csv('histones.csv'), pd.read_csv('subs_df.csv')
display(his_df.head())
subs_df.head()

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
0,XP_010685819.1,H2A,cH2A,cH2A,,731349093,,,3555,Beta vulgaris subsp. vulgaris,Streptophyta,Magnoliopsida,,,,MDSTAGGKAKKGAGGRKGGGPKKKPVSRSVKAGLQFPVGRIGRYLK...
1,NP_563627.1,H3,cenH3,cenH3,,18378832,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,,MARTKHRVTRSQPRNQTDAAGASSSQAAGPTTTPTRRGGEGGDNTQ...
2,NP_001190852.1,H2A,cH2A,cH2A,,334186954,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,,MAGRGKQLGSGAAKKSTSRSSKAGLQFPVGRIARFLKAGKYAERVG...
3,NP_175517.1,H2A,cH2A,cH2A,,15223708,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,,MAGRGKTLGSGSAKKATTRSSKAGLQFPVGRIARFLKKGKYAERVG...
4,NP_188703.1,H2A,cH2A,cH2A,,15232330,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,,MAGRGKTLGSGVAKKSTSRSSKAGLQFPVGRIARFLKNGKYATRVG...


Unnamed: 0.1,Unnamed: 0,index,species,a_resid,b_resid,a_entity,b_entity,a_resname,b_resname,a_new_resname,b_new_resname,a_variant,b_variant,a_accsession,b_accsession,a_column,b_column
0,0,0,Drosophila melanogaster,48,117,H3,H2A,L,P,N,-,cenH3,H2A.Z,NP_523730.2,NP_524519.1,181,225
1,1,1,Drosophila melanogaster,58,104,H3,H2A,T,Q,P,G,cenH3,H2A.Z,NP_523730.2,NP_524519.1,200,190
2,2,4,Tetrahymena thermophila SB210,73,25,H3,H4,E,N,D,S,H3.3,cH4,XP_001008397.1,XP_001016593.1,215,26
3,3,5,Trypanosoma brucei brucei TREU927,74,62,H3,H4,I,L,V,V,cH3,cH4,XP_001218942.1,XP_951561.1,216,63
4,4,6,Trypanosoma brucei brucei TREU927,74,66,H3,H4,I,I,V,V,cH3,cH4,XP_001218942.1,XP_951561.1,216,67


In [29]:
tax_ids = set()

for i, row in subs_df.iterrows():
    a_acc, b_acc = row['a_accsession'], row['b_accsession']
    a_tax_id = his_df.loc[his_df['accession']==a_acc]['taxonomy_id']
    b_tax_id = his_df.loc[his_df['accession']==b_acc]['taxonomy_id']
    tax_ids.add(int(a_tax_id)), tax_ids.add(int(b_tax_id))
 
xenopus_id = 8355
tax_ids.add(xenopus_id)
print(tax_ids)

{508771, 8355, 9606, 9031, 9544, 10090, 10029, 9646, 312017, 3702, 185431, 9913, 7227, 559292, 10141}


In [30]:
ncbi = NCBITaxa()

tree = ncbi.get_topology(tax_ids)
print(tree.get_ascii(attributes=["sci_name", "rank"]))


                                                                                                                                                                      /-Cricetulus griseus, species
                                                                                                                                                       /Muroidea, clade
                                                                                                                                        /Rodentia, order              \-Mus musculus, species
                                                                                                                                       |              |
                                                                                                            /Euarchontoglires, superorder              \-Cavia porcellus, species
                                                                                                           |       