In [1]:
from Bio import Phylo
import pandas as pd
from Bio.Phylo.TreeConstruction import DistanceCalculator 
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from ete3 import PhyloTree
from ete3 import Tree
from ete3 import NCBITaxa

In [2]:
ncbi = NCBITaxa()

In [3]:
with open('new_alignments/H1.fasta') as h1_file, open('new_alignments/H2A.fasta') as h2a_file,\
open('new_alignments/H2B.fasta') as h2b_file, open('new_alignments/H3.fasta') as h3_file,\
open('new_alignments/H4.fasta') as h4_file:

    h1_al = AlignIO.read(h1_file, 'fasta')
    h2a_al = AlignIO.read(h2a_file, 'fasta')
    h2b_al = AlignIO.read(h2b_file, 'fasta')
    h3_al = AlignIO.read(h3_file, 'fasta')
    h4_al = AlignIO.read(h4_file, 'fasta')
 

al_refs = [h1_al, h2a_al, h2b_al, h3_al, h4_al]
names = ['H1.nwk', 'H2A.nwk', 'H2B.nwk', 'H3.nwk', 'H4.nwk']

In [4]:
for alignment, name in zip(al_refs, names):
    dm = DistanceCalculator('identity').get_distance(alignment)
    constructor = DistanceTreeConstructor()
    nj_tree = constructor.nj(dm)
    path = 'trees/' + name
    Phylo.write(nj_tree, path, 'newick')
    tree = PhyloTree(newick='trees/H1.nwk', alignment='new_alignments/H1.fasta', alg_format='fasta',
                     format=3)
    break

Warnning: [57] terminal nodes could not be found in the alignment.


In [6]:
his_df, subs_df = pd.read_csv('histones.csv'), pd.read_csv('subs_df_cleaned.csv')
display(his_df.head())
subs_df.head()

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
0,XP_010685819.1,H2A,cH2A,cH2A,,731349093,,,3555,Beta vulgaris subsp. vulgaris,Streptophyta,Magnoliopsida,,,,MDSTAGGKAKKGAGGRKGGGPKKKPVSRSVKAGLQFPVGRIGRYLK...
1,NP_563627.1,H3,cenH3,cenH3,,18378832,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,,MARTKHRVTRSQPRNQTDAAGASSSQAAGPTTTPTRRGGEGGDNTQ...
2,NP_001190852.1,H2A,cH2A,cH2A,,334186954,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,,MAGRGKQLGSGAAKKSTSRSSKAGLQFPVGRIARFLKAGKYAERVG...
3,NP_175517.1,H2A,cH2A,cH2A,,15223708,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,,MAGRGKTLGSGSAKKATTRSSKAGLQFPVGRIARFLKKGKYAERVG...
4,NP_188703.1,H2A,cH2A,cH2A,,15232330,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,,MAGRGKTLGSGVAKKSTSRSSKAGLQFPVGRIARFLKNGKYATRVG...


Unnamed: 0.1,Unnamed: 0,species,a_resid,b_resid,a_entity,b_entity,a_resname,b_resname,a_new_resname,b_new_resname,a_variant,b_variant,a_accsession,b_accsession,a_column,b_column
0,105,Tetrahymena thermophila SB210,25,73,H4,H3,N,E,S,D,cH4,cH3,XP_001016593.1,XP_001016594.3,27,378
1,106,Tetrahymena thermophila SB210,25,73,H4,H3,N,E,S,D,cH4,H3.3,XP_001016593.1,XP_001008397.1,27,378
2,109,Trypanosoma brucei brucei TREU927,41,105,H4,H3,G,E,A,A,cH4,cH3,XP_951561.1,XP_001218942.1,43,417
3,110,Tetrahymena thermophila SB210,49,121,H4,H3,L,P,F,S,cH4,cenH3,XP_001016593.1,XP_001011273.1,51,433
4,111,Tetrahymena thermophila SB210,49,121,H4,H3,L,P,F,T,cH4,cH3,XP_001016593.1,XP_001016594.3,51,433


In [7]:
tax_ids = set()

for i, row in subs_df.iterrows():
    a_acc, b_acc = row['a_accsession'], row['b_accsession']
    a_tax_id = his_df.loc[his_df['accession']==a_acc]['taxonomy_id']
    b_tax_id = his_df.loc[his_df['accession']==b_acc]['taxonomy_id']
    tax_ids.add(int(a_tax_id)), tax_ids.add(int(b_tax_id))
 
xenopus_id = 8355
tax_ids.add(xenopus_id)
print(tax_ids)

{9986, 9606, 9615, 10141, 8355, 10029, 9646, 13616, 9913, 9785, 7227, 559292, 10181, 9031, 9544, 312017, 185431, 9823, 6239, 508771, 10090, 423536, 3702}


In [10]:
ncbi = NCBITaxa()

tree = ncbi.get_topology(tax_ids)
with open ('pictures_for_article/tax_tree', 'w') as file:
    print(tree.get_ascii(attributes=["sci_name", "rank"]), file=file)

In [20]:
subs_df = pd.read_csv('subs_df_cleaned.csv')
df = pd.read_csv('histones.csv')
df.head()
subs_df.head()

Unnamed: 0.1,Unnamed: 0,species,a_resid,b_resid,a_entity,b_entity,a_resname,b_resname,a_new_resname,b_new_resname,a_variant,b_variant,a_accsession,b_accsession,a_column,b_column
0,105,Tetrahymena thermophila SB210,25,73,H4,H3,N,E,S,D,cH4,cH3,XP_001016593.1,XP_001016594.3,27,378
1,106,Tetrahymena thermophila SB210,25,73,H4,H3,N,E,S,D,cH4,H3.3,XP_001016593.1,XP_001008397.1,27,378
2,109,Trypanosoma brucei brucei TREU927,41,105,H4,H3,G,E,A,A,cH4,cH3,XP_951561.1,XP_001218942.1,43,417
3,110,Tetrahymena thermophila SB210,49,121,H4,H3,L,P,F,S,cH4,cenH3,XP_001016593.1,XP_001011273.1,51,433
4,111,Tetrahymena thermophila SB210,49,121,H4,H3,L,P,F,T,cH4,cH3,XP_001016593.1,XP_001016594.3,51,433


In [17]:
len(df['organism'].unique())

75

In [18]:
len(subs_df['species'].unique())

22

In [45]:
sp_variants = {}
variants = set()

for sp in subs_df['species'].unique():
    req_a = set(subs_df.loc[subs_df['species']==sp]['a_variant'].unique())
    req_b = set(subs_df.loc[subs_df['species']==sp]['b_variant'].unique())
    variants = variants.union(req_a.union(req_b))
    unique_variants = str(req_a.union(req_b)).replace('{', '').replace('}', '').replace(',', '').replace("'", '')
    print(sp, unique_variants)

Tetrahymena thermophila SB210 cH4 cH3 cenH3 H3.3
Trypanosoma brucei brucei TREU927 cH2A cH4 H2A.Z cH3 cH2B
Saccharomyces cerevisiae S288C cH4 H2A.X H3.3 cenH3 H2A.Z cH2B
Homo sapiens cH4 H2A.Z cH3 cH2B
Arabidopsis thaliana cH2A H2A.W H2A.X cenH3 H2A.Z cH2B
Caenorhabditis elegans cH2A cenH3
Drosophila melanogaster cH2A cenH3 H2A.Z cH2B
Perkinsus marinus ATCC 50983 cH2A H2A.X cenH3 H2A.Z
Mus musculus cH2A H2A.P H2A.1 H2A.B H2A.X H2A.L cenH3 subH2B
Macaca mulatta cH2A H2B.W H2A.P H2A.B subH2B
Canis lupus familiaris H2B.W H2A.P H2B.1 H2A.L
Ailuropoda melanoleuca H2B.W H2A.L H2A.P cH2B H2A.B subH2B
Loxodonta africana H2A.B subH2B
Sus scrofa H2A.L H2A.P cH2B H2A.B subH2B
Bos taurus H2B.W H2A.L cH2B H2A.B subH2B H2A.X
Cricetulus griseus cH2A H2A.L H2A.P cH2B H2A.B subH2B
Heterocephalus glaber H2A.P H2A.B subH2B H2A.L
Oryctolagus cuniculus H2B.W H2A.L
Cavia porcellus subH2B H2A.L
Toxoplasma gondii ME49 cH2A H2B.Z H2A.X H2A.Z
Monodelphis domestica cH2A H2A.X subH2B
Gallus gallus cH2B macroH2A


In [46]:
variants

{'H2A.1',
 'H2A.B',
 'H2A.L',
 'H2A.P',
 'H2A.W',
 'H2A.X',
 'H2A.Z',
 'H2B.1',
 'H2B.W',
 'H2B.Z',
 'H3.3',
 'cH2A',
 'cH2B',
 'cH3',
 'cH4',
 'cenH3',
 'macroH2A',
 'subH2B'}