In [253]:
from Bio import SeqIO
import pandas as pd
import time as t
from collections import Counter
import numpy as np

In [254]:
# implementation of avl tree

class Node(object):
    def __init__(self, data, val, res_ind, sp, var):
        self.variant = var
        self.species = sp
        self.data = data
        self.value = val
        self.res_ind = res_ind
        self.parent = None
        self.height = 0
        self.left_child = None
        self.right_child = None


class AVLTree(object):

    def __init__(self):
        self.root = None
            
            
    def insert(self, data, val, res_ind, sp, var):
        if self.root is None:
            self.root = Node(data, val, res_ind, sp, var)
        else:
            new_node = self._insert(self.root, data, val, res_ind, sp, var)
            self._walk_up(new_node)

    def _insert(self, node, data, val, res_ind, sp, var):
        if data <= node.data:
            if not node.left_child:
                node.left_child = Node(data, val, res_ind, sp, var)
                node.left_child.parent = node
                return node.left_child
            return self._insert(node.left_child, data, val, res_ind, sp, var)
        else:
            if not node.right_child:
                node.right_child = Node(data, val, res_ind, sp, var)
                node.right_child.parent = node
                return node.right_child
            return self._insert(node.right_child, data, val, res_ind, sp, var)

    def remove(self, data):
        if not self.root:
            raise ValueError('Tree is empty.')
        self._delete_value(data)

    def _delete_value(self, data):
        node = self.find(data, self.root)
        if node is False:
            raise ValueError('No node with value {}'.format(data))
        parent_of_deleted_node = self._delete_node(node)
        # if parent is None we know that we just deleted the root node,
        # but if root node had a child, that child is now the root node!
        if not parent_of_deleted_node and self.root:
            parent_of_deleted_node = self.root
        self._walk_up(parent_of_deleted_node)

    def find(self, value, node):
        if node is None:
            return False
        if node.data > value:
            return self.find(value, node.left_child)
        elif node.data < value:
            return self.find(value, node.right_child)
        return node

    def _delete_node(self, node):
        parent_node = node.parent
        num_child = self._num_children(node)

        if num_child == 0:
            # If there is no parent then 'node' is the root node.
            # 'node' has no children, so set root to point to None.
            if not parent_node:
                self.root = None
            elif parent_node.left_child == node:
                parent_node.left_child = None
            else:
                parent_node.right_child = None
            return parent_node

        elif num_child == 1:
            if node.left_child:
                child = node.left_child
            else:
                child = node.right_child

            if not parent_node:
                self.root = child
            elif parent_node.left_child == node:
                parent_node.left_child = child
            else:
                parent_node.right_child = child
            child.parent = parent_node
            return parent_node

        else:
            successor = self._max_node(node.left_child)
            node.data = successor.data
            self._delete_node(successor)

    @staticmethod
    def _num_children(node):
        num_children = 0
        if node.left_child:
            num_children += 1
        if node.right_child:
            num_children += 1
        return num_children

    def _walk_up(self, node):
        if not node:
            return
        else:
            self._check_node(node)
            return self._walk_up(node.parent)

    def _check_node(self, node):
        left_height = -1
        right_height = -1
        if node.left_child:
            left_height = node.left_child.height
        if node.right_child:
            right_height = node.right_child.height
        if abs(left_height - right_height) > 1:
            if left_height < right_height:
                self.left_rotate(node, node.right_child)
            else:
                self.right_rotate(node, node.left_child)
        else:
            node.height = max(left_height, right_height) + 1

    def left_rotate(self, node, child_node):
        if child_node.left_child:
            node.right_child = child_node.left_child
            node.right_child.parent = node
        else:
            node.right_child = None

        if node != self.root:
            child_node.parent = node.parent
            if node.parent.right_child == node:
                node.parent.right_child = child_node
            else:
                node.parent.left_child = child_node
        else:
            child_node.parent = None
            # because we are not replacing the parent node('node') with the
            # child node('child_node'), self.root is still pointing at the parent node.
            self.root = child_node

        child_node.left_child = node
        node.parent = child_node

        node.height -= 1

    def right_rotate(self, node, child_node):
        if child_node.right_child:
            node.left_child = child_node.right_child
            node.left_child.parent = node
        else:
            node.left_child = None

        if node != self.root:
            child_node.parent = node.parent
            if node.parent.right_child == node:
                node.parent.right_child = child_node
            else:
                node.parent.left_child = child_node
            # node.parent.left_child = child_node
        else:
            child_node.parent = None
            self.root = child_node

        child_node.right_child = node
        node.parent = child_node

        node.height -= 1

    def _min_node(self, node):
        if node.left_child:
            return self._min_node(node.left_child)
        return node

    def _max_node(self, node):
        if node.right_child:
            return self._max_node(node.right_child)
        return node

    def traverse(self, method='in'):
        if not self.root:
            return iter(())
        if method == 'in':
            return self._traverse_inorder(self.root)
        elif method == 'pre':
            return self._traverse_preorder(self.root)
        elif method == 'post':
            return self._traverse_postorder(self.root)
        else:
            raise ValueError('method must be either "in", "pre" or "post".')

    # left subtree -> root -> right subtree
    def _traverse_inorder(self, node):
        if node.left_child:
            yield from self._traverse_inorder(node.left_child)
        yield node
        if node.right_child:
            yield from self._traverse_inorder(node.right_child)

    # root -> left subtree -> right subtree
    def _traverse_preorder(self, node):
        yield node
        if node.left_child:
            yield from self._traverse_inorder(node.left_child)
        if node.right_child:
            yield from self._traverse_inorder(node.right_child)

    # left subtree -> right subtree -> root
    def _traverse_postorder(self, node):
        if node.left_child:
            yield from self._traverse_inorder(node.left_child)
        if node.right_child:
            yield from self._traverse_inorder(node.right_child)
        yield node

In [255]:
his_table = pd.read_csv('histones.csv')
his_table.head()

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
0,XP_010685819.1,H2A,cH2A,cH2A,,731349093,,,3555,Beta vulgaris subsp. vulgaris,Streptophyta,Magnoliopsida,,,,MDSTAGGKAKKGAGGRKGGGPKKKPVSRSVKAGLQFPVGRIGRYLK...
1,NP_563627.1,H3,cenH3,cenH3,,18378832,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,,MARTKHRVTRSQPRNQTDAAGASSSQAAGPTTTPTRRGGEGGDNTQ...
2,NP_001190852.1,H2A,cH2A,cH2A,,334186954,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,,MAGRGKQLGSGAAKKSTSRSSKAGLQFPVGRIARFLKAGKYAERVG...
3,NP_175517.1,H2A,cH2A,cH2A,,15223708,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,,MAGRGKTLGSGSAKKATTRSSKAGLQFPVGRIARFLKKGKYAERVG...
4,NP_188703.1,H2A,cH2A,cH2A,,15232330,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,,MAGRGKTLGSGVAKKSTSRSSKAGLQFPVGRIARFLKNGKYATRVG...


In [256]:
def get_species(acc, df):
    request = df.loc[df['accession'] == acc]['organism']
    try:
        species = request.iloc[0]
    except IndexError:
        print('error, skip', acc, request)
        return False
    else:
        return species

a = get_species('NP_188703.1', his_table)
print(a)

Arabidopsis thaliana


In [258]:
def get_variant(acc, df):
    request = df.loc[df['accession'] == acc]['variant']
    try:
        variant = request.iloc[0]
    except IndexError:
        print('error, skip', acc, request)
        return False
    else:
        return variant


a = get_variant('NP_188703.1', his_table)
print(a)

cH2A


In [259]:
# open files aligned by its type

tree = AVLTree()

path = './new_alignments'
h1 = open(path + '/H1.fasta')
h2a = open(path + '/H2A.fasta')
h2b = open(path + '/H2B.fasta')
h3 = open(path + '/H3.fasta')
h4 = open(path + '/H4.fasta')


# add histone alignments from files into the avl_tree. Each H1 histone is assigned with id that gives
#remainder of 0 when devided by 5, H2A histone give remainders of 1, H2B - 2, H3 - 3, H4 - 4.
#After obtaining following id increase by 5

start = t.time()

files = [h1, h2a, h2b, h3, h4]
types = ['H1', 'H2A', 'H2B', 'H3', 'H4']
rests = {'H1': 0, 'H2A': 1, 'H2B': 2, 'H3':  3, 'H4': 4}
id_counter = {'H1': 0, 'H2A': 1, 'H2B': 2, 'H3':  3, 'H4': 4}
his_counter = 0
i = -1


for file in files:
    i += 1
    cur_his_type = types[i]
    for record in SeqIO.parse(file, 'fasta'):
        species = get_species(record.id, his_table)
        variant = get_variant(record.id, his_table)
        if species and variant:
            res_ind = [i for i in range(len(record.seq)) if record.seq[i] != '-']
            his_id = id_counter[cur_his_type]
            id_counter[cur_his_type] += 5
            his_counter += 1
            tree.insert(his_id, record, res_ind, species, variant)
        else:
            continue
        
                
print(t.time() - start)
print(his_counter)
print(id_counter)

error, skip sp|O17536.3|H14_CAEEL Series([], Name: organism, dtype: object)
error, skip sp|O17536.3|H14_CAEEL Series([], Name: variant, dtype: object)
error, skip sp|P23444.2|H1_MAIZE Series([], Name: organism, dtype: object)
error, skip sp|P23444.2|H1_MAIZE Series([], Name: variant, dtype: object)
error, skip sp|P02272.2|H2AV_CHICK Series([], Name: organism, dtype: object)
error, skip sp|P02272.2|H2AV_CHICK Series([], Name: variant, dtype: object)
error, skip sp|Q9LZ45.3|H2B9_ARATH Series([], Name: organism, dtype: object)
error, skip sp|Q9LZ45.3|H2B9_ARATH Series([], Name: variant, dtype: object)
error, skip sp|Q9FFC0.3|H2B10_ARATH Series([], Name: organism, dtype: object)
error, skip sp|Q9FFC0.3|H2B10_ARATH Series([], Name: variant, dtype: object)
error, skip sp|Q9LQQ4.3|H2B1_ARATH Series([], Name: organism, dtype: object)
error, skip sp|Q9LQQ4.3|H2B1_ARATH Series([], Name: variant, dtype: object)
error, skip sp|Q9LZT0.3|H2B7_ARATH Series([], Name: organism, dtype: object)
error, sk

In [260]:
# read contact table and delit letters p and d from histone names

conts = pd.read_csv('../histone_contacts.csv')
conts = conts[(conts['A_entity'] != conts['B_entity'])]

conts['A_entity'] = [his[:-1] for his in conts['A_entity']]
conts['B_entity'] = [his[:-1] for his in conts['B_entity']]
conts

Unnamed: 0.1,Unnamed: 0,A_segid,A_resid,A_resname,B_segid,B_resid,B_resname,num_int,A_entity,B_entity
202,202,A,44,G,B,44,K,2,H3,H4
227,227,A,47,A,B,39,R,5,H3,H4
228,228,A,47,A,B,44,K,1,H3,H4
238,238,A,48,L,B,44,K,2,H3,H4
239,239,A,48,L,G,115,L,2,H3,H2A
...,...,...,...,...,...,...,...,...,...,...
8294,8294,H,118,Y,G,20,R,2,H2B,H2A
8295,8295,H,118,Y,G,49,V,2,H2B,H2A
8317,8317,H,121,A,G,20,R,1,H2B,H2A
8324,8324,H,122,K,G,6,Q,2,H2B,H2A


In [261]:
# search for synchronous replacement of contacting residues in the aligned histone sequences

start = t.time()


res_dict = {key: [] for key in ['species', 'a_resid', 'b_resid', 'a_entity', 'b_entity', 'a_resname',\
                        'b_resname', 'a_new_resname', 'b_new_resname', 'a_variant', 'b_variant',\
                               'a_accsession', 'b_accsession']}

i = -1

for j, row in conts.iterrows():
    a_his, b_his = row[8], row[9]
    a_resi, b_resi = row[2] - 1, row[5] - 1
    a_resid, b_resid = row['A_resid'], row['B_resid']
    a_resname, b_resname = row[3], row[6]
    a_start, b_start = rests[a_his], rests[b_his]
    a_stop, b_stop = id_counter[a_his], id_counter[b_his]
    i += 1

    while a_start < a_stop:
        a_node = tree.find(a_start, tree.root)
        a_start += 5
        a_seq = a_node.value.seq
        try:
            a_new_resname = a_seq[a_node.res_ind[a_resi]]
        except IndexError:
            continue
        else:
            if a_new_resname != a_resname:
                
                while b_start < b_stop:
                    b_node = tree.find(b_start, tree.root)
                    b_start += 5
                    b_seq = b_node.value.seq
                    try:
                        b_new_resname = b_seq[b_node.res_ind[b_resi]]
                    except IndexError:
                        continue
                    else:
                        if b_new_resname != a_new_resname and a_node.species == b_node.species:
                            res_tup = (a_node.species, a_resid, b_resid, a_his, b_his,
                                                 a_resname, b_resname, a_new_resname, b_new_resname,
                                       a_node.variant, b_node.variant, a_node.value.id, b_node.value.id)
                            
                            for item, key in zip(res_tup, res_dict):
                                res_dict[key].append(item)
                                
        
print(t.time() - start)

# scrypt returns species, contacting histones in 1KX5, position and residue of each histone in 1KX5, 
# new residues and replaced hisone variants

0.7746615409851074


In [262]:
res = pd.DataFrame(res_dict).reset_index()
res

Unnamed: 0,index,species,a_resid,b_resid,a_entity,b_entity,a_resname,b_resname,a_new_resname,b_new_resname,a_variant,b_variant,a_accsession,b_accsession
0,0,Drosophila melanogaster,44,44,H3,H4,G,K,T,V,cenH3,cH4,NP_523730.2,NP_001027352.1
1,1,Drosophila melanogaster,47,39,H3,H4,A,R,Q,A,cenH3,cH4,NP_523730.2,NP_001027352.1
2,2,Drosophila melanogaster,47,44,H3,H4,A,K,Q,V,cenH3,cH4,NP_523730.2,NP_001027352.1
3,3,Arabidopsis thaliana,48,44,H3,H4,L,K,T,V,cenH3,cH4,NP_563627.1,NP_180441.1
4,4,Arabidopsis thaliana,48,115,H3,H2A,L,L,T,P,cenH3,H2A.Z,NP_563627.1,NP_193093.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1628,1628,Bos taurus,122,6,H2B,H2A,K,Q,M,H,H2B.W,H2A.L,DAA13058.1,NP_001071426.1
1629,1629,Bos taurus,122,20,H2B,H2A,K,R,M,S,H2B.W,H2A.X,DAA13058.1,NP_001073248.1
1630,1630,Bos taurus,122,20,H2B,H2A,K,R,M,S,H2B.W,cH2A,DAA13058.1,NP_001192525.1
1631,1631,Bos taurus,122,20,H2B,H2A,K,R,M,S,H2B.W,H2A.B,DAA13058.1,NP_001069373.1


In [263]:
res.info()
res['species'].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1633 entries, 0 to 1632
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   index          1633 non-null   int64 
 1   species        1633 non-null   object
 2   a_resid        1633 non-null   int64 
 3   b_resid        1633 non-null   int64 
 4   a_entity       1633 non-null   object
 5   b_entity       1633 non-null   object
 6   a_resname      1633 non-null   object
 7   b_resname      1633 non-null   object
 8   a_new_resname  1633 non-null   object
 9   b_new_resname  1633 non-null   object
 10  a_variant      1633 non-null   object
 11  b_variant      1633 non-null   object
 12  a_accsession   1633 non-null   object
 13  b_accsession   1633 non-null   object
dtypes: int64(3), object(11)
memory usage: 178.7+ KB


array(['Drosophila melanogaster', 'Arabidopsis thaliana',
       'Trypanosoma brucei brucei TREU927',
       'Tetrahymena thermophila SB210', 'Giardia intestinalis',
       'Bos taurus', 'Homo sapiens'], dtype=object)