In [1]:
# import packages
import pandas as pd
import gget
import numpy as np
from Bio import SeqIO, AlignIO
from Bio.pairwise2 import align, format_alignment
from Bio.Entrez import efetch
from Bio.Align.Applications import MafftCommandline
from Bio.Align import AlignInfo
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from joblib import Parallel, delayed



In [30]:
ancestral_states_list = pd.read_csv('ancestral_states_list.csv')
display(ancestral_states_list)

Unnamed: 0,Node,Site,State,A,R,N,D,C,Q,E,...,L,K,M,F,P,S,T,W,Y,V
0,Rodent_Node,1,-,0.06800,0.05550,0.03629,0.04687,0.02143,0.05028,0.06893,...,0.10119,0.06004,0.01966,0.03624,0.05515,0.09686,0.05714,0.01178,0.02473,0.06622
1,Rodent_Node,2,-,0.06800,0.05550,0.03629,0.04687,0.02143,0.05028,0.06893,...,0.10119,0.06004,0.01966,0.03624,0.05515,0.09686,0.05714,0.01178,0.02473,0.06622
2,Rodent_Node,3,-,0.06800,0.05550,0.03629,0.04687,0.02143,0.05028,0.06893,...,0.10119,0.06004,0.01966,0.03624,0.05515,0.09686,0.05714,0.01178,0.02473,0.06622
3,Rodent_Node,4,-,0.06800,0.05550,0.03629,0.04687,0.02143,0.05028,0.06893,...,0.10119,0.06004,0.01966,0.03624,0.05515,0.09686,0.05714,0.01178,0.02473,0.06622
4,Rodent_Node,5,-,0.06800,0.05550,0.03629,0.04687,0.02143,0.05028,0.06893,...,0.10119,0.06004,0.01966,0.03624,0.05515,0.09686,0.05714,0.01178,0.02473,0.06622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3870,Primate_Node,771,P,0.04786,0.01654,0.00977,0.00560,0.00474,0.03050,0.00869,...,0.05523,0.00885,0.00441,0.00645,0.62142,0.09058,0.03676,0.00129,0.00351,0.01369
3871,Primate_Node,772,P,0.04786,0.01654,0.00977,0.00560,0.00474,0.03050,0.00869,...,0.05523,0.00885,0.00441,0.00645,0.62142,0.09058,0.03676,0.00129,0.00351,0.01369
3872,Primate_Node,773,Q,0.00982,0.07070,0.00860,0.00933,0.00367,0.63315,0.04700,...,0.02786,0.04794,0.00334,0.00344,0.03345,0.01727,0.00865,0.00360,0.00559,0.00657
3873,Primate_Node,774,A,0.51646,0.00970,0.01152,0.01463,0.00543,0.00726,0.02257,...,0.02194,0.00930,0.00994,0.00561,0.03881,0.07952,0.10361,0.00110,0.00252,0.07444


In [34]:
Rodent_Hamster_Node = ancestral_states_list[ancestral_states_list.Node == 'Rodent_Hamster_Node']
display(Rodent_Hamster_Node)

Unnamed: 0,Node,Site,State,A,R,N,D,C,Q,E,...,L,K,M,F,P,S,T,W,Y,V
775,Rodent_Hamster_Node,1,-,0.06800,0.05550,0.03629,0.04687,0.02143,0.05028,0.06893,...,0.10119,0.06004,0.01966,0.03624,0.05515,0.09686,0.05714,0.01178,0.02473,0.06622
776,Rodent_Hamster_Node,2,-,0.06800,0.05550,0.03629,0.04687,0.02143,0.05028,0.06893,...,0.10119,0.06004,0.01966,0.03624,0.05515,0.09686,0.05714,0.01178,0.02473,0.06622
777,Rodent_Hamster_Node,3,-,0.06800,0.05550,0.03629,0.04687,0.02143,0.05028,0.06893,...,0.10119,0.06004,0.01966,0.03624,0.05515,0.09686,0.05714,0.01178,0.02473,0.06622
778,Rodent_Hamster_Node,4,-,0.06800,0.05550,0.03629,0.04687,0.02143,0.05028,0.06893,...,0.10119,0.06004,0.01966,0.03624,0.05515,0.09686,0.05714,0.01178,0.02473,0.06622
779,Rodent_Hamster_Node,5,-,0.06800,0.05550,0.03629,0.04687,0.02143,0.05028,0.06893,...,0.10119,0.06004,0.01966,0.03624,0.05515,0.09686,0.05714,0.01178,0.02473,0.06622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1545,Rodent_Hamster_Node,771,P,0.05849,0.02341,0.01544,0.00974,0.00749,0.03753,0.01456,...,0.06961,0.01501,0.00714,0.01063,0.49738,0.10623,0.04624,0.00223,0.00601,0.02247
1546,Rodent_Hamster_Node,772,P,0.05849,0.02341,0.01544,0.00974,0.00749,0.03753,0.01456,...,0.06961,0.01501,0.00714,0.01063,0.49738,0.10623,0.04624,0.00223,0.00601,0.02247
1547,Rodent_Hamster_Node,773,Q,0.01605,0.08325,0.01338,0.01556,0.00622,0.51161,0.05866,...,0.03784,0.06212,0.00506,0.00631,0.04116,0.02781,0.01414,0.00520,0.00901,0.01153
1548,Rodent_Hamster_Node,774,A,0.39776,0.01550,0.01741,0.02017,0.00800,0.01187,0.02996,...,0.03548,0.01516,0.01366,0.00934,0.04744,0.09474,0.10917,0.00192,0.00449,0.08309


In [38]:
# create a function to extract the ancestral sequence for each node
def Node_Refinement(df, node, threshold):
    # make mini dataframe for node
    node_df = df[df.Node == node]
    node_df.reset_index(inplace=True, drop=True)
    
    # get index list
    indices = len(node_df.Site)
    
    # initiate sequence string
    sequence = ''
    
    # iterate through index list and add or reject State
    for n in range(indices):
        current_state = node_df.State[n]
        if current_state == '-':
            continue
        else:
            if node_df.at[n, current_state] < threshold:
                continue
            else:
                sequence += current_state
                
    print('Length of ancestral sequence: ',len(sequence))
                
    return sequence

In [28]:
Node_Refinement(ancestral_states_list, 'Rodent_Node', 0.9)

Length of ancestral sequence:  212


'SPEIYKTVSAWKRQPVRVLSLFGNIDKELKSLGFLESGSGSEGGTLKYVEDVTNVVRRDVEKWGPFDLVYGSTQPLGYSCDRCPGWYMFQFHRILQYARPRQSQQPFFWIFVDNLLLTEDDQVTTVRFLQTEAVTLQDVRGRVLQNAVRVWSNIPGLKLSKHALTPKEEQSLQAQVRTRSKLAAQKVDPLVKNCLLPLREYFKYFSQNSLPL'

In [40]:
Node_Refinement(ancestral_states_list, 'Rodent_Hamster_Node', 0.9)

Length of ancestral sequence:  213


'SPLEMYKTVSAWKRQPMRVLSLFGNIDKELKSLGFLESGSGSEEGRLKYLEDVTNVVRRDVEWGPFDLVYGSTQPLGYSCDCPGWYMFQFHRILQYARPHPGSQQPFFWIFVDNLLLTEDDQVTAARFQGEAVTLQDVRGRVLQNAVRVWSNIPGLKSLSKHALTPKEEQSLAQVRTRAKLAAQKVDPLVKNCLLPLREYFKYFSQNSLPLYK'

In [53]:
Node_Refinement(ancestral_states_list, 'Rodent_Chinchilla_Node', 0.9565)

Length of ancestral sequence:  213


'LLWLYESPLEMYKTVPWKREPRVLSLFGDIKKLELTSLGFLEGSGRLKHLDDVTDVVRRDVEEWGPFDLVYGSTPPLGHCDSPGWYLFQFHRLLQYARPPGSPQPFFWMFVDNLLLTDDQATATRFLPGEGPEPVTLQPDVRGRVLQNAVRVWSNIPAVKSSHALAPEEELSLLAQQRALQGPALVKNCWFLPLREYFKYFSQTNSLPLYKRR'

In [74]:
Node_Refinement(ancestral_states_list, 'Hs_Mm_LCA_Node', 0.997)

Length of ancestral sequence:  228


'CFLCLPFRGLLQRKWRLKFYDREESPLEMYKTVPVWKREPVRVLSLFGDIKKELTSLGFLEGSDPGRLKHLDDVTDVVRRDVEEWGPFDLVYGSTPPLGHACDHPPGWYLFQFHRLQYARPRPGSQPFFWMFVDNLVLTDDRATRFLGEGPTPVTIQDCGRQNAVVWSNIPAVSRHSALSEELSLLAQDRQRAKPQGPALVKNCWFLPLREYFKYFSTELTSSLPLYK'

In [76]:
Node_Refinement(ancestral_states_list, 'Primate_Node',0.99)

Length of ancestral sequence:  236


'CFLCLPFRSGLLQRRKWRLKAFYDREESPLEMYKTVPVWKREPVRVLSLFGDIKKELTSLGFLEGSPGRLKHLDDVTDVVRRDVEEWGPFDLVYGSTPPLGHACDHPPGWYLFQFHRLLQYARPRPGSQPFFWMFVDNLVLTDRAATRFLGEGPTPVTIQDVCGRAQNAVVWSNIPAVKPSHSALEEELSLLAQDRQRAKPAQGGPALVKNCWFLPLREYFKYFSTELTSSLPLYK'