In [None]:
"""
# reranking code for the result
## get the probability of RNA(nucleotide, codon) mutation
## formula is aX + bY + cZ

|Probability||||
|---|---|---|---|
A to C	1.1309cZ882976023E-05
A to G	3.79592984444666E-05
A to T	2.17601073885478E-05
C to A	4.82364467163978E-05
C to G	1.43123713007997E-05
C to T	0.000615014695634072
G to A	0.000111490944193118
G to C	7.0550851521742E-05
G to T	0.000311342109361459
T to A	7.57183086181891E-06
T to C	5.12463411619656E-05
T to G	6.88348260165355E-06
	
A to C	0.000480370596901429
A to G	0.00161226521002784
A to T	0.000924228464347168
C to A	0.000214773685034829
C to G	6.37261020475316E-05
C to T	0.00273836448419408
G to A	0.000681726831066903
G to C	0.000431392960074668
G to T	0.00190374448013487
T to A	0.00034768052837326
T to C	0.00235311053529112
T to G	0.000316073207612054


total number of cases are 24187
for escape mutants priority rank applying independent prob ways, mean = 1854.3684210526317, std = 1399.3762185951248
"""
from Bio import SeqIO
from Bio.Data import CodonTable
import pandas as pd
cov_p_table = {"AA": 1, "AC": 0.000011309882976023, 
               "AG": 0.0000379592984444666, "AT": 0.0000217601073885478,
                "CA": 0.0000482364467163978, "CC": 1, 
                "CG": 0.0000143123713007997, "CT": 0.000615014695634072, 
                "GA": 0.000111490944193118, "GC": 0.000070550851521742, 
                "GG": 1, "GT": 0.000311342109361459,
                "TA": 0.00000757183086181891, "TC": 0.0000512463411619656, 
                "TG": 0.00000688348260165355, "TT": 1}

h1n1_p_table = {
                "AA": 1, "AC":1.1309882976023E-05,
                "AG":3.79592984444666E-05 , "AT":2.17601073885478E-05,
                "CA":4.82364467163978E-05 , "CC":1,
                "CG":1.43123713007997E-05 , "CT":0.000615014695634072,
                "GA":0.000111490944193118 , "GC":7.0550851521742E-05,
                "GG": 1, "GT":0.000311342109361459,
                "TA":7.57183086181891E-06 , "TC":5.12463411619656E-05,
                "TG":6.88348260165355E-06 , "TT":1
                }

In [3]:
# receive amino_acid tranlate it to 3char codon
def to_codons(ammino_acid):
    std_codon_table = CodonTable.unambiguous_dna_by_id[1]
    codons = [codon for codon, aa in std_codon_table.forward_table.items() \
        if aa == ammino_acid]
    assert len(codons) > 0, f'ammino_acid = {ammino_acid}'
    # print(f'{ammino_acid} is mapped to {codons}')
    return codons

In [11]:
def get_codon_aa_independent_probability(wt_codon, mut_aa, p_table):
    # 3 char for each codon
    def get_codon_probability_independent(wt_codon, mut_codon, p_table):
        p = 1
        for ch1, ch2 in zip(wt_codon, mut_codon):
            # print(f"prob {ch1} to {ch2}")
            p *= p_table[ch1 + ch2]
        # print(f'p = {p}')
        return p


    mut_codons = to_codons(mut_aa)
    prob_list = [get_codon_probability_independent(wt_codon, mut_codon, p_table) for mut_codon in mut_codons]
    # print(prob_list)
    # print(type(prob_list))
    assert len(prob_list) > 0, f"changed from {wt_codon}, \
        to {mut_codons}, mut aa is {mut_aa}"
    return sum(prob_list)

In [12]:
# df = pd.read_csv("wt_codon.csv")
get_codon_aa_independent_probability("GAC", "L", h1n1_p_table)

1.5366580550713715e-09

In [13]:
"""
rerank code
input file: prob table already built
file format: tab based csv like
"""
from enum import Enum
class Virus(Enum):
    COV = 1
    FLU = 2
from enum import Enum
class Virus(Enum):
    COV = 1
    FLU = 2
h1n1_df = pd.read_csv("results/flu/semantics/analyze_semantics_flu_h1_bilstm_512.txt", delimiter='\t')
cov_df = pd.read_csv("results/cov/semantics/analyze_semantics_cov_bilstm_512.txt", delimiter='\t')

In [14]:
cov_wt_codon_df = pd.read_csv("wt_codon.csv")
h1n1_wt_codon_df = pd.read_csv("h1n1_wt_codon.csv")

# fis pos = pFLUos - 1 (make it start from 0, not 1)
h1n1_wt_codon_df['pos'] -= 1

In [15]:
def wt_codon(position, wt_codon_df):
    try:
        codon = wt_codon_df[wt_codon_df['pos'] == position]['Codon'].values[0]
    except IndexError:
        print(f"Error position is {position}")
    return codon

In [16]:
def virus_table(virus_type:Virus):
    return (cov_df, cov_p_table, cov_wt_codon_df) if virus_type == Virus.COV \
        else (h1n1_df, h1n1_p_table, h1n1_wt_codon_df)

In [54]:
def process_viral_result_table(virus_type: Virus):
    rank_df, p_table, wt_codon_df = virus_table(virus_type)

    # get rid of junklike alphabet
    rank_df = rank_df[~rank_df['wt'].str.contains('X|B|Z|J|U', case=False, na=False)]
    rank_df = rank_df[~rank_df['mut'].str.contains('X|B|Z|J|U', case=False, na=False)]
    

    # translate aa to codon
    rank_df['codon'] = rank_df.apply(lambda row: wt_codon(row['pos'], wt_codon_df), axis=1)

    # get wild type codon to mutation probability
    rank_df['codon_prob'] = rank_df.apply(
        lambda row: get_codon_aa_independent_probability(wt_codon(row['pos'], wt_codon_df), 
                                        row['mut'], p_table), axis=1)

    # make rank columns
    rank_df['grammar_rank'] = rank_df['prob'].rank(method='min', ascending=False)
    rank_df['semantic_rank'] = rank_df['change'].rank(method='min', ascending=False)
    rank_df['codon_mut_rank'] = rank_df['codon_prob'].rank(method='min', ascending=False)
    rank_df['rank_sum'] = rank_df['grammar_rank'] + rank_df['semantic_rank'] + rank_df['codon_mut_rank']
    rank_df['total_rank'] = rank_df['rank_sum'].rank(method='min', ascending=True)
    return rank_df

In [55]:
# wt_codon(0, h1n1_wt_codon_df)
df = process_viral_result_table(Virus.COV)
df

Unnamed: 0,pos,wt,mut,prob,change,is_viable,is_escape,codon,codon_prob,grammar_rank,semantic_rank,codon_mut_rank,rank_sum,total_rank
0,0,M,A,2.190679e-06,1389.0,False,False,ATG,1.946235e-09,9919.0,23447.0,12638.0,46004.0,18714.0
2,0,M,C,1.529340e-06,3214.0,False,False,ATG,5.720196e-14,10773.0,12545.0,21897.0,45215.0,18274.0
3,0,M,D,6.757930e-09,1710.0,False,False,ATG,1.097642e-13,20644.0,22586.0,21464.0,64694.0,24017.0
4,0,M,E,1.990447e-07,1568.0,False,False,ATG,2.874534e-10,15302.0,23025.0,17568.0,55895.0,22721.0
5,0,M,F,1.245590e-07,1209.0,False,False,ATG,8.310032e-09,16250.0,23779.0,9655.0,49684.0,20619.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30544,0,M,S,8.232845e-08,769.5,False,False,ATG,3.744430e-09,17084.0,24137.0,11055.0,52276.0,21659.0
30545,0,M,T,2.829840e-05,701.5,False,False,ATG,5.127163e-05,4215.0,24155.0,2403.0,30773.0,8347.0
30547,0,M,V,8.076018e-06,765.0,False,False,ATG,3.797803e-05,6856.0,24138.0,3463.0,34457.0,10971.0
30548,0,M,W,1.012355e-09,1172.0,False,False,ATG,1.497853e-10,22265.0,23830.0,18554.0,64649.0,24015.0


In [56]:
df2 = df[df['is_escape'] == True]
mean, std = df2['total_rank'].mean(), df2['total_rank'].std()
df2
print(f'total number of cases are {len(df)}')
print(f'for escape mutants priority rank applying independent prob ways, mean = {mean}, std = {std}')
df2.to_csv("independent_prob_codon.csv")

total number of cases are 24187
for escape mutants priority rank applying independent prob ways, mean = 1854.3684210526317, std = 1399.3762185951248


In [24]:
import pandas as pd
from Bio.Data import CodonTable
cov_wt_codon_df = pd.read_csv('wt_codon.csv')
flu_wt_codon_df = pd.read_csv('h1n1_wt_codon.csv')
mut_prob_table = pd.read_excel('12276_2021_658_MOESM2_ESM.xlsx', sheet_name=3)
flu_wt_codon_df[flu_wt_codon_df['pos'] == 111]

Unnamed: 0,pos,Codon,1AA
111,111,CTC,L


In [39]:
# target_str format: "XX>XX"
def get_mutation_dependent_probability(target_str):
    if target_str[1] == target_str[3]:
        return 1
    sum = mut_prob_table['SARS-CoV-2'].sum()
    # sum = mut_prob_table['Influenza A'].sum()
    val = mut_prob_table[mut_prob_table['Substition type'] == target_str]\
        ['SARS-CoV-2'].values[0]
    # val = mut_prob_table[mut_prob_table['Substition type'] == target_str]\
    #     ['Influenza A'].values[0]
    # print(f'mutation_probability = {val / sum}')
    return val / sum

In [40]:
def make_string(a, b, c, d):
    return a + b + '>' + c + d

In [41]:
def codon_mutation_dependent_probability(prefix, wt_codon, mut_codon, postfix) -> list:
    # alias
    ms = make_string
    prob_sum = []
    # 0 1 2 
    str1 = ms(prefix, wt_codon[0], mut_codon[0], wt_codon[1])
    str2 = ms(mut_codon[0], wt_codon[1], mut_codon[1], wt_codon[2])
    str3 = ms(mut_codon[1], wt_codon[2], mut_codon[2], postfix)
    prob = 1
    for str in [str1, str2, str3]:
        prob *= get_mutation_dependent_probability(str)
    # print(f'for {str1}, {str2}, {str3}, prob = {prob}')
    if prob != 1:
        prob_sum.append(prob)

    # 0 2 1 
    str1 = ms(prefix, wt_codon[0], mut_codon[0], wt_codon[1])
    str2 = ms(wt_codon[1], wt_codon[2], mut_codon[2], postfix)
    str3 = ms(mut_codon[0], wt_codon[1], mut_codon[1], mut_codon[2])
    prob = 1
    for str in [str1, str2, str3]:
        prob *= get_mutation_dependent_probability(str)
    # print(f'for {str1}, {str2}, {str3}, prob = {prob}')
    if prob != 1:
        prob_sum.append(prob)


    # 1 0 2
    str1 = ms(wt_codon[0], wt_codon[1], mut_codon[1], wt_codon[2])
    str2 = ms(prefix, wt_codon[0], mut_codon[0], mut_codon[1])
    str3 = ms(mut_codon[1], wt_codon[2], mut_codon[2], postfix)
    prob = 1
    for str in [str1, str2, str3]:
        prob *= get_mutation_dependent_probability(str)
    # print(f'for {str1}, {str2}, {str3}, prob = {prob}')
    if prob != 1:
        prob_sum.append(prob)

    # 1 2 0
    str1 = ms(wt_codon[0], wt_codon[1], mut_codon[1], wt_codon[2])
    str2 = ms(mut_codon[1], wt_codon[2], mut_codon[2], postfix)
    str3 = ms(prefix, wt_codon[0], mut_codon[0], mut_codon[1])
    prob = 1
    for str in [str1, str2, str3]:
        prob *= get_mutation_dependent_probability(str)
    # print(f'for {str1}, {str2}, {str3}, prob = {prob}')
    if prob != 1:
        prob_sum.append(prob)
    
    # 2 0 1 
    str1 = ms(wt_codon[1], wt_codon[2], mut_codon[2], postfix)
    str2 = ms(prefix, wt_codon[0], mut_codon[0], wt_codon[1])
    str3 = ms(mut_codon[0], wt_codon[1], mut_codon[1], mut_codon[2])
    prob = 1
    for str in [str1, str2, str3]:
        prob *= get_mutation_dependent_probability(str)
    # print(f'for {str1}, {str2}, {str3}, prob = {prob}')
    if prob != 1:
        prob_sum.append(prob)

    # 2 1 0
    str1 = ms(wt_codon[1], wt_codon[2], mut_codon[2], postfix)
    str3 = ms(wt_codon[0], wt_codon[1], mut_codon[1], mut_codon[2])
    str3 = ms(prefix, wt_codon[0], mut_codon[0], mut_codon[1])
    prob = 1
    for str in [str1, str2, str3]:
        prob *= get_mutation_dependent_probability(str)
    # print(f'for {str1}, {str2}, {str3}, prob = {prob}')
    if prob != 1:
        prob_sum.append(prob)

    # print(f'prob_sum = {prob_sum}')
    return max(prob_sum)

In [42]:
def get_aa_from_position(codon_df, position):
    return codon_df.iloc[position]['1AA']

def get_codon_from_position(codon_df, position):
    return codon_df.iloc[position]['Codon']


# need to be careful boundary exception
def aa_mutation_dependent_probability(codon_df, starting_position, mut_aa):
    assert get_aa_from_position(codon_df, starting_position) != mut_aa,\
        f'starting position = {starting_position}, mut_aa = {mut_aa}'
    mut_codons = to_codons(mut_aa)

    pre = starting_position - 1
    post = starting_position + 1
    prefix = get_codon_from_position(codon_df, pre)[2]
    postfix = get_codon_from_position(codon_df, post)[0]
    wt_codon = get_codon_from_position(codon_df, starting_position)
    # print(prefix, wt_codon, mut_codons, postfix)
    
    probs = [codon_mutation_dependent_probability(prefix, wt_codon, mut_codon, postfix) for mut_codon in mut_codons]
    # print(probs)
    return sum(probs)

In [43]:
val = aa_mutation_dependent_probability(cov_wt_codon_df, 10, 'L')
val

0.0009600207340561401

In [44]:
cov_df = pd.read_csv("results/cov/semantics/analyze_semantics_cov_bilstm_512.txt", delimiter='\t')

flu_df = pd.read_csv("results/flu/semantics/analyze_semantics_flu_h1_bilstm_512.txt", delimiter='\t')
rank_df = cov_df
# get rid of start and last position
max_pos = rank_df['pos'].max()
rank_df = rank_df[(rank_df['pos'] != 0) & (rank_df['pos'] != max_pos)]

In [45]:
# get rid of junklike alphabet
rank_df = rank_df[~rank_df['wt'].str.contains('X|B|Z|J|U', case=False, na=False)]
rank_df = rank_df[~rank_df['mut'].str.contains('X|B|Z|J|U', case=False, na=False)]

In [47]:
# translate aa to codon
rank_df['codon'] = rank_df.apply(lambda row: wt_codon(row['pos'], cov_wt_codon_df), axis=1)
# get wild type codon to mutation probability
rank_df['codon_prob'] = rank_df.apply(
    lambda row: aa_mutation_dependent_probability(cov_wt_codon_df, row['pos'], row['mut']), axis=1)
rank_df

Unnamed: 0,pos,wt,mut,prob,change,is_viable,is_escape,codon,codon_prob
12,1,F,A,4.983475e-05,2056.0,False,False,TTT,5.100784e-06
14,1,F,C,2.952611e-05,2502.0,False,False,TTT,7.081138e-03
15,1,F,D,1.466931e-07,2048.0,False,False,TTT,2.538145e-07
16,1,F,E,5.211668e-06,2256.0,False,False,TTT,9.587636e-10
17,2,V,A,2.152248e-06,1576.0,False,False,GTT,1.436758e-02
...,...,...,...,...,...,...,...,...,...
30532,1,F,S,2.906836e-05,1823.0,False,False,TTT,1.616187e-02
30533,1,F,T,7.235685e-06,1750.0,False,False,TTT,1.158983e-06
30535,1,F,V,7.380600e-04,1897.0,False,False,TTT,1.068411e-03
30536,1,F,W,1.973972e-07,1649.0,False,False,TTT,6.591293e-04


In [48]:
# make rank columns
rank_df['grammar_rank'] = rank_df['prob'].rank(method='min', ascending=False)
rank_df['semantic_rank'] = rank_df['change'].rank(method='min', ascending=False)
rank_df['codon_mut_rank'] = rank_df['codon_prob'].rank(method='min', ascending=False)
rank_df['rank_sum'] = rank_df['grammar_rank'] + rank_df['codon_mut_rank'] + rank_df['semantic_rank'] 
# rank_df['rank_sum'] = rank_df['codon_mut_rank']
rank_df['total_rank'] = rank_df['rank_sum'].rank(method='min', ascending=True)
rank_df
#

Unnamed: 0,pos,wt,mut,prob,change,is_viable,is_escape,codon,codon_prob,grammar_rank,semantic_rank,codon_mut_rank,rank_sum,total_rank
12,1,F,A,4.983475e-05,2056.0,False,False,TTT,5.100784e-06,3212.0,20997.0,16096.0,40305.0,15236.0
14,1,F,C,2.952611e-05,2502.0,False,False,TTT,7.081138e-03,4118.0,18244.0,3884.0,26246.0,5139.0
15,1,F,D,1.466931e-07,2048.0,False,False,TTT,2.538145e-07,15891.0,21111.0,20649.0,57651.0,23127.0
16,1,F,E,5.211668e-06,2256.0,False,False,TTT,9.587636e-10,7827.0,19797.0,23844.0,51468.0,21482.0
17,2,V,A,2.152248e-06,1576.0,False,False,GTT,1.436758e-02,9945.0,23004.0,2076.0,35025.0,11384.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30532,1,F,S,2.906836e-05,1823.0,False,False,TTT,1.616187e-02,4157.0,22169.0,1867.0,28193.0,6387.0
30533,1,F,T,7.235685e-06,1750.0,False,False,TTT,1.158983e-06,7092.0,22442.0,18469.0,48003.0,19951.0
30535,1,F,V,7.380600e-04,1897.0,False,False,TTT,1.068411e-03,521.0,21876.0,7730.0,30127.0,7725.0
30536,1,F,W,1.973972e-07,1649.0,False,False,TTT,6.591293e-04,15296.0,22777.0,8588.0,46661.0,19248.0


|%%--%%| <kRzGntSq0H|8kwNSmwdI0>

# wt_codon(0, h1n1_wt_codon_df)
df = process_viral_result_table(Virus.COV)

In [51]:
df2 = rank_df[rank_df['is_escape'] == True]
mean, std = df2['total_rank'].mean(), df2['total_rank'].std()
print(f'total number of cases are {len(df)}')
print(f'for escape mutants priority rank applying independent prob ways, mean = {mean}, std = {std}')
df2.to_csv("dependent_prob_df.csv")

total number of cases are 24187
for escape mutants priority rank applying independent prob ways, mean = 2182.1052631578946, std = 1672.972500761472
