In [1]:
%cd ~/SSMuLA/

/disk2/fli/SSMuLA


In [4]:
import numpy as np
import pandas as pd

In [56]:
def read_res_pred(fin):
    df_gvp_pred = pd.read_csv(fin)
    df_gvp_pred["pos"] = df_gvp_pred.mut.str[1:-1].astype(int)  # m1_indexed
    df_gvp_pred["mut_aa"] = df_gvp_pred.mut.str[-1].astype(str)
    df_gvp_pred["wt_pos"] = df_gvp_pred.mut.str[0:-1].astype(str)
    return df_gvp_pred

In [5]:
def get_norm_probability_df(df, mutant_score, pos, t=1):
    # normalizes the probabilities of a given score in a dataframe
    # for a particular mutant score from RES model, get it's site-wise normalized probability
    # subtracting max log_p to not have underflow issues
    df_pos = df.loc[df.pos == pos]

    log_p_mut = -np.abs(mutant_score) / t  # scalar
    log_p_all = -np.abs(df_pos.mean_x) / t  # array

    max_log_p_all = max(log_p_all)
    p_mut_norm_max = np.exp(log_p_mut - max_log_p_all)
    p_all_norm_max = np.exp(log_p_all - max_log_p_all)

    # normalize probabilities to sum to one
    p_norm = p_mut_norm_max / np.sum(p_all_norm_max)
    return p_norm


In [6]:
def add_p_col_df_gvp_log(df_gvp_pred, t=1):
    df_gvp_pred[f"p_t{t}"] = df_gvp_pred.apply(
        lambda r: get_norm_probability_df(df_gvp_pred, r.mean_x, r.pos, t=t), axis=1
    )
    df_gvp_pred[f"log_p_t{t}"] = np.log(df_gvp_pred[f"p_t{t}"])
    return df_gvp_pred


In [9]:

def get_joint_log_prob_mutants(df, muts, p_col="log_p_t0.1"):
    # assuming independence between positions, give a score based on RES predictions
    # expects muts as a concatenation of 'D5L:R6K'
    # beware of difference in number of elements, cannot compare
    log_prob = 0
    try:
        for m in muts.split(":"):
            log_prob += df.loc[df.mut == m][p_col].values[0]
    except IndexError:
        print("index_error with mut:{}".format(muts))
    return log_prob


In [62]:
zs_df = pd.read_csv("/disk2/fli/SSMuLA/ev_esm2/GB1/GB1.csv")
zs_df

Unnamed: 0,muts,fit,split,seq,combo,pos,esm_score,n_mut,ev_score
0,WT,1.000000,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYD...,['NA'],,,1,
1,V39A,0.061910,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGADGEWTYD...,['A'],[39],-4.358272,1,-6.846682
2,V39C,0.242237,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGCDGEWTYD...,['C'],[39],-6.405176,1,-6.846682
3,V39D,0.006472,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGDDGEWTYD...,['D'],[39],-5.624555,1,-6.846682
4,V39E,0.032719,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGEDGEWTYD...,['E'],[39],-5.173347,1,-6.846682
...,...,...,...,...,...,...,...,...,...
149356,V39Y:D40Y:G41Y:V54R,0.001350,multi,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGYYYEWTYD...,"['Y', 'Y', 'Y', 'R']","[39, 40, 41, 54]",-48.258465,4,-28.470377
149357,V39Y:D40Y:G41Y:V54S,0.004421,multi,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGYYYEWTYD...,"['Y', 'Y', 'Y', 'S']","[39, 40, 41, 54]",-49.006076,4,-28.470377
149358,V39Y:D40Y:G41Y:V54T,0.021200,multi,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGYYYEWTYD...,"['Y', 'Y', 'Y', 'T']","[39, 40, 41, 54]",-48.757724,4,-28.470377
149359,V39Y:D40Y:G41Y:V54W,0.009136,multi,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGYYYEWTYD...,"['Y', 'Y', 'Y', 'W']","[39, 40, 41, 54]",-48.818668,4,-28.470377


In [52]:
site_list = np.array([int(num) for num in max([v for v in zs_df["pos"].unique() if pd.notna(v)], key=len).strip('[]').split(', ')])

site_list

array([39, 40, 41, 54])

In [57]:
coves_df = "/disk2/fli/SSMuLA/coves/100/GB1_A.csv"
# scoring with CoVES

# get the residue effect scores that are inferred from the structural surrounding for each residue
df_gvp_pred = read_res_pred(coves_df)

# normalize scores for each position to be probabilities and log_probabilities at a given site
# the temperature controls the relative weighting of this normalization
df_gvp_pred=add_p_col_df_gvp_log(df_gvp_pred, t=0.1)
df_gvp_pred

Unnamed: 0,index,mut,mean_x,std_x,pos,mut_aa,wt_pos,p_t0.1,log_p_t0.1
0,0,M1A,-7.248457,0.0,1,A,M1,3.706373e-16,-35.531308
1,1,M1R,-5.380106,0.0,1,R,M1,4.820525e-08,-16.847798
2,2,M1N,-6.560279,0.0,1,N,M1,3.611358e-13,-28.649522
3,3,M1D,-6.611183,0.0,1,D,M1,2.170681e-13,-29.158565
4,4,M1C,-9.340178,0.0,1,C,M1,3.052952e-25,-56.448518
...,...,...,...,...,...,...,...,...,...
1115,15,E56S,-8.642832,0.0,56,S,E56,8.312300e-14,-30.118455
1116,16,E56T,-5.859381,0.0,56,T,E56,1.018816e-01,-2.283944
1117,17,E56W,-12.355153,0.0,56,W,E56,6.270681e-30,-67.241668
1118,18,E56Y,-12.268634,0.0,56,Y,E56,1.489577e-29,-66.376475


In [58]:
sliced_df_gvp = df_gvp_pred[df_gvp_pred["pos"].isin(site_list)].copy()
sliced_df_gvp

Unnamed: 0,index,mut,mean_x,std_x,pos,mut_aa,wt_pos,p_t0.1,log_p_t0.1
760,0,V39A,-15.170280,0.0,39,A,V39,3.046234e-48,-1.094102e+02
761,1,V39R,-13.225900,0.0,39,R,V39,8.474176e-40,-8.996638e+01
762,2,V39N,-13.694249,0.0,39,N,V39,7.835769e-42,-9.464987e+01
763,3,V39D,-16.779905,0.0,39,D,V39,3.113488e-55,-1.255064e+02
764,4,V39C,-10.791400,0.0,39,C,V39,3.169535e-29,-6.562138e+01
...,...,...,...,...,...,...,...,...,...
1075,15,V54S,-12.546754,0.0,54,S,V54,1.529398e-38,-8.707336e+01
1076,16,V54T,-9.540358,0.0,54,T,V54,1.742343e-25,-5.700940e+01
1077,17,V54W,-11.640129,0.0,54,W,V54,1.324164e-34,-7.800711e+01
1078,18,V54Y,-11.950289,0.0,54,Y,V54,5.955737e-36,-8.110871e+01


In [59]:
sliced_df_gvp["wt_pos"].unique()

array(['V39', 'D40', 'G41', 'V54'], dtype=object)

In [65]:
# Define the function to format the mutations
def format_mutations(muts, positions):
    # Parse the mutations into a dictionary: {'position': 'mutated residue'}
    mut_dict = {mut[:-1]: mut[-1] for mut in muts.split(':')}
    
    # Build the full sequence with the mutated residues or original residues
    formatted_muts = ':'.join([f"{pos}{mut_dict.get(pos, pos[0])}" for pos in positions])
    
    return formatted_muts


In [66]:
# Apply the function to the dataframe
zs_df['verbose_muts'] = zs_df['muts'].apply(format_mutations, positions=sliced_df_gvp["wt_pos"].unique())
zs_df

Unnamed: 0,muts,fit,split,seq,combo,pos,esm_score,n_mut,ev_score,verbose_muts
0,WT,1.000000,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYD...,['NA'],,,1,,V39V:D40D:G41G:V54V
1,V39A,0.061910,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGADGEWTYD...,['A'],[39],-4.358272,1,-6.846682,V39A:D40D:G41G:V54V
2,V39C,0.242237,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGCDGEWTYD...,['C'],[39],-6.405176,1,-6.846682,V39C:D40D:G41G:V54V
3,V39D,0.006472,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGDDGEWTYD...,['D'],[39],-5.624555,1,-6.846682,V39D:D40D:G41G:V54V
4,V39E,0.032719,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGEDGEWTYD...,['E'],[39],-5.173347,1,-6.846682,V39E:D40D:G41G:V54V
...,...,...,...,...,...,...,...,...,...,...
149356,V39Y:D40Y:G41Y:V54R,0.001350,multi,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGYYYEWTYD...,"['Y', 'Y', 'Y', 'R']","[39, 40, 41, 54]",-48.258465,4,-28.470377,V39Y:D40Y:G41Y:V54R
149357,V39Y:D40Y:G41Y:V54S,0.004421,multi,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGYYYEWTYD...,"['Y', 'Y', 'Y', 'S']","[39, 40, 41, 54]",-49.006076,4,-28.470377,V39Y:D40Y:G41Y:V54S
149358,V39Y:D40Y:G41Y:V54T,0.021200,multi,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGYYYEWTYD...,"['Y', 'Y', 'Y', 'T']","[39, 40, 41, 54]",-48.757724,4,-28.470377,V39Y:D40Y:G41Y:V54T
149359,V39Y:D40Y:G41Y:V54W,0.009136,multi,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGYYYEWTYD...,"['Y', 'Y', 'Y', 'W']","[39, 40, 41, 54]",-48.818668,4,-28.470377,V39Y:D40Y:G41Y:V54W


In [67]:
# calculating the antitoxi 3 position library combinatorial variant effect score from the individual per site amino acid scores
zs_df['coves_pred_t0.1'] = zs_df["verbose_muts"].apply(
    lambda m: get_joint_log_prob_mutants(df_gvp_pred, m, p_col = 'log_p_t0.1'))

zs_df


Unnamed: 0,muts,fit,split,seq,combo,pos,esm_score,n_mut,ev_score,verbose_muts,coves_pred_t0.1
0,WT,1.000000,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYD...,['NA'],,,1,,V39V:D40D:G41G:V54V,-19.324869
1,V39A,0.061910,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGADGEWTYD...,['A'],[39],-4.358272,1,-6.846682,V39A:D40D:G41G:V54V,-124.890427
2,V39C,0.242237,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGCDGEWTYD...,['C'],[39],-6.405176,1,-6.846682,V39C:D40D:G41G:V54V,-81.101631
3,V39D,0.006472,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGDDGEWTYD...,['D'],[39],-5.624555,1,-6.846682,V39D:D40D:G41G:V54V,-140.986685
4,V39E,0.032719,single,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGEDGEWTYD...,['E'],[39],-5.173347,1,-6.846682,V39E:D40D:G41G:V54V,-120.731138
...,...,...,...,...,...,...,...,...,...,...,...
149356,V39Y:D40Y:G41Y:V54R,0.001350,multi,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGYYYEWTYD...,"['Y', 'Y', 'Y', 'R']","[39, 40, 41, 54]",-48.258465,4,-28.470377,V39Y:D40Y:G41Y:V54R,-299.015156
149357,V39Y:D40Y:G41Y:V54S,0.004421,multi,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGYYYEWTYD...,"['Y', 'Y', 'Y', 'S']","[39, 40, 41, 54]",-49.006076,4,-28.470377,V39Y:D40Y:G41Y:V54S,-302.528969
149358,V39Y:D40Y:G41Y:V54T,0.021200,multi,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGYYYEWTYD...,"['Y', 'Y', 'Y', 'T']","[39, 40, 41, 54]",-48.757724,4,-28.470377,V39Y:D40Y:G41Y:V54T,-272.465007
149359,V39Y:D40Y:G41Y:V54W,0.009136,multi,MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGYYYEWTYD...,"['Y', 'Y', 'Y', 'W']","[39, 40, 41, 54]",-48.818668,4,-28.470377,V39Y:D40Y:G41Y:V54W,-293.462722
