This notebook generates the .mut file needed for ddG calculations using Triad.

In [2]:
# Import necessary modules
import pandas as pd
import numpy as np
from itertools import product
import re

In [4]:
df = pd.read_csv("/disk2/fli/SSMuLA/data/DHFR/processed/DHFR.csv")
len(df)

9261

In [6]:
len(df["AAs"].values.tolist())

9261

Load the GB1_Fitness file and build mutants using it.

In [2]:
# Load GB1 fitness
fitness_df = pd.read_csv("/disk1/jyang4/repos/data/Pgb_fitness.csv")
measured = fitness_df.Combo.values.tolist()

In [7]:
fitness_df

Unnamed: 0.2,Unnamed: 0.1,Combo,Unnamed: 0,Alignment Probability,Alignment Count,Column,StdArea,Iso1Area,Iso2Area,num_wells,...,EVMutation Rank,Triad Score,Triad Rank,NormIso1_recomb,NormIso2_recomb,NormIso1_recomb_rank,NormIso2_recomb_rank,NormIso1,NormIso2,Diff
0,0,FRMNY,7.0,0.624151,339.0,2.0,484.0,3398.0,567.0,1,...,69.0,-648.97722,33,,,,,4.586441,0.765307,3.821134
1,1,SAFRY,2.0,0.755508,128.0,8.0,487.0,3778.0,963.0,1,...,118.0,-646.67160,48,0.835324,1.329356,20.0,70.0,5.067932,1.291800,3.776133
2,2,GIDLY,165.0,0.086459,197.0,5.0,501.0,2485.0,428.0,1,...,112.0,-642.32678,97,0.000000,0.000000,155.5,155.5,3.449043,0.594040,2.855002
3,3,TSGMY,28.0,0.244922,154.0,3.0,497.0,3058.0,1062.0,1,...,90.0,-645.07324,60,,,,,4.019563,1.395937,2.623626
4,5,TNMPY,331.0,0.867393,241.0,11.0,484.0,1788.0,367.0,1,...,146.0,-633.74115,162,0.262932,0.495441,52.0,132.0,3.260248,0.669190,2.591058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,183,LQSGA,353.0,0.172635,238.0,12.0,474.0,276.0,2725.0,1,...,130.0,-642.98517,84,0.000000,0.000000,155.5,155.5,0.513877,5.073604,-4.559727
212,186,QYKGD,286.0,0.001800,177.0,4.0,442.0,107.0,2436.0,1,...,80.0,-641.25164,113,0.390436,4.028886,38.0,2.0,0.213644,4.863887,-4.650243
213,213,PCLTD,251.0,0.717479,513.0,1.0,506.0,88.0,3513.0,1,...,216.0,-629.35627,187,0.075245,2.350554,87.0,19.0,0.120283,4.801761,-4.681477
214,190,NNVER,306.0,0.687294,230.0,7.0,509.0,106.0,2847.0,1,...,176.0,-640.64290,120,,,,,0.183788,4.936261,-4.752473


In [8]:
fitness_df.columns

Index(['Unnamed: 0.1', 'Combo', 'Unnamed: 0', 'Alignment Probability',
       'Alignment Count', 'Column', 'StdArea', 'Iso1Area', 'Iso2Area',
       'num_wells', 'EVMutation', 'EVMutation Rank', 'Triad Score',
       'Triad Rank', 'NormIso1_recomb', 'NormIso2_recomb',
       'NormIso1_recomb_rank', 'NormIso2_recomb_rank', 'NormIso1', 'NormIso2',
       'Diff'],
      dtype='object')

In [4]:
AAs = ('A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V')

In [5]:
all = [''.join(x) for x in list(product(AAs, repeat=5))]

In [6]:
#variants = list(set(all) - set(measured))
variants = set(measured)

Now make the full mut file:

In [12]:
# Define the wildtype amino acids at each position
wt_aas = ("W", "Y", "W", "V", "F")

# Define the prefixes for mutagenesis
prefixes = ("A_56", "A_57", "A_59", "A_60", "A_89")

# Get the variants
#all_variants = list(product(AAs, repeat=4))
all_variants = variants

# Loop over variants
mutation_encodings = []
for variant in all_variants:

    # Loop over each character in the variant
    mut_encoding_list = []
    for j, (var_char, wt_char) in enumerate(zip(variant, wt_aas)):

        # If the var_char does not equal the wt_char, append
        if var_char != wt_char:
            mut_encoding_list.append(prefixes[j] + var_char)
            
    # If the mut_encoding_list has no entries, continue (this is wild type)
    if len(mut_encoding_list) == 0:
        continue
    
    # Otherwise, append to mutation_encodings
    else:
        mutation_encodings.append("+".join(mut_encoding_list) + "\n")    
        
# Save the mutants
with open("ParPgb_5site_round1.mut", "w") as f:
    f.writelines(mutation_encodings)
    
# # Save the mutants
# with open("2GI9_Test.mut", "w") as f:
#     f.writelines(mutation_encodings[:400])    

Sanity check to make sure the mutation_encodings map back to the mutations:

In [34]:
# Assert that we have the correct number of mutations
assert len(mutation_encodings) == 149360

# Compile a regular expression for finding variants
position_re = re.compile("_([0-9]+)([A-Z]{1})")

# Pull the variants
variants = fitness_df.Mutations.values.tolist()[1:]

# Define a dicitonary linking back to wt aa
wt_aa_dict = {"39": "V",
             "40": "D",
             "41": "G",
             "54": "V"}

# Reconstruct variants
for mutation_encoding, variant in zip(mutation_encodings, variants):
    
    # Find all matches
    all_matches = position_re.findall(mutation_encoding)
    
    # Reconstruct the variant and make sure it's what we expect
    reconstructed_variant = "".join([f"{wt_aa_dict[pos]}{pos}{new_char}" for pos, new_char in all_matches])
    assert reconstructed_variant == variant