Working jupyter to create function to create a list of aminoacids likely to be used for mutation

To Do's:
- iterate through cutoff-value from -0.05 to 0.05 in 0.005 or 0.01 steps
- or reintroduce the first_entry loop, in case no mutation was found to force a mutation (not sure how good)

In [None]:
path_pdb = './data/pdbs'
path_pqr=  './data/pqrs'
pdb_file = 'AF-C0H3V2-F1.pdb'
output_path = './data/test'



In [None]:
def prot_mut(pdb_path, pdb_file, pqr_output_path, Deep_mut=True, iterations=1, cutoff_value = -3, threshhold = 10, seed = 0):
    #import functions
    from function import functional_aa
    from function import free_aa
    from function import AA2s4pred
     
    from function_mut import diff_weighted
    from function_mut import mutator_rand
    from function_mut import mutator_rational
    
    from SPARC import SPARC
    
    from helper_function import Subst_reducer
    from helper_function import pdb2AA
    from helper_function import ArraySlice

    from heapq import heappop, heappush
    import heapq
    
    
    # Define variables
    
    # top 20 features that correlate with melt point (positive and negative), based on prokaryotes323 columns, cutoff at +/- 0.32
    pos_corr = {'YR': 0.492442, 
                'RP': 0.480594, 
                'RG': 0.442533, 
                'R': 0.432905, 
                'WR': 0.392523, 
                'YP': 0.390709, 
                'LR': 0.386596, 
                'FR': 0.376553, 
                'VR': 0.370241, 
                'ER': 0.369174, 
                'RC': 0.357052, 
                'Rhelix': 0.349204, # percentage of R in all helices
                'MR': 0.343496, 
                'P': 0.340082, 
                'PG': 0.338434, 
                'EAmotif': 0.332202, # EA hinterenander
                'LP': 0.330337, 
                'EP': 0.328569, 
                'RH': 0.326193, 
                'AR': 0.324355, 
                'ARmotif': 0.3241, # AR hinterenander
                'NR': 0.323378}
    neg_corr = {'QT': -0.52935, 
                'MQ': -0.507329, 
                'QS': -0.502697, 
                'QC': -0.493738, 
                'Q': -0.469765, 
                'QD': -0.466556, 
                'QH': -0.455041, 
                'NQ': -0.435562, 
                'IQ': -0.429683, 
                'FQ': -0.42363, 
                'WQ': -0.420057, 
                'QK': -0.41872, 
                'PolarAA': -0.406035, 
                'ST': -0.396185, 
                'Qhelix': -0.37895, 
                'TC': -0.364763, 
                'MT': -0.359458, 
                'TH': -0.346921, 
                'TD': -0.34469, 
                'SH': -0.327684, 
                'SC': -0.321686, 
                'T': -0.321208}
    
    #frequency of aminoacids in pos_corr and neg_corr
    sorted_freq_pos = ['R', 'P', 'L', 'Y', 'E', 'G', 'A', 'V', 'M', 'F', 'W', 'N', 'H', 'C'] # depreciated
    sorted_freq_neg = ['K', 'N', 'W', 'F', 'I', 'D', 'M', 'C', 'H', 'S', 'T', 'Q'] # depreciated

    #ideal values for each feature taken from top 10 % of prokaryotes323 columns
    ideal_pos_value = {'AR': 0.19577028919092312, 
                       'VR': 0.16570774850840858,
                       'LR': 0.22057443760531548,
                       'LP': 0.19913083779456836,
                       'MR': 0.10189569954318338,
                       'FR': 0.1214807452476434,
                       'WR': 0.09732041140609597,
                       'NR': 0.10350733491352844,
                       'YR': 0.11505796446768121,
                       'YP': 0.09361436465693411,
                       'ER': 0.1750096588731577,
                       'EP': 0.15356605906241058,
                       'RH': 0.10505828276892132,
                       'RC': 0.0895071066024632,
                       'RP': 0.14867680267495723,
                       'RG': 0.1743851008086472,
                       'PG': 0.15294150099790005,
                       'PolarAA': 0.14402202391640712,
                       'R': 0.08506020124285214,
                       'P': 0.06361660143210504,
                       'ARmotif': 0.010758762261349177,
                       'EAmotif': 0.01633251571740483,
                       'Rhelix': 0.10373774208196636
                       }
    
    ideal_neg_value = {'IQ': 0.05490211183163266,
                       'MQ': 0.03973022298244349,
                       'MT': 0.054568274035922376,
                       'FQ': 0.05931526868690351,
                       'WQ': 0.03515493484535609,
                       'NQ': 0.041341858352788544,
                       'QS': 0.057844351285310555,
                       'QT': 0.06062750041770345,
                       'QD': 0.06216042713743332,
                       'QH': 0.04289280620818142,
                       'QK': 0.06281534035966867,
                       'QC': 0.027341630041723304,
                       'ST': 0.07268240233878943,
                       'SH': 0.054947708129267414,
                       'SC': 0.03939653196280929,
                       'TD': 0.0769984781909122,
                       'TH': 0.0577308572616603,
                       'TC': 0.04217968109520218,
                       'Q': 0.022894724682112268,
                       'T': 0.03773277573559115,
                       'PolarAA': 0.14402202391640712, 
                       'Qhelix': 0.02831461743013483
                       }
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

    #possible substitutions for each aminoacid, taken from literature
    #https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003674
    conserv_subst = {
        'A': ['D', 'E', 'G', 'S', 'T'],
        'C': ['G', 'R', 'S', 'W', 'Y'],
        'D': ['A', 'E', 'G', 'H', 'N', 'V', 'Y'],
        'E': ['A', 'D', 'G', 'K', 'Q', 'V'],
        'F': ['I', 'L', 'Y'],
        'G': ['A', 'C', 'D', 'E', 'R'],
        'H': ['D', 'L', 'N', 'P', 'Q', 'R', 'Y'],
        'I': ['F', 'L', 'M', 'N', 'V'],
        'K': ['E', 'M', 'N', 'Q', 'R', 'T'],
        'L': ['F', 'H', 'I', 'M', 'P', 'Q', 'R', 'V', 'W'],
        'M': ['I', 'K', 'L', 'R', 'T', 'V'],
        'N': ['D', 'H', 'I', 'K', 'S', 'T', 'Y'],
        'P': ['H', 'L', 'Q', 'R', 'S'],
        'Q': ['E', 'H', 'K', 'L', 'P', 'R'],
        'R': ['C', 'G', 'H', 'K', 'L', 'M', 'P', 'Q', 'T', 'W'],
        'S': ['A', 'C', 'N', 'P', 'T', 'W', 'Y'],
        'T': ['A', 'K', 'M', 'N', 'R', 'S'],
        'V': ['D', 'E', 'I', 'L', 'M'],
        'W': ['C', 'L', 'R', 'S'],
        'Y': ['C', 'D', 'F', 'H', 'N'],
        }
    #currently not in use
    non_conservative_substitutions = {
        'A': ['P', 'V'],
        'C': ['F'],
        'F': ['C', 'S', 'V'],
        'G': ['S', 'V', 'W'],
        'I': ['K', 'R', 'S', 'T'],
        'K': ['I'],
        'L': ['S'],
        'P': ['A', 'T'],
        'Q': ['E', 'K'],
        'R': ['I', 'S'],
        'S': ['F', 'G', 'I', 'L', 'R'],
        'T': ['I', 'P'],
        'V': ['A', 'F', 'G'],
        'W': ['G'],
        }



    #extract protein features
    aa_list = pdb2AA(pdb_path, pdb_file)
    aa_locked = functional_aa(pdb_path, pdb_file, pqr_output_path)
    aa_free = free_aa(pdb_path, pdb_file, aa_locked)
    aa_str = ''.join(aa_list)
    free_AA_dict = {a: b for a, b in zip(aa_free[:,2], aa_free[:,1] )} # create dictionary from array the key is the absolute aminoacid position and value is the aminoacid
    sec_prediction = AA2s4pred('./data/s4pred', output_path, aa_str, pdb_file)
    possible_substitutions = Subst_reducer(sec_prediction, conserv_subst, free_AA_dict, seed = seed)

    #calculate WT deviations
    WT_dev_sum, WT_dev = diff_weighted(pos_corr, neg_corr, aa_list, ideal_pos_value, ideal_neg_value, sec_prediction)





    #randoly mutate protein (within given constraints), get top 10 mutations
    #!!!!ToDo: implement to get top 10 sequences which vary most from eachother / WT
    top_variations = []
    largest_variations = []
    heappush(top_variations, (WT_dev_sum, WT_dev, aa_str))  # Placeholder for lowest score
    heappush(largest_variations, (0, WT_dev, aa_str))  # Placeholder for largest variation
    
    Mut_seq_str = mutator_rand(aa_list, possible_substitutions, threshhold = threshhold, seed = seed)
    for Mut_prot in Mut_seq_str:
        Mut_dev_sum, Mut_dev = diff_weighted(pos_corr, neg_corr, Mut_prot, ideal_pos_value, ideal_neg_value, sec_prediction, sort = True)
        Str_dev = sum(c1 != c2 for c1, c2 in zip(''.join(Mut_prot), aa_str))

        #save the top 10 scores
        if len(top_variations) < 11:
            heappush(top_variations, (Mut_dev_sum, Mut_dev, Mut_prot)) 
                  
        elif Mut_dev_sum < top_variations[0][0]:
            heappush(top_variations, (Mut_dev_sum, Mut_dev, Mut_prot))
            heappop(top_variations)
        
        #saves top 10 largest variations
        if len(largest_variations) <11:
            heappush(largest_variations, (Str_dev, Mut_dev, Mut_prot))
        elif Str_dev > largest_variations[0][0]:
            heappush(largest_variations, (Str_dev, Mut_dev, Mut_prot))
            heappop(largest_variations)
            
    heappush(top_variations, (WT_dev_sum, WT_dev, aa_str))
    
    Random_creation = list(heapq.merge(top_variations, largest_variations))
    
    print('Random mutation finished')
    #define variables for iteration
    prev_Mut_prot_list = aa_list    
    prev_Mut_dev = WT_dev           
    aa_available = aa_free          
    
    #define variables for best protein
    best_Mut_prot_list = list(heappop(top_variations)[2])
    best_Mut_dev_sum = heappop(top_variations)[0]
    best_Mut_dev = heappop(top_variations)[1]
    best_aa_available = aa_available
    
    
    #collect top 5 best variations of rational improvement to calculate melt point
    top_top_variations = []
    heappush(top_top_variations, (WT_dev_sum, WT_dev, aa_str))
    
    #use top 10 mutated sequences and use rational improvement      
    best_iteration = 0 #(used to track how many iterations are necessary, currently ~2-3 seems best)
    for mut_seq in top_variations:
        prev_Mut_prot_list = list(mut_seq[2])
        prev_Mut_dev_sum = mut_seq[0]
        prev_Mut_dev = mut_seq[1]

        for k in range(iterations):
            Mut_prot_list, possible_mutations = mutator_rational(
                                                    AA_list = prev_Mut_prot_list, 
                                                    free_AA = aa_available, 
                                                    deviation = prev_Mut_dev,
                                                    pos_corr = pos_corr, 
                                                    neg_corr =  neg_corr, 
                                                    conserv_substitution = possible_substitutions,
                                                    ideal_pos_value = ideal_pos_value, 
                                                    ideal_neg_value = ideal_neg_value,
                                                    cutoff = cutoff_value,
                                                    sec_prediction = sec_prediction
                                                    ) #f_value = cutoff, calculates list of possible mutations
            
            Mut_dev_sum, Mut_dev = diff_weighted(pos_corr, neg_corr, Mut_prot_list, ideal_pos_value, ideal_neg_value, sec_prediction) # calculate deviation of mutated protein sequence
            
            if len(top_top_variations) < 6:
                heappush(top_top_variations, (Mut_dev_sum, Mut_dev, Mut_prot_list))
            elif Mut_dev_sum < top_top_variations[0][0]:
                heappush(top_top_variations, (Mut_dev_sum, Mut_dev, Mut_prot_list))
                heappop(top_top_variations)
                
            if abs(best_Mut_dev_sum) > abs(Mut_dev_sum):
                best_Mut_prot_list = Mut_prot_list
                best_Mut_dev_sum = Mut_dev_sum
                best_Mut_dev = Mut_dev  
                best_possible_mutations = possible_mutations #get list of best mutations (AA-POS-AA), depreciated, bcs random mutator doesn't output this
                best_iteration = str(k+1)
                #aa_available = ArraySlice(aa_available, possible_mutations) #updates available aminoacids, so that each aminoacid can only be mutated once
                
            elif Mut_dev[0][0] == prev_Mut_dev[0][0] and abs(Mut_dev[0][1]-prev_Mut_dev[0][1]) < 0.001:
                break
                
            #update variables for next iteration            
            prev_possible_mutations = possible_mutations    #list of mutations (AA-POS-AA) (prev_possible_mutations can be printed if needed)
            prev_Mut_prot_list = Mut_prot_list                        #Mutated protein as a list with one AA per entry
            prev_Mut_dev = Mut_dev
            prev_Mut_dev_sum = Mut_dev_sum
        
    #for top_hit in top_top_variations:
    #-------SPARC implementation missing--------#    


    

    Improvement = WT_dev_sum - best_Mut_dev_sum
    #return Improvement, Dev_list, prev_Mut_prot_list, AAs
    
    return Improvement, best_Mut_prot_list, aa_list, WT_dev_sum, best_Mut_dev_sum

In [None]:
test_list = ['AF-P0AGD1-F1.pdb', 'AF-O34633-F1.pdb', 'AF-Q72L06-F1.pdb', 'AF-Q746J6-F1.pdb', 'AF-R4YU54-F1.pdb', 'AF-Q72L88-F1.pdb','AF-P21340-F1.pdb' , 'AF-Q745V2-F1.pdb', 'AF-C0H3V2-F1.pdb']
pdb_file = test_list[8]
Improv_dict = {}
""" for i in test_list:
    Improv, best_it = prot_mut(path_pdb, i, path_pqr, Deep_mut=True, iterations=5, cutoff_value = 0.01)
    Improv_dict[i] = [Improv, best_it] """

Improvement, Mutated_prot, WT_prot, WT_dev_sum, best_Mut_dev_sum = prot_mut(path_pdb, pdb_file, path_pqr, Deep_mut=True, iterations=100, cutoff_value = -0.05, threshhold = 10000, seed = 0)

In [None]:
print(WT_dev_sum)
print(best_Mut_dev_sum)
print(Improvement)
print(Mutated_prot)
print(WT_prot)
print(f'\n')
print(''.join(Mutated_prot))
print(''.join(WT_prot))

In [None]:
print(Improv_dict)
sum = 0
count = 0
for key in Improv_dict:
    sum += Improv_dict[key][0]
    if Improv_dict[key][0] != 0:
        count += 1

print(sum/count)

Get list of pos/neg correlating features for Meltpoint

In [None]:
import pandas as pd
import os
pro_df: pd.DataFrame = pd.read_csv(os.path.join('./data', 'prokaryotes_323columns.csv'))
pro_df = pro_df.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'])

In [None]:
import numpy as np
top_df = pro_df[pro_df['meltPoint'] >= pro_df['meltPoint'].quantile(0.9)]
Ideal_pos_value = {}
Ideal_neg_value = {}

keys = ['Rhelix', 'Qhelix']
for key in keys:
    Ideal_pos_value[key] = np.mean(top_df[key])

""" for key in neg_corr.keys():
    Ideal_neg_value[key] = np.mean(top_df[key]) """
    
print(Ideal_pos_value)
print(Ideal_neg_value)


In [None]:
pro_corr = pro_df.corr(numeric_only=True)['meltPoint']
pos_corr = pro_corr[pro_corr > 0.32]
neg_corr = pro_corr[pro_corr < -0.32]

In [None]:
pos_corr = list(pro_corr.index)
neg_corr = list(neg_corr.index)
print(neg_corr)

Needs to be mutated

In [None]:
print(pro_corr[pro_corr < -0.32])