In [1]:

path_pdb = './data/pdbs'
path_pqr=  './data/pqrs'
pdb_file = 'AF-C0H3V2-F1.pdb'
output_path = './data/test'

prot_list = ['AF-P0AGD1-F1.pdb', 'AF-O34633-F1.pdb', 'AF-Q72L06-F1.pdb', 'AF-Q746J6-F1.pdb', 'AF-R4YU54-F1.pdb', 'AF-Q72L88-F1.pdb','AF-P21340-F1.pdb' , 'AF-Q745V2-F1.pdb', 'AF-C0H3V2-F1.pdb']


In [61]:
def prot_mut(pdb_path, pdb_file, pqr_output_path, seed = 0, threshhold = 10000):
    
    import os
    from helper_function import pdb2AA
    from function import functional_aa
    from function import free_aa
    from helper_function import Subst_reducer 
    from function import AA2s4pred
    from function_mut import diff_weighted
    from heapq import heappop, heappush
    
    
    # top 20 features that correlate with melt point (positive and negative), based on prokaryotes323 columns, cutoff at +/- 0.32
    pos_corr = {'YR': 0.492442, 
                'RP': 0.480594, 
                'RG': 0.442533, 
                'R': 0.432905, 
                'WR': 0.392523, 
                'YP': 0.390709, 
                'LR': 0.386596, 
                'FR': 0.376553, 
                'VR': 0.370241, 
                'ER': 0.369174, 
                'RC': 0.357052, 
                'Rhelix': 0.349204, # percentage of R in all helices
                'MR': 0.343496, 
                'P': 0.340082, 
                'PG': 0.338434, 
                'EAmotif': 0.332202, # EA hinterenander
                'LP': 0.330337, 
                'EP': 0.328569, 
                'RH': 0.326193, 
                'AR': 0.324355, 
                'ARmotif': 0.3241, # AR hinterenander
                'NR': 0.323378}
    
    neg_corr = {'QT': -0.52935, 
                'MQ': -0.507329, 
                'QS': -0.502697, 
                'QC': -0.493738, 
                'Q': -0.469765, 
                'QD': -0.466556, 
                'QH': -0.455041, 
                'NQ': -0.435562, 
                'IQ': -0.429683, 
                'FQ': -0.42363, 
                'WQ': -0.420057, 
                'QK': -0.41872, 
                'PolarAA': -0.406035, 
                'ST': -0.396185, 
                'Qhelix': -0.37895, 
                'TC': -0.364763, 
                'MT': -0.359458, 
                'TH': -0.346921, 
                'TD': -0.34469, 
                'SH': -0.327684, 
                'SC': -0.321686, 
                'T': -0.321208}
    
    #sorted aminoacids by frequency in pos_corrm neg_corr
    sorted_freq_pos = ['R', 'P', 'L', 'Y', 'E', 'G', 'A', 'V', 'M', 'F', 'W', 'N', 'H', 'C']
    sorted_freq_neg = ['K', 'N', 'W', 'F', 'I', 'D', 'M', 'C', 'H', 'S', 'T', 'Q']


    ideal_pos_value = {'AR': 0.19577028919092312, 
                       'VR': 0.16570774850840858,
                       'LR': 0.22057443760531548,
                       'LP': 0.19913083779456836,
                       'MR': 0.10189569954318338,
                       'FR': 0.1214807452476434,
                       'WR': 0.09732041140609597,
                       'NR': 0.10350733491352844,
                       'YR': 0.11505796446768121,
                       'YP': 0.09361436465693411,
                       'ER': 0.1750096588731577,
                       'EP': 0.15356605906241058,
                       'RH': 0.10505828276892132,
                       'RC': 0.0895071066024632,
                       'RP': 0.14867680267495723,
                       'RG': 0.1743851008086472,
                       'PG': 0.15294150099790005,
                       'PolarAA': 0.14402202391640712,
                       'R': 0.08506020124285214,
                       'P': 0.06361660143210504,
                       'ARmotif': 0.010758762261349177,
                       'EAmotif': 0.01633251571740483,
                       'Rhelix': 0.10373774208196636
                       }
    
    ideal_neg_value = {'IQ': 0.05490211183163266,
                       'MQ': 0.03973022298244349,
                       'MT': 0.054568274035922376,
                       'FQ': 0.05931526868690351,
                       'WQ': 0.03515493484535609,
                       'NQ': 0.041341858352788544,
                       'QS': 0.057844351285310555,
                       'QT': 0.06062750041770345,
                       'QD': 0.06216042713743332,
                       'QH': 0.04289280620818142,
                       'QK': 0.06281534035966867,
                       'QC': 0.027341630041723304,
                       'ST': 0.07268240233878943,
                       'SH': 0.054947708129267414,
                       'SC': 0.03939653196280929,
                       'TD': 0.0769984781909122,
                       'TH': 0.0577308572616603,
                       'TC': 0.04217968109520218,
                       'Q': 0.022894724682112268,
                       'T': 0.03773277573559115,
                       'PolarAA': 0.14402202391640712, 
                       'Qhelix': 0.02831461743013483
                       }
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

    #https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003674
    conserv_subst = {
        'A': ['D', 'E', 'G', 'S', 'T'],
        'C': ['G', 'R', 'S', 'W', 'Y'],
        'D': ['A', 'E', 'G', 'H', 'N', 'V', 'Y'],
        'E': ['A', 'D', 'G', 'K', 'Q', 'V'],
        'F': ['I', 'L', 'Y'],
        'G': ['A', 'C', 'D', 'E', 'R'],
        'H': ['D', 'L', 'N', 'P', 'Q', 'R', 'Y'],
        'I': ['F', 'L', 'M', 'N', 'V'],
        'K': ['E', 'M', 'N', 'Q', 'R', 'T'],
        'L': ['F', 'H', 'I', 'M', 'P', 'Q', 'R', 'V', 'W'],
        'M': ['I', 'K', 'L', 'R', 'T', 'V'],
        'N': ['D', 'H', 'I', 'K', 'S', 'T', 'Y'],
        'P': ['H', 'L', 'Q', 'R', 'S'],
        'Q': ['E', 'H', 'K', 'L', 'P', 'R'],
        'R': ['C', 'G', 'H', 'K', 'L', 'M', 'P', 'Q', 'T', 'W'],
        'S': ['A', 'C', 'N', 'P', 'T', 'W', 'Y'],
        'T': ['A', 'K', 'M', 'N', 'R', 'S'],
        'V': ['D', 'E', 'I', 'L', 'M'],
        'W': ['C', 'L', 'R', 'S'],
        'Y': ['C', 'D', 'F', 'H', 'N'],
        }

    non_conservative_substitutions = {
        'A': ['P', 'V'],
        'C': ['F'],
        'F': ['C', 'S', 'V'],
        'G': ['S', 'V', 'W'],
        'I': ['K', 'R', 'S', 'T'],
        'K': ['I'],
        'L': ['S'],
        'P': ['A', 'T'],
        'Q': ['E', 'K'],
        'R': ['I', 'S'],
        'S': ['F', 'G', 'I', 'L', 'R'],
        'T': ['I', 'P'],
        'V': ['A', 'F', 'G'],
        'W': ['G'],
        }


    #extract aminoacid list
    aa_list = pdb2AA(pdb_path, pdb_file)
    aa_str = ''.join(aa_list)
    aa_locked = functional_aa(pdb_path, pdb_file, pqr_output_path)
    aa_free = free_aa(pdb_path, pdb_file, aa_locked)
    free_AA_dict = {a: b for a, b in zip(aa_free[:,2], aa_free[:,1] )} # create dictionary from array the key is the absolute aminoacid position and value is the aminoacid

    sec_prediction = AA2s4pred('./data/s4pred', output_path, aa_str, pdb_file)
    
    WT_dev_sum = diff_weighted(pos_corr, neg_corr, aa_list, ideal_pos_value, ideal_neg_value, sec_prediction, sum_only = True)
    Best_dev_sum = WT_dev_sum
    best_Mut_prot = aa_str
    possible_substitutions = Subst_reducer(sec_prediction, conserv_subst, free_AA_dict, seed = seed)
    
    
    #------------------------create random mutatiosn -------------------------
    
    # Top 5 best variations with their scores (min-heap for lowest energy)
    top_variations = []
    heappush(top_variations, (float('inf'), None))  # Placeholder for lowest score
    
    Mut_seq_str = mutator_rand(aa_list, possible_substitutions, threshhold = threshhold, seed = seed)
    for Mut_prot in Mut_seq_str:
        Mut_dev_sum = diff_weighted(pos_corr, neg_corr, Mut_prot, ideal_pos_value, ideal_neg_value, sec_prediction, sum_only = True)

        #save the top 5 scores
        if len(top_variations) < 6:
            heappush(top_variations, (Mut_dev_sum, Mut_prot))  
                  
        elif Mut_dev_sum < top_variations[0][0]:
            heappush(top_variations, (Mut_dev_sum, Mut_prot))
            heappop(top_variations)
    
    #top_proteins = [variation for _, variation in top_variations]

            
        

    
    
    return Best_dev_sum, WT_dev_sum, top_variations

threshhold=1 -> 12.2s
threshhold=10 -> 29 s

In [62]:
Best_version, WT, best_versions = prot_mut(path_pdb, 'AF-P39846-F1.pdb', path_pqr, seed = 0, threshhold = 10)


Pqr file already exists
Salt_bridge finished


  distance[:,0] = distance[:,0].astype('int')
  theta = np.arccos((d_DH[n,1,1]**2 + d_HA[n,1:]**2 - d_DA[1:,1:][n,:]**2)/(2*d_DH[n,1,1]*d_HA[1:,1:][n,:]))


H_bond finished
VdW_interaction finished


  array[:,0] = array[:,0].astype('int')


c:\Users\marik\OneDrive - bwedu\Uni HD\FS 4\Bioinfo\topic04_02\data\test\AF-P39846-F1.pdb.fasta
c:\Users\marik\OneDrive - bwedu\Uni HD\FS 4\Bioinfo\topic04_02\data\test\AF-P39846-F1.pdb.fas
fasta file already exists ./data/test\AF-P39846-F1.pdb.fasta
fas file already exists


In [63]:
print(Best_version)
#print(WT)
print(best_versions)
print(f'\n')
for score in best_versions:
    if score[0] != float('inf'):
        print(score[0])
    

3.3432069164104483
[(3.343206916410448, 'MAQSAQIQDIYPLSHMQEGMLFHSLMDFSSKAYIEQTSFTITGNLCVDSFQKSLNLLVSRYDIFRTIFIKEVPDLTGPQQVVLSNRELTVYREDISRLADQEQQTLIDAFMTKDREKGFDLQKDPLMRLALFDRGDSQYTCVWTHHHIIMDGWCLGIILKEFFSMYDSLKNNSPVQLGSTVPYSRYIEWLGEQDQEETAAYWSEYLKEYGNTASIPRIKRRTADGNYKADQVSFSLAPDMVEKLTEAAQNWGVTLNTLFMSIWGVLLHRYNAADDAVFGSVISGRPSAIDGIESMVGLFINTVPVRIRSAEGITFSSLVKAVQEDILSSEQHGYYPLYEIQNHSPLKQGLIDHIFVFENYPVQLHQALSVESENDEGALKLSDISMSEQTNYDFNIVIVPGESFYIKFSYNADVYEREEMLRIQGHLKQALDCILTNPDVAVSDINIVPPEEQQVIQLFNETERPYVNKTIPQLFEEQAHKTPEAAALKMGNECWTYRQLQVRANQIAHALIEKGVGSGDIVAVMMGRSMEMPAALLGIWKAGGAYMPLDPHFPAERLSFLLKDSQAAQLLIEEDLISLIPPSYEGNTITIEHTESYQTEAPNMPPGDLAYLIYTSGTTGRPKGVLVDHHGIANTLQWRREEYSMTEQDISLHLFSYVFDGCVTSLFTPLLSGACVLLTTDDEAKDVLALKRKIARYKVSHMIIVPSLYRVLLEVMTADDAKSLRIVTFAGEAVTPDLLELNQIICPSAELANEYGPTENSVATTILRHLNKKERITIGHPIRNTKVFVLHGNQMQPIGAAGELCISGAGLARGYYKQQELTQKAFSDHPFLEGERLYRTGDAGRFLPDGTIEYIGRFDDQVKIRGYRIELREIETVLRQAPGVKEAAVLARDVSAEEKELVAYIVPEKGNSLPDLYQHLAGTLPSYMIPASIINISQMPLTSSGKLDRFALPEPENNT

In [None]:
protein = ['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A']
substitutions = {"1": ["B", "C"], "2": ["X", "Y", "Z"], "5" : ["M", "N", "O", "P"], "7": ["Q", "R", "S", "T"], "10": ["U", "V", "W", "X", "Y", "Z"]}
variations = mutator5(protein, substitutions, threshhold = 10000, seed = 1)
for variation in variations:
    print(f'{count} -- {variation}')


In [10]:

def mutator_rand(AAs_list, substitutions, threshhold = 100, seed = 0):
    from itertools import product
    import random
    count = 0
    random.seed(seed)
    
    keys = list(substitutions.keys())
    random.shuffle(keys)
    # Create a list of lists, each containing tuples of (position, substitution)
    while count < threshhold:

        subst_options = [[(pos, subst) for subst in [AAs_list[int(pos)-1]] + substitutions[pos]] for pos in keys]
        if not subst_options:
            break
        
        # Generate all combinations of substitutions
        for combination in product(*subst_options):
            # Start with the original protein sequence
            prot_variation = list(AAs_list)
            # Apply each substitution in the combination
            for pos, subst in combination:
                prot_variation[int(pos)-1] = subst
            # Yield the new protein variation as a string
            yield ''.join(prot_variation)
            count += 1
            
            if count >= threshhold:
                break