Working jupyter to create function to create a list of aminoacids likely to be used for mutation

In [1]:
import re
line = 'ATOM   7949  N   ASN A1000     -16.817  14.355  -9.711  1.00 97.93           N'
line = re.sub(r'([A-Za-z])(\d)', r'\1 \2', line)
print(line.split()[5])
print(line)


1000
ATOM   7949  N   ASN A 1000     -16.817  14.355  -9.711  1.00 97.93           N


In [125]:
import os
path_pdb = './data/pdbs'
path_pqr=  './data/pqrs'
pdb_file = 'AF-C0H3V2-F1.pdb'
output_path = './data/test'

print(os.path.join(path_pdb, pdb_file))


./data/pdbs\AF-C0H3V2-F1.pdb


In [10]:
def fasparse(faspath):
    import numpy as np
    helixl = []
    sheetl = []
    data = np.loadtxt(faspath, dtype={'names': ('index', 'col1', 'col2', 'val1', 'val2', 'val3'),'formats': ('i4', 'S1', 'S1', 'f4', 'f4', 'f4')})
    indiceshelix = data['index'][data['col2'] == b'H']
    indicessheet = data['index'][data['col2'] == b'E']
    helix = np.split(indiceshelix, np.where(np.diff(indiceshelix) != 1)[0]+1)
    sheet = np.split(indicessheet, np.where(np.diff(indicessheet) != 1)[0]+1)
    return [helix, sheet]

In [15]:
import os
def AA2s4pred (directory_S4pred, output_path, AA_seq, prot):
    import os
    os.getcwd()
    # call s4pred and create fas file
    fastapath = os.path.join(output_path,f'{prot}.fasta')
    faspath = os.path.join(output_path,f'{prot}.fas')
    
    if os.path.isfile(fastapath):
        print(f'fasta file already exists {fastapath}')
    else:
        with open(os.path.join(output_path,f'{prot}.fasta'), "w") as fasta_file:
            fasta_file.write(f">{prot}\n{AA_seq}\n")
        print(f'fasta file created {fastapath}')
    if os.path.isfile(faspath):
        print('fas file already exists')
    else:
        os.chdir(directory_S4pred)
        os.system(f'python3 run_model.py "{fastapath}" > "{faspath}"')
        os.chdir('../../')
     
    #read fas file and   
    sec_pred = fasparse(faspath)    

    return sec_pred

In [94]:
for i in range(10):
    if i >= 0:
        print(i)

0
1
2
3
4
5
6
7
8
9


In [132]:
def prot_mut(pdb_path, pdb_file, pqr_output_path, deep_mut=None, iterations=1):
    
    import os
    from helper_function import pdb2AA
    from function import functional_aa
    from function import free_aa
    import numpy as np
    
    
    # top 20 features that correlate with melt point (positive and negative), based on prokaryotes323 columns, cutoff at +/- 0.32
    pos_corr = {'YR': 0.492442, 
                'RP': 0.480594, 
                'RG': 0.442533, 
                'R': 0.432905, 
                'WR': 0.392523, 
                'YP': 0.390709, 
                'LR': 0.386596, 
                'FR': 0.376553, 
                'VR': 0.370241, 
                'ER': 0.369174, 
                'RC': 0.357052, 
                #'Rhelix': 0.349204, # percentage of R in all helices
                'MR': 0.343496, 
                'P': 0.340082, 
                'PG': 0.338434, 
                'EAmotif': 0.332202, # EA hinterenander
                'LP': 0.330337, 
                'EP': 0.328569, 
                'RH': 0.326193, 
                'AR': 0.324355, 
                'ARmotif': 0.3241, # AR hinterenander
                'NR': 0.323378}
    
    neg_corr = {'QT': -0.52935, 
                'MQ': -0.507329, 
                'QS': -0.502697, 
                'QC': -0.493738, 
                'Q': -0.469765, 
                'QD': -0.466556, 
                'QH': -0.455041, 
                'NQ': -0.435562, 
                'IQ': -0.429683, 
                'FQ': -0.42363, 
                'WQ': -0.420057, 
                'QK': -0.41872, 
                #'PolarAA': -0.406035, 
                'ST': -0.396185, 
                #'Qhelix': -0.37895, 
                'TC': -0.364763, 
                'MT': -0.359458, 
                'TH': -0.346921, 
                'TD': -0.34469, 
                'SH': -0.327684, 
                'SC': -0.321686, 
                'T': -0.321208}
    sorted_freq_pos = ['R', 'P', 'L', 'Y', 'E', 'G', 'A', 'V', 'M', 'F', 'W', 'N', 'H', 'C']
    sorted_freq_neg = ['K', 'N', 'W', 'F', 'I', 'D', 'M', 'C', 'H', 'S', 'T', 'Q']

    
    ideal_pos_value = {'AR': 0.19577028919092312, 'VR': 0.16570774850840858, 'LR': 0.22057443760531548, 'LP': 0.19913083779456836, 'MR': 0.10189569954318338, 'FR': 0.1214807452476434, 'WR': 0.09732041140609597, 'NR': 0.10350733491352844, 'YR': 0.11505796446768121, 'YP': 0.09361436465693411, 'ER': 0.1750096588731577, 'EP': 0.15356605906241058, 'RH': 0.10505828276892132, 'RC': 0.0895071066024632, 'RP': 0.14867680267495723, 'RG': 0.1743851008086472, 'PG': 0.15294150099790005, 'R': 0.08506020124285214, 'P': 0.06361660143210504}
    ideal_neg_value = {'IQ': 0.05490211183163266, 'MQ': 0.03973022298244349, 'MT': 0.054568274035922376, 'FQ': 0.05931526868690351, 'WQ': 0.03515493484535609, 'NQ': 0.041341858352788544, 'QS': 0.057844351285310555, 'QT': 0.06062750041770345, 'QD': 0.06216042713743332, 'QH': 0.04289280620818142, 'QK': 0.06281534035966867, 'QC': 0.027341630041723304, 'ST': 0.07268240233878943, 'SH': 0.054947708129267414, 'SC': 0.03939653196280929, 'TD': 0.0769984781909122, 'TH': 0.0577308572616603, 'TC': 0.04217968109520218, 'Q': 0.022894724682112268, 'T': 0.03773277573559115}


    #https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003674
    conserv_subst = {
        'A': ['D', 'E', 'G', 'S', 'T'],
        'C': ['G', 'R', 'S', 'W', 'Y'],
        'D': ['A', 'E', 'G', 'H', 'N', 'V', 'Y'],
        'E': ['A', 'D', 'G', 'K', 'Q', 'V'],
        'F': ['I', 'L', 'Y'],
        'G': ['A', 'C', 'D', 'E', 'R'],
        'H': ['D', 'L', 'N', 'P', 'Q', 'R', 'Y'],
        'I': ['F', 'L', 'M', 'N', 'V'],
        'K': ['E', 'M', 'N', 'Q', 'R', 'T'],
        'L': ['F', 'H', 'I', 'M', 'P', 'Q', 'R', 'V', 'W'],
        'M': ['I', 'K', 'L', 'R', 'T', 'V'],
        'N': ['D', 'H', 'I', 'K', 'S', 'T', 'Y'],
        'P': ['H', 'L', 'Q', 'R', 'S'],
        'Q': ['E', 'H', 'K', 'L', 'P', 'R'],
        'R': ['C', 'G', 'H', 'K', 'L', 'M', 'P', 'Q', 'T', 'W'],
        'S': ['A', 'C', 'N', 'P', 'T', 'W', 'Y'],
        'T': ['A', 'K', 'M', 'N', 'R', 'S'],
        'V': ['D', 'E', 'I', 'L', 'M'],
        'W': ['C', 'L', 'R', 'S'],
        'Y': ['C', 'D', 'F', 'H', 'N'],
        }

    non_conservative_substitutions = {
        'A': ['P', 'V'],
        'C': ['F'],
        'F': ['C', 'S', 'V'],
        'G': ['S', 'V', 'W'],
        'I': ['K', 'R', 'S', 'T'],
        'K': ['I'],
        'L': ['S'],
        'P': ['A', 'T'],
        'Q': ['E', 'K'],
        'R': ['I', 'S'],
        'S': ['F', 'G', 'I', 'L', 'R'],
        'T': ['I', 'P'],
        'V': ['A', 'F', 'G'],
        'W': ['G'],
        }

    #extract aminoacid list
    aa_list = pdb2AA(pdb_path, pdb_file)
    aa_locked = functional_aa(pdb_path, pdb_file, pqr_output_path)
    aa_free = free_aa(pdb_path, pdb_file, aa_locked)
    AAs = ''.join(aa_list)
    # calculate ratio of WT protein for pos_corr and neg_corr and sort them in descending order
    from function import rel_aa_comp
    from function_mut import diff_weighted
    from function_mut import mut_apply
    from function_mut import mutator2
    from helper_function import ArraySlice
    WT_dev_sum, WT_dev = diff_weighted(pos_corr, neg_corr, aa_list, ideal_pos_value, ideal_neg_value)
    
    #define variables for iteration
    cutoff = 0.01
    prev_Mut_list = aa_list
    prev_Mut_dev_sum = WT_dev_sum
    prev_Mut_dev = WT_dev
    aa_available = aa_free
    
    Dev_list =[WT_dev]
    
    for i in range(iterations):
        Mut_list, possible_mutations = mutator2(prev_Mut_list, aa_available, prev_Mut_dev ,pos_corr, sorted_freq_pos, neg_corr, conserv_subst, ideal_pos_value, ideal_neg_value) # calculates list of possible mutations
        Mut_dev_sum, Mut_dev = diff_weighted(pos_corr, neg_corr, Mut_list, ideal_pos_value, ideal_neg_value) # calculate deviation of mutated protein sequence
        Dev_list.append(Mut_dev)
        if Mut_dev_sum - cutoff >= prev_Mut_dev_sum:
           output_message = f' After {i+1} iterations: Mutation increased deviation of protein sequence from ideal values \n{WT_dev_sum} \n {prev_Mut_dev_sum} : {i} iterations -> {Mut_dev_sum} : {i+1} iterations'
           break
        elif prev_Mut_dev[0][0] == Mut_dev[0][0]:
            output_message = f' After {i+1} no change in the most deviating feature'
            break
        
        #update variables for next iteration            
        prev_possible_mutations = possible_mutations
        prev_Mut_list = Mut_list
        prev_Mut_dev = Mut_dev
        prev_Mut_dev_sum = Mut_dev_sum
        aa_available = ArraySlice(aa_available, possible_mutations) #updates available aminoacids, so that each aminoacid can only be mutated once
        if i == (iterations-1):
            output_message = f'maximum amount of iterations reached: {i+1} \n deviation from {WT_dev_sum},  to {Mut_dev_sum}'

    

    #introduce random mutations
    if i < iterations:
        R_prev_Mut_list, R_possible_mutations = random_mutate(prev_Mut_list, aa_available, conserv_subst, 0.01)
        
        R_possible_mutations = possible_mutations
#        R_Mut_dev = Mut_dev
        R_aa_available = aa_available
        R_prev_Mut_dev = prev_Mut_dev
        R_prev_Mut_dev_sum = prev_Mut_dev_sum
        
        for i in range(iterations):
            R_Mut_list, R_possible_mutations = mutator2(R_prev_Mut_list, R_aa_available, R_prev_Mut_dev ,pos_corr, sorted_freq_pos, neg_corr, conserv_subst, ideal_pos_value, ideal_neg_value) # calculates list of possible mutations
            R_Mut_dev_sum, R_Mut_dev = diff_weighted(pos_corr, neg_corr, R_Mut_list, ideal_pos_value, ideal_neg_value) # calculate deviation of mutated protein sequence
            Dev_list.append(R_Mut_dev)
            
            if i > 0:
                if R_Mut_dev_sum - cutoff >= R_prev_Mut_dev_sum:
                    output_message = f'RAND After {i+1} iterations: Mutation increased deviation of protein sequence from ideal values \n{WT_dev_sum} \n {R_prev_Mut_dev_sum} : {i} iterations -> {R_Mut_dev_sum} : {i+1} iterations'
                    break
                #elif R_prev_Mut_dev[0][0] == R_Mut_dev[0][0]:
                #   output_message = f'RAND After {i+1} no change in the most deviating feature'
                #  break
            
            #update variables for next iteration
            R_prev_possible_mutations = R_possible_mutations
            R_prev_Mut_list = R_Mut_list
            R_prev_Mut_dev = R_Mut_dev
            R_prev_Mut_dev_sum = R_Mut_dev_sum
            R_aa_available = ArraySlice(R_aa_available, R_possible_mutations) #updates available aminoacids, so that each aminoacid can only be mutated once
            if i == (iterations-1):
                output_message = f'RAND maximum amount of iterations reached: {i+1} \n deviation from {WT_dev_sum},  to {Mut_dev_sum}'
            
    
    print( f'########## \n {output_message} \n ##########')  
    
    if R_prev_Mut_dev_sum < prev_Mut_dev_sum:
        R_Improvement = R_prev_Mut_dev_sum - WT_dev_sum
        return R_Improvement, Dev_list
    else:
        print('no improvement with random mutations')
        Improvement = prev_Mut_dev_sum - WT_dev_sum
        return Improvement, Dev_list

In [131]:
def random_mutate(AA_list, free_AA, mutation_dict, mutation_rate):
    """
    Mutates a protein sequence randomly, respecting allowed mutations from a dictionary.

    Args:
        sequence (str): The protein sequence to mutate.
        mutation_dict (dict): Dictionary where keys are amino acids and values are lists of possible mutations.

    Returns:
        str: The mutated protein sequence.
    """
    import random
    from function_mut import mut_apply
    mut_list = []
    aa_mut_list = AA_list
    free_AA_dict = {a: b for a, b in zip(free_AA[:,2], free_AA[:,1] )} # create dictionary from array the key is the absolute aminoacid position and the value the aminoacid

    for key in free_AA_dict:
        aminoacid = key
        if aminoacid in mutation_dict:
            possible_mutation = mutation_dict[aminoacid]
            if random.random() < mutation_rate:
                mutation = random.choice(possible_mutation)
                mut_aa = f'{aminoacid} - {free_AA_dict[key]} - {mutation}'
                mut_list.append(mut_aa)
                aa_mut_list = mut_apply(AA_list, [mut_aa])
    return aa_mut_list, mut_list

In [133]:
#test_list = ['AF-P0AGD1-F1.pdb', 'AF-O34633-F1.pdb', 'AF-Q72L06-F1.pdb', 'AF-Q746J6-F1.pdb', 'AF-R4YU54-F1.pdb', 'AF-C0H3V2-F1.pdb']
test_list = ['AF-C0H3V2-F1.pdb']
results = {}
Improvement_sum = 0
for n in test_list:
    pdb_file = n
    imporvement, DEV_list = prot_mut(path_pdb, pdb_file, path_pqr, deep_mut=None, iterations=10)
    results[n] = imporvement
    Improvement_sum += imporvement

re_improvement = Improvement_sum/len(test_list)
    

Pqr file already exists
Salt_bridge finished
H_bond finished
VdW_interaction finished


  distance[:,0] = distance[:,0].astype('int')
  theta = np.arccos((d_DH[n,1,1]**2 + d_HA[n,1:]**2 - d_DA[1:,1:][n,:]**2)/(2*d_DH[n,1,1]*d_HA[1:,1:][n,:]))
  array[:,0] = array[:,0].astype('int')


########## 
 RAND maximum amount of iterations reached: 10 
 deviation from 2.406399223985011,  to 5.062299068488399 
 ##########
no improvement with random mutations


In [134]:
print(re_improvement)
print(Improvement_sum)
print(results)

-0.2607254885162451
-0.2607254885162451
{'AF-C0H3V2-F1.pdb': -0.2607254885162451}


In [130]:
def mutator3(AA_list:list, free_AA, deviation,pos_corr:dict, sorted_freq_pos, neg_corr:dict, conserv_substitution, ideal_pos_value, ideal_neg_value):
    """
    Generates a list of potential mutations based on deviations and correlations.

    Args:
        AAs (str): The amino acid sequence.
        free_AA (np.ndarray): Array containing information about free amino acids Col1: Prot name, Col2: aminoacid position, Col3: Aminoacid (one letter code).
        deviation (list): List containing deviations from ideal values for features.
        pos_corr_list (list): List of amino acids that positively correlate with desired features.
        sorted_freq_pos (list): Possibly sorted list of frequencies for amino acids contributing to positive features (usage unclear).
        neg_corr_list (list): List of amino acids that negatively correlate with desired features.
        conserv_substitution (dict): Dictionary containing a list of possible conservative substitutions for each amino acid.

    Returns:
        list: A list of mutation strings defining potential substitutions.
    """
    
    import itertools
    from function import rel_aa_comp
    from function_mut import mut_apply
    from function_mut import mut_live_test
    
    pos_corr_list = list(pos_corr.keys())
    neg_corr_list = list(neg_corr.keys())
    AA_mut_list = AA_list
    AAs = ''.join(AA_list)
    free_AA_dict = {a: b for a, b in zip(free_AA[:,2], free_AA[:,1] )} # create dictionary from array the key is the absolute aminoacid position and the value the aminoacid
    mut_list = []
    Diff_cutoff = 0
    
    for n in range(1):
        first_entry = deviation[n][0]
        # determine possible substitutions if the first entry is a single amino acid
        if len(first_entry) == 1:
            
            if first_entry in pos_corr_list: #checks if amount of aminoacid should be increaed
                for key in free_AA_dict:
                    aminoacid = free_AA_dict[key]
                    AA_subst = conserv_substitution[aminoacid]
                    
                    
                    for k in pos_corr_list: # Check if the current amino acid is in the positive correlation list
                        if len(k) == 1:     #selects the first feature with one aminoacid
                            if rel_aa_comp(AAs, k) < ideal_pos_value[k]:    #checks if the relative composition is suboptimal
                                if k in AA_subst:
                                    mut_aa = (aminoacid + '-' + key + '-' + k)
                                    Diff = mut_live_test(AA_mut_list, [mut_aa], pos_corr, neg_corr, ideal_pos_value, ideal_neg_value) # live test if mutation is benefitical
                                    if Diff > Diff_cutoff:
                                        mut_list.append(mut_aa)
                                        AA_mut_list = mut_apply(AA_list, [mut_aa]) #applies mutation to the protein sequence
                                        break


            elif first_entry in neg_corr_list: #checks if amount of aminoacid should be decreased
                for key in free_AA_dict:
                    aminoacid = free_AA_dict[key]
                    if aminoacid == first_entry:
                        AA_subst = conserv_substitution[aminoacid]
                        
                        #tries to substitute the aminoacid to the aminoacids that comes first in the sorted_freq_pos list
                        for k in pos_corr_list:
                            if aminoacid == k:
                                if rel_aa_comp(AAs, k) < ideal_pos_value[k]:
                                    mut_aa = aminoacid + '-' + key + '-' + k
                                    Diff = mut_live_test(AA_mut_list, [mut_aa], pos_corr, neg_corr, ideal_pos_value, ideal_neg_value) # live test if mutation is benefitical
                                    if Diff > Diff_cutoff: # live test if mutation is benefitical
                                        AA_mut_list = mut_apply(AA_list, [mut_aa]) #applies mutation to the protein sequence
                                        mut_list.append(mut_aa)
                                        break
                                
                                
    
        # determine possible substitutions if the first entry is a pair of amino acids
        elif len(first_entry) == 2:
            
            if first_entry in pos_corr_list: # checks if the amount of aminoacids should be increased
                for key in free_AA_dict:
                    aminoacid = free_AA_dict[key]
                    #if aminoacid not in sorted_freq_pos: #checks if the aminoacid overall positively contributes to one of the pos_corr features
                    AA_subst = conserv_substitution[aminoacid] # list of possible substitutions for the current amino acid
                    
                    #mutation
                    for n in range(len(deviation)): # iterates through all deviations and takes the first deviation (from pos corr) which can be increaed
                        entry = deviation[n][0]
                        if entry in pos_corr_list:
                            subst = [] #creates a list of possible substitutions that increase one of the amino acids in the highest entry that positively correlates
                            for k in AA_subst:
                                if k in first_entry:
                                    subst = k
                                    
                            if len(subst) == 1: # if only one substitution increases one of the aminoacids this substitution will be used
                                mut_aa = aminoacid + '-' + key + '-' + subst
                                Diff = mut_live_test(AA_mut_list, [mut_aa], pos_corr, neg_corr, ideal_pos_value, ideal_neg_value) # live test if mutation is benefitical
                                if Diff > Diff_cutoff:
                                    AA_mut_list = mut_apply(AA_list, [mut_aa]) #applies mutation to the protein sequence
                                    mut_list.append(mut_aa)
                                    break
                                
                            elif len(subst) == 2: # if the aminoacid can be substituted to both aminoacids in first_entry choose the one that has the least frequency
                                comp = [(aa, rel_aa_comp(AAs, aa)) for aa in subst] # calculate the relative amino acid composition of the possible substitutions
                                lowest_comp = min(comp, key=lambda pair: pair[1]) # find the amino acid with the lowest relative composition
                                mut_aa = aminoacid + '-' + key + '-' + lowest_comp[0]
                                Diff = mut_live_test(AA_mut_list, [mut_aa], pos_corr, neg_corr, ideal_pos_value, ideal_neg_value) # live test if mutation is benefitical
                                if Diff > Diff_cutoff:
                                    AA_mut_list = mut_apply(AA_list, [mut_aa])
                                    mut_list.append(mut_aa) # append the mutation to the mutation list
                                    break

            
                        
            elif first_entry in neg_corr_list: # checks if the amount of aminoacids should be decreased
                for key in free_AA_dict:
                    aminoacid = free_AA_dict[key]
                    if aminoacid in first_entry: #checks if the aminoacid is present in the first_entry
                        AA_subst = conserv_substitution[aminoacid] # list of possible substitutions for the current amino acid
                        
                        #mutation               
                        for n in range(len(deviation)): # iterates through all deviations and takes the first deviation (from pos corr) which can be increaed
                            entry = deviation[n][0]
                            if entry in pos_corr_list: 
                                subst = [] #creates a list of possible substitutions that increase one of the amino acids in the first_entry
                                for k in AA_subst:
                                    if k in entry:
                                        subst.append(k)
                                if len(subst) == 1: # if only one substitution increases one of the aminoacids this substitution will be used
                                    mut_aa = aminoacid + '-' + key + '-' + subst[0]
                                    Diff = mut_live_test(AA_mut_list, [mut_aa], pos_corr, neg_corr, ideal_pos_value, ideal_neg_value) # live test if mutation is benefitical
                                    if Diff > Diff_cutoff:
                                        AA_mut_list = mut_apply(AA_list, [mut_aa])
                                        mut_list.append(mut_aa) 
                                        break
                                    
                                elif len(subst) == 2: # if the aminoacid can be substituted to both aminoacids in first_entry choose the one that has the least frequency
                                    comp = [(aa, rel_aa_comp(AAs, aa)) for aa in subst] # calculate the relative amino acid composition of the possible substitutions
                                    lowest_comp = min(comp, key=lambda pair: pair[1]) # find the amino acid with the lowest relative composition
                                    mut_aa = aminoacid + '-' + key + '-' + lowest_comp[0]
                                    Diff = mut_live_test(AA_mut_list, [mut_aa], pos_corr, neg_corr, ideal_pos_value, ideal_neg_value) # live test if mutation is benefitical
                                    if Diff > Diff_cutoff:
                                        AA_mut_list = mut_apply(AA_list, [mut_aa])
                                        mut_list.append(mut_aa) # append the mutation to the mutation list
                                        break
        
        
        elif 'motif' in first_entry:
            if first_entry in pos_corr_list: # checks if the amount of aminoacids should be increased
                for key in free_AA_dict:
                        aa_back = AAs[key-1]    # aminoacid before the current aminoacid
                        aa_current = AAs[key]   # current aminoacid
                        aa_for = AAs[key+1]     # aminoacid after the current aminoacid
                        
                        if (aa_current in first_entry[0]) and (aa_for not in first_entry[1]):    #checks if current aminoacid is part of the motif, and next aminoacid not
                            for_subst = conserv_substitution[aa_back] # possible substitutions for the aminoacid before the motif
                            for s in for_subst: #iterates through the possible substitutions
                                if s in first_entry[1]:
                                    mut_aa = f'{aa_back}-{key+1}-{s}'
                                    Diff = mut_live_test(AA_mut_list, [mut_aa], pos_corr, neg_corr, ideal_pos_value, ideal_neg_value) # live test if mutation is benefitical
                                    if Diff > Diff_cutoff:
                                        AA_mut_list = mut_apply(AA_list, [mut_aa])
                                        mut_list.append(mut_aa)
                                        break  
                                            
                        if (aa_current in first_entry[1]) and (aa_back not in first_entry[0]):   #checks if current aminoacid is part of the motif, and previous aminoacid not
                            back_subst = conserv_substitution[aa_for] # possible substitutions for the aminoacid after the motif
                            for b in back_subst: #iterates through the possible substitutions
                                if b in first_entry[0]:
                                    mut_aa = f'{aa_for}-{key-1}-{b}'
                                    Diff = mut_live_test(AA_mut_list, [mut_aa], pos_corr, neg_corr, ideal_pos_value, ideal_neg_value) # live test if mutation is benefitical
                                    if Diff > Diff_cutoff:
                                        AA_mut_list = mut_apply(AA_list, [mut_aa])
                                        mut_list.append(mut_aa)
                                        break
        
    return AA_mut_list, mut_list

Get list of pos/neg correlating features for Meltpoint

In [4]:
import pandas as pd
import os
pro_df: pd.DataFrame = pd.read_csv(os.path.join('./data', 'prokaryotes_323columns.csv'))
pro_df = pro_df.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'])

  pro_df: pd.DataFrame = pd.read_csv(os.path.join('./data', 'prokaryotes_323columns.csv'))


In [52]:
top_df = pro_df[pro_df['meltPoint'] >= pro_df['meltPoint'].quantile(0.9)]
AR_mean = np.mean(top_df['AR'])
Ideal_pos_value = {}
Ideal_neg_value = {}

for key in pos_corr.keys():
    Ideal_pos_value[key] = np.mean(top_df[key])

for key in neg_corr.keys():
    Ideal_neg_value[key] = np.mean(top_df[key])
    
print(Ideal_pos_value)
print(Ideal_neg_value)


{'AR': 0.19577028919092312, 'VR': 0.16570774850840858, 'LR': 0.22057443760531548, 'LP': 0.19913083779456836, 'MR': 0.10189569954318338, 'FR': 0.1214807452476434, 'WR': 0.09732041140609597, 'NR': 0.10350733491352844, 'YR': 0.11505796446768121, 'YP': 0.09361436465693411, 'ER': 0.1750096588731577, 'EP': 0.15356605906241058, 'RH': 0.10505828276892132, 'RC': 0.0895071066024632, 'RP': 0.14867680267495723, 'RG': 0.1743851008086472, 'PG': 0.15294150099790005, 'R': 0.08506020124285214, 'P': 0.06361660143210504}
{'IQ': 0.05490211183163266, 'MQ': 0.03973022298244349, 'MT': 0.054568274035922376, 'FQ': 0.05931526868690351, 'WQ': 0.03515493484535609, 'NQ': 0.041341858352788544, 'QS': 0.057844351285310555, 'QT': 0.06062750041770345, 'QD': 0.06216042713743332, 'QH': 0.04289280620818142, 'QK': 0.06281534035966867, 'QC': 0.027341630041723304, 'ST': 0.07268240233878943, 'SH': 0.054947708129267414, 'SC': 0.03939653196280929, 'TD': 0.0769984781909122, 'TH': 0.0577308572616603, 'TC': 0.04217968109520218, 'Q

In [20]:
pro_corr = pro_df.corr(numeric_only=True)['meltPoint']
pos_corr = pro_corr[pro_corr > 0.32]
neg_corr = pro_corr[pro_corr < -0.32]

In [21]:
pos_corr = list(pro_corr.index)
neg_corr = list(neg_corr.index)
print(neg_corr)

['IQ', 'MQ', 'MT', 'FQ', 'WQ', 'NQ', 'QS', 'QT', 'QD', 'QH', 'QK', 'QC', 'ST', 'SH', 'SC', 'TD', 'TH', 'TC', 'Qhelix', 'Q', 'T', 'PolarAA']


Needs to be mutated

In [24]:
print(pro_corr[pro_corr < -0.32])

IQ        -0.429683
MQ        -0.507329
MT        -0.359458
FQ        -0.423630
WQ        -0.420057
NQ        -0.435562
QS        -0.502697
QT        -0.529350
QD        -0.466556
QH        -0.455041
QK        -0.418720
QC        -0.493738
ST        -0.396185
SH        -0.327684
SC        -0.321686
TD        -0.344690
TH        -0.346921
TC        -0.364763
Qhelix    -0.378950
Q         -0.469765
T         -0.321208
PolarAA   -0.406035
Name: meltPoint, dtype: float64
