Working jupyter to create function to create a list of aminoacids likely to be used for mutation

In [13]:
# import packages
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt


Function finished

In [79]:
def functional_aa(input_path, pdb_file, output_path, df=False):
    """
    This function selects atoms involved in various interactions (salt bridges, hydrogen bonds, 
    and van der Waals interactions) from a protein structure.

    Args:
        input_path (str): Path to the directory containing the protein structure file.
        pdb_file (str): Filename of the protein structure file in PDB format.
        output_path (str): Path to the directory where the output PQR file will be saved.
        df (bool, optional): If True, returns a pandas DataFrame containing the information. 
                                Defaults to False (returns a NumPy array).

    Returns:
        np.ndarray | pd.DataFrame: A NumPy array containing the selected atom information 
                                   or a pandas DataFrame if `df` is True.
    """
    
    #import necessary functions
    from function import salt_bridge
    from function import H_bond_calc
    from function import VdW_interaction
    from function import pdb2pqr
    from helper_function import remove_nan
    import os
    import numpy as np
    import pandas as pd
    
    # get protein name and pqr file name
    prot_name = pdb_file.split('-')[1]
    
    #create pqr file
    pqr_file = f'{(pdb_file.split('.')[0]).split('-')[1]}.pqr'

    if os.path.isfile(os.path.join(output_path, f'{(pdb_file.split(".")[0]).split("-")[1]}.pqr')):
        print('Pqr file already exists')
    else:    
        pdb2pqr(input_path, output_path, pdb_file)
    

    # Calculate atom features
    Salt_bridge = salt_bridge(input_path, pdb_file)
    print('Salt_bridge finished')
    H_bond = H_bond_calc(output_path, pqr_file)
    print('H_bond finished')
    VdW_clust, VdW_vol = VdW_interaction(input_path, pdb_file, by_atom = True)
    print('VdW_interaction finished')
    
    # extract the values for the proteins from the dictionary and delete atoms that dont have a feature (if applicable)
    Salt_bridge = remove_nan(Salt_bridge[prot_name])
    H_bond = remove_nan(H_bond[prot_name][:,:,0])

    VdW_clust = VdW_clust[prot_name]
 

    
    
    #create lists with all aminoacid that are part of a feature
    atom_S =list(Salt_bridge[0,1:])
    atom_HA = list(H_bond[0,1:])
    atom_HD = list(H_bond[1:,0])

    
    
    # creates an atom_dict that contains the atom number and the feature it is part of
    atom_dict = {}
    for lst, identifier in [(atom_S, "Salt_bridge"), (atom_HA, "Hbond_acc"), (atom_HD, "Hbond_don")]:
        for atom_number in lst:
            if atom_number in atom_dict:
                atom_dict[atom_number] = [atom_dict[atom_number], identifier]
            else:
                atom_dict[atom_number] = identifier
    # Add van der Waals interaction information to the dictionary
    for k,v in VdW_clust.items():
        if k in atom_dict:
            atom_dict[k] = [atom_dict[k], v]
        else: atom_dict[k] = v
    atom_sorted = {k: atom_dict[k] for k in sorted(atom_dict)}


    # create a dataframe with the atom number and the feature it is part of
    prot_df = pd.DataFrame(columns = ['Protein','Aminoacid','Aminoacid_number', 'Atom_number', 'Feature'])
    Protein_array = np.empty((0, 5))
    with open (os.path.join(output_path, pqr_file)) as f:
        prot_df_list = []
        for line in f:
            line = line.replace('-', '  -')
            if line.startswith('ATOM'):
                atom_number = int(line.split()[1])
                feature = atom_sorted.get(atom_number)
                if atom_number in atom_sorted.keys():
                    atom_line = np.array([[str(prot_name),str(line.split()[3]), float(line.split()[4]),float(line.split()[1]), str(feature)]])
                    Protein_array = np.append(Protein_array, atom_line, axis=0)
    # return the dataframe if df is True
    if df:
        Prot_df = pd.DataFrame(Protein_array, columns = ['Protein','Aminoacid','Aminoacid_number', 'Atom_number', 'Feature'])
        return Prot_df
    else:
        return Protein_array


In [None]:
path_pdb = './data/pdbs'
path_pqr=  './data/pqrs'
pdb_file = 'AF-C0SP86-F1.pdb'
pqr_file = 'P39846.pqr'
output_path = './data/test'
name = pqr_file.split('.')[0]


prot_arr = aa_feature_select(path_pdb, pdb_file, output_path, df =False)

Function to muatate aminoacids to increase correlation

In [121]:
def free_aa (path, file, prot_arr):
    """
    Identifies and collects free amino acids from a PDB or PQR file.

    This function takes a path to a PQR file, the filename of the PQR file, and a NumPy array containing protein information as input.
    It iterates through the PQR file and identifies residues that are not classified as functional based on the `functional_aa` list
    and have not been encountered yet. These residues are considered free amino acids and are accumulated in a NumPy array.

    Args:
        path (str): Path to the directory containing the PQR file.
        pqr_file (str): Filename of the PQR file.
        prot_arr (np.ndarray): NumPy array containing protein information (assumed to have residue types in the 2nd column).

    Returns:
        np.ndarray: A NumPy array containing information about free amino acids (protein name, residue name, residue number).
    """
       
    # created wiith correlation of df prokaryotes_323columns
    # cut off of > 0.32 and < -0.32
    pos_corr = ['AR', 'VR', 'LR', 'LP', 'MR', 'FR', 'WR', 'NR', 'YR', 
                'YP', 'ER', 'EP', 'RH', 'RC', 'RP', 'RG', 'PG', 'Rhelix', 
                'R', 'P', 'ARmotif', 'EAmotif'] 
    neg_corr = ['IQ', 'MQ', 'MT', 'FQ', 'WQ', 'NQ', 'QS', 'QT', 'QD', 
                'QH', 'QK', 'QC', 'ST', 'SH', 'SC', 'TD', 'TH', 'TC', 
                'Qhelix', 'Q', 'T', 'PolarAA']
    
    functional_aa = sorted(set(prot_arr[:,2]))
    free_aa = np.empty((0, 3))
    prot_name = file.split('.')[0]
    with open(os.path.join(path, str(file))) as f:
        for line in f:
            line = line.replace('-', '  -')
            if line.startswith('ATOM'):
                aa_number = line.split()[4]
                if aa_number not in functional_aa and aa_number not in free_aa[:,2]:
                    aa_line = np.array([[str(prot_name), line.split()[3], line.split()[4]]])
                    free_aa = np.append(free_aa, aa_line, axis=0)
    return free_aa

Get list of pos/neg correlating features for Meltpoint

In [6]:
import pandas as pd
import os
pro_df: pd.DataFrame = pd.read_csv(os.path.join('./data', 'prokaryotes_323columns.csv'))
pro_df = pro_df.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'])

  pro_df: pd.DataFrame = pd.read_csv(os.path.join('./data', 'prokaryotes_323columns.csv'))


In [16]:
import numpy
numpy.version.version


'1.26.4'

In [39]:
pro_corr = pro_df.corr(numeric_only=True)['meltPoint']
pos_corr = pro_corr[pro_corr > 0.32]
neg_corr = pro_corr[pro_corr < -0.32]

In [40]:
pos_corr = list(pro_corr.index)
#print(pos_corr[3:])
neg_corr = list(neg_corr.index)
print(neg_corr)

['IQ', 'MQ', 'MT', 'FQ', 'WQ', 'NQ', 'QS', 'QT', 'QD', 'QH', 'QK', 'QC', 'ST', 'SH', 'SC', 'TD', 'TH', 'TC', 'Qhelix', 'Q', 'T', 'PolarAA']


Needs to be mutated

In [14]:
print(pro_corr[pro_corr < -0.32])

IQ        -0.429683
MQ        -0.507329
MT        -0.359458
FQ        -0.423630
WQ        -0.420057
NQ        -0.435562
QS        -0.502697
QT        -0.529350
QD        -0.466556
QH        -0.455041
QK        -0.418720
QC        -0.493738
ST        -0.396185
SH        -0.327684
SC        -0.321686
TD        -0.344690
TH        -0.346921
TC        -0.364763
Qhelix    -0.378950
Q         -0.469765
T         -0.321208
PolarAA   -0.406035
Name: meltPoint, dtype: float64
