In [179]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import Bio
from localcider.sequenceParameters import SequenceParameters
import scipy
from scipy.spatial.distance import cdist

In [180]:
path: str = './data/' #folder where files are stored
prokaryotes: pd.DataFrame = pd.read_csv(os.path.join(path, "prokaryotes_unique_prot.csv"), dtype=str)   # imports file from the folder as species, based on name
prokaryotes_all: pd.DataFrame = pd.read_csv(os.path.join(path, "prokaryotes_all.csv"), dtype=str) # imports 3rd file from the folder, based on name

In [None]:
def salt_bridge(path, pdb_files=None):
    if pdb_files is None:
        pdb_files = [f for f in os.listdir(path) if f.endswith('.pdb')]
    if isinstance(pdb_files, str):
        pdb_files = [pdb_files] 
    for pdb_file in pdb_files:
        Salt = dict()
        Asp_Glu_array = np.empty((0, 4))
        Lys_Arg_His_array = np.empty((0, 4))
        
        with open(os.path.join(path, str(pdb_file))) as f:
            Salt= dict()
            Asp_Glu_array = np.empty((0,4))
            Lys_Arg_His_array = np.empty((0,4))
            for line in f:
                line = line.strip()
                if line.startswith('ATOM'):
                    if ('ASP' in line and 'OD' in line) or ('GLU' in line and 'OE' in line):
                        line_array = np.array([[line[8:11].strip(), line[27:38].strip(), line[39:46].strip(), line[47:54].strip()]])
                        line_array = line_array.astype('float64')
                        Asp_Glu_array = np.append(Asp_Glu_array, line_array, axis = 0)
                    if ('LYS' in line and 'NZ' in line) or ('ARG' in line and 'NH' in line) or ('HIS' in line and 'NE' in line) or ('HIS' in line and 'ND' in line):
                        line_array = np.array([[line[8:11].strip(), line[27:38].strip(), line[39:46].strip(), line[47:54].strip()]])
                        line_array = line_array.astype('float64')
                        Lys_Arg_His_array = np.append(Lys_Arg_His_array, line_array, axis = 0)

            #calculate distance clean up array
            distance = cdist(Asp_Glu_array[:,1:], Lys_Arg_His_array[:,1:], metric='euclidean') #calculate distance
            distance = np.concatenate((np.array([Lys_Arg_His_array[:,0]]), distance), axis=0) #add atom number from Lys_Arg_His to array
            distance = np.concatenate((np.insert(np.array([Asp_Glu_array[:,0]]), 0, None).reshape(-1,1), distance), axis=1) #add atom number from Asp_Glu to array
            distance[1:, 1:][distance[1:, 1:] > 4] = np.nan #set distance > 4 to nan
            rows_with_nan = np.insert(np.array([np.all(np.isnan(distance[1:, 1:]), axis=1)]),0, None) #find rows with all nan values
            cols_with_nan = np.insert(np.array([np.all(np.isnan(distance[1:, 1:]), axis=0)]),0, None) #find columns with all nan values
            distance = distance[~rows_with_nan, :] #delete rows with all nan values
            distance = distance[:, ~cols_with_nan] #delete columns with all nan values
            Salt[str(pdb_file).split('-')[1]] = distance

    return Salt


In [181]:
path_pdb = './data/pdbs/'
file = ['AF-C0H3Q1-F1.pdb','AF-C0H3V2-F1.pdb', 'AF-C0H3V8-F1.pdb', 'AF-C0H3Y1-F1.pdb']
test = salt_bridge(path_pdb,'AF-C0H3V8-F1.pdb')
print(test)


{'C0H3V8': array([[         nan, 381.        , 442.        , 459.        ,
        468.        , 499.        ],
       [ 37.        ,          nan,          nan,          nan,
                 nan,   2.906544  ],
       [100.        ,          nan,          nan,   2.73139873,
                 nan,          nan],
       [101.        ,          nan,          nan,          nan,
          3.79779752,          nan],
       [401.        ,          nan,   2.57065809,          nan,
                 nan,          nan],
       [421.        ,   3.79115616,          nan,          nan,
                 nan,          nan],
       [527.        ,          nan,          nan,          nan,
                 nan,   2.87607702]])}


In [None]:
path_pdb = './data/pdbs/'
pdb_file = ['AF-C0H3V8-F1.pdb']
with open(os.path.join(path_pdb, 'AF-C0H3V8-F1.pdb')) as f:
            Salt= dict()
            Asp_Glu_array = np.empty((0,4))
            Lys_Arg_His_array = np.empty((0,4))
            for line in f:
                line = line.strip()
                if line.startswith('ATOM'):
                    if ('ASP' in line and 'OD' in line) or ('GLU' in line and 'OE' in line):
                        line_array = np.array([[line[8:11].strip(), line[27:38].strip(), line[39:46].strip(), line[47:54].strip()]])
                        line_array = line_array.astype('float64')
                        Asp_Glu_array = np.append(Asp_Glu_array, line_array, axis = 0)
                    if ('LYS' in line and 'NZ' in line) or ('ARG' in line and 'NH' in line) or ('HIS' in line and 'NE' in line) or ('HIS' in line and 'ND' in line):
                        line_array = np.array([[line[8:11].strip(), line[27:38].strip(), line[39:46].strip(), line[47:54].strip()]])
                        line_array = line_array.astype('float64')
                        Lys_Arg_His_array = np.append(Lys_Arg_His_array, line_array, axis = 0)

            #calculate distance clean up array
            distance = cdist(Asp_Glu_array[:,1:], Lys_Arg_His_array[:,1:], metric='euclidean') #calculate distance
            distance = np.concatenate((np.array([Lys_Arg_His_array[:,0]]), distance), axis=0) #add atom number from Lys_Arg_His to array
            distance = np.concatenate((np.insert(np.array([Asp_Glu_array[:,0]]), 0, None).reshape(-1,1), distance), axis=1) #add atom number from Asp_Glu to array
            distance[1:, 1:][distance[1:, 1:] > 4] = np.nan #set distance > 4 to nan
            rows_with_nan = np.insert(np.array([np.all(np.isnan(distance[1:, 1:]), axis=1)]),0, None) #find rows with all nan values
            cols_with_nan = np.insert(np.array([np.all(np.isnan(distance[1:, 1:]), axis=0)]),0, None) #find columns with all nan values
            distance = distance[~rows_with_nan, :] #delete rows with all nan values
            distance = distance[:, ~cols_with_nan] #delete columns with all nan values
            Salt[str(pdb_file)[5:11]] = distance
print(Salt)

In [None]:
SeqObj = SequenceParameters('MADKDFGLNDIVEMKKPHPCGANSWKIIRMGMDIRIKCEGCSHSVMIPRREFERKLKKVLVKHEEPTS')
SeqObj.save_zscoresAndPlots(num_scrambles=100000, random_seed=None)


In [None]:
SecObj2 = SequenceParameters('MADKDFGLNDIVEMKKPHPCGANSWKIIRMGMDIRIKCEGCSHSVMIPRREFERKLKKVLVKHEEPTS')
SecObj2.get_kappa()