Secondary structure metrics with BioPython/DSSP 

In [10]:
import os

from Bio import BiopythonWarning
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore', BiopythonWarning)

from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
from Bio.PDB.DSSP import ss_to_index

import functools
from collections import Counter
import pandas as pd
import numpy as np

from pathlib import Path
#from multiprocessing import Pool

In [11]:
#cpus = 80

In [22]:
dataset_name = 'vhh_twist'
# structure_path = '/data/localhost/gordon/TNP_Project/results/debugged_nbb2_results/new_'+dataset_name+'_output/Raw_Model_Outputs/'
# structures = [str(s).split('/')[-1] for s in list(Path(structure_path).glob('**/*.pdb'))]

structure_path = '/data/localhost/gordon/TNP_Project/RESULTS/greiff_results/reduced_strucs/reduced_'+dataset_name+'/'
structures = os.listdir(structure_path)
#files = [structure_path + structure for structure in structures]


In [23]:
def dssp_to_dict(structure, path = structure_path):

    try:
        #print(structure[:-4])
        p = PDBParser(QUIET=True)
        path_to_pdb = path + structure
        #print(path_to_pdb)
        model = p.get_structure("", path_to_pdb)[0] # GG modded here bc nested dirs
        #print(model)
        dssp = DSSP(model, path_to_pdb)
        #print(dssp)

        data = np.array(list(dict(dssp).values()))

        init_c = Counter({
            'H': 0,
            'B': 0,
            'E': 0,
            'G': 0,
            'I': 0,
            'T': 0,
            'S': 0,
            '-': 0
        })

        get_c = Counter(data[..., 2])

        ss_c = dict(functools.reduce(lambda a, b: a.update(b) or a, [init_c, get_c], Counter()))

        values = np.mean(data[..., 3:].astype(np.float32), axis=0)

        ss_d = {}
        keys = ['relative_ASA', 'phi', 'psi', 'NH_O_1_relidx', 
                'NH_O_1_energy', 'O_NH_1_relidx', 'O_NH_1_energy',
                'NH_O_2_relidx', 'NH_O_2_energy', 'O_NH_2_relidx', 
                'O_NH_2_energy']

        ss_d.update(ss_c)

        for n, key in enumerate(keys):
            ss_d.update({key: values[n]})
        
        return ss_d

    except Exception:
        print('Error:', structure[:-4])
        ss_d = {
            'H': 0,
            'B': 0,
            'E': 0,
            'G': 0,
            'I': 0,
            'T': 0,
            'S': 0,
            '-': 0,
            'relative_ASA': 0, 
            'phi': 0, 
            'psi': 0, 
            'NH_O_1_relidx': 0, 
            'NH_O_1_energy': 0,
            'O_NH_1_relidx': 0, 
            'O_NH_1_energy': 0,
            'NH_O_2_relidx': 0,
            'NH_O_2_energy': 0, 
            'O_NH_2_relidx': 0, 
            'O_NH_2_energy': 0
        }
        return ss_d

In [24]:
all_results = dict()
for structure in structures:
    results = dssp_to_dict(structure, path = structure_path)
    all_results[structure[:-4]] = results

In [25]:
len(all_results)

108

In [26]:
df = pd.DataFrame(all_results).reset_index()
#print(df.shape)

#df.insert(0, 'ID', [s[:-4] for s in structures])
# df.to_csv('./ABB_paired_dssp_2.csv', index=None)
#print(df.shape)

In [27]:
df

Unnamed: 0,index,seq_TBC-101-VA10040_VHH,seq_D8D85764-3574-KV-cFR2muts,seq_Ozoralizumab_VHH1,seq_D9D88792-5767-N,seq_Brivekimig2_VHH1,seq_TBC-107-11_VHH,seq_TBC-106-01_VHH,seq_D9D88792-8800-KV,seq_Rimteravimab_VHH1,...,seq_TBC-105-01_VHH,seq_D12D77826-11122-KV,seq_Enristomig_VHH2,seq_TBC-101-VA10006_VHH,seq_Envafolimab_VHH1,seq_TBC-103-VA10005_VHH,seq_D9D88792-13450-N-cFR2muts,seq_TBC-107-01_VHH,seq_D9D88792-13450-KV,seq_TBC-109-VA10008_VHH
0,H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,E,57.0,56.0,55.0,56.0,55.0,62.0,64.0,50.0,59.0,...,55.0,53.0,57.0,58.0,55.0,65.0,58.0,60.0,56.0,61.0
3,G,9.0,9.0,6.0,3.0,6.0,9.0,9.0,6.0,12.0,...,3.0,9.0,9.0,9.0,6.0,3.0,11.0,9.0,9.0,9.0
4,I,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,T,16.0,19.0,20.0,17.0,17.0,15.0,10.0,18.0,15.0,...,22.0,16.0,13.0,18.0,20.0,22.0,17.0,15.0,16.0,16.0
6,S,11.0,9.0,7.0,8.0,10.0,8.0,9.0,10.0,8.0,...,6.0,7.0,9.0,13.0,10.0,4.0,4.0,11.0,7.0,7.0
7,-,32.0,37.0,27.0,35.0,27.0,30.0,26.0,39.0,31.0,...,26.0,29.0,33.0,35.0,37.0,31.0,28.0,29.0,29.0,33.0
8,relative_ASA,0.325164,0.337642,0.331212,0.337,0.356303,0.352723,0.34855,0.339736,0.323166,...,0.341781,0.331213,0.364823,0.323948,0.326611,0.321532,0.339954,0.344536,0.334653,0.327027
9,phi,-82.412018,-77.747696,-77.047821,-81.664703,-79.169556,-76.990311,-76.689812,-77.660477,-81.743195,...,-78.366364,-77.611397,-78.696701,-75.616547,-78.923431,-76.008789,-80.268646,-81.748405,-78.096611,-82.545235


In [28]:
df.to_csv('/data/localhost/gordon/TNP_Project/RESULTS/greiff_results/sec_struc/VHH_TWIST_dssp.csv', index=False)