In [1]:
import os
import sys
import  urllib.request
import pandas as pd
from tqdm import tqdm
import numpy as np
from pymol import *

In [2]:
aa_dict = {'GLY':'G','ALA' :'A','VAL': 'V','LEU': 'L','ILE': 'I','PRO': 'P','PHE' :'F','TYR' :'Y','TRP' :'W',
'SER' :'S','THR': 'T','CYS': 'C','MET': 'M','ASN': 'N','GLN' :'Q','ASP': 'D', 'GLU' :'E','LYS' :'K','ARG' :'R', 'HIS' :'H',
'NAG':'X', 'DG':'X','DT':'X','DC':'X','DA':'X','UNK':'X',
'GLX':'X'}

In [3]:

def get_chain_ca_coordinates(pdb_path, chain_id):
    """
    获取指定链上的所有氨基酸残基的 Cα（alpha-carbon）坐标
    """

    cmd.delete('all')
    # cmd.load('./data/PDBs/{}.pdb'.format(pdb_name))
    cmd.load(pdb_path)
    cmd.remove('solvent')
    cmd.remove('hetatm')

    pos_dict = dict()  # 临时存储数据
    model = cmd.get_model(f"chain {chain_id} and name CA")  # 获取 Cα 原子数据
    for atom in model.atom:
        resn = atom.resn   # 残基名称
        resi = atom.resi   # 残基编号
        x, y, z = atom.coord  # Cα 原子的坐标
        # stored_list.append([resn, resi, x, y, z])

        ID = '{}-{}'.format(resi, aa_dict[resn])
        pos = [x, y, z]
        pos_dict[ID] = pos

    return pos_dict



In [4]:
##biounit chains
biounit_chains_dict = dict()
tempdata = pd.read_csv('../../SKEMPI2/S4169.csv').values.tolist()
for item in tempdata:
    pdb_name = item[0]
    biounit_chains = list(item[1].replace('_',''))
    biounit_chains_dict[pdb_name] = biounit_chains
biounit_chains_dict['2QJB'] = ['A', 'B', 'C', 'D']
biounit_chains_dict['2QJA'] = ['A', 'B', 'C', 'D']
biounit_chains_dict['2QJ9'] = ['A', 'B', 'C', 'D']
biounit_chains_dict['1MQ8'] = ['A', 'B']
biounit_chains_dict['1XXM'] = ['A', 'C']
biounit_chains_dict['2PYE'] = ['D', 'E']
biounit_chains_dict['3HG1'] = ['A', 'C', 'D', 'E']
biounit_chains_dict['1QAB'] = ['A', 'B', 'C', 'D', 'E', 'F']
biounit_chains_dict['4UYP'] = ['A', 'B', 'C', 'D']
biounit_chains_dict['4UYQ'] = ['A', 'B']
biounit_chains_dict['2VN5'] = ['A', 'B']
biounit_chains_dict['3BDY'] = ['H', 'L', 'V']
biounit_chains_dict['3BE1'] = ['H', 'L', 'A']
biounit_chains_dict['2NY7'] = ['H', 'L', 'G']
biounit_chains_dict['3IDX'] = ['H', 'L', 'G']
biounit_chains_dict['2C5D'] = ['A', 'B', 'C', 'D']
biounit_chains_dict['1OHZ'] = ['A', 'B']
biounit_chains_dict['1Y4A'] = ['E', 'I']
biounit_chains_dict['2NOJ'] = ['A', 'B']
biounit_chains_dict['2ABZ'] = ['B', 'E']
biounit_chains_dict['2CCL'] = ['A', 'B']
biounit_chains_dict['3UIH'] = ['A', 'B']
biounit_chains_dict['3SE4'] = ['A', 'B', 'C']
biounit_chains_dict['4GU0'] = ['A', 'C', 'E']




In [5]:
aa_coor_dict = dict()

dataname = 'SM_ZEMu'
# dataname = 'SM595'
# dataname = 'SM1124'

##Test 2
no_error_count = 0
data = pd.read_csv('./{}.csv'.format(dataname)).values.tolist() 
for item in tqdm(data):
    # pdb_name,Partner1,Partner2,mutations,ddg,_ = item
    
    pdb_chains,num_mut,multiple_mutations,ddG,mmCSM_PPI,Zemu,DDMut_PPI = item
    # pdb_chains,num_mut,multiple_mutations,ddG,mmCSM_PPI,discovery_studio,foldx,DDMut_PPI = item
    # pdb_chains,num_mut,multiple_mutations,ddG,mmCSM_PPI,DDMut_PPI = item
    
    
    pdb_name = pdb_chains[:4]
    biounit_chains = biounit_chains_dict[pdb_name]
    mutations = multiple_mutations.replace('_',',')

    ## Get coordinates for each chain   
    for chain in biounit_chains:
        subunit_ID = pdb_name + '_' + chain

        if subunit_ID not in aa_coor_dict.keys():
            pos_data = get_chain_ca_coordinates('./PDBs/{}.pdb'.format(pdb_name),chain) # 获取alpha碳原子坐标
            aa_coor_dict[subunit_ID] = pos_data
    
    ## 检查突变信息是否正确 
    flag_mutation_right = True
    for mutation in mutations.split(','):
        mut_chain = mutation[1]
        src_aa = mutation[0]
        tgt_aa = mutation[-1]
        pos = mutation[2:-1]

        ##Check if mutation information is wrong
        src_aa_pos = '{}-{}'.format(pos,src_aa)
        mut_subunit = pdb_name + '_' + mut_chain

        if src_aa_pos.upper() not in aa_coor_dict[mut_subunit].keys():
            flag_mutation_right = False
            mut_chain_wrong = mut_chain
            src_aa_pos_wrong = src_aa_pos
            
    if not flag_mutation_right:
        print("Something is wrong for {}-{}-{}".format(pdb_name, mut_chain_wrong,src_aa_pos_wrong))
    else:
        no_error_count += 1


print(no_error_count)
np.save('./{}-position.npy'.format(dataname),aa_coor_dict)
print('dataname done!')


  0%|          | 0/270 [00:00<?, ?it/s]

 26%|██▌       | 70/270 [00:01<00:02, 77.94it/s]

Something is wrong for 1DVF-D-106-R


100%|██████████| 270/270 [00:02<00:00, 99.54it/s] 

269
dataname done!





In [6]:
aa_coor_dict = dict()

# dataname = 'SM_ZEMu'
dataname = 'SM595'
# dataname = 'SM1124'

##Test 2
no_error_count = 0
data = pd.read_csv('./{}.csv'.format(dataname)).values.tolist() 
for item in tqdm(data):
    # pdb_name,Partner1,Partner2,mutations,ddg,_ = item
    
    # pdb_chains,num_mut,multiple_mutations,ddG,mmCSM_PPI,Zemu,DDMut_PPI = item
    pdb_chains,num_mut,multiple_mutations,ddG,mmCSM_PPI,discovery_studio,foldx,DDMut_PPI = item
    # pdb_chains,num_mut,multiple_mutations,ddG,mmCSM_PPI,DDMut_PPI = item
    
    
    pdb_name = pdb_chains[:4]
    biounit_chains = biounit_chains_dict[pdb_name]
    mutations = multiple_mutations.replace('_',',')

    ## Get coordinates for each chain   
    for chain in biounit_chains:
        subunit_ID = pdb_name + '_' + chain

        if subunit_ID not in aa_coor_dict.keys():
            pos_data = get_chain_ca_coordinates('./PDBs/{}.pdb'.format(pdb_name),chain) # 获取alpha碳原子坐标
            aa_coor_dict[subunit_ID] = pos_data
    
    ## 检查突变信息是否正确 
    flag_mutation_right = True
    for mutation in mutations.split(','):
        mut_chain = mutation[1]
        src_aa = mutation[0]
        tgt_aa = mutation[-1]
        pos = mutation[2:-1]

        ##Check if mutation information is wrong
        src_aa_pos = '{}-{}'.format(pos,src_aa)
        mut_subunit = pdb_name + '_' + mut_chain

        if src_aa_pos.upper() not in aa_coor_dict[mut_subunit].keys():
            flag_mutation_right = False
            mut_chain_wrong = mut_chain
            src_aa_pos_wrong = src_aa_pos
            
    if not flag_mutation_right:
        print("Something is wrong for {}-{}-{}".format(pdb_name, mut_chain_wrong,src_aa_pos_wrong))
    else:
        no_error_count += 1


print(no_error_count)
np.save('./{}-position.npy'.format(dataname),aa_coor_dict)
print('dataname done!')


 85%|████████▍ | 503/595 [00:06<00:01, 66.93it/s] 

Something is wrong for 3BDY-L-98-T
Something is wrong for 3BDY-H-101-G
Something is wrong for 3BE1-L-98-T
Something is wrong for 3BE1-H-102-D
Something is wrong for 3BDY-L-98-T
Something is wrong for 3BDY-H-101-G
Something is wrong for 3BDY-H-60-R
Something is wrong for 3BE1-L-98-T
Something is wrong for 3BE1-H-102-D
Something is wrong for 3BE1-H-102-D
Something is wrong for 3BE1-L-33-I


100%|██████████| 595/595 [00:11<00:00, 49.87it/s]

584
dataname done!





In [7]:
aa_coor_dict = dict()

# dataname = 'SM_ZEMu'
# dataname = 'SM595'
dataname = 'SM1124'

##Test 2
no_error_count = 0
data = pd.read_csv('./{}.csv'.format(dataname)).values.tolist() 
for item in tqdm(data):
    # pdb_name,Partner1,Partner2,mutations,ddg,_ = item
    
    # pdb_chains,num_mut,multiple_mutations,ddG,mmCSM_PPI,Zemu,DDMut_PPI = item
    # pdb_chains,num_mut,multiple_mutations,ddG,mmCSM_PPI,discovery_studio,foldx,DDMut_PPI = item
    pdb_chains,num_mut,multiple_mutations,ddG,mmCSM_PPI,DDMut_PPI = item
    
    
    pdb_name = pdb_chains[:4]
    biounit_chains = biounit_chains_dict[pdb_name]
    mutations = multiple_mutations.replace('_',',')

    ## Get coordinates for each chain   
    for chain in biounit_chains:
        subunit_ID = pdb_name + '_' + chain

        if subunit_ID not in aa_coor_dict.keys():
            pos_data = get_chain_ca_coordinates('./PDBs/{}.pdb'.format(pdb_name),chain) # 获取alpha碳原子坐标
            aa_coor_dict[subunit_ID] = pos_data
    
    ## 检查突变信息是否正确 
    flag_mutation_right = True
    for mutation in mutations.split(','):
        mut_chain = mutation[1]
        src_aa = mutation[0]
        tgt_aa = mutation[-1]
        pos = mutation[2:-1]

        ##Check if mutation information is wrong
        src_aa_pos = '{}-{}'.format(pos,src_aa)
        mut_subunit = pdb_name + '_' + mut_chain

        if src_aa_pos.upper() not in aa_coor_dict[mut_subunit].keys():
            flag_mutation_right = False
            mut_chain_wrong = mut_chain
            src_aa_pos_wrong = src_aa_pos
            
    if not flag_mutation_right:
        print("Something is wrong for {}-{}-{}".format(pdb_name, mut_chain_wrong,src_aa_pos_wrong))
    else:
        no_error_count += 1


print(no_error_count)
np.save('./{}-position.npy'.format(dataname),aa_coor_dict)
print('dataname done!')


  7%|▋         | 74/1124 [00:00<00:12, 84.08it/s] 

Something is wrong for 1DVF-D-106-R


100%|██████████| 1124/1124 [00:22<00:00, 49.62it/s]

1123
dataname done!



