In [1]:
import os
import numpy as np
import pandas as pd

import json

from params import *

from utils.common import load_tab, df2json, save_tab, read_json2dict
from loop.dssp import generate_pdp_chain_pairs
from loop.pdb_files import get_uniprot_info

import requests

# 0. improtant information

> Here, we complete the Uniprot sequence length by accessing the Uniprot URL
>
> Unfortunately, some sequences have been deleted from Uniprot database (len_seq_unp=0), therefore, we access UniParc website and complete that part.

# Functions

In [2]:
def get_protein_length(uniprot_accession):
    # UniProt API URL for retrieving protein sequence in plain text format
    url = f"https://www.uniprot.org/uniprot/{uniprot_accession}.fasta"
    
    response = requests.get(url)
    
    if response.status_code == 200:
        fasta_data = response.text
        # Extract the sequence from the FASTA format
        lines = fasta_data.split('\n')
        sequence = ''.join(lines[1:])
        # Calculate the length of the sequence
        sequence_length = len(sequence)
        return sequence_length
    else:
        return None

# 1. extract PDB <-> Uniprot mapping info
> start & end of Uniprot index for PDB chain

In [3]:
# {'1A04': ['A'], '1A0I': ['A'],...}
dict_pdb_chain_unp = read_json2dict(path_pdb_chain_unp)
len(dict_pdb_chain_unp)

2565

**Generate list of seq_id**

In [4]:
# # ['1a04_a_uniprotID', '1a0i_a_uniprotID', '1a0p_a_uniprotID', ...]
list_pidChainUnp = []

for pid, dict_chain_unp in dict_pdb_chain_unp.items():
    pid = pid.lower()
    for chain, unps in dict_chain_unp.items():
        list_pidChainUnp += [f'{pid}_{chain.lower()}_{unp}' for unp in unps]

In [5]:
list_idx_mapping = []
for seq_id in list_pidChainUnp:
    df_uniprot = pd.read_json(os.path.join(path_dssp_full_uniprot_chainid, seq_id+'.json'))
    uniprot_start = list(df_uniprot['unp_num'])[0]
    uniprot_end = list(df_uniprot['unp_num'])[-1]
    list_id = seq_id.split('_')
    pdbid = list_id[0]
    chainid = list_id[1]
    acc = list_id[2]
    len_seq_unp = get_protein_length(acc)
    
    dict_map = {'seq_id': seq_id, 'pdb_id': pdbid, 'chain_id': chainid, 'acc': acc, 'len_seq_unp': len_seq_unp,
                'uniprot_start': uniprot_start, 'uniprot_end': uniprot_end}
    list_idx_mapping.append(dict_map)

In [8]:
df_pdbUniMap = pd.DataFrame(list_idx_mapping).drop_duplicates()
df_pdbUniMap

Unnamed: 0,seq_id,pdb_id,chain_id,acc,len_seq_unp,uniprot_start,uniprot_end
0,1a04_a_P0AF28,1a04,a,P0AF28,216,2,216
2,1a0i_a_P00969,1a0i,a,P00969,359,2,349
4,1a0p_a_P0A8P8,1a0p,a,P0A8P8,298,3,292
6,1a21_a_P24055,1a21,a,P24055,292,33,251
8,1a3q_a_Q00653,1a3q,a,Q00653,900,37,327
...,...,...,...,...,...,...,...
6051,7pvc_a_P0ADE6,7pvc,a,P0ADE6,149,1,149
6053,7req_a_P11653,7req,a,P11653,728,2,728
6055,7req_b_P11652,7req,b,P11652,638,2,638
6057,8ruc_a_P00875,8ruc,a,P00875,475,1,475


In [9]:
len(df_pdbUniMap['seq_id'].unique())

2649

In [10]:
save_tab(df_pdbUniMap, path_idxMap_pdb_chain_unp)

## checking PDB chain & Uniprot sequence mapping

In [24]:
df_pdbUniMap = load_tab(path_idxMap_pdb_chain_unp)

**The following sequences have been deleted from Uniprot**

In [29]:
df_pdbUniMap[[r['uniprot_end']>r['len_seq_unp'] for i, r in df_pdbUniMap.iterrows()]]

Unnamed: 0,seq_id,pdb_id,chain_id,acc,len_seq_unp,uniprot_start,uniprot_end
2141,3n91_a_A7M003,3n91,a,A7M003,0,23,344
2151,3oqq_a_A7LT28,3oqq,a,A7LT28,0,23,456
2154,3p02_a_A7LR42,3p02,a,A7LR42,0,23,338
2155,3p1v_a_A7LS78,3p1v,a,A7LS78,0,20,425
2176,3qwn_h_A5ZK25,3qwn,h,A5ZK25,0,23,235
2194,3sot_e_A7LZP6,3sot,e,A7LZP6,0,22,339
2262,4aur_a_E3PN25,4aur,a,E3PN25,0,1,577
2285,4c91_a_A7M022,4c91,a,A7M022,0,1,856
2302,4dqa_a_A7M0D0,4dqa,a,A7M0D0,0,27,380
2447,4pqx_a_A5ZGW9,4pqx,a,A5ZGW9,0,27,242


## Manually check and update the sequence length

In [36]:
'''
Deleted Sequence length, get from https://www.uniprot.org/uniparc
!!! All the PDB chains end at the end of the Uniprot sequences.

 'A7M003', 344
 'A7LT28', 456
 'A7LR42', 338
 'A7LS78', 425
 'A5ZK25', 235
 'A7LZP6', 339
 'E3PN25', 577
 'A7M022', 856
 'A7M0D0', 380
 'A5ZGW9', 242
 'A5ZGP5', 418
 'A5ZHK4', 312
 'A7AJI6', 261
 'R4YEY9', 477
 'K0A9N9', 275
 '''
df_pdbUniMap.loc[[r['uniprot_end']>r['len_seq_unp'] for i, r in df_pdbUniMap.iterrows()], 'len_seq_unp'] = list(df_pdbUniMap.loc[[r['uniprot_end']>r['len_seq_unp'] for i, r in df_pdbUniMap.iterrows()], 'uniprot_end'])


In [38]:
# check again
df_pdbUniMap[[r['uniprot_end']>r['len_seq_unp'] for i, r in df_pdbUniMap.iterrows()]]

Unnamed: 0,seq_id,pdb_id,chain_id,acc,len_seq_unp,uniprot_start,uniprot_end


In [39]:
# save the mapping infomation again
save_tab(df_pdbUniMap, path_idxMap_pdb_chain_unp)

# 2. Dependent Domain Linker

In [13]:
df_DDL = load_tab(path_dependent_domain_linker)
df_DDL.columns

Index(['linkerID', 'start_loop', 'end_loop', 'seq_id', 'start_unp', 'end_unp',
       'seq_id_unp', 'seq', 'seq_unp', 'dssp_key_str', 'missing_loop',
       'miss_length', 'miss_percentage', 'unp_acc', 'length', 'domain1',
       'domain2', 'pdbid_loop', 'chainid_loop', 'hbonds', 'c_c_contacts',
       'num_hbonds', 'num_contacts', 'pdbid_domain', 'chainid_domain',
       'start_domain', 'end_domain', 'dist', 'missing_domain'],
      dtype='object')

In [14]:
df_DDL.loc[df_DDL['unp_acc']=='P0ABH9', :]

Unnamed: 0,linkerID,start_loop,end_loop,seq_id,start_unp,end_unp,seq_id_unp,seq,seq_unp,dssp_key_str,...,hbonds,c_c_contacts,num_hbonds,num_contacts,pdbid_domain,chainid_domain,start_domain,end_domain,dist,missing_domain


In [15]:
df_DDL.loc[:, ['pdbid_domain', 'unp_acc']]

Unnamed: 0,pdbid_domain,unp_acc
0,1A65,Q9Y780
1,1A6C,Q88894
2,1A8D,P04958
3,1ABR,P11140
4,1AIS,P62001
...,...,...
644,6L7J,M1HE54
645,6OP5,A0A384E132
646,6SDR,Q72EJ1
647,7BUR,P24826
