In [1]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from loop.dssp import generate_pdp_chain_pairs
from utils.common import load_tab, save_tab, dump_dicts2jsons, lower_pdbid, read_json2dict, dump_dict2json
from utils.protein import protein_letters_3to1

from utils.loops import print_loop_count

from domain_linker.cc_contact import contact, getCoords
from domain_linker.extract_linkers import get_2domain_start_end
from domain_linker.hbond import check_2Hbond
from params import *


# DLD regions

In [2]:
# 1. Load the 5 datasets.
# 1.1. domain
df_domain_dataset = load_tab(path_domain_dataset_tab)

# 1.2. intra-domain-loop
df_intra_domain_loop = load_tab(path_loops_intra_domain_smoothSS_first_2step_tab)

# 1.3. dependent-domain-linker
df_dependent_domain_linker = load_tab(path_dependent_domain_linker)

# 1.4. independent-domain-linker
df_independent_domain_linker = load_tab(path_independent_domain_linker)

# 1.4. terminus
df_terminus = load_tab(path_terminus_tab)

df_domain_dataset.shape, df_intra_domain_loop.shape, df_dependent_domain_linker.shape, df_independent_domain_linker.shape, df_terminus.shape

((6044, 18), (44896, 18), (647, 31), (1640, 31), (2991, 21))

In [5]:
df_independent_domain_linker[df_independent_domain_linker['miss_length']>0]

Unnamed: 0,linkerID,start_loop,end_loop,seq_id,start_unp,end_unp,seq_id_unp,seq,seq_unp,dssp_key_str,...,num_hbonds,num_contacts,pdbid_domain,chainid_domain,start_domain,end_domain,dist,missing_domain,pdb_chainid,seqID
0,0,142,157,"[142, 143, 144, 145, 146, 147, 148, 149, 150, ...",142,157,"[142, 143, 144, 145, 146, 147, 148, 149, 150, ...",L R A N R A T T E R D V N Q L T,L R A N R A T T E R D V N Q L T,"['A_142_ ', 'A_143_ ', 'A_144_ ', 'A_145_ ', '...",...,1,2,1A04,A,143,149,7,1,1A04_A,1a04_a_P0AF28
1,1,89,114,"[89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 1...",89,114,"[89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 1...",E K F R E D D P S A H L A S P K L P Q R L P K ...,E K F R E D D P S A H L A S P K L P Q R L P K ...,"['A_89_ ', 'A_90_ ', 'A_91_ ', 'A_92_ ', 'A_93...",...,0,0,1A0P,A,101,110,10,1,1A0P_A,1a0p_a_P0A8P8
17,24,245,260,"[245, 246, 247, 248, 249, 250, 251, 252, 253, ...",245,260,"[245, 246, 247, 248, 249, 250, 251, 252, 253, ...",L S R N E A L A A L L R D G E T,L S R N E A L A A L L R D G E T,"['A_245_ ', 'A_246_ ', 'A_247_ ', 'A_248_ ', '...",...,0,1,1AOA,A,252,259,8,1,1AOA_A,1aoa_a_P13797
20,29,74,109,"[74, 75, 76, 87, 88, 89, 90, 91, 92, 93, 94, 9...",197,222,"[197, 198, 199, 200, 201, 202, 203, 204, 205, ...",A E Q V G A L Y N E K V G A N E R K R K R R T ...,A E Q V G A L Y N E K V G A N E R K R K R R T ...,"['A_74_ ', 'A_75_ ', 'A_76_ ', 'A_87_ ', 'A_88...",...,0,0,1AU7,A,77,107,31,0,1AU7_A,1au7_a_P10037
27,37,63,75,"[63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 7...",63,75,"[63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 7...",D E K H E T E Y D G Q L D,D E K H E T E Y D G Q L D,"['A_63_ ', 'A_64_ ', 'A_65_ ', 'A_66_ ', 'A_67...",...,0,0,1B0N,A,69,73,5,1,1B0N_A,1b0n_a_P06533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1545,2182,865,877,"[865, 866, 867, 868, 869, 870, 871, 872, 873, ...",865,877,"[865, 866, 867, 868, 869, 870, 871, 872, 873, ...",D Q R K T Y K K N P S L I,D Q R K T Y K K N P S L I,"['A_865_ ', 'A_866_ ', 'A_867_ ', 'A_868_ ', '...",...,0,1,5JC7,A,868,876,9,1,5JC7_A,5jc7_a_D9N195
1601,2246,376,488,"[376, 377, 378, 379, 380, 381, 382, 383, 384, ...",376,488,"[376, 377, 378, 379, 380, 381, 382, 383, 384, ...",N Q R F I Y G N Q D L F A T S Q N K E F D P L ...,N Q R F I Y G N Q D L F A T S Q N K E F D P L ...,"['A_376_ ', 'A_377_ ', 'A_378_ ', 'A_379_ ', '...",...,0,0,5XMC,A,382,483,102,1,5XMC_A,5xmc_a_Q8C863
1606,2251,266,291,"[266, 267, 268, 269, 270, 271, 272, 273, 274, ...",266,291,"[266, 267, 268, 269, 270, 271, 272, 273, 274, ...",H P W L Q G V D P S P A T K Y N I P L V S Y K ...,H P W L Q G V D P S P A T K Y N I P L V S Y K ...,"['A_266_ ', 'A_267_ ', 'A_268_ ', 'A_269_ ', '...",...,0,1,5YKS,A,274,289,16,0,5YKS_A,5yks_a_Q9NRH2
1620,2269,148,160,"[148, 149, 150, 151, 152, 153, 154, 155, 156, ...",148,160,"[148, 149, 150, 151, 152, 153, 154, 155, 156, ...",G N N L G S T D G Y L A T,G N N L G S T D G Y L A T,"['A_148_ ', 'A_149_ ', 'A_150_ ', 'A_151_ ', '...",...,0,0,6I97,A,150,154,5,1,6I97_A,6i97_a_Q9I116


In [13]:
len(df_terminus['pdb_chainid'].unique()), len(df_dependent_domain_linker['pdb_chainid'].unique()), len(df_independent_domain_linker['pdb_chainid'].unique())

(1751, 618, 1405)

In [3]:
df_independent_domain_linker[df_independent_domain_linker['length']>=10]

Unnamed: 0,linkerID,start_loop,end_loop,seq_id,start_unp,end_unp,seq_id_unp,seq,seq_unp,dssp_key_str,...,num_hbonds,num_contacts,pdbid_domain,chainid_domain,start_domain,end_domain,dist,missing_domain,pdb_chainid,seqID
0,0,142,157,"[142, 143, 144, 145, 146, 147, 148, 149, 150, ...",142,157,"[142, 143, 144, 145, 146, 147, 148, 149, 150, ...",L R A N R A T T E R D V N Q L T,L R A N R A T T E R D V N Q L T,"['A_142_ ', 'A_143_ ', 'A_144_ ', 'A_145_ ', '...",...,1,2,1A04,A,143,149,7,1,1A04_A,1a04_a_P0AF28
1,1,89,114,"[89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 1...",89,114,"[89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 1...",E K F R E D D P S A H L A S P K L P Q R L P K ...,E K F R E D D P S A H L A S P K L P Q R L P K ...,"['A_89_ ', 'A_90_ ', 'A_91_ ', 'A_92_ ', 'A_93...",...,0,0,1A0P,A,101,110,10,1,1A0P_A,1a0p_a_P0A8P8
3,3,220,229,"[220, 221, 222, 223, 224, 225, 226, 227, 228, ...",220,229,"[220, 221, 222, 223, 224, 225, 226, 227, 228, ...",S K S P G A S N L K,S K S P G A S N L K,"['A_220_ ', 'A_221_ ', 'A_222_ ', 'A_223_ ', '...",...,0,0,1A3Q,A,-1,-1,0,0,1A3Q_A,1a3q_a_Q00653
5,5,50,63,"[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 6...",126,139,"[126, 127, 128, 129, 130, 131, 132, 133, 134, ...",H N H L N P E V K K S S W T,H N H L N P E V K K S S W T,"['A_50_ ', 'A_51_ ', 'A_52_ ', 'A_53_ ', 'A_54...",...,0,0,1A5J,A,-1,-1,0,0,1A5J_A,1a5j_a_Q03237
6,7,285,326,"[285, 286, 287, 288, 289, 290, 291, 292, 293, ...",303,344,"[303, 304, 305, 306, 307, 308, 309, 310, 311, ...",A G A A N A D P T T S A N P N P A Q L N E A D ...,A G A A N A D P T T S A N P N P A Q L N E A D ...,"['A_285_ ', 'A_286_ ', 'A_287_ ', 'A_288_ ', '...",...,1,1,1A65,A,-1,-1,0,0,1A65_A,1a65_a_Q9Y780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1633,2287,519,551,"[519, 520, 521, 522, 523, 524, 525, 526, 527, ...",546,578,"[546, 547, 548, 549, 550, 551, 552, 553, 554, ...",H P P K D S S G Q R V D V S P T S Q R L Q L L ...,H P P K D S S G Q R V D V S P T S Q R L Q L L ...,"['A_519_ ', 'A_520_ ', 'A_521_ ', 'A_522_ ', '...",...,1,1,7ACN,A,-1,-1,0,0,7ACN_A,7acn_a_P16276
1634,2289,128,149,"[128, 129, 130, 131, 132, 133, 134, 135, 136, ...",128,149,"[128, 129, 130, 131, 132, 133, 134, 135, 136, ...",R D K V G P V G A S G L T V G T A A D G N A,R D K V G P V G A S G L T V G T A A D G N A,"['A_128_ ', 'A_129_ ', 'A_130_ ', 'A_131_ ', '...",...,0,0,7OA5,A,134,148,15,1,7OA5_A,7oa5_a_P40832
1636,2291,557,568,"[557, 558, 559, 560, 561, 562, 563, 564, 565, ...",557,568,"[557, 558, 559, 560, 561, 562, 563, 564, 565, ...",G R Y T A Q I R T I S G,G R Y T A Q I R T I S G,"['A_557_ ', 'A_558_ ', 'A_559_ ', 'A_560_ ', '...",...,0,0,7REQ,A,-1,-1,0,0,7REQ_A,7req_a_P11653
1638,2294,75,89,"[75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 8...",102,116,"[102, 103, 104, 105, 106, 107, 108, 109, 110, ...",G A G C Q G G P C R A D I K C,G A G C Q G G P C R A D I K C,"['A_75_ ', 'A_76_ ', 'A_77_ ', 'A_78_ ', 'A_79...",...,0,0,9WGA,A,-1,-1,0,0,9WGA_A,9wga_a_P02876


# two meta files

In [7]:
multi_domain = load_tab(path_tab_scop_FA_continuous_uni_multi_parse_domian)
dict_pdb_chain_unp = read_json2dict(path_pdb_chain_unp)

list_short_domain = ['1e3h_a', '3ttc_a', '4g9i_a']
multi_domain['pdb_chainid'] = [r.lower() for r in multi_domain['seq_id']]

In [9]:
len(multi_domain[multi_domain['pdb_chainid'].isin(list_short_domain)])

15

In [6]:
len(dict_pdb_chain_unp)

2565

In [10]:
1640+647

2287