# Disordered parts of protein



In [1]:
import gzip
import time
from multiprocessing import Pool
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import utils.gcs_utils as gcs
import utils.spark_utils as sprk
from utils.proteins import *

In [2]:
files = gcs.list_keys("UP000005640_9606_HUMAN/cif")

In [5]:
i = 0
files[i*1000:(i+1)*1000]

['UP000005640_9606_HUMAN/cif/AF-A0A024R1R8-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A024RBG1-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A024RCN7-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A075B6H5-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A075B6H7-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A075B6H8-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A075B6H9-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A075B6I0-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A075B6I1-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A075B6I3-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A075B6I4-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A075B6I6-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A075B6I7-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A075B6I9-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A075B6J1-F1-model_v1.cif.gz',
 'UP000005640_9606_HUMAN/cif/AF-A0A075B6

In [6]:
key = 'UP000005640_9606_HUMAN/cif/AF-A0A024R1R8-F1-model_v1.cif.gz'
content = gcs.download_gzip_to_string(key)
parsed_cif = parse_cif(content)
file_name = key.strip(".cif.gz").split("/")[-1]

In [7]:
file_name

'AF-A0A024R1R8-F1-model_v1'

In [8]:
sequence = get_protein_sequence_from_cif(parsed_cif)
sequence['protein_filename'] = file_name
sequence['protein_id'] = get_protein_id_from_filename(file_name)
sequence['confidence_pLDDT'] = get_global_confidence_from_cif(parsed_cif)

sequence.head()

Unnamed: 0,db_code,db_name,entity_id,id,pdbx_align_begin,pdbx_db_accession,pdbx_db_isoform,pdbx_seq_one_letter_code,protein_filename,protein_id,confidence_pLDDT
0,A0A024R1R8_HUMAN,UNP,1,1,1,A0A024R1R8,,MSSHEGGKKKALKQPKKQAKEMDEEEKAFKQKQKEEQKKLEVLKAK...,AF-A0A024R1R8-F1-model_v1,A0A024R1R8,73.21


In [11]:
sequence.shape

(1, 11)

In [10]:
atoms = get_atom_sites_from_cif(parsed_cif)
atoms['protein_filename'] = file_name
atoms['protein_id'] = get_protein_id_from_filename(file_name)

atoms.head()

Unnamed: 0,group_PDB,id,type_symbol,label_atom_id,label_alt_id,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,...,auth_comp_id,auth_asym_id,auth_atom_id,pdbx_PDB_model_num,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res,protein_filename,protein_id
0,ATOM,1,N,N,,MET,A,1,1,,...,MET,A,N,1,A0A024R1R8,UNP,1,M,AF-A0A024R1R8-F1-model_v1,A0A024R1R8
1,ATOM,2,C,CA,,MET,A,1,1,,...,MET,A,CA,1,A0A024R1R8,UNP,1,M,AF-A0A024R1R8-F1-model_v1,A0A024R1R8
2,ATOM,3,C,C,,MET,A,1,1,,...,MET,A,C,1,A0A024R1R8,UNP,1,M,AF-A0A024R1R8-F1-model_v1,A0A024R1R8
3,ATOM,4,C,CB,,MET,A,1,1,,...,MET,A,CB,1,A0A024R1R8,UNP,1,M,AF-A0A024R1R8-F1-model_v1,A0A024R1R8
4,ATOM,5,O,O,,MET,A,1,1,,...,MET,A,O,1,A0A024R1R8,UNP,1,M,AF-A0A024R1R8-F1-model_v1,A0A024R1R8


In [21]:
atoms.iloc[0]

group_PDB                                       ATOM
id                                                 1
type_symbol                                        N
label_atom_id                                      N
label_alt_id                                        
label_comp_id                                    MET
label_asym_id                                      A
label_entity_id                                    1
label_seq_id                                       1
pdbx_PDB_ins_code                               None
Cartn_x                                      -52.339
Cartn_y                                       -6.285
Cartn_z                                       37.051
occupancy                                        1.0
B_iso_or_equiv                                 59.87
pdbx_formal_charge                              None
auth_seq_id                                        1
auth_comp_id                                     MET
auth_asym_id                                  

In [23]:
print(parsed_cif)

<pdbx.containers.DataContainer object at 0x15be96b50>


In [12]:
struct_shape_explode = get_protein_shapes_from_cif(parsed_cif)
atoms_w_shape = join_atoms_with_shape(atoms, struct_shape_explode)

In [34]:
print(str(struct_shape_explode[5:15]))

   shape.beg_label_comp_id shape.beg_label_seq_id shape.conf_type_id  \
5                      ALA                     11         TURN_TY1_P   
6                      LYS                     13               BEND   
7                      LYS                     13               BEND   
8                      PRO                     15       HELX_RH_AL_P   
9                      PRO                     15       HELX_RH_AL_P   
10                     PRO                     15       HELX_RH_AL_P   
11                     PRO                     15       HELX_RH_AL_P   
12                     PRO                     15       HELX_RH_AL_P   
13                     PRO                     15       HELX_RH_AL_P   
14                     PRO                     15       HELX_RH_AL_P   

   shape.end_label_comp_id shape.end_label_seq_id       shape.id  label_seq_id  
5                      LEU                     12    TURN_TY1_P1            12  
6                      GLN                   

In [15]:
struct_shape_explode.shape

(49, 7)

In [16]:
struct_shape_explode.columns

Index(['shape.beg_label_comp_id', 'shape.beg_label_seq_id',
       'shape.conf_type_id', 'shape.end_label_comp_id',
       'shape.end_label_seq_id', 'shape.id', 'label_seq_id'],
      dtype='object')

In [19]:
struct_shape_explode['shape.conf_type_id'].drop_duplicates()

0             BEND
4       TURN_TY1_P
8     HELX_RH_AL_P
45    HELX_RH_3T_P
Name: shape.conf_type_id, dtype: object

In [25]:
struct_shape_explode.iloc[0]

shape.beg_label_comp_id      GLY
shape.beg_label_seq_id         7
shape.conf_type_id          BEND
shape.end_label_comp_id      LYS
shape.end_label_seq_id        10
shape.id                   BEND1
label_seq_id                   7
Name: 0, dtype: object

In [26]:
struct_shape_explode.iloc[1]

shape.beg_label_comp_id      GLY
shape.beg_label_seq_id         7
shape.conf_type_id          BEND
shape.end_label_comp_id      LYS
shape.end_label_seq_id        10
shape.id                   BEND1
label_seq_id                   8
Name: 1, dtype: object

In [36]:
print(content)

data_AF-A0A024R1R8-F1
#
_entry.id AF-A0A024R1R8-F1
#
_af_target_ref_db_details.gene                         hCG_2014768
_af_target_ref_db_details.seq_db_sequence_checksum     C00D8704F604CA72
_af_target_ref_db_details.seq_db_sequence_version_date 2014-07-09
#
loop_
_atom_type.symbol
C 
N 
O 
S 
#
loop_
_audit_author.name
_audit_author.pdbx_ordinal
"Jumper, John"               1  
"Evans, Richard"             2  
"Pritzel, Alexander"         3  
"Green, Tim"                 4  
"Figurnov, Michael"          5  
"Ronneberger, Olaf"          6  
"Tunyasuvunakool, Kathryn"   7  
"Bates, Russ"                8  
"Zidek, Augustin"            9  
"Potapenko, Anna"            10 
"Bridgland, Alex"            11 
"Meyer, Clemens"             12 
"Kohl, Simon A. A."          13 
"Ballard, Andrew J."         14 
"Cowie, Andrew"              15 
"Romera-Paredes, Bernardino" 16 
"Nikolov, Stanislav"         17 
"Jain, Rishub"               18 
"Adler, Jonas"               19 
"Back, Trevor"         