In [10]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

from biotite.sequence.io import fasta
from biotite.database import uniprot

from rocketshp import config

In [2]:
def uniprot_fetch(upid, start=1, stop=None):
    """
    Fetch the sequence of a protein from UniProt.
    
    Parameters
    ----------
    upid : str
        The UniProt ID of the protein.
    start : int, optional
        The first residue of the protein to fetch.
        The default is 1.
    stop : int, optional
        The last residue of the protein to fetch.
        The default is None, which fetches the entire sequence.
    
    Returns
    -------
    seq : str
        The sequence of the protein from start to stop inclusive.
    """
    # Fetch the sequence from UniProt
    seq_record = uniprot.fetch(upid, "fasta").read().split("\n")
    seq = "".join(seq_record[1:])

    if stop is None:
        stop = len(seq)
    seq = seq[start-1:stop]

    return seq

In [3]:
DISPROT_DATA_DIR = config.RAW_DATA_DIR / "disprot"
os.listdir(DISPROT_DATA_DIR)

['DisProt_2024_12_structural_state.tsv',
 'DisProt_2024_12_structural_state_transition.tsv',
 'DisProt_2024_12.tsv',
 'DisProt_2024_12.fasta']

In [4]:
full_df = pd.read_csv(DISPROT_DATA_DIR / "DisProt_2024_12.tsv", sep="\t")
sequences = fasta.FastaFile.read(DISPROT_DATA_DIR / "DisProt_2024_12.fasta")

In [36]:
df_ss = full_df[full_df["term_namespace"] == "Structural state"]
df_ss = df_ss.drop_duplicates(["disprot_id","start","end"])

df_ss_trans = full_df[full_df["term_namespace"].isin(["Structural state", "Structural transition"])]
df_ss_trans = df_ss.drop_duplicates(["disprot_id","start","end"])

df_ss.to_csv(DISPROT_DATA_DIR / "DisProt_2024_12_structural_state.tsv", sep="\t", index=False)
df_ss_trans.to_csv(DISPROT_DATA_DIR / "DisProt_2024_12_structural_state_transition.tsv", sep="\t", index=False)

In [6]:
list(sequences.keys())[:10]

['disprot|DP00003r002 pos=294-334 term=IDPO:00076 ec=ECO:0006220 pmid=8632448',
 'disprot|DP00003r004 pos=454-464 term=IDPO:00076 ec=ECO:0006220 pmid=8632448',
 'disprot|DP00004r001 pos=134-170 term=IDPO:00076 ec=ECO:0006206 pmid=9452503',
 'disprot|DP00004r002 pos=134-170 term=IDPO:00050 ec=ECO:0006206 pmid=9452503',
 'disprot|DP00004r004 pos=134-170 term=GO:0019835 ec=ECO:0007634 pmid=9452503',
 'disprot|DP00004r005 pos=150-162 term=GO:1990000 ec=ECO:0006003 pmid=32753597',
 'disprot|DP00004r006 pos=150-162 term=GO:1990000 ec=ECO:0005670 pmid=32753597',
 'disprot|DP00005r001 pos=1-107 term=IDPO:00076 ec=ECO:0006165 pmid=9659923',
 'disprot|DP00005r004 pos=1-107 term=IDPO:00076 ec=ECO:0006210 pmid=21936008',
 'disprot|DP00005r005 pos=1-107 term=IDPO:00076 ec=ECO:0006204 pmid=9063900']

In [7]:
uniprot_fetch("P03265")

'MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKRMRRRIESEDEEDSSQDALVPRTPSPRPSTSAADLAIAPKKKKKRPSPKPERPPSPEVIVDSEEEREDVALQMVGFSNPPVLIKHGKGGKRTVRRLNEDDPVARGMRTQEEEEEPSEAESEITVMNPLSVPIVSAWEKGMEAARALMDKYHVDNDLKANFKLLPDQVEALAAVCKTWLNEEHRGLQLTFTSKKTFVTMMGRFLQAYLQSFAEVTYKHHEPTGCALWLHRCAEIEGELKCLHGSIMINKEHVIEMDVTSENGQRALKEQSSKAKIVKNRWGRNVVQISNTDARCCVHDAACPANQFSGKSCGMFFSEGAKAQVAFKQIKAFMQALYPNAQTGHGHLLMPLRCECNSKPGHAPFLGRQLPKLTPFALSNAEDLDADLISDKSVLASVHHPALIVFQCCNPVYRNSRAQGGGPNCDFKISAPDLLNALVMVRSLWSENFTELPRMVVPEFKWSTKHQYRNVSLPVAHSDARQNPFDF'

In [8]:
uniprot_fetch("P03265", 294, 334)

'EHVIEMDVTSENGQRALKEQSSKAKIVKNRWGRNVVQISNT'

In [53]:
proc_df = []
label_options = sorted(list(df_ss["term_name"].unique()))

for upid in tqdm(df_ss["acc"].unique()):
    full_seq = uniprot_fetch(upid)
    label_vec = np.zeros(len(full_seq), dtype=int)
    subdf = df_ss[df_ss["acc"] == upid]
    for i, row in subdf.iterrows():
        start = row["start"]
        end = row["end"]
        label = label_options.index(row["term_name"]) + 1
        # print(start,end, row["term_name"], label)
        label_vec[start-1:end] = label
    proc_df.append({
        "acc": upid,
        "sequence": full_seq,
        "label": label_vec,
    })
proc_df = pd.DataFrame(proc_df)

100%|██████████| 30/30 [00:09<00:00,  3.30it/s]


In [57]:
proc_df.iloc[10]

acc                                                    P61926
sequence    MTDVETTYADFIASGRTGRRNAIHDILVSSASGNSNELALKLAGLD...
label       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
Name: 10, dtype: object

In [35]:
df_ss.drop_duplicates(["disprot_id","start","end"]).shape, df_ss.shape

((5586, 18), (7129, 18))

In [65]:
df_ss[df_ss.term_name == "pre-molten globule"]

Unnamed: 0,acc,name,disorder_content,organism,ncbi_taxon_id,disprot_id,region_id,start,end,term_namespace,term,term_name,ec,ec_name,reference,region_sequence,confidence,obsolete
1952,P08083,Colicin-N,23.0,Escherichia coli,562,DP00461,DP00461r018,40,76,Structural state,IDPO:00078,pre-molten globule,ECO:0006165,nuclear magnetic resonance spectroscopy eviden...,pmid:12679333,NSNGWSWSNKPHKNDGFHSDGSYHITFHGDNNSKPKP,,
2079,P10275,Androgen receptor,58.26,Homo sapiens,9606,DP00492,DP00492r026,142,448,Structural state,IDPO:00078,pre-molten globule,ECO:0006165,nuclear magnetic resonance spectroscopy eviden...,pmid:27356095,SKGLPQQLPAPPDEDDSAAPSTLSLLGPTFPGLSSCSADLKDILSE...,,
2298,Q9NX55,Huntingtin-interacting protein K,99.22,Homo sapiens,9606,DP00546,DP00546r001,1,129,Structural state,IDPO:00078,pre-molten globule,ECO:0006204,far-UV circular dichroism evidence used in man...,pmid:18076027,MRRRGEIDMATEGDVELELETETSGPERPPEKPRKHDSGAADLERV...,,
2933,Q9IK91,Phosphoprotein,57.12,Nipah virus,121791,DP00699,DP00699r001,1,406,Structural state,IDPO:00078,pre-molten globule,ECO:0006198,proton-based nuclear magnetic resonance eviden...,pmid:20657787,MDKLELVNDGLNIIDFIQKNQKEIQKTYGRSSIQQPSIKDQTKAWE...,,
2952,O55778,Phosphoprotein,57.28,Hendra virus (isolate Horse/Autralia/Hendra/1994),928303,DP00700,DP00700r001,1,404,Structural state,IDPO:00078,pre-molten globule,ECO:0006204,far-UV circular dichroism evidence used in man...,pmid:20657787,MDKLDLVNDGLDIIDFIQKNQKEIQKTYGRSSIQQPSTKDRTRAWE...,,
4180,P19599,Merozoite surface antigen 2,82.2,Plasmodium falciparum (isolate FC27 / Papua Ne...,5837,DP01067,DP01067r001,21,238,Structural state,IDPO:00078,pre-molten globule,ECO:0006204,far-UV circular dichroism evidence used in man...,pmid:18440022,KNESKYSNTFINNAYNMSIRRSMANEGSNTNSVGANAPNADTIASG...,,
4285,Q9XES8,Seed maturation protein PM28,98.88,Glycine max,3847,DP01088,DP01088r001,1,89,Structural state,IDPO:00078,pre-molten globule,ECO:0006204,far-UV circular dichroism evidence used in man...,pmid:20071374,MAKSKEDITYATSQARLSEDEAVRVAYEHGSPLEGGKIADSQPVDL...,,
4451,Q16236,Nuclear factor erythroid 2-related factor 2,99.83,Homo sapiens,9606,DP01115,DP01115r003,1,605,Structural state,IDPO:00078,pre-molten globule,ECO:0007689,sodium dodecyl sulfate polyacrylamide gel elec...,pmid:34299054,MMDLELPPPGLPSQQDMDLIDILWRQDIDLGVSREVFDFSQRRKEY...,,
5094,O60828,Polyglutamine-binding protein 1,99.62,Homo sapiens,9606,DP01308,DP01308r006,1,265,Structural state,IDPO:00078,pre-molten globule,ECO:0001249,fluorescence evidence used in manual assertion,pmid:22500761,MPLPVALQTRLAKRGILKHLEPEPEEEIIAEDYDDDPVDYEATRLE...,,
7553,P06837,Neuromodulin,99.56,Mus musculus,10090,DP02342,DP02342r003,1,227,Structural state,IDPO:00078,pre-molten globule,ECO:0006204,far-UV circular dichroism evidence used in man...,pmid:23462742,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,,


In [62]:
df_ss[df_ss["acc"] == "P12520"]

Unnamed: 0,acc,name,disorder_content,organism,ncbi_taxon_id,disprot_id,region_id,start,end,term_namespace,term,term_name,ec,ec_name,reference,region_sequence,confidence,obsolete
10768,P12520,Protein Vpr,16.67,Human immunodeficiency virus type 1 group M su...,11698,DP03534,DP03534r008,1,17,Structural state,IDPO:00076,disorder,ECO:0006222,X-ray crystallography-based structural model w...,pmid:27571178,MEQAPEDQGPQREPYNE,,


In [13]:
!head $DISPROT_DATA_DIR/DisProt_2024_12_structural_state_transition.tsv

acc	name	disorder_content	organism	ncbi_taxon_id	disprot_id	region_id	start	end	term_namespace	term	term_name	ec	ec_name	reference	region_sequence	confidence	obsolete
P03265	DNA-binding protein	9.45	Human adenovirus C serotype 5	28285	DP00003	DP00003r002	294	334	Structural state	IDPO:00076	disorder	ECO:0006220	X-ray crystallography-based structural model with missing residue coordinates used in manual assertion	pmid:8632448	EHVIEMDVTSENGQRALKEQSSKAKIVKNRWGRNVVQISNT		
P03265	DNA-binding protein	9.45	Human adenovirus C serotype 5	28285	DP00003	DP00003r004	454	464	Structural state	IDPO:00076	disorder	ECO:0006220	X-ray crystallography-based structural model with missing residue coordinates used in manual assertion	pmid:8632448	VYRNSRAQGGG		
P49913	Cathelicidin antimicrobial peptide	21.18	Homo sapiens	9606	DP00004	DP00004r001	134	170	Structural state	IDPO:00076	disorder	ECO:0006206	near-UV circular dichroism evidence used in manual assertion	pmid:9452503	LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLVPRTE