In [2]:
import os
import numpy as np
import pandas as pd
from Bio import SeqIO

disprot_regions_file = "/biodata/franco/datasets/disprot/DisProt_release_2023_12_with_ambiguous_evidences.tsv"
df = pd.read_table(disprot_regions_file, header=0)

In [6]:
disprot_annotations_file = "/biodata/franco/datasets/disprot/DisProt_release_2023_12_with_ambiguous_evidences_consensus_regions.tsv"

df_annot = pd.read_table(disprot_annotations_file, header=0)
annot_dict = dict(zip(df_annot['acc'], df_annot['consensus']))

In [13]:
df

Unnamed: 0,acc,name,organism,ncbi_taxon_id,disprot_id,region_id,start,end,term_namespace,term,term_name,ec,ec_name,reference,region_sequence,confidence,obsolete
0,P03265,DNA-binding protein,Human adenovirus C serotype 5,28285,DP00003,DP00003r002,294,334,Structural state,IDPO:00076,disorder,ECO:0006220,X-ray crystallography-based structural model w...,pmid:8632448,EHVIEMDVTSENGQRALKEQSSKAKIVKNRWGRNVVQISNT,,
1,P03265,DNA-binding protein,Human adenovirus C serotype 5,28285,DP00003,DP00003r004,454,464,Structural state,IDPO:00076,disorder,ECO:0006220,X-ray crystallography-based structural model w...,pmid:8632448,VYRNSRAQGGG,,
2,P49913,Cathelicidin antimicrobial peptide,Homo sapiens,9606,DP00004,DP00004r001,134,170,Structural state,IDPO:00076,disorder,ECO:0006206,near-UV circular dichroism evidence used in ma...,pmid:9452503,LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLVPRTES,,
3,P49913,Cathelicidin antimicrobial peptide,Homo sapiens,9606,DP00004,DP00004r002,134,170,Structural transition,IDPO:00050,disorder to order,ECO:0006206,near-UV circular dichroism evidence used in ma...,pmid:9452503,LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLVPRTES,,
4,P49913,Cathelicidin antimicrobial peptide,Homo sapiens,9606,DP00004,DP00004r004,134,170,Biological process,GO:0019835,cytolysis,ECO:0007634,experimental phenotypic evidence used in manua...,pmid:9452503,LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLVPRTES,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11662,P19597,Circumsporozoite protein,Plasmodium falciparum (isolate NF54),5843,DP04138,DP04138r001,19,105,Structural state,IDPO:00076,disorder,ECO:0001184,gel-filtration evidence used in manual assertion,pmid:38059674,LFQEYQCYGSSSNTRVLNELNYDNAGTNLYNELEMNYYGKQENWYS...,,
11663,P19597,Circumsporozoite protein,Plasmodium falciparum (isolate NF54),5843,DP04138,DP04138r002,19,105,Structural state,IDPO:00076,disorder,ECO:0007064,dynamic light scattering assay evidence used i...,pmid:38059674,LFQEYQCYGSSSNTRVLNELNYDNAGTNLYNELEMNYYGKQENWYS...,,
11664,P19597,Circumsporozoite protein,Plasmodium falciparum (isolate NF54),5843,DP04138,DP04138r003,19,105,Structural state,IDPO:00076,disorder,ECO:0006204,far-UV circular dichroism evidence used in man...,pmid:38059674,LFQEYQCYGSSSNTRVLNELNYDNAGTNLYNELEMNYYGKQENWYS...,,
11665,P19597,Circumsporozoite protein,Plasmodium falciparum (isolate NF54),5843,DP04138,DP04138r004,19,105,Structural state,IDPO:00076,disorder,ECO:0006210,small-angle X-ray scattering evidence used in ...,pmid:38059674,LFQEYQCYGSSSNTRVLNELNYDNAGTNLYNELEMNYYGKQENWYS...,,


In [12]:
set(df["term_namespace"])

{'Biological process',
 'Cellular component',
 'Disorder function',
 'Molecular function',
 'Structural state',
 'Structural transition'}

In [14]:
dataset_dict = dict()

target_namespaces  = ['Disorder function', 'Structural state', 'Structural transition']

namespace_skip = 0
for i in range(df.shape[0]):
#     print(df.iloc[i])
    uniprot = df.iloc[i]["acc"]
    seq     = df.iloc[i]["region_sequence"]
    regionid= df.iloc[i]["region_id"]
    start   = int(df.iloc[i]['start']) - 1
    end     = int(df.iloc[i]['end'])
    namespace = df.iloc[i]['term_namespace']

    if namespace not in target_namespaces:
        namespace_skip += 1
        continue

    if uniprot in dataset_dict:
        if regionid in dataset_dict[uniprot]:
            print(f"Region {regionid} exists?")
            raise
        else:
            found = False
            for region in dataset_dict[uniprot]:
                if seq == dataset_dict[uniprot][region]['seq']:
                    found = True
                    dataset_dict[uniprot][region]['ids'].append(regionid)
            if not found:
                dataset_dict[uniprot][regionid] = dict()
                dataset_dict[uniprot][regionid]["seq"] = seq
                dataset_dict[uniprot][regionid]["ids"] = [regionid]
                dataset_dict[uniprot][regionid]["consensus"] = annot_dict[uniprot][start:end]
    else:
        dataset_dict[uniprot] = dict()
        dataset_dict[uniprot][regionid] = dict()
        dataset_dict[uniprot][regionid]["seq"] = seq
        dataset_dict[uniprot][regionid]["ids"] = [regionid]
        dataset_dict[uniprot][regionid]["consensus"] = annot_dict[uniprot][start:end]

print(f"Skipped {namespace_skip} regions")

Skipped 3375 regions


In [16]:
with open("/biodata/franco/datasets/disprot/disprot_regions_seq_2023_12_Structure_namespaces.fasta", 'w') as outf:
    with open("/biodata/franco/datasets/disprot/disprot_regions_annot_2023_12_Structure_namespaces.fasta", 'w') as outg:
        for unip in dataset_dict:
            for region in dataset_dict[unip]:
                outf.write(f">{unip}_{region}|{','.join(dataset_dict[unip][region]['ids'])}\n")
                outg.write(f">{unip}_{region}|{','.join(dataset_dict[unip][region]['ids'])}\n")
                outf.write(f"{dataset_dict[unip][region]['seq']}\n")
                outg.write(f"{dataset_dict[unip][region]['consensus']}\n")

In [19]:
# After clustering with CD-HIT at 100% identity, check which regions remain. 
# How many are longer than 1022 aminoacids?

with open("/biodata/franco/datasets/disprot/disprot_regions_seq_2023_12_Structure_namespaces_CDHIT_1.0") as infile:
    seqs = list(SeqIO.parse(infile, 'fasta'))

longer = 0
OK_ids = []
for seq in seqs:
    if len(seq) > 1022:
        longer += 1
        print(len(seq))
    else:
        OK_ids.append(seq.id)
print(f"Longer than 1022: {longer}")

2152
1550
1045
1226
1249
1408
1129
1483
2726
1259
Longer than 1022: 10


In [20]:
with open("/biodata/franco/datasets/disprot/disprot_regions_seq_2023_12_Structure_namespaces_CDHIT_1.0") as infile:
    seqs = list(SeqIO.parse(infile, 'fasta'))

with open("/biodata/franco/datasets/disprot/disprot_regions_annot_2023_12_Structure_namespaces.fasta") as infile:
    annots = list(SeqIO.parse(infile, 'fasta'))

with open("/biodata/franco/datasets/disprot/disprot_regions_seq_2023_12_Structure_namespaces_CDHIT_1.0_OK.fasta", 'w') as outf:
    with open("/biodata/franco/datasets/disprot/disprot_regions_annot_2023_12_Structure_namespaces_CDHIT_1.0_OK.fasta", 'w') as outg:
        for seq in seqs:
            if seq.id in OK_ids:
                outf.write(f">{seq.id}\n{seq.seq}\n")
        for annot in annots:
            if annot.id in OK_ids:
                outg.write(f">{annot.id}\n{annot.seq}\n")

In [14]:
##### Build Disprot dataset of full proteins 2023
import os
import numpy as np
import pandas as pd
import time
import json
import requests
from Bio import SeqIO
import collections

disprot_annot_file = "/biodata/franco/datasets/disprot/DisProt_release_2023_12_with_ambiguous_evidences_consensus_regions.fasta"
disprot_json_file = "/biodata/franco/datasets/disprot/DisProt_release_2023_12_with_ambiguous_evidences.json"

with open(disprot_json_file) as f:
    new_disprot_data = json.load(f)

new_datadict = {}
new_datadict = collections.defaultdict(dict)
for record in SeqIO.parse(disprot_annot_file, "fasta"):
    acc = record.description.split(" ")[1].split("=")[1]
    seq = str(record.seq)
    new_datadict[acc]['disorder'] = seq

# new_disprot_data['data'][0].keys()
# print(new_disprot_data['data'][0]['disprot_id'])
# print(new_disprot_data['data'][0]['acc'])
# print(new_disprot_data['data'][0]['length'])
# print(new_disprot_data['data'][0]['sequence'])

skip_length = 0
OK_datadict = collections.defaultdict(dict)
for protein in new_disprot_data['data']:
    acc = protein['acc']
    if protein['length'] < 1022:
        OK_datadict[acc]['seq'] = protein['sequence']
        OK_datadict[acc]['disorder'] = new_datadict[acc]['disorder']
        if len(OK_datadict[acc]['disorder']) == len(OK_datadict[acc]['seq']):
            pass #print(f"OK {acc}")
        else:
            print(f"ERROR {acc}")
    else:
        print(f"Skipping {acc}, protein length: {protein['length']}={len(protein['sequence'])}")
        skip_length += 1
print(f"Skipped {skip_length} proteins")

Skipping P13569, protein length: 1480=1480
Skipping P70475, protein length: 1187=1187
Skipping O86488, protein length: 1315=1315
Skipping Q8WZ42, protein length: 34350=34350
Skipping P06786, protein length: 1428=1428
Skipping P07248, protein length: 1323=1323
Skipping P51123, protein length: 2129=2129
Skipping Q53654, protein length: 1183=1183
Skipping P10587, protein length: 1979=1979
Skipping P15146, protein length: 1861=1861
Skipping O95405, protein length: 1425=1425
Skipping P08775, protein length: 1970=1970
Skipping P21513, protein length: 1061=1061
Skipping P07293, protein length: 1873=1873
Skipping P38398, protein length: 1863=1863
Skipping Q24298, protein length: 1507=1507
Skipping Q04656, protein length: 1500=1500
Skipping P00533, protein length: 1210=1210
Skipping Q9Y6Q9, protein length: 1424=1424
Skipping P45481, protein length: 2441=2441
Skipping P09983, protein length: 1023=1023
Skipping P12497, protein length: 1435=1435
Skipping P78504, protein length: 1218=1218
Skipping 

In [15]:
print(f"Kept {len(list(new_datadict.keys()))} proteins")

Kept 2896 proteins


In [16]:
## Write fasta files for the dataset
with open("/biodata/franco/datasets/disprot/disprot_OK_fullset_2023_12.fasta", 'w') as outstrm:
    for i, uniprot_id in enumerate(OK_datadict.keys()):
        #print(i, uniprot_id)
        outstrm.write(f">{uniprot_id}\n{OK_datadict[uniprot_id]['seq']}\n")

with open("/biodata/franco/datasets/disprot/disprot_OK_fullset_annotations_2023_12.fasta", 'w') as outstrm:
    for i, uniprot_id in enumerate(OK_datadict.keys()):
        outstrm.write(f">{uniprot_id}\n{OK_datadict[uniprot_id]['disorder']}\n")