In [48]:
import matplotlib.pyplot as plt
import requests
import json
from Bio import SeqIO
import os
import time
import collections
os.chdir("/biodata/franco/zsuzsa_lab/jupyter")

# Read the file disprot_OK_fullset.fasta and get all the identifiers
identifiers = []

disprot_dict = collections.defaultdict(dict)

with open("/biodata/franco/datasets/disprot/disprot_OK_fullset_2023_12.fasta") as instrm:
    for record in SeqIO.parse(instrm, "fasta"):
            disprot_dict[record.id]['seq'] = str(record.seq)
            identifiers.append(record.id)
with open("/biodata/franco/datasets/disprot/disprot_OK_fullset_annotations_2023_12.fasta") as instrm:
    for record in SeqIO.parse(instrm, "fasta"):
            disprot_dict[record.id]['disorder'] = str(record.seq)

In [49]:
import re
def remove_letters(num):
    return int(re.sub("[^0-9]", "", str(num)))

def parse_pdb_json(datadict, pdbid, chain):
    annots = []

    if "rcsb_polymer_instance_feature" not in datadict:
        print(f"Error with {pdbid}_{chain}")
        return False, False
    else:
        # get sequence numbering
        seqres_num = datadict['rcsb_polymer_entity_instance_container_identifiers']['auth_to_entity_poly_seq_mapping']
        seqres_num = [remove_letters(n) for n in seqres_num]
        for e in datadict["rcsb_polymer_instance_feature"]:
            if 'provenance_source' in e:
                if e['provenance_source'] == "PROMOTIF":
                    ss_type  = e['type']
                    for ee in e['feature_positions']:
                        ss_begin = remove_letters(ee["beg_seq_id"])
                        ss_end   = remove_letters(ee["end_seq_id"])
                        #print(ss_type, ss_begin, ss_end)
                        annots.append([ss_type, ss_begin, ss_end])
                if e['provenance_source'] == "PDB":
                    if e["type"] == "UNOBSERVED_RESIDUE_XYZ":
                        for ee in e['feature_positions']:
                            miss_begin = remove_letters(ee["beg_seq_id"])
                            miss_end   = remove_letters(ee["end_seq_id"])
                            #print("MISSING", miss_begin, miss_end)
                            annots.append(["MISSING", miss_begin, miss_end])
        return seqres_num, annots

def parse_annots_to_seq(annot, disprot_dict, uniprot_id, seqres_num):
    ss_placeholder = ["-" for i in range(len(disprot_dict[uniprot_id]['seq']))]
    #seqres_num = ssresnum_list[i]
    for a in annot:
        letter = "-"
        if a[0].startswith("UNASSIGNED"):
            letter = "U"
        if a[0].startswith("MISSING"):
            letter = "M"
        if a[0].startswith("HELIX"):
            letter = "H"
        if a[0].startswith("SHEET"):
            letter = "B"
        for j in range(int(a[1])-1, int(a[2])): # numbering starts at 1 in pdb, so start counting one before, ie 1 is 0
            try:
                ss_placeholder[j] = letter
            except:
                print(f"Error with {uniprot_id}")
                print(f"Annot: {a}")
                print(f"Seq: {disprot_dict[uniprot_id]['seq']}")
                print(f"SS: {len(ss_placeholder)}")
                return False
    return ss_placeholder

def parse_annots_to_seq2(annot, disprot_dict, uniprot_id, seqres_num):
    ss_placeholder = ["-" for i in range(len(disprot_dict[uniprot_id]['seq']))]
    #seqres_num = ssresnum_list[i]
    for a in annot:
        letter = "-"
        if a[0].startswith("UNASSIGNED"):
            letter = "U"
        if a[0].startswith("MISSING"):
            letter = "M"
        if a[0].startswith("HELIX"):
            letter = "H"
        if a[0].startswith("SHEET"):
            letter = "B"
        #print(a[1], a[2], "-->", int(seqres_num[int(a[1])-1]), int(seqres_num[int(a[2])-1]))
        for j in range(int(remove_letters(seqres_num[int(a[1])-1])), int(remove_letters(seqres_num[int(a[2])-1]))): # numbering starts at 1 in pdb, so start counting one before, ie 1 is 0
            try:
                ss_placeholder[j] = letter
            except:
                print(f"index: {j}")
                print(f"seqres: {seqres_num[j]}")
                print(f"Error with {uniprot_id}")
                print(f"Annot: {a}")
                print(f"Seq: {disprot_dict[uniprot_id]['seq']}")
                print(f"SS: {len(ss_placeholder)}")
                return False
    return ss_placeholder

In [53]:
import numpy as np

mobifiles = os.listdir("mobidb_dev_10_2024")

os.makedirs("mobidb_dev_10_2024_SSannotated", exist_ok=True)

stats_dict = dict()

for i, mobifile in enumerate(mobifiles):

    uniprot_id = mobifile.split(".")[0]
    output_json = f"mobidb_dev_10_2024_SSannotated/{uniprot_id}.json"

    if not os.path.exists(output_json):
        #read json
        with open(f"mobidb_dev_10_2024/{mobifile}") as f:
            data = json.load(f)
        
        stats_dict[uniprot_id] = collections.defaultdict(int)
        ss_annot_seqs = collections.defaultdict(dict)
        ss_annot_seqs_list = []
        for k in data.keys():
            if k.startswith("derived-observed-mobi-"):
                #print(k)
                for l in data[k].keys():
                    #print(l, data[k][l])
                    if l == "source_id":
                        pdbid, chain = data[k][l].split("_")
                        #print(pdbid, chain)
                        
                        ### NEW: Omit download part, these are pre-downloaded on get_mobidb_annotations.ipynb
                        #res = requests.get(f"https://data.rcsb.org/rest/v1/core/polymer_entity_instance/{pdbid.upper()}/{chain}")
                        pdb_json = f"mobidb_dev_10_2024/PDB_data/{pdbid}_{chain}.json"
                        with open(pdb_json) as f:
                            datadict = json.load(f)
                        #datadict = json.loads(res.content.decode())
                        seqres_num, annots = parse_pdb_json(datadict, pdbid, chain)
                        if annots:
                            if len(seqres_num) == len(disprot_dict[uniprot_id]['seq']):
                                #print("classic ",pdbid,chain)
                                stats_dict[uniprot_id]["complete"] += 1
                                ss_seq_list = parse_annots_to_seq(annots, disprot_dict, uniprot_id, seqres_num)
                            else:
                                seqnum_max = np.max([int(i) for i in seqres_num])
                                seqlen = len(disprot_dict[uniprot_id]['seq'])
                                if seqnum_max <= seqlen:
                                    stats_dict[uniprot_id]["incomplete"] += 1
                                    ss_seq_list = parse_annots_to_seq2(annots, disprot_dict, uniprot_id, seqres_num)
                                else:
                                    #print(f"length mismatch: {seqlen} and {seqnum_max}")
                                    #### PDB chain is longer than uniprot
                                    stats_dict[uniprot_id]["fail_length"] += 1
                                    continue
                            ss_seq = "".join(ss_seq_list)
                            ss_annot_seqs[uniprot_id][f"{pdbid}_{chain}"] = ss_seq
                        else:
                            #### PDB chain has no annotation
                            stats_dict[uniprot_id]["fail_noannot"] += 1
        print(f'{uniprot_id} : complete={stats_dict[uniprot_id]["complete"]}, incomplete={stats_dict[uniprot_id]["incomplete"]}, fail_length={stats_dict[uniprot_id]["fail_length"]}, fail_noannot={stats_dict[uniprot_id]["fail_noannot"]}')
        
        ss_annot_seqs[uniprot_id]['stats'] = stats_dict[uniprot_id]

        with open(output_json, 'w') as outf:
            json.dump(ss_annot_seqs, outf)


Q9UKV8 : complete=31, incomplete=34, fail_length=0, fail_noannot=0
Q9HD36 : complete=0, incomplete=1, fail_length=0, fail_noannot=0
Q8K419 : complete=0, incomplete=2, fail_length=0, fail_noannot=0
P05059 : complete=0, incomplete=0, fail_length=0, fail_noannot=1
B8YB65 : complete=8, incomplete=0, fail_length=0, fail_noannot=0
Q90VU7 : complete=4, incomplete=3, fail_length=79, fail_noannot=0
P27782 : complete=0, incomplete=3, fail_length=0, fail_noannot=0
P52564 : complete=0, incomplete=13, fail_length=2, fail_noannot=0
Q9UKT5 : complete=0, incomplete=2, fail_length=0, fail_noannot=0
F0ZBA6 : complete=0, incomplete=1, fail_length=0, fail_noannot=0
Q9NQA5 : complete=0, incomplete=1, fail_length=0, fail_noannot=0
O61667 : complete=0, incomplete=2, fail_length=0, fail_noannot=0
P69697 : complete=0, incomplete=1, fail_length=0, fail_noannot=0
Q9UHK0 : complete=0, incomplete=1, fail_length=0, fail_noannot=0
O00308 : complete=0, incomplete=19, fail_length=0, fail_noannot=0
P0CF51 : complete=0,

In [61]:
## implement majority voting, only if 90% or 75% of majority agrees
annotfiles = [f for f in os.listdir("mobidb_dev_10_2024_SSannotated") if f.endswith("json")]

outdir_consensus= "mobidb_dev_10_2024_SSannotated/consensus"
os.makedirs(outdir_consensus, exist_ok=True)


def get_majority_seq(disprot_dict, uniprot_id, ss_annot_seqs_list, majority_value=0.75, use_incomplete=True):
    if use_incomplete:
        total_seqs = len(ss_annot_seqs_list)
        ss_seqs_list = ss_annot_seqs_list
    else:
        ss_seqs_list = [s for s in ss_annot_seqs_list if "-" not in s]
        total_seqs = len(ss_seqs_list)
    #print(uniprot_id, total_seqs)
    seq = disprot_dict[uniprot_id]['seq']
    letter_ix = ["M", "U", "H", "B"]
    majority_seq = []
    majority_value = 0.75
    if len(ss_seqs_list) == 0:
        return ["-" for k in range(len(seq))]
    else:
        for i in range(len(seq)):
            counts = np.array([0,0,0,0]) # M, U, H, B
            for j in range(total_seqs):
                letter = ss_seqs_list[j][i]
                if letter == "M":
                    counts[0] += 1
                elif letter == "U":
                    counts[1] += 1
                elif letter == "H":
                    counts[2] += 1
                elif letter == "B":
                    counts[3] += 1
            majority = np.max(counts)
            if majority >= majority_value*total_seqs:
                winner = np.where(counts == np.max(counts))[0][0]
                majority_seq.append(letter_ix[winner])
            else:
                majority_seq.append("-")
        return majority_seq

for i, mobifile in enumerate(annotfiles):
    ss_annot_seqs_cons = collections.defaultdict(dict)

    uniprot_id = mobifile.split(".")[0]
    # if uniprot_id.endswith("consensus"):
    #     continue
    with open(f"mobidb_dev_10_2024_SSannotated/{mobifile}") as f:
        ss_annot_dict = json.load(f)
    ss_annot_seqs_list = [ss_annot_dict[uniprot_id][k] for k in ss_annot_dict[uniprot_id].keys() if k!="stats"]
    
    mv = 0.75
    use_incomp = True
    label=f"consensus_{mv:.2f}_inc"
    majority_seq = get_majority_seq(disprot_dict, uniprot_id, ss_annot_seqs_list, use_incomplete=use_incomp)
    ss_annot_seqs_cons[uniprot_id][label] = "".join(majority_seq)

    mv = 0.75
    use_incomp = False
    label=f"consensus_{mv:.2f}"
    majority_seq = get_majority_seq(disprot_dict, uniprot_id, ss_annot_seqs_list, use_incomplete=use_incomp)
    ss_annot_seqs_cons[uniprot_id][label] = "".join(majority_seq)

    with open(f"{outdir_consensus}/{uniprot_id}_consensus.json", 'w') as outf:
        json.dump(ss_annot_seqs_cons, outf)

In [60]:

def get_majority_seq(disprot_dict, uniprot_id, ss_annot_seqs_list, majority_value=0.75, use_incomplete=True):
    if use_incomplete:
        total_seqs = len(ss_annot_seqs_list)
        ss_seqs_list = ss_annot_seqs_list
    else:
        print("using only complete")
        ss_seqs_list = [s for s in ss_annot_seqs_list if "-" not in s]
        total_seqs = len(ss_seqs_list)
    #print(uniprot_id, total_seqs)
    seq = disprot_dict[uniprot_id]['seq']
    letter_ix = ["M", "U", "H", "B"]
    majority_seq = []
    majority_value = 0.75
    if len(ss_seqs_list) == 0:
        return ["-" for k in range(len(seq))]
    else:
        for i in range(len(seq)):
            counts = np.array([0,0,0,0]) # M, U, H, B
            for j in range(total_seqs):
                letter = ss_seqs_list[j][i]
                if letter == "M":
                    counts[0] += 1
                elif letter == "U":
                    counts[1] += 1
                elif letter == "H":
                    counts[2] += 1
                elif letter == "B":
                    counts[3] += 1
            majority = np.max(counts)
            if majority >= majority_value*total_seqs:
                winner = np.where(counts == np.max(counts))[0][0]
                majority_seq.append(letter_ix[winner])
            else:
                majority_seq.append("-")
        return majority_seq

mv = 0.75
use_incomp = False
label=f"consensus_{mv:.2f}"
uniprot_id="P03265"

with open(f"mobidb_dev_10_2024_SSannotated/{uniprot_id}.json") as f:
    ss_annot_dict = json.load(f)
ss_annot_seqs_list = [ss_annot_dict[uniprot_id][k] for k in ss_annot_dict[uniprot_id].keys() if k!="stats"]


majority_seq = get_majority_seq(disprot_dict, uniprot_id, ss_annot_seqs_list, use_incomplete=use_incomp)
print(majority_seq)

using only complete
[]
['-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '

In [70]:
#### Example of incomplete pdb protein

def parse_annots_to_seq2(annot, disprot_dict, uniprot_id, seqres_num):
    ss_placeholder = ["-" for i in range(len(disprot_dict[uniprot_id]['seq']))]
    #seqres_num = ssresnum_list[i]
    for a in annot:
        letter = "-"
        if a[0].startswith("UNASSIGNED"):
            letter = "U"
        if a[0].startswith("MISSING"):
            letter = "M"
        if a[0].startswith("HELIX"):
            letter = "H"
        if a[0].startswith("SHEET"):
            letter = "B"
        #print(a[1], a[2], "-->", int(seqres_num[int(a[1])-1]), int(seqres_num[int(a[2])-1]))
        for j in range(int(seqres_num[int(a[1])-1]), int(seqres_num[int(a[2])-1])): # numbering starts at 1 in pdb, so start counting one before, ie 1 is 0
            try:
                ss_placeholder[j] = letter
            except:
                print(f"index: {j}")
                print(f"seqres: {seqres_num[j]}")
                print(f"Error with {uniprot_id}")
                print(f"Annot: {a}")
                print(f"Seq: {disprot_dict[uniprot_id]['seq']}")
                print(f"SS: {len(ss_placeholder)}")
                return False
    return ss_placeholder

pdbid = "4fro"
chain = "A"
print(pdbid, chain)
res = requests.get(f"https://data.rcsb.org/rest/v1/core/polymer_entity_instance/{pdbid.upper()}/{chain}")
datadict = json.loads(res.content.decode())
seqres_num, annots = parse_pdb_json(datadict, pdbid, chain)
# try:
#     assert len(seqres_num) == len(disprot_dict[uniprot_id]['seq'])
# except AssertionError:
#     print("---> ERROR:", uniprot_id, pdbid, chain, len(seqres_num), len(disprot_dict[uniprot_id]['seq']))
#     continue      
if annots:
    ss_seq_list = parse_annots_to_seq(annots, disprot_dict, uniprot_id, seqres_num)
    seqnum_max = np.max([int(i) for i in seqres_num])
    seqlen = len(disprot_dict[uniprot_id]['seq'])
    if seqnum_max > seqlen:
        print(f"length mismatch: {seqlen} and {seqnum_max}")
        raise
    ss_seq_list2 = parse_annots_to_seq2(annots, disprot_dict, uniprot_id, seqres_num)
    ss_seq = "".join(ss_seq_list)
    ss_seq2 = "".join(ss_seq_list2)
    print(ss_seq)
    print(ss_seq2)

4fro A
length mismatch: 354 and 355


RuntimeError: No active exception to reraise