In [4]:
disprot_sequencefile = "/biodata/franco/zsuzsa_lab/disprot_OK_fullset_12_2023.fasta"
monomer_sequencefile = "/biodata/franco/zsuzsa_lab/monomer_OK_fullset.fasta"

disprot_annotationsfile = "/biodata/franco/zsuzsa_lab/disprot_OK_fullset_annotations_12_2023.fasta"
monomer_annotationsfile = "/biodata/franco/zsuzsa_lab/monomer_OK_fullset_annotations.fasta"

from Bio import SeqIO

disprot_sequences = [str(r.seq) for r in list(SeqIO.parse(disprot_sequencefile, "fasta"))]
disprot_annotations = [str(r.seq) for r in list(SeqIO.parse(disprot_annotationsfile, "fasta"))]

monomer_sequences = [str(r.seq) for r in list(SeqIO.parse(monomer_sequencefile, "fasta"))]
monomer_annotations = [str(r.seq) for r in list(SeqIO.parse(monomer_annotationsfile, "fasta"))]

In [5]:
from collections import defaultdict

disprot_disorder_distrib_dict = defaultdict(int)
disprot_order_distrib_dict = defaultdict(int)

monomer_disorder_distrib_dict = defaultdict(int)
monomer_order_distrib_dict = defaultdict(int)

for i in range(len(disprot_sequences)):
    this_seq = disprot_sequences[i]
    this_annot = disprot_annotations[i]
    if len(this_seq) != len(this_annot):
        print("Error: sequence and annotation length mismatch for disprot sequence", i)
    else:
        for j in range(len(this_seq)):
            if this_annot[j] != "-":
                disprot_disorder_distrib_dict[this_seq[j]] += 1
            else:
                disprot_order_distrib_dict[this_seq[j]] += 1

for i in range(len(monomer_sequences)):
    this_seq = monomer_sequences[i]
    this_annot = monomer_annotations[i]
    if len(this_seq) != len(this_annot):
        print("Error: sequence and annotation length mismatch for monomer sequence", i)
    else:
        for j in range(len(this_seq)):
            if this_annot[j] != "-":
                monomer_disorder_distrib_dict[this_seq[j]] += 1
            else:
                monomer_order_distrib_dict[this_seq[j]] += 1

In [6]:
print(disprot_disorder_distrib_dict)
print(disprot_order_distrib_dict)
print(monomer_order_distrib_dict)
print(monomer_disorder_distrib_dict)

import json

## write dictionaries to json
with open("disprot_disorder_distrib_dict.json", "w") as f:
    json.dump(disprot_disorder_distrib_dict, f)

with open("disprot_order_distrib_dict.json", "w") as f:
    json.dump(disprot_order_distrib_dict, f)

with open("monomer_order_distrib_dict.json", "w") as f:
    json.dump(monomer_order_distrib_dict, f)

with open("monomer_disorder_distrib_dict.json", "w") as f:
    json.dump(monomer_disorder_distrib_dict, f)

defaultdict(<class 'int'>, {'M': 3821, 'Q': 9339, 'R': 9875, 'G': 14592, 'N': 8027, 'F': 4730, 'K': 13346, 'E': 16201, 'A': 14951, 'L': 12092, 'I': 6058, 'W': 1257, 'P': 13251, 'S': 17279, 'Y': 3731, 'D': 11718, 'T': 10274, 'H': 3740, 'V': 8773, 'C': 1638, 'Z': 1})
defaultdict(<class 'int'>, {'M': 14810, 'G': 41374, 'A': 44377, 'R': 34195, 'S': 45756, 'V': 39505, 'L': 59063, 'E': 43459, 'D': 33692, 'W': 7413, 'K': 38011, 'I': 31689, 'P': 34852, 'Y': 19034, 'H': 14402, 'F': 24168, 'N': 26161, 'T': 32925, 'C': 10887, 'Q': 27832, 'U': 1, 'X': 1})
defaultdict(<class 'int'>, {'R': 29641, 'S': 36004, 'L': 54126, 'T': 31958, 'A': 46284, 'D': 35565, 'E': 39551, 'Y': 21425, 'K': 35487, 'I': 33111, 'Q': 22839, 'P': 26680, 'C': 7858, 'W': 8569, 'M': 11499, 'V': 41267, 'G': 41335, 'H': 13779, 'F': 24344, 'N': 25958, 'X': 286, 'B': 1})
defaultdict(<class 'int'>, {'S': 4209, 'D': 1968, 'T': 1829, 'M': 1806, 'A': 2842, 'Q': 1365, 'N': 1600, 'P': 1855, 'K': 2098, 'W': 162, 'C': 174, 'G': 3802, 'H': 49

In [8]:
total = sum([disprot_disorder_distrib_dict[aa] for aa in disprot_disorder_distrib_dict.keys()])
for aa in disprot_disorder_distrib_dict.keys():
    print(aa, disprot_disorder_distrib_dict[aa]/total * 100)

M 2.06882735768352
Q 5.056471785764562
R 5.346681538111688
G 7.900635645987417
N 4.346107615840255
F 2.5609927772423577
K 7.226006259001375
E 8.771806339133919
A 8.095011207727376
L 6.547045383174331
I 3.2800199248486686
W 0.6805851841424193
P 7.174569829014478
S 9.35547446045892
Y 2.0200981082222484
D 6.344548279857494
T 5.562714544056655
H 2.024971033168376
V 4.750018950263679
C 0.8868723401951335
Z 0.0005414361051252342
