In [3]:
import os
import numpy as np
import pandas as pd
import umap
import time
import json
import requests
from Bio import SeqIO
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.patches import Patch
import mpl_stylesheet
mpl_stylesheet.banskt_presentation(fontfamily = 'mono', fontsize = 20, colors = 'banskt', dpi = 300)

target_uniprots = ["P37840", "P04637", "P02686", "P07305", "O00488", "Q9NYB9", "P06401", "Q16186", "S6B291", "P23441"]


  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [4]:
### Load sequences and annotations for disprot

def get_sequences(fastadir=None, fastafile=None):
    sequences = []
    if fastadir is None and fastafile is None:
        print("No fasta dir or file")
        raise
    if fastadir is not None and fastafile is not None:
        print("Choose one, fasta dir or multi fasta filr")
        raise
    # check for a directory with individual fasta files
    # or a multi fasta file
    if fastadir is not None:
        fastafiles = os.listdir(fastadir)
        for f in fastafiles:
            counter = 0
            for record in SeqIO.parse(os.path.join(fastadir, f), "fasta"):
                sequences.append(record)
                counter += 1
                if counter > 1:
                    print("More than one fasta record?", f)
                    raise
    elif fastafile is not None:
        for record in SeqIO.parse(fastafile, "fasta"):
            sequences.append(record)
    return sequences

# Disprot
# embeddir="/data/franco/disorder_flavours/testset/" #"/data/franco/datasets/prot_embedding_weights/disprot/halft5/"
fasta_dir = "./disprot/fasta/" #"/data/franco/datasets/disprot/fasta/"
fasta_files = [f"{u}.fasta" for u in target_uniprots]
counter = 0

# find annotation for each sequence
annotfile = "./disprot/DisProt_release_2022_06_reformat_annot.fasta" #/data/franco/datasets/disprot/
annotdir  = None
annots    = get_sequences(fastadir=annotdir, fastafile=annotfile)
sequences = get_sequences(fastadir=fasta_dir, fastafile=None)

In [5]:
### Load monomer test proteins

# Monomers
fasta_file = "./disprot/monomers.fasta"
annotfile = "./disprot/monomers_annot.fasta"
annotdir  = None
mono_rannots    = get_sequences(fastadir=None, fastafile=annotfile)
mono_rsequences = get_sequences(fastadir=None, fastafile=fasta_file)

mono_seqs = list()
mono_annots = list()
mono_ids = list()

for i in range(len(mono_rsequences)):
    aa_seq = str(mono_rsequences[i].seq).upper()
    annot_seq = str(mono_rannots[i].seq)
    if len(aa_seq) == len(annot_seq):
        mono_ids.append(mono_rsequences[i].id)
        mono_seqs.append(aa_seq)
        mono_annots.append(annot_seq)
        if i>9:
            break

In [7]:
### Load more detailed annotation for disprot

import json
disprot_json_file = "./disprot/DisProt_release_2022_06_with_ambiguous_evidences.json"
with open(disprot_json_file) as infmt:
    json_dict = json.load(infmt)

json_dict['data'][1]['regions'][0]
print(json_dict['data'][1]["acc"])

P49913


In [8]:
### Subset target test proteins
disprot_datadict = dict()
for i in range(len(json_dict['data'])):
    if json_dict['data'][i]["acc"] in target_uniprots:
        disprot_datadict[json_dict['data'][i]["acc"]] = json_dict['data'][i]

In [9]:
### Make sure seq and annotations length match, and embed test proteins
annot_dict = dict()
for record in annots:
    if "|" in record.name:
        name = record.name.split("|")[1].strip()
    else:
        name = record.name.split()[0].strip()
        if name == "":
            print("Name is empty",record.name)
    annot_dict[name] = str(record.seq)
    
msequences  = list()
mannots     = list()
uniprots    = list()
counter = 0
for s in sequences:
    if "|" in s.name:
        uniprot_id = s.name.split("|")[1].strip()
    else:
        uniprot_id = s.name.split()[0].strip()
    if uniprot_id in target_uniprots:
        print(uniprot_id)
        aa_sequence = str(s.seq).upper()
        if len(aa_sequence) == len(annot_dict[uniprot_id]):
            uniprots.append(uniprot_id)
            msequences.append(aa_sequence)
            mannots.append(annot_dict[uniprot_id])
            counter += 1
        else:
            print("Seq length and annot do not match")
            print(len(annot_dict[uniprot_id]), uniprot_id, len(aa_sequence))
print(f"Loaded {counter} proteins")

S6B291
P04637
P23441
Q16186
P06401
O00488
P02686
P37840
Q9NYB9
P07305
Loaded 10 proteins


In [10]:
### Explore some detailed annotations
i=1
print(disprot_datadict[uniprots[i]]['disprot_consensus'])
print("##########")

for r in disprot_datadict[uniprots[i]]['regions']:
    print(r["term_namespace"])
    print(r["disprot_namespace"])
    print(r["term_name"])
    print(r["start"], r["end"])
    #print(r)
    print("########")

print(mannots[i])
## F: disorder Function, molecular Function
## D: disordered
## T: Structural Transition

{'full': [{'start': 1, 'end': 62, 'type': 'T'}, {'start': 63, 'end': 93, 'type': 'D'}, {'start': 291, 'end': 312, 'type': 'D'}, {'start': 359, 'end': 360, 'type': 'F'}, {'start': 361, 'end': 365, 'type': 'D'}, {'start': 366, 'end': 388, 'type': 'T'}, {'start': 389, 'end': 393, 'type': 'D'}], 'Structural state': [{'start': 1, 'end': 93, 'type': 'D'}, {'start': 291, 'end': 312, 'type': 'D'}, {'start': 361, 'end': 393, 'type': 'D'}], 'Molecular function': [{'start': 1, 'end': 93, 'type': 'F'}, {'start': 359, 'end': 393, 'type': 'F'}], 'Structural transition': [{'start': 1, 'end': 62, 'type': 'T'}, {'start': 366, 'end': 388, 'type': 'T'}], 'Disorder function': [{'start': 1, 'end': 61, 'type': 'F'}, {'start': 368, 'end': 372, 'type': 'F'}, {'start': 380, 'end': 384, 'type': 'F'}]}
##########
Structural state
Structural state
disorder
1 12
########
Structural state
Structural state
disorder
1 93
########
Structural state
Structural state
disorder
291 312
########
Structural state
Structural 

In [11]:
### Secondary structure annotation

## Get PDBs and calculate Secondary structure for each using mkdssp
# what a pain to install!!

## Info about DSSP output
# https://pdb-redo.eu/dssp/about


for pdbidchain in mono_ids:
    pdbid = pdbidchain[:4]
    chain = pdbidchain[4]
    print(pdbid, chain)
    pdbfile = os.path.join("disprot","pdbs",pdbid+".pdb")
    if not os.path.exists(pdbfile):
        res = requests.get(f"https://files.rcsb.org/download/{pdbid}.pdb")
        with open(pdbfile, 'w') as outfmt:
            outfmt.write(res.content.decode())

        dsspfile = os.path.join("disprot","pdbs",pdbid+".dssp")
        os.system(f"mkdssp --output-format dssp {pdbfile} {dsspfile}")
    else:
        print(f"File exists: {pdbfile}")

1AE9 A
File exists: disprot/pdbs/1AE9.pdb
1AH7 A
File exists: disprot/pdbs/1AH7.pdb
1AHO A
File exists: disprot/pdbs/1AHO.pdb
1AOC A
File exists: disprot/pdbs/1AOC.pdb
1AOL A
File exists: disprot/pdbs/1AOL.pdb
1AQZ A
File exists: disprot/pdbs/1AQZ.pdb
1ATG A
File exists: disprot/pdbs/1ATG.pdb
1ATZ A
File exists: disprot/pdbs/1ATZ.pdb
1AYO B
File exists: disprot/pdbs/1AYO.pdb
1AZO A
File exists: disprot/pdbs/1AZO.pdb
1B9W A
File exists: disprot/pdbs/1B9W.pdb


In [12]:
## Code to parse mkdssp output, not very optimal

def offset_loop(this_annot, counter, offset):
    dlen = 1
    while this_annot[counter+offset+1] == "D":
        offset += 1
        dlen += 1
    ss_list = ["-" for r in range(dlen)]
    return offset, ss_list

def parseDSSP(lines, targ_chain, this_seq, this_annot, debug=False):
    flagstart = False
    ss_seq = list()
    aa_seq = list()
    offset = 0
    targ_chain = "A"
    counter = 0
    for line in lines:
        if flagstart:
            if line == "":
                break
            resnum = line[:5]
            pdbresnum = line[5:10]
            chain = line[11]
            resname = line[13]
            ss = line[16]
            end = line[14]
            if debug:
                print(f"--> counter:{counter}, offset:{offset}",resnum, pdbresnum, chain, resname, ss, "||", line[1:17])
            if end == "*":
                break
            if resname != "!":
                if chain == targ_chain:
                    if resname == this_seq[counter+offset]:
                        if ss == " ":
                            ss = "-"
                        ss_seq.append(ss)
                        aa_seq.append(resname)
                        if debug:
                            print(f"---> {resname} === {this_seq[counter+offset]}")
                    else:
                        if this_annot[counter+offset] == "D":
                            if debug:
                                print(f" {resname} != {this_seq[counter+offset]} -->OFFSET LOOP: counter:{counter}, offset:{offset}")
                            #ss_seq.append("-")
                            if this_annot[counter+offset+1] == "D" and len(this_annot) >= (counter+offset+1):
                                offset, ss_list = offset_loop(this_annot, counter, offset)
                                if debug:
                                    print(f"END OFFSET LOOP: counter:{counter}, offset:{offset}")
                                for s in ss_list:
                                    ss_seq.append(s)
                                    aa_seq.append("-")
                                if counter == 0:
                                    offset += 1
                                    if resname == this_seq[counter+offset]:
                                        if ss == " ":
                                            ss = "-"
                                        ss_seq.append(ss)
                                        aa_seq.append(resname)
                        else:
                            ss = "-"
                            ss_seq.append(ss)
                            aa_seq.append(resname)
                            if debug:
                                print(f"MISMATCH?? {resname} ?? {this_seq[counter+offset]}")
                else:
                    print("PANIC")
                    raise
            else:
                if debug:
                    print(f"resname == '!' --> counter:{counter}, offset:{offset} '-' {this_seq[counter+offset]}")
                #ss_seq.append("-")
                if this_annot[counter+offset+1] == "D" and len(this_annot) >= (counter+offset+1):
                    offset, ss_list = offset_loop(this_annot, counter, offset)
                    for s in ss_list:
                        ss_seq.append(s)
                        aa_seq.append("-")
                else:
                    print("PANIC")
                    raise           
            counter += 1
        if line.strip().startswith("#"):
            flagstart = True
            
    diff = len(ss_seq) - len(this_seq)
    if diff > 0:
        print("DSSP is longer than seq?")
    if diff < 0:
        print(f"DSSP is missing some residues? missing:{diff}")
        ## If it's missing some residue ss, just add it to the end. This is not very good
        for i in range(np.abs(diff)):
            ss_seq.append("-")
    return ss_seq, aa_seq

In [13]:
## parse output files
## Parse secondary structure
# make sure sequences matches
mono_dssps = list()
for i, pdbidchain in enumerate(mono_ids):
    pdbid = pdbidchain[:4]
    dsspfile = os.path.join("disprot","pdbs",pdbid+".dssp")
    with open(dsspfile) as infmt:
        lines = infmt.readlines()
    print(pdbid, pdbidchain[4])
    mono_ss_seq, _aa_seq = parseDSSP(lines, pdbidchain[4], mono_seqs[i], mono_annots[i])
    mono_dssps.append("".join(mono_ss_seq))

1AE9 A
1AH7 A
1AHO A
1AOC A
1AOL A
DSSP is missing some residues? missing:-1
1AQZ A
1ATG A
1ATZ A
DSSP is missing some residues? missing:-2
1AYO B
1AZO A
1B9W A
DSSP is missing some residues? missing:-4


In [14]:
## Check ss and seq len are the same at least
for i in range(11):
    print(f"### {mono_ids[i]}", end=" ")
    if len(mono_seqs[i]) == len(mono_dssps[i]):
        print("OK")
    else:
        print("seqlen:", len(mono_seqs[i]),"dssp_len:", len(mono_dssps[i]))

### 1AE9A OK
### 1AH7A OK
### 1AHOA OK
### 1AOCA OK
### 1AOLA OK
### 1AQZA OK
### 1ATGA OK
### 1ATZA OK
### 1AYOB OK
### 1AZOA OK
### 1B9WA OK


In [15]:
## Some debugging
# i = 9
# pdbidchain = mono_ids[i]
# pdbid = pdbidchain[:4]
# print(pdbidchain)
# dsspfile = os.path.join("disprot","pdbs",pdbid+".dssp")
# with open(dsspfile) as infmt:
#     lines = infmt.readlines()
# ss_seq, _aa_seq = parseDSSP(lines, pdbidchain[4], mono_seqs[i], mono_annots[i], debug=True)

# print(f"### {mono_ids[i]}")
# print(mono_annots[i])
# print("".join(mono_dssps[i]))
# print("seqlen:", len(mono_seqs[i]),"dssp_len:", len("".join(mono_dssps[i])))

In [16]:
### Obtain SS annotations from PDB website

ssannots_list = list()
for i in range(len(mono_ids)):
    annots = list()
    pdbidchain = mono_ids[i]
    pdbid = pdbidchain[:4]
    chain = pdbidchain[4]
    print(pdbid, chain)
    res = requests.get(f"https://data.rcsb.org/rest/v1/core/polymer_entity_instance/{pdbid.upper()}/{chain.upper()}")
    datadict = json.loads(res.content.decode())
    
    for e in datadict["rcsb_polymer_instance_feature"]:
        if 'provenance_source' in e:
            if e['provenance_source'] == "PROMOTIF":
                ss_type  = e['type']
                for ee in e['feature_positions']:
                    ss_begin = ee["beg_seq_id"]
                    ss_end   = ee["end_seq_id"]
                    #print(ss_type, ss_begin, ss_end)
                    annots.append([ss_type, ss_begin, ss_end])
            if e['provenance_source'] == "PDB":
                if e["type"] == "UNOBSERVED_RESIDUE_XYZ":
                    for ee in e['feature_positions']:
                        miss_begin = ee["beg_seq_id"]
                        miss_end   = ee["end_seq_id"]
                        #print("MISSING", miss_begin, miss_end)
                        annots.append(["MISSING", miss_begin, miss_end])
    ssannots_list.append(annots)

1AE9 A
1AH7 A
1AHO A
1AOC A
1AOL A
1AQZ A
1ATG A
1ATZ A
1AYO B
1AZO A
1B9W A


In [17]:
## Change the annotation from regions to letter sequences
mono_easydssp = list()
for i in range(len(mono_ids)):
    #print(ssannots_list[i])
    ss_placeholder = ["-" for i in range(len(mono_seqs[i]))]
    for a in ssannots_list[i]:
        letter = "-"
        if a[0].startswith("HELIX"):
            letter = "H"
        if a[0].startswith("SHEET"):
            letter = "B"
        for j in range(int(a[1]), int(a[2])):
            ss_placeholder[j] = letter
    mono_easydssp.append("".join(ss_placeholder))

In [18]:
# Compare with mkdssp, it was very similar.. ish. DSSP gives more detailed information 
# but the overall ranges match

# for i in range(len(mono_ids)):
#     print(mono_dssps[i])
#     print(mono_easydssp[i])
#     print("~~~")

In [31]:
data_dict=dict()
for i,prot in enumerate(mono_ids):
    data_dict[prot] = dict()
    data_dict[prot]['seq'] = mono_seqs[i]
    data_dict[prot]['ss'] = mono_easydssp[i]
    data_dict[prot]['disorder'] = mono_annots[i]

with open("monomer_datadict.json", 'w') as outfmt:
    json.dump(data_dict, outfmt)