# **Motifs**

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
data_dir = '/content/drive/MyDrive/BioInformatics/final'

In [3]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


In [4]:
from Bio import SeqIO
from Bio.Blast import NCBIXML
import pandas as pd
import requests

In [8]:
# Install clustalo
!apt-get -qq update
!apt-get -qq install -y clustalo

# Input: verified family FASTA
input_fasta = f"{data_dir}/data/O43099_verified_family.fasta"

# Output: MSA file (NEW file!)
msa_output = f"{data_dir}/data/O43099_verified_family_msa.fasta"

# Run Clustal Omega
!clustalo -i "{input_fasta}" -o "{msa_output}" --force --outfmt=fasta --output-order=input-order

print("MSA written to:", msa_output)


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
✅ MSA written to: /content/drive/MyDrive/BioInformatics/final/data/O43099_verified_family_msa.fasta


In [11]:
#look into new fasta
msa_output = f"{data_dir}/data/O43099_verified_family_msa.fasta"

# Show the first ~60 lines of the alignment file
!head -n 60 "{msa_output}"


>O43099_TIR
KAGDSFPSDVVFSYIPWSEDKGEITACGIPINYNASKEWADKKVILFALPGAFTPVCSAR
HVPEYIEKLPEIRAKGVDVVAVLAYNDAYVMSAWGKANQVTGDDILFLSDPDARFSKSIG
WADEEGRTKRYALVIDHGKITYAALEPAKNHLEFSSAETVL
>UR50:UniRef50_O43099 Peroxiredoxin Asp f3 n=482 Tax=Eukaryota TaxID=2759 RepID=PRX5_ASPFU
KAGDSFPSDVVFSYIPWSEDKGEITACGIPINYNASKEWADKKVILFALPGAFTPVCSAR
HVPEYIEKLPEIRAKGVDVVAVLAYNDAYVMSAWGKANQVTGDDILFLSDPDARFSKSIG
WADEEGRTKRYALVIDHGKITYAALEPAKNHLEFSSAETVL
>UR50:UniRef50_P56577 Putative peroxiredoxin n=32 Tax=Ustilaginomycotina TaxID=452284 RepID=MALF2_MALFU
--GNEIPDT-LMGYIPWTPELDSGEVCGIPTTFKTRDEWKGKKVVIVSIPGAYTPICHQQ
HIPPLVKRVDELKAKGVDAVYVIASNDPFVMAAWGNFN-NAKDKVVFATDIDLAFSKALG
ATIDLSRTARYALIIDDNKIVFASDEGDTGKLQNASIDTIL
>UR50:UniRef50_P56578 Putative peroxiredoxin (Fragment) n=56 Tax=Dikarya TaxID=451864 RepID=MALF3_MALFU
--GSTIPNA-TFAYVPYSPELEDHKVCGMPTSFQSHERWKGKKVVIVAVPGAFTPTCTAN
HVPPYVEKIQELKSKGVDEVVVISANDPFVLSAWGITE-HAKDNLTFAQDVNCEFSKHFN
ATSSKGRTARYALIANDLKVEYFGIDE--GEPKQSSAATVL
>UR50:UniRef50_Q69TY4 Peroxiredox

In [13]:
#Remove the gaps and make this more searchable
from collections import Counter

msa_output = f"{data_dir}/data/O43099_verified_family_msa.fasta"

# read aligned fasta
seqs = []
with open(msa_output) as f:
    cur = []
    for line in f:
        line = line.strip()
        if not line:
            continue
        if line.startswith(">"):
            if cur:
                seqs.append("".join(cur))
                cur = []
        else:
            cur.append(line)
    if cur:
        seqs.append("".join(cur))

L = len(seqs[0])
assert all(len(s) == L for s in seqs), "Alignment lengths differ — something went wrong."

# build consensus (ignore gaps when choosing the most common aa)
cons = []
cons_pct = []
for i in range(L):
    col = [s[i] for s in seqs if s[i] != "-"]
    if not col:
        cons.append("-")
        cons_pct.append(0.0)
        continue
    c = Counter(col)
    aa, n = c.most_common(1)[0]
    cons.append(aa)
    cons_pct.append(n / len(col))

consensus = "".join(cons)

print("Alignment length:", L)
print("Num sequences:", len(seqs))
print("\nConsensus (first 200 aa):")
print(consensus[:200])

# show top conserved positions (excluding gaps-only)
top = sorted([(i, p, consensus[i]) for i,p in enumerate(cons_pct) if consensus[i] != "-"],
             key=lambda x: x[1], reverse=True)[:30]
print("\nTop conserved positions (0-based index, conservation, aa):")
for i,p,aa in top:
    print(i, f"{p:.2f}", aa)


Alignment length: 161
Num sequences: 20

Consensus (first 200 aa):
KVGDAIPSDEVFEYIPWSKDKGEITVCGIPINVNLSELFAGKKVVLFGVPGAFTPTCSQKHVPGYIEKAPELKAKGVDVVACLSVNDPFVMKAWGKALQGAKDKVKFLSDGNGEFTKSLGLLDDKGRSKRYALVVDDGKVKYANIEPGGGEFTVSSAETIL

Top conserved positions (0-based index, conservation, aa):
0 1.00 K
2 1.00 G
6 1.00 P
29 1.00 P
41 1.00 K
49 1.00 P
51 1.00 A
56 1.00 C
109 1.00 D
50 0.95 G
60 0.95 H
62 0.95 P
75 0.95 G
86 0.95 D
93 0.95 W
126 0.95 R
129 0.95 R
145 0.95 E
15 0.90 P
42 0.90 K
53 0.90 T
54 0.90 P
85 0.90 N
154 0.89 S
156 0.89 A
160 0.89 L
52 0.85 F
88 0.85 F
26 0.83 C
74 0.80 K


In [15]:
from collections import Counter

msa_output = f"{data_dir}/data/O43099_verified_family_msa.fasta"

# read aligned fasta
seqs = []
with open(msa_output) as f:
    cur = []
    for line in f:
        line = line.strip()
        if not line:
            continue
        if line.startswith(">"):
            if cur:
                seqs.append("".join(cur))
                cur = []
        else:
            cur.append(line)
    if cur:
        seqs.append("".join(cur))

L = len(seqs[0])

# per-position conservation (ignore gaps)
cons = []
cons_pct = []
for i in range(L):
    col = [s[i] for s in seqs if s[i] != "-"]
    if not col:
        cons.append("-")
        cons_pct.append(0.0)
        continue
    c = Counter(col)
    aa, n = c.most_common(1)[0]
    cons.append(aa)
    cons_pct.append(n / len(col))

consensus = "".join(cons)

def window_score(start, w):
    vals = cons_pct[start:start+w]
    # exclude windows with lots of gaps in consensus (means alignment is sparse there)
    gap_frac = sum(1 for x in consensus[start:start+w] if x == "-") / w
    return sum(vals)/w, gap_frac

results = []
for w in range(3, 13):  # 3..12 aa
    for start in range(0, L-w+1):
        score, gap_frac = window_score(start, w)
        if gap_frac <= 0.2:  # keep mostly non-gap windows
            results.append((score, w, start, consensus[start:start+w]))

# show top windows (deduplicate by same sequence+start)
results.sort(reverse=True)
seen = set()
top = []
for score, w, start, seq in results:
    key = (start, w, seq)
    if key in seen:
        continue
    seen.add(key)
    top.append((score, w, start, seq))
    if len(top) == 25:
        break

print("Top conserved windows (score, length, start_index, consensus_window):")
for score, w, start, seq in top:
    print(f"{score:.2f}", w, start, seq)


Top conserved windows (score, length, start_index, consensus_window):
0.98 3 49 PGA
0.95 4 49 PGAF
0.94 5 49 PGAFT
0.93 6 49 PGAFTP
0.93 3 50 GAF
0.93 4 50 GAFT
0.92 5 50 GAFTP
0.92 3 51 AFT
0.91 4 51 AFTP
0.89 8 49 PGAFTPTC
0.88 3 52 FTP
0.87 9 49 PGAFTPTCS
0.87 7 50 GAFTPTC
0.87 7 49 PGAFTPT
0.87 7 48 VPGAFTP
0.87 6 48 VPGAFT
0.87 3 41 KKV
0.86 4 48 VPGA
0.86 3 40 GKK
0.86 5 48 VPGAF
0.86 6 51 AFTPTC
0.86 8 50 GAFTPTCS
0.85 6 50 GAFTPT
0.85 3 84 VND
0.84 9 48 VPGAFTPTC


In [23]:
import re
from Bio import SeqIO

input_fasta = f"{data_dir}/data/O43099_verified_family.fasta"

def extract_accession(header: str) -> str:
    # 1) UniRef50 headers: UR50:UniRef50_P30044 -> P30044
    m = re.search(r"UniRef50_([A-Z0-9]{6,10})\b", header)
    if m:
        return m.group(1)

    # 2) If it’s like "sp|P30044|..." or "tr|Q9XXXX|..."
    m = re.search(r"\b(?:sp|tr)\|([A-Z0-9]{6,10})\|", header)
    if m:
        return m.group(1)

    # 3) First token before whitespace, e.g. "O43099_TIR" -> "O43099"
    first = header.split()[0]
    first = first.split("_")[0]
    if re.fullmatch(r"[A-Z0-9]{6,10}", first):
        return first

    return None

accs = []
for r in SeqIO.parse(input_fasta, "fasta"):
    acc = extract_accession(r.description)
    if acc is None:
        print("⚠️ Could not extract accession from:", r.description)
    else:
        accs.append(acc)

# unique, keep order
accs = list(dict.fromkeys(accs))
print("Accessions found:", accs)
print("Count:", len(accs))


✅ Accessions found: ['O43099', 'P56577', 'P56578', 'Q69TY4', 'Q949U7', 'P99029', 'Q9XEX2', 'P14293', 'P73728', 'A9PCL4', 'Q9BGI1', 'P30044', 'P38013', 'B3EWI1', 'Q9SDD6', 'Q9M7T0', 'P44758', 'O14313', 'Q7G959']
Count: 19


In [24]:
import time
import pandas as pd
import requests

data_dir = '/content/drive/MyDrive/BioInformatics/final'
out_tsv = f"{data_dir}/data/O43099_family_iupred2a_long_api.tsv"

accs = ['O43099', 'P56577', 'P56578', 'Q69TY4', 'Q949U7', 'P99029', 'Q9XEX2', 'P14293',
        'P73728', 'A9PCL4', 'Q9BGI1', 'P30044', 'P38013', 'B3EWI1', 'Q9SDD6', 'Q9M7T0',
        'P44758', 'O14313', 'Q7G959']

rows = []
failed = []

for acc in accs:
    url = f"https://iupred2a.elte.hu/iupred2a/long/{acc}.json"
    r = requests.get(url, timeout=60)
    if r.status_code != 200:
        failed.append((acc, r.status_code))
        continue

    js = r.json()
    seq = js.get("sequence")
    scores = js.get("iupred") or js.get("iupred2")

    if seq is None or scores is None:
        failed.append((acc, "bad_json"))
        continue

    for i, (aa, sc) in enumerate(zip(seq, scores), start=1):
        rows.append({"accession": acc, "pos": i, "aa": aa, "iupred_long": float(sc)})

    time.sleep(0.2)

df = pd.DataFrame(rows)
df.to_csv(out_tsv, sep="\t", index=False)

print("Wrote:", out_tsv)
print("Rows:", len(df), "Proteins:", df["accession"].nunique())
print("Failed:", failed[:10], ("... (more)" if len(failed) > 10 else ""))
df.head(10)


✅ Wrote: /content/drive/MyDrive/BioInformatics/final/data/O43099_family_iupred2a_long_api.tsv
Rows: 4072 Proteins: 19
Failed: [] 


Unnamed: 0,accession,pos,aa,iupred_long
0,O43099,1,M,0.216401
1,O43099,2,S,0.247041
2,O43099,3,G,0.308712
3,O43099,4,L,0.236433
4,O43099,5,K,0.182256
5,O43099,6,A,0.098513
6,O43099,7,G,0.120615
7,O43099,8,D,0.074921
8,O43099,9,S,0.050641
9,O43099,10,F,0.078022


In [25]:
#identifying disordered regions
import pandas as pd

data_dir = '/content/drive/MyDrive/BioInformatics/final'
infile = f"{data_dir}/data/O43099_family_iupred2a_long_api.tsv"

df = pd.read_csv(infile, sep="\t")

threshold = 0.5

intervals = []

for acc, group in df.groupby("accession"):
    group = group.sort_values("pos")
    in_disorder = False
    start = None

    for _, row in group.iterrows():
        if row["iupred_long"] > threshold:
            if not in_disorder:
                start = row["pos"]
                in_disorder = True
        else:
            if in_disorder:
                end = prev_pos
                intervals.append({"accession": acc, "start": start, "end": end})
                in_disorder = False
        prev_pos = row["pos"]

    # close if disorder until end
    if in_disorder:
        intervals.append({"accession": acc, "start": start, "end": prev_pos})

dis_df = pd.DataFrame(intervals)

print("Number of disordered regions:", len(dis_df))
print("\nFirst few disordered intervals:")
print(dis_df.head(10))


Number of disordered regions: 23

First few disordered intervals:
  accession  start  end
0    B3EWI1      1    2
1    B3EWI1      4    4
2    B3EWI1     12   13
3    B3EWI1    112  112
4    B3EWI1    150  150
5    B3EWI1    164  164
6    B3EWI1    166  166
7    P14293      5    5
8    P44758      1    1
9    P44758      3    3


These Results show we only have very few disoredered Regions.

In [26]:
#So here we filter for longer disordered regions
# Filter intervals >= 10 residues
long_disorder = dis_df.copy()
long_disorder["length"] = long_disorder["end"] - long_disorder["start"] + 1

long_disorder = long_disorder[long_disorder["length"] >= 10]

print("Number of long disorder regions (>=10 aa):", len(long_disorder))
print("\nLong disorder regions:")
print(long_disorder)


Number of long disorder regions (>=10 aa): 2

Long disorder regions:
   accession  start  end  length
13    P56577      1   11      11
18    Q7G959    544  553      10


We got 2 disordered regions, that were now gonna extract the actual disordered sequences for those regions

In [27]:
from Bio import SeqIO

data_dir = '/content/drive/MyDrive/BioInformatics/final'
input_fasta = f"{data_dir}/data/O43099_verified_family.fasta"

# load sequences by accession
seq_dict = {}

def extract_accession(header: str) -> str:
    import re
    m = re.search(r"UniRef50_([A-Z0-9]{6,10})\b", header)
    if m: return m.group(1)
    m = re.search(r"\b(?:sp|tr)\|([A-Z0-9]{6,10})\|", header)
    if m: return m.group(1)
    first = header.split()[0].split("_")[0]
    if re.fullmatch(r"[A-Z0-9]{6,10}", first): return first
    return None

for r in SeqIO.parse(input_fasta, "fasta"):
    acc = extract_accession(r.description)
    if acc:
        seq_dict[acc] = str(r.seq)

# your long disorder intervals
long_regions = [
    ("P56577", 1, 11),
    ("Q7G959", 544, 553)
]

for acc, start, end in long_regions:
    seq = seq_dict.get(acc)
    if seq is None:
        print(f"sequence not found for {acc}")
        continue
    frag = seq[start-1:end]  # 1-based -> 0-based
    print(f"{acc}  {start}-{end}  (len={len(frag)}): {frag}")


P56577  1-11  (len=11): --GNEIP-DTL
Q7G959  544-553  (len=0): 


In [28]:
#Fetchin the uniprot full accessions for these two regions
import requests

def fetch_uniprot_fasta(acc):
    url = f"https://rest.uniprot.org/uniprotkb/{acc}.fasta"
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    return r.text

def parse_fasta(fasta_text):
    lines = fasta_text.strip().splitlines()
    header = lines[0]
    seq = "".join(lines[1:]).replace(" ", "").replace("\n", "")
    return header, seq

# long disorder intervals (from your IUPred API table)
regions = {
    "P56577": (1, 11),
    "Q7G959": (544, 553),
}

for acc, (start, end) in regions.items():
    fasta = fetch_uniprot_fasta(acc)
    header, seq = parse_fasta(fasta)
    frag = seq[start-1:end]  # 1-based -> 0-based
    print(header)
    print(f"{acc} length = {len(seq)}")
    print(f"{acc} {start}-{end} (len={len(frag)}): {frag}")
    print("-"*60)


>sp|P56577|MALF2_MALFU Putative peroxiredoxin OS=Malassezia furfur OX=55194 PE=1 SV=1
P56577 length = 177
P56577 1-11 (len=11): MPGDPTATAKG
------------------------------------------------------------
>sp|Q7G959|PRX2A_ARATH Peroxiredoxin-2A OS=Arabidopsis thaliana OX=3702 GN=PRXIIA PE=2 SV=1
Q7G959 length = 553
Q7G959 544-553 (len=10): TKEQSDYKRH
------------------------------------------------------------


Now  we have to scan these fragments for PROSITE patterns (PA lines only)

In [31]:
import re
import requests

# Your two disordered fragments (from UniProt sequences, ungapped)
fragments = {
    "P56577_disorder_1_11": "MPGDPTATAKG",
    "Q7G959_disorder_544_553": "TKEQSDYKRH",
}

# 1) Download PROSITE patterns data
prosite_url = "https://ftp.expasy.org/databases/prosite/prosite.dat"
txt = requests.get(prosite_url, timeout=60).text
print("Downloaded prosite.dat characters:", len(txt))

# 2) Parse entries, keep only PA patterns
# Prosite entry format: ID, AC, DE, PA (pattern), ...
entries = []
current = {"id": None, "ac": None, "de": None, "pa": []}

for line in txt.splitlines():
    tag = line[:2]
    content = line[5:].strip()

    if line.startswith("//"):
        if current["pa"]:  # only entries with PA lines
            entries.append(current)
        current = {"id": None, "ac": None, "de": None, "pa": []}
        continue

    if tag == "ID":
        current["id"] = content.split()[0]
    elif tag == "AC":
        current["ac"] = content.replace(";", "").strip()
    elif tag == "DE":
        current["de"] = content
    elif tag == "PA":
        # PA lines can span multiple lines; store raw pattern text (without trailing '.')
        current["pa"].append(content)

print("Entries with PA patterns:", len(entries))

# 3) Convert PROSITE pattern -> Python regex
# PROSITE patterns basics:
#  - x = any aa
#  - [ABC] = one of
#  - {ABC} = any except
#  - A(2) = AA
#  - x(3,5) = .{3,5}
#  - '-' separators are removed
def prosite_to_regex(pat: str) -> str:
    pat = pat.strip()
    if pat.endswith("."):
        pat = pat[:-1]
    pat = pat.replace("-", "")

    # handle repeats like A(2) or x(3,5)
    def repl_repeat(m):
        token = m.group(1)
        rep = m.group(2)
        if token == "x":
            base = "."
        else:
            base = re.escape(token)
        if "," in rep:
            a,b = rep.split(",")
            return f"{base}{{{int(a)},{int(b)}}}"
        else:
            return f"{base}{{{int(rep)}}}"

    pat = re.sub(r"(x|[A-Z])\((\d+(?:,\d+)?)\)", repl_repeat, pat)

    # x -> .
    pat = pat.replace("x", ".")

    #
    pat = re.sub(r"\{([A-Z]+)\}", r"[^\1]", pat)

    # Anchor full regex (we'll search, not fullmatch)
    return pat

# 4) Scan fragments for PA pattern matches
hits = []
for ent in entries:
    for raw_pat in ent["pa"]:
        rx = prosite_to_regex(raw_pat)
        try:
            cre = re.compile(rx)
        except re.error:
            continue

        for frag_id, seq in fragments.items():
            for m in cre.finditer(seq):
                hits.append({
                    "fragment": frag_id,
                    "match": m.group(0),
                    "start": m.start()+1,  # 1-based within fragment
                    "end": m.end(),        # 1-based end
                    "prosite_id": ent["id"],
                    "prosite_ac": ent["ac"],
                    "description": ent["de"],
                    "pattern": raw_pat
                })

print("PROSITE PA hits found:", len(hits))
# Show up to 20 hits
for h in hits[:20]:
    print(f"{h['fragment']}: {h['match']} ({h['start']}-{h['end']})  {h['prosite_id']} {h['prosite_ac']}  {h['description']}")


✅ Downloaded prosite.dat characters: 25164461
✅ Entries with PA patterns: 1311
✅ PROSITE PA hits found: 379
P56577_disorder_1_11: TAK (8-10)  PKC_PHOSPHO_SITE; PS00005  Protein kinase C phosphorylation site.
P56577_disorder_1_11: PTATAK (5-10)  PHOSPHOPANTETHEINE; PS00012  Phosphopantetheine attachment site.
P56577_disorder_1_11: TA (6-7)  PHOSPHOPANTETHEINE; PS00012  Phosphopantetheine attachment site.
P56577_disorder_1_11: TA (8-9)  PHOSPHOPANTETHEINE; PS00012  Phosphopantetheine attachment site.
Q7G959_disorder_544_553: DY (6-7)  EF_HAND_1; PS00018  EF-hand calcium-binding domain.
P56577_disorder_1_11: M (1-1)  HEMOPEXIN; PS00024  Hemopexin domain signature.
Q7G959_disorder_544_553: Y (7-7)  HEMOPEXIN; PS00024  Hemopexin domain signature.
Q7G959_disorder_544_553: KR (8-9)  NUCLEAR_REC_DBD_1; PS00031  Nuclear hormones receptors DNA-binding region signature.
P56577_disorder_1_11: M (1-1)  ADH_SHORT; PS00061  Short-chain dehydrogenases/reductases family signature.
P56577_disorder_1_11:

Now we filter these PROSITE hits down to ones that are actually meaningful for your family

In [32]:
# Put your hits into a DataFrame (reusing the 'hits' list from the previous cell)
import pandas as pd

hits_df = pd.DataFrame(hits)
hits_df["match_len"] = hits_df["match"].str.len()

# Filter: keep only matches length >= 4
filt = hits_df[hits_df["match_len"] >= 4].copy()

print("All hits:", len(hits_df))
print("Hits with match_len >= 4:", len(filt))

# Show unique remaining hits (fragment + prosite + match)
show = (filt[["fragment","match","start","end","match_len","prosite_id","prosite_ac","description","pattern"]]
        .drop_duplicates()
        .sort_values(["fragment","match_len"], ascending=[True, False]))

print("\nFiltered hits (unique):")
print(show.to_string(index=False))


All hits: 379
Hits with match_len >= 4: 29

Filtered hits (unique):
               fragment   match  start  end  match_len            prosite_id prosite_ac                                                         description                                                                pattern
   P56577_disorder_1_11 MPGDPTA      1    7          7    SUGAR_TRANSPORT_1;    PS00216                               Sugar transport proteins signature 1.                                                         x(4,6)-[GSTA].
   P56577_disorder_1_11  PTATAK      5   10          6   PHOSPHOPANTETHEINE;    PS00012                                 Phosphopantetheine attachment site. [STAGCPQLIVMF]-[LIVMATN]-[DENQGTAKRHLM]-[LIVMWSTA]-[LIVGSTACR]-{LPIY}-
   P56577_disorder_1_11  MPGDPT      1    6          6     CARBOXYPEPT_ZN_1;    PS00132            Zinc carboxypeptidases, zinc-binding region 1 signature.                                                       x(5)-[LIVMFYTA].
   P56577_disorder_1_11 

Now applying a “motif-like only” filter and map to UniProt coordinates

In [33]:
# Start from your 'hits_df' (all PROSITE hits) and your disordered fragment info
import pandas as pd

# Recreate fragment → (accession, fragment_start_uniprot)
frag_info = {
    "P56577_disorder_1_11": ("P56577", 1),
    "Q7G959_disorder_544_553": ("Q7G959", 544),
}

hits_df = pd.DataFrame(hits)
hits_df["match_len"] = hits_df["match"].str.len()

# Keep motif-like descriptions (simple keyword filter)
keywords = [
    "phosph", "myrist", "glycosyl", "amidation", "prenyl", "sumo",
    "acetyl", "methyl", "ubiquitin", "nuclear localization", "peroxisomal",
    "signal peptide", "targeting", "cleavage", "kinase"
]

def is_motif_like(desc):
    d = str(desc).lower()
    return any(k in d for k in keywords)

motif_like = hits_df[hits_df["description"].apply(is_motif_like)].copy()

# Also require match length >= 3 (since real PTM motifs can be short)
motif_like = motif_like[motif_like["match_len"] >= 3].copy()

# Map to UniProt coordinates
def map_uniprot(row):
    acc, base = frag_info[row["fragment"]]
    u_start = base + row["start"] - 1
    u_end = base + row["end"] - 1
    return pd.Series({"accession": acc, "uniprot_start": u_start, "uniprot_end": u_end})

motif_like = pd.concat([motif_like, motif_like.apply(map_uniprot, axis=1)], axis=1)

cols = ["accession","fragment","match","match_len","uniprot_start","uniprot_end",
        "prosite_id","prosite_ac","description","pattern"]
motif_like = motif_like[cols].drop_duplicates().sort_values(["accession","uniprot_start","match_len"], ascending=[True, True, False])

print("Motif-like PROSITE hits in disordered regions:", len(motif_like))
print(motif_like.to_string(index=False))


Motif-like PROSITE hits in disordered regions: 17
accession                fragment  match  match_len  uniprot_start  uniprot_end             prosite_id prosite_ac                                              description                                                                pattern
   P56577    P56577_disorder_1_11    MPG          3              1            3    GLUTAMATE_5_KINASE;    PS00902                            Glutamate 5-kinase signature.                                                        x(2)-[GALVCMI].
   P56577    P56577_disorder_1_11  GDPTA          5              3            7      TRANSGLYCOSYLASE;    PS00922                 Prokaryotic transglycosylases signature.                                                            x(4)-[SAG].
   P56577    P56577_disorder_1_11   GDPT          4              3            6                TRMA_1;    PS01230           RNA methyltransferase trmA family signature 1.                                           [NAMDSYHKGQ

To validate we run a ELM scan for our two disordered segments

In [35]:
!pip -q install gget

import gget

# Download ELM database locally (only needs to be done once)
gget.setup("elm")

print("ELM database setup complete.")


INFO:gget.utils:ELM data can be downloaded & distributed for non-commercial use according to the following license: http://elm.eu.org/media/Elm_academic_license.pdf
INFO:gget.utils:Downloading ELM database files (requires curl to be installed)...
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 88298  100 88298    0     0  74444      0  0:00:01  0:00:01 --:--:-- 74450
  % Total    % Received % Xferd  Average Spe

✅ ELM database setup complete.


In [37]:
from gget import elm
import pandas as pd

# Your disordered fragments (ungapped UniProt coords)
fragments = {
    "P56577_disorder_1_11": "MPGDPTATAKG",
    "Q7G959_disorder_544_553": "TKEQSDYKRH",
}

all_hits = []

for frag_id, seq in fragments.items():
    # returns two dataframes; we want the regex-based motif matches
    regex_df, diamond_df = elm(seq, json=False)
    if regex_df is None or len(regex_df) == 0:
        print(f"{frag_id}: no ELM regex hits")
        continue
    regex_df["fragment"] = frag_id
    all_hits.append(regex_df)

if all_hits:
    elm_hits = pd.concat(all_hits, ignore_index=True)
    print("Total ELM regex hits:", len(elm_hits))
    display(elm_hits.head(30))
else:
    print("Total ELM regex hits: 0")


INFO:gget.utils:ELM_Classes_Download_Version: 1.4, ELM_Classes_Download_Date: 2026-02-18 16:15:53.190570
INFO:gget.utils:ELM_Instance_Download_Version: 1.4, ELM_Instance_Download_Date: 2026-02-19 12:40:08.619824
INFO:gget.utils:ORTHO Compiling ortholog information...
INFO:gget.utils:ORTHO Performing pairwise sequence alignment against ELM database using DIAMOND for 1 sequence(s)...
INFO:gget.utils:Creating DIAMOND database and initiating alignment...
INFO:gget.utils:DIAMOND alignment complete.
INFO:gget.utils:REGEX Finding regex motif matches...
INFO:gget.utils:ELM_Classes_Download_Version: 1.4, ELM_Classes_Download_Date: 2026-02-18 16:15:53.190570
INFO:gget.utils:ELM_Instance_Download_Version: 1.4, ELM_Instance_Download_Date: 2026-02-19 12:40:08.619824
INFO:gget.utils:ORTHO Compiling ortholog information...
INFO:gget.utils:ORTHO Performing pairwise sequence alignment against ELM database using DIAMOND for 1 sequence(s)...
INFO:gget.utils:Creating DIAMOND database and initiating alignm

P56577_disorder_1_11: no ELM regex hits


INFO:gget.utils:DIAMOND alignment complete.
INFO:gget.utils:REGEX Finding regex motif matches...


Q7G959_disorder_544_553: no ELM regex hits
Total ELM regex hits: 0
