In [57]:
import requests
import time
import random
from bs4 import BeautifulSoup

In [14]:
BASE = "http://dictybase.org"

# get curator notes

In [47]:
# good for large qureies
session = requests.Session()
session.headers.update({
    "User-Agent": "dictybase-curator-notes/0.1"
})


In [48]:
def get_curator_notes_html(gene_id: str, timeout: float = 15.0) -> str | None:
    """
    Return curator notes as HTML-ish string (with <i>, <br>, etc.),
    or None if 404 / no notes.
    """
    url = f"{BASE}/gene/{gene_id}/gene/summary.json"
    r = session.get(url, timeout=timeout)

    if r.status_code == 404:
        return None
    r.raise_for_status()

    data = r.json()

    try:
        col0 = data[0]["items"][0]
        col_items = col0["content"][0]["items"]
        content_row = col_items[1]                  # after "Curator Notes" title
        tokens = content_row["content"][0]["items"]
    except (KeyError, IndexError, TypeError):
        return None

    fragments = []
    for t in tokens:
        if "text" in t:
            fragments.append(t["text"])
        elif "caption" in t:
            fragments.append(t["caption"])

    html = "".join(fragments).strip()
    return html or None


def get_curator_notes_plain(gene_id: str, timeout: float = 15.0) -> str | None:
    """
    Plain-text version of curator notes (HTML stripped).
    """
    html = get_curator_notes_html(gene_id, timeout=timeout)
    if html is None:
        return None

    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text(" ", strip=True)

In [49]:
# Example
gid = "DDB_G0283907"
print("HTML version:\n", get_curator_notes_html(gid)[:300], "...\n")
print("Plain text:\n", get_curator_notes_plain(gid)[:300], "...")

HTML version:
 <i>pkaC</i> encodes the catalytic subunit of the cAMP dependent protein kinase PKA  (Mann <i>et al.</i> 1992 , Anjard <i>et al.</i> 1993). In the absence of cAMP, PKA (pkaC, pkaR)  exists as an inactive complex of the catalytic and regulatory subunits. cAMP binds cooperatively to two sites on PkaR w ...

Plain text:
 pkaC encodes the catalytic subunit of the cAMP dependent protein kinase PKA  (Mann et al. 1992 , Anjard et al. 1993). In the absence of cAMP, PKA (pkaC, pkaR)  exists as an inactive complex of the catalytic and regulatory subunits. cAMP binds cooperatively to two sites on PkaR which leads to the rel ...


In [51]:
# Example empty
gid = "DDB_G3946984"
note = get_curator_notes_html(gid)

if note is None:
    print(f"{gid}: no curator notes (404 or empty)")
else:
    print("HTML version:\n", note[:300], "...\n")

DDB_G3946984: no curator notes (404 or empty)


# get full protein list

In [2]:
import polars as pl

In [34]:
# List of Reviews and associated genes (Updated monthly)
df_review = pl.read_csv("dictybase_files/Reviews.txt", separator="\t", has_header=False)
df_review.head()

column_1,column_2,column_3
str,i64,str
"""DDB_G0267374""",26183444,"""Chemotaxis/Motility, Reviews"""
"""DDB_G0267376""",26284972,"""Signal Transduction, Reviews, …"
"""DDB_G0267376""",26013485,"""Signal Transduction, Reviews, …"
"""DDB_G0267376""",18779059,"""Signal Transduction, Protein F…"
"""DDB_G0267376""",27318097,"""Signal Transduction, Reviews, …"


In [8]:
genes_review = df_review.select(df_review.columns[0]).unique()
len(genes_review)

1057

In [35]:
# DDB_G curation status (Updated monthly)
df_status = pl.read_csv("dictybase_files/DDB_G-curation_status.txt", separator="\t", has_header=False,truncate_ragged_lines=True)
df_status.head()

column_1,column_2
str,str
"""DDB_G0267212""","""Basic annotations have been ad…"
"""DDB_G0267280""","""Basic annotations have been ad…"
"""DDB_G0267304""","""Basic annotations have been ad…"
"""DDB_G0267338""","""Basic annotations have been ad…"
"""DDB_G0267356""","""Basic annotations have been ad…"


In [11]:
genes_status = df_status.select(df_status.columns[0]).unique()
len(genes_status)

18058

In [43]:
genes_status_nonempty = (
    df_status
    .filter(
        pl.col(df_status.columns[1]).is_not_null()
        & (pl.col(df_status.columns[1]) != "")
    )
    .select(pl.col(df_status.columns[0]))
    .unique()
)
len(genes_status_nonempty)

13339

In [36]:
#dictyBase ID, gene names, synonyms, and gene products (Updated monthly)
df_gene = pl.read_csv("dictybase_files/gene_information.txt", separator="\t", has_header=True)
df_gene.head()

GENE ID,Gene Name,Synonyms,Gene products
str,str,str,str
"""DDB_G0267364""","""DDB_G0267364_RTE""",,"""Skipper GAG-PRO"""
"""DDB_G0267372""","""DDB_G0267372_RTE""",,"""TRE5-A ORF1"""
"""DDB_G0267380""","""argE""","""P52D""","""acetylornithine deacetylase"""
"""DDB_G0267304""","""DDB_G0267304_RTE""",,"""DIRS1 ORF3 fragment"""
"""DDB_G0267338""","""DDB_G0267338_RTE""",,"""DIRS1 ORF3"""


In [24]:
genes_gene = df_gene.select(df_gene.columns[0]).unique()
len(genes_gene)

14222

In [37]:
#DDB-DDB_G-UniProt mapping (Updated monthly)
df_mapping = pl.read_csv("dictybase_files/DDB-GeneID-UniProt.txt", separator="\t", has_header=True)
df_mapping.head()

DDB ID,DDB_G ID,Name,UniProt ID
str,str,str,str
"""DDB0250764""","""DDB_G0267212""","""DDB_G0267212_RTE""","""Q55H52"""
"""DDB0216487""","""DDB_G0267280""","""DDB_G0267280_TE""","""Q55H16"""
"""DDB0216498""","""DDB_G0267304""","""DDB_G0267304_RTE""","""Q55H05"""
"""DDB0202373""","""DDB_G0267338""","""DDB_G0267338_RTE""","""Q55GY9"""
"""DDB0216520""","""DDB_G0267356""","""DDB_G0267356_RTE""","""Q55GX9"""


In [27]:
genes_mapping = df_mapping.select(df_mapping.columns[1]).unique()
len(genes_mapping)

14192

In [38]:
print(
    df_review.height,
    genes_status.height,
    genes_gene.height,
    genes_mapping.height,
)


3542 18058 14222 14192


In [31]:
s = pl.concat([
    genes_status.to_series(0),
    genes_gene.to_series(0),
    genes_mapping.to_series(0),
])

s.unique().len()

18058

gene information txt file contains all gene id

# loop to get all curated notes

In [52]:
genes_status.head()

column_1
str
"""DDB_G3973843"""
"""DDB_G0290933"""
"""DDB_G0288271"""
"""DDB_G0276409"""
"""DDB_G0276769"""


In [54]:
def polite_sleep(base=0.2, jitter=0.10):
    time.sleep(base + random.random() * jitter)

In [60]:
rows = []
for gid in genes_status.head(20).to_series():  #<-----------------remove head for extract all
    html = None
    plain = None
    try:
        html = get_curator_notes_html(gid)
        if html:
            plain = BeautifulSoup(html, "html.parser").get_text(" ", strip=True)
    except requests.RequestException as e:
        print(f"{gid}: failed ({e})")

    rows.append({
        "gene_id": gid,
        "curator_notes_html": html,
        "curator_notes_plain": plain,
    })

    polite_sleep()

DDB_G3973843: failed (500 Server Error:  for url: http://dictybase.org/gene/DDB_G3973843/gene/summary.json)
DDB_G3969885: failed (500 Server Error:  for url: http://dictybase.org/gene/DDB_G3969885/gene/summary.json)
DDB_G3971383: failed (500 Server Error:  for url: http://dictybase.org/gene/DDB_G3971383/gene/summary.json)
DDB_G3973327: failed (500 Server Error:  for url: http://dictybase.org/gene/DDB_G3973327/gene/summary.json)


In [61]:
rows

[{'gene_id': 'DDB_G3973843',
  'curator_notes_html': None,
  'curator_notes_plain': None},
 {'gene_id': 'DDB_G0290933',
  'curator_notes_html': 'Gene has been comprehensively annotated, 02-MAY-2011 PF',
  'curator_notes_plain': 'Gene has been comprehensively annotated, 02-MAY-2011 PF'},
 {'gene_id': 'DDB_G0288271',
  'curator_notes_html': 'Basic annotations have been added to this gene 4-MAY-2009 PF',
  'curator_notes_plain': 'Basic annotations have been added to this gene 4-MAY-2009 PF'},
 {'gene_id': 'DDB_G0276409',
  'curator_notes_html': 'Gene has been comprehensively annotated, 24-MAR-2011 RD',
  'curator_notes_plain': 'Gene has been comprehensively annotated, 24-MAR-2011 RD'},
 {'gene_id': 'DDB_G0276769',
  'curator_notes_html': 'A curated model has been added, 30-APR-2010 PG',
  'curator_notes_plain': 'A curated model has been added, 30-APR-2010 PG'},
 {'gene_id': 'DDB_G0289621',
  'curator_notes_html': 'A curated model has been added, 27-SEP-2010 RD',
  'curator_notes_plain': '

In [67]:
#Optional (but recommended): incremental save / resume
from pathlib import Path

OUT = Path("curator_notes.parquet")

if OUT.exists():
    df_done = pl.read_parquet(OUT)
    done_ids = set(df_done["gene_id"].to_list())
    print(f"Resuming: {len(done_ids)} genes already processed")
else:
    df_done = None
    done_ids = set()
    print("Starting fresh")

def polite_sleep(base=0.15, jitter=0.10):
    time.sleep(base + random.random() * jitter)

rows_buffer = []
BATCH_SIZE = 200   # write every 200 genes

for gid in genes_status.head(100).select("column_1").to_series():   #<-----------------remove head for extract all
    if gid in done_ids:
        continue   # ← THIS enables resume

    try:
        html = get_curator_notes_html(gid)
        plain = (
            BeautifulSoup(html, "html.parser").get_text(" ", strip=True)
            if html else None
        )
    except Exception as e:
        print(f"{gid}: failed ({e})")
        html = None
        plain = None

    rows_buffer.append({
        "gene_id": gid,
        "curator_notes_html": html,
        "curator_notes_plain": plain,
    })

    # periodically flush to disk
    if len(rows_buffer) >= BATCH_SIZE:
        df_batch = pl.DataFrame(rows_buffer)

        if OUT.exists():
            df_batch.write_parquet(OUT, append=True)
        else:
            df_batch.write_parquet(OUT)

        rows_buffer.clear()

    polite_sleep()
    
if rows_buffer:
    df_batch = pl.DataFrame(rows_buffer)
    if OUT.exists():
        df_batch.write_parquet(OUT, append=True)
    else:
        df_batch.write_parquet(OUT)

Starting fresh
DDB_G3973843: failed (500 Server Error:  for url: http://dictybase.org/gene/DDB_G3973843/gene/summary.json)
DDB_G3969885: failed (500 Server Error:  for url: http://dictybase.org/gene/DDB_G3969885/gene/summary.json)
DDB_G3971383: failed (500 Server Error:  for url: http://dictybase.org/gene/DDB_G3971383/gene/summary.json)
DDB_G3973327: failed (500 Server Error:  for url: http://dictybase.org/gene/DDB_G3973327/gene/summary.json)
DDB_G3971993: failed (500 Server Error:  for url: http://dictybase.org/gene/DDB_G3971993/gene/summary.json)
DDB_G0290665: failed (500 Server Error:  for url: http://dictybase.org/gene/DDB_G0290665/gene/summary.json)
DDB_G3972953: failed (500 Server Error:  for url: http://dictybase.org/gene/DDB_G3972953/gene/summary.json)
DDB_G0279847: failed (500 Server Error:  for url: http://dictybase.org/gene/DDB_G0279847/gene/summary.json)


In [69]:
df = pl.read_parquet("curator_notes.parquet")
df

gene_id,curator_notes_html,curator_notes_plain
str,str,str
"""DDB_G3973843""",,
"""DDB_G0290933""","""Gene has been comprehensively …","""Gene has been comprehensively …"
"""DDB_G0288271""","""Basic annotations have been ad…","""Basic annotations have been ad…"
"""DDB_G0276409""","""Gene has been comprehensively …","""Gene has been comprehensively …"
"""DDB_G0276769""","""A curated model has been added…","""A curated model has been added…"
…,…,…
"""DDB_G0273081""","""Gene has been comprehensively …","""Gene has been comprehensively …"
"""DDB_G0288895""","""Basic annotations have been ad…","""Basic annotations have been ad…"
"""DDB_G0295611""","""Basic annotations have been ad…","""Basic annotations have been ad…"
"""DDB_G0292844""","""A curated model has been added…","""A curated model has been added…"
