In [5]:
import requests
from bs4 import BeautifulSoup

def extract_curator_notes(gene_id):
    url = f"http://dictybase.org/gene/{gene_id}"
    r = requests.get(url)
    r.raise_for_status()

    soup = BeautifulSoup(r.text, "html.parser")

    # Look for the Curator Notes header
    header = soup.find(lambda tag: tag.name in ["h2", "h3", "div"] 
                                   and "Curator" in tag.get_text())

    if not header:
        return None
    
    # Curator notes typically follow immediately after the header
    notes_container = header.find_next()

    return notes_container.get_text(strip=True)

print(extract_curator_notes("DDB_G0283907"))

None


In [6]:
url = f"http://dictybase.org/gene/DDB_G0283907"
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")

In [4]:
r

<Response [200]>

In [7]:
soup

<!DOCTYPE html>

<head>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<link href="/gene/javascripts/yui/build/fonts/fonts-min.css?1296150189" media="screen" rel="stylesheet" type="text/css"/>
<link href="/gene/javascripts/yui/build/assets/skins/sam/skin.css?1296150188" media="screen" rel="stylesheet" type="text/css"/>
<link href="/gene/javascripts/yui/plugins/accordion/assets/accordion.css?1296150189" media="screen" rel="stylesheet" type="text/css"/>
<link href="/gene/javascripts/yui/build/container/assets/skins/sam/container.css?1296150188" media="screen" rel="stylesheet" type="text/css"/>
<link href="/gene/javascripts/yui/build/menu/assets/skins/sam/menu.css?1296150189" media="screen" rel="stylesheet" type="text/css"/>
<link href="/gene/javascripts/yui/build/button/assets/skins/sam/button.css?1296150188" media="screen" rel="stylesheet" type="text/css"/>
<link href="/gene/stylesheets/header.css?1296150189" media="screen" rel="stylesheet" type="text/css"/>
<link 

In [9]:
import requests
from bs4 import BeautifulSoup

BASE = "http://dictybase.org"

def get_curator_notes_html(gene_id: str) -> str | None:
    """
    Return Curator Notes as an HTML-ish string (with <i>, <br>, etc.),
    or None if no notes found.
    """
    url = f"{BASE}/gene/{gene_id}/gene/summary.json"
    r = requests.get(url)
    if r.status_code != 200:
        print(f"{gene_id}: HTTP {r.status_code}")
        return None

    data = r.json()
    try:
        # First column in summary.json = Curator Notes block
        col0 = data[0]["items"][0]                       # first column
        col_items = col0["content"][0]["items"]          # title + content rows
        content_row = col_items[1]                       # index 0 = title "Curator Notes"
        tokens = content_row["content"][0]["items"]      # list of text/caption tokens
    except (KeyError, IndexError, TypeError) as e:
        print(f"{gene_id}: structure not as expected ({e})")
        return None

    fragments = []
    for t in tokens:
        if "text" in t:
            fragments.append(t["text"])
        elif "caption" in t:
            # just use the caption text; if you want the link too, you can append f'{t["caption"]} ({BASE}{t["url"]})'
            fragments.append(t["caption"])

    html = "".join(fragments).strip()
    return html or None

def get_curator_notes_plain(gene_id: str) -> str | None:
    """
    Same as above, but return plain text (HTML tags stripped).
    """
    html = get_curator_notes_html(gene_id)
    if html is None:
        return None
    soup = BeautifulSoup(html, "html.parser")
    # join pieces with spaces so citations don’t stick together
    return soup.get_text(" ", strip=True)


# Example
gid = "DDB_G0283907"
print("HTML version:\n", get_curator_notes_html(gid)[:300], "...\n")
print("Plain text:\n", get_curator_notes_plain(gid)[:300], "...")


HTML version:
 <i>pkaC</i> encodes the catalytic subunit of the cAMP dependent protein kinase PKA  (Mann <i>et al.</i> 1992 , Anjard <i>et al.</i> 1993). In the absence of cAMP, PKA (pkaC, pkaR)  exists as an inactive complex of the catalytic and regulatory subunits. cAMP binds cooperatively to two sites on PkaR w ...

Plain text:
 pkaC encodes the catalytic subunit of the cAMP dependent protein kinase PKA  (Mann et al. 1992 , Anjard et al. 1993). In the absence of cAMP, PKA (pkaC, pkaR)  exists as an inactive complex of the catalytic and regulatory subunits. cAMP binds cooperatively to two sites on PkaR which leads to the rel ...
