<h1>Retrieving DONSON information</h1>

In [2]:
GENE_SYM = "DONSON"

# Setup

In [3]:
%pip install biopython --quiet
%pip install pybiomart --quiet
%pip install biomart --quiet
%pip install httpx --quiet
# https://pypi.org/project/ncbi-datasets-pylib/
# https://www.ncbi.nlm.nih.gov/datasets/docs/v1/languages/python/
%pip install ncbi-datasets-pylib --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.3/60.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.5/57.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m331.5/331.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 kB[0m [31m2.8 MB/s[0m e

In [4]:
import polars as pl
import httpx
from Bio import Entrez
from Bio import SeqIO
from pybiomart import Server
from biomart import BiomartServer
from google.colab import userdata

import ncbi.datasets.openapi
from ncbi.datasets.openapi.api import gene_api
from ncbi.datasets.openapi.models import V1GeneMatch
from ncbi.datasets.openapi import ApiException as DatasetsApiException

from IPython.display import HTML, display
pl.Config.set_fmt_str_lengths(100)
def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

# NCBI
Here we get RefSeq accession number based on gene symbol and retrieve information form the database.

In [6]:
Entrez.email = "dutking@gmail.com"

def get_gene_record_from_entrez(gene_symbol: str, organism: str="Homo sapiens") -> list[str]:
    """
    Fetches the RefSeq accession numbers for a given gene symbol.

    Arguments:
    param gene_symbol: str
    The symbol of the gene (e.g., "BRCA1").

    param organism: str
    The name of the organism (default is "Homo sapiens").

    return: list
    A gene record from Entrez.efetch.
    """

    query = f"{gene_symbol}[Gene Name] AND {organism}[Organism]"
    with Entrez.esearch(db="gene", term=query, retmax=5) as search_handle:
      search_results = Entrez.read(search_handle)

    gene_ids = search_results.get("IdList")

    for gene_id in gene_ids:
        print(f'GeneID: {gene_id}')
        print('='*10)
        with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as fetch_handle:
          gene_records = Entrez.read(fetch_handle)
          return gene_records


def get_refseq_accessions(gene_records) -> list[str]:
    """
    Fetches the RefSeq accession numbers for a given gene symbol.

    Arguments:
    gene_records: Bio.Entrez.Parser.ListElement
    List returned by Entrez.efetch

    return: list
    A list of RefSeq accession numbers.
    """

    refseq_accessions = []

    for gene_record in gene_records:
        if 'Entrezgene_locus' in gene_record:
            for commentary in gene_record['Entrezgene_locus']:
                if 'Gene-commentary_products' in commentary:
                    for product in commentary['Gene-commentary_products']:
                        if 'Gene-commentary_accession' in product:
                            refseq_accessions.append(product['Gene-commentary_accession'])
    return refseq_accessions



def get_gene_information(gene_record: V1GeneMatch) -> dict:
    """
    Retrieves gene information from the NCBI Datasets API.

    Arguments:
    gene_record: V1GeneMatch
    A gene record from the NCBI Datasets API.

    return: dict
    A dictionary of gene information.
    """

    print("query", gene_record.query)

    if gene_record.warnings:
        print(gene_record.warnings)
    if gene_record.errors:
        print(gene_record.errors)

    if gene_record.gene:
        gene_dictionary = gene_record.gene.to_dict()
        return gene_dictionary
    else:
        print(f"No gene found for {gene_record.query}")

In [7]:
gene_record = get_gene_record_from_entrez(gene_symbol=GENE_SYM, organism="Homo sapiens")
gene_accessions = list(set(get_refseq_accessions(gene_record)))
if gene_accessions:
    print(f"RefSeq Accession numbers for {GENE_SYM}: {', '.join(gene_accessions)}")
else:
    print(f"No RefSeq Accession numbers found for {GENE_SYM}")

GeneID: 29980
RefSeq Accession numbers for DONSON: NM_017613


In [8]:
configuration = ncbi.datasets.openapi.Configuration(
    host = "https://api.ncbi.nlm.nih.gov/datasets/v1"
)

configuration.api_key['ApiKeyAuthHeader'] = userdata.get('NCBI_API_SECRET_KEY')

# configuration.api_key_prefix['ApiKeyAuthHeader'] = 'Bearer'

with ncbi.datasets.openapi.ApiClient(configuration) as api_client:
    api_instance = gene_api.GeneApi(api_client)

    try:
        gene_reply = api_instance.gene_metadata_by_accession(gene_accessions)
        for gene in gene_reply.genes:
            ncbi_gene_data = get_gene_information(gene)
    except DatasetsApiException as e:
        print(f"Exception when calling GeneApi: {e}\n")

query ['NM_017613']
 'message': 'The current accession.version will be returned.',
 'reason': 'The accession.version you requested is no longer current or '
           'otherwise unrecognized.',
 'replaced_id': {'requested': 'NM_017613', 'returned': 'NM_017613.4'}}]


In [9]:
for k, v in ncbi_gene_data.items():
    print(f"{k}: {v}", end='\n\n')

gene_id: 29980

symbol: DONSON

description: DNA replication fork stabilization factor DONSON

tax_id: 9606

taxname: Homo sapiens

type: PROTEIN_CODING

orientation: minus

genomic_ranges: [{'accession_version': 'NC_000021.9', 'range': [{'begin': '33577551', 'end': '33588684', 'orientation': 'minus'}]}, {'accession_version': 'NC_060945.1', 'range': [{'begin': '31959392', 'end': '31970473', 'orientation': 'minus'}]}]

transcripts: [{'accession_version': 'NM_017613.4', 'length': 2500, 'genomic_range': {'accession_version': 'NC_000021.9', 'range': [{'begin': '33577551', 'end': '33588684', 'orientation': 'minus'}]}, 'exons': {'accession_version': 'NC_000021.9', 'range': [{'begin': '33588321', 'end': '33588684', 'order': 1}, {'begin': '33587522', 'end': '33587602', 'order': 2}, {'begin': '33585978', 'end': '33586181', 'order': 3}, {'begin': '33584590', 'end': '33584768', 'order': 4}, {'begin': '33583488', 'end': '33583666', 'order': 5}, {'begin': '33582165', 'end': '33582246', 'order': 6},

# MyGene

In [10]:
!pip install mygene --quiet
from mygene import MyGeneInfo

In [11]:
mg = MyGeneInfo()
mg_query = mg.querymany(
    qterms=[GENE_SYM],
    scopes="symbol",
    species='human',
    as_dataframe=False,
    df_index=True,
    fields='all'
)


INFO:biothings.client:querying 1-1...
INFO:biothings.client:done.
INFO:biothings.client:Finished.


## General data on DONSON

In [12]:
print(mg_query[0]['name'])
print(mg_query[0]['summary'])

DNA replication fork stabilization factor DONSON
This gene lies downstream of the SON gene and spans 10 kb on chromosome 21. The function of this gene is unknown. [provided by RefSeq, Jul 2008].


## GO terms

### Biological processes

In [13]:
bp_terms = mg_query[0]['go']['BP']
if isinstance(bp_terms, list):
    for i in bp_terms:
        print(f"{i['qualifier'].replace('_', ' ')} {i['term']}")
else:
    print(f"{bp_terms['qualifier'].replace('_', ' ')} {bp_terms['term']}")

involved in DNA damage checkpoint signaling
involved in DNA replication
involved in mitotic G2 DNA damage checkpoint signaling
involved in replication fork processing
involved in nuclear DNA replication
involved in mitotic DNA replication checkpoint signaling


### Cellular components

In [14]:
cc_terms = mg_query[0]['go']['CC']
if isinstance(cc_terms, list):
    for i in cc_terms:
        print(f"{i['qualifier'].replace('_', ' ')} {i['term']}")
else:
    print(f"{cc_terms['qualifier'].replace('_', ' ')} {cc_terms['term']}")

is active in nucleus
located in nucleus
located in replication fork
part of replisome


### Molecular functions

In [15]:
mf_terms = mg_query[0]['go']['MF']
if isinstance(mf_terms, list):
    for i in mf_terms:
        print(f"{i['qualifier'].replace('_', ' ')} {i['term']}")
else:
    print(f"{mf_terms['qualifier'].replace('_', ' ')} {mf_terms['term']}")

enables protein binding


## Publications

In [16]:
pubs = mg_query[0]['generif']
print(f"{len(pubs)} publications on {GENE_SYM} in PubMed:", end="\n\n")
for idx, pub in enumerate(pubs, start=1):
    print(f"{idx}. {pub['text']}", end='\n\n')

9 publications on DONSON in PubMed:

1. Aberrant splicing and a noncoding mutation in DONSON gene is the cause of microcephaly-micromelia syndrome

2. we present the clinical data of siblings with microcephaly, short stature, and limb abnormalities syndrome (MISSLA) featuring a novel DONSON variant and summarize the current literature on MISSLA.

3. four unrelated families with five affected individuals having biallelic or de novo variants in DONSON presenting with a core phenotype of severe short stature (z score < -3 SD), additional skeletal abnormalities, and microcephaly, were identified.

4. Linked-read genome sequencing identifies biallelic pathogenic variants in DONSON as a novel cause of Meier-Gorlin syndrome.

5. the antitumor miR-101-5p/DONSON axis and its modulated replisome genes might be a novel diagnostic and therapeutic target for clear cell renal cell carcinoma

6. Circ-DONSON promotes malignant progression of glioma through modulating FOXO3.

7. DONSON and FANCM associ

# UniProt

In [17]:
def query_uniprot(gene_symbol: str) -> dict:
    """
    Queries UniProt for gene information.

    Arguments:
    gene_symbol: str
    The gene symbol to query.

    return: dict
    A dictionary of gene information.
    """

    url = 'https://rest.uniprot.org/uniprotkb/stream'
    query = {
        'gene': gene_symbol,
        'organism_id': 9606,
        'reviewed':'true'
    }
    query_string = ' AND '.join([f"{k}:{v}" for (k,v) in query.items()])
    params = {'query': query_string, 'format': 'json'}
    res = httpx.get(url, params=params)
    return res.json()

In [18]:
uniprot_data = query_uniprot(GENE_SYM)
for comment in uniprot_data['results'][0]['comments']:
    if 'texts' in comment.keys():
        print(comment['commentType'])
        for text in comment['texts']:
            print(text['value'], end='\n\n')

FUNCTION
Replisome component that maintains genome stability by protecting stalled or damaged replication forks. After the induction of replication stress, required for the stabilization of stalled replication forks, the efficient activation of the intra-S-phase and G/2M cell-cycle checkpoints and the maintenance of genome stability

SUBUNIT
Component of the replisome complex composed of at least DONSON, MCM2, MCM7, PCNA and TICRR; interaction at least with PCNA occurs during DNA replication

TISSUE SPECIFICITY
Expressed in the brain, with higher levels in prenatal compared to adult brain

DEVELOPMENTAL STAGE
Expressed during embryonic development. At Carnegie stage 22 (about 7.5 weeks gestation), expressed in numerous tissues, including brain, heart, lung, gastrointestinal tract, kidney, hind limb and forelimb digits. Similar expression is observed at 9 weeks of gestation. In the brain of a 9-week old fetus, prominently expressed in the neocortex subventricular zone and in the cortica