In [1]:
import os
from io import StringIO
from pathlib import Path
from Bio import Entrez, SeqIO
from Bio import Blast
from Bio.Blast import NCBIWWW, NCBIXML

In [20]:
EMAIL = "himanshu.joshi@sydney.edu.au"          # <-- put a real address (NCBI requirement)
API_KEY = 'b787fe7fdc45dcbdbd6a29b3b495cf033c09' # os.getenv("NCBI_API_KEY")       # optional but recommended
NUCCORE_ID = "NW_019805496.1"
GENE_SYMBOL = "MTCH2"
ENTREZ_DB = "nuccore"
BLAST_DB = "Genome (GRCh38.p14 reference assembly RS_2024_08)"               # then restricted to GRCh38 with entrez_query
ENTREZ_QUERY = '"Homo sapiens"[Organism] AND "GRCh38.p14"[Assembly]'  #  [oai_citation:0‡biostars.org](https://www.biostars.org/p/129932/?utm_source=chatgpt.com)
PERCENT_LOW, PERCENT_HIGH = 99.0, 100.0

In [5]:
Entrez.email = EMAIL
if API_KEY:
    Entrez.api_key = API_KEY

In [6]:
def fetch_genbank_record(acc):
    """Download GenBank record as a SeqRecord object."""
    with Entrez.efetch(db=ENTREZ_DB, id=acc, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    return record  #  [oai_citation:1‡biopython.org](https://biopython.org/docs/1.75/api/Bio.Entrez.html?utm_source=chatgpt.com)


def find_gene_location(record, gene_symbol):
    """Return (start, end, strand) for the requested gene feature."""
    for feat in record.features:
        if feat.type == "gene" and "gene" in feat.qualifiers:
            if gene_symbol in feat.qualifiers["gene"]:
                loc = feat.location
                return int(loc.start) + 1, int(loc.end), loc.strand  # NCBI is 1-based
    raise ValueError(f"Gene {gene_symbol} not found in {record.id}")  #  [oai_citation:2‡biopython.org](https://biopython.org/docs/1.76/api/Bio.SeqFeature.html?utm_source=chatgpt.com)


def fetch_fasta_slice(acc, start, end, strand):
    """Retrieve a subsequence in FASTA format."""
    with Entrez.efetch(
        db=ENTREZ_DB,
        id=acc,
        strand=1 if strand >= 0 else 2,
        seq_start=start,
        seq_stop=end,
        rettype="fasta",
        retmode="text",
    ) as handle:
        fasta = handle.read()
    return fasta


In [7]:
record = fetch_genbank_record(NUCCORE_ID)
start, end, strand = find_gene_location(record, GENE_SYMBOL)
fasta = fetch_fasta_slice(NUCCORE_ID, start, end, strand)
print(f"> GCNT2 coordinates on {NUCCORE_ID}: {start}-{end} (strand {strand})")

> GCNT2 coordinates on NW_019805496.1: 32135-74925 (strand -1)


In [10]:
import requests
import urllib.parse
import re
import time
import logging

# ——— Configure verbose logging ———
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)s %(message)s')
logger = logging.getLogger(__name__)

In [27]:
BLAST_URL = "https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi"


def submit_blast_job(fasta: str) -> (str, int, str):
    """
    Submit a BLASTn megablast job and return (RID, RTOE, query_id).
    """
    # Extract query_id from FASTA header
    header = fasta.strip().splitlines()[0]
    if not header.startswith(">"):
        raise ValueError("FASTA must start with '>' header line")
    query_id = header[1:].split()[0]
    logger.debug(f"Parsed query_id = {query_id}")

    # Build payload
    payload = {
        "CMD": "Put",
        "PROGRAM": "blastn",
        "MEGABLAST": "on",
        "DATABASE": "human_genome",
        "QUERY": fasta
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}

    # Submit
    logger.debug("Submitting BLAST job (CMD=Put)...")
    resp = requests.post(BLAST_URL, data=payload, headers=headers)
    resp.raise_for_status()
    text = resp.text
    logger.debug(f"POST response:\n{text}")

    # Parse RID and RTOE
    rid_m = re.search(r"RID = (\S+)", text)
    rtoe_m = re.search(r"RTOE = (\d+)", text)
    if not rid_m or not rtoe_m:
        raise RuntimeError("Failed to parse RID/RTOE from BLAST response")
    rid = rid_m.group(1)
    rtoe = int(rtoe_m.group(1))
    logger.info(f"Received RID={rid}, RTOE={rtoe}s")

    return rid, rtoe, query_id


def poll_blast_job(rid: str, initial_wait: int) -> None:
    """
    Wait initial_wait seconds, then poll every 5s until Status=READY.
    Raises on FAILED or UNKNOWN.
    """
    logger.debug(f"Sleeping for initial RTOE of {initial_wait}s")
    time.sleep(initial_wait)

    while True:
        logger.debug("Polling for completion (CMD=Get FORMAT_OBJECT=SearchInfo)...")
        params = {"CMD": "Get", "FORMAT_OBJECT": "SearchInfo", "RID": rid}
        resp = requests.get(BLAST_URL, params=params)
        resp.raise_for_status()
        info = resp.text
        logger.debug(f"Poll response:\n{info}")

        if re.search(r"Status=WAITING", info):
            logger.debug("Status=WAITING; sleeping 5s")
            time.sleep(5)
            continue
        if re.search(r"Status=FAILED", info):
            raise RuntimeError(f"BLAST search {rid} failed")
        if re.search(r"Status=UNKNOWN", info):
            raise RuntimeError(f"BLAST search {rid} expired")
        if re.search(r"Status=READY", info):
            if re.search(r"ThereAreHits=no", info):
                logger.warning("No hits found")
            else:
                logger.info("Hits found; ready to retrieve")
            return


def fetch_and_parse_results(rid: str) -> list[dict]:
    """
    Retrieve the final TEXT results for RID and parse hits into
    [{'description': ..., 'percent_identity': ...}, ...].
    """
    logger.debug("Retrieving final results (CMD=Get FORMAT_TYPE=Text)...")
    params = {"CMD": "Get", "FORMAT_TYPE": "Text", "RID": rid}
    resp = requests.get(BLAST_URL, params=params)
    resp.raise_for_status()
    text = resp.text
    logger.debug(f"Final BLAST output:\n{text}")

    hits = []
    desc = None
    for line in text.splitlines():
        if line.startswith(">"):
            desc = line[1:].strip()
            logger.debug(f"Found hit: {desc}")
        elif desc and "Identities =" in line:
            m = re.search(r"Identities = [\d/]+ \((\d+%)\)", line)
            if m:
                hits.append({"description": desc, "percent_identity": m.group(1)})
                logger.debug(f"Parsed % identity: {m.group(1)}")
            desc = None

    logger.info(f"Total hits parsed: {len(hits)}")
    return hits


def run_blast(fasta: str):
    rid, rtoe, qid = submit_blast_job(fasta)
    poll_blast_job(rid, rtoe)
    hits = fetch_and_parse_results(rid)
    return rid, qid, hits

In [28]:
rid, qid, hits = run_blast(fasta)

2025-07-08 02:28:17,191 DEBUG Parsed query_id = NW_019805496.1:c74925-32135
2025-07-08 02:28:17,194 DEBUG Submitting BLAST job (CMD=Put)...
2025-07-08 02:28:17,197 DEBUG Starting new HTTPS connection (1): blast.ncbi.nlm.nih.gov:443
2025-07-08 02:28:19,362 DEBUG https://blast.ncbi.nlm.nih.gov:443 "POST /blast/Blast.cgi HTTP/1.1" 200 None
2025-07-08 02:28:19,957 DEBUG POST response:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="jig" content="ncbitoggler ncbiautocomplete"/>
<meta name="ncbi_app" content="static" />
<meta name="ncbi_pdid" content="blastformatreq" />
<meta name="ncbi_stat" content="false" />
<meta name="ncbi_sessionid" content="2196A6B186C82421_0000SID" />
<meta name="ncbi_phid" content="2196A6B186C824210000000000000001" />
<title>NCBI Blast</ti