### Programming for Biomedical Informatics
#### Week 3 - Data Integration & Summary Analysis

This week we're first going to practice using a range of the eUtilities end-points to search, fetch, and link data.

In [1]:
# Preliminaries
from Bio import Entrez
import urllib.request
import json
import xml.etree.ElementTree as ET

# load my API key from the file
with open('../api_keys/ncbi.txt', 'r') as file:
    api_key = file.read().strip()

with open('../api_keys/ncbi_email.txt', 'r') as file:
    email = file.read().strip()

Entrez.api_key = api_key
Entrez.email = email

FileNotFoundError: [Errno 2] No such file or directory: '../api_keys/ncbi.txt'

In [None]:
# Step One - Example Using eInfo

# let's use biopython Entrez module to do an eInfo query
# this will tell us what databases are available
handle = Entrez.einfo()

# read the handle
record = Entrez.read(handle)

# print the record in an easy to read format
# print(json.dumps(record, indent=4))

# print out some useful information about each of these databases
# NB this is incredibly useful but long so best to do it for a particular database
for db in record['DbList']:
    print(f"Database: {db}")
    # get the database info
    db_info = Entrez.read(Entrez.einfo(db=db))
    # print the database info
    print(json.dumps(db_info, indent=4))

# lets do this just for the gene database
gene_info = Entrez.read(Entrez.einfo(db='gene'))

# this is a nice way to print out nested XML structures in a readable way
print(json.dumps(gene_info, indent=4))

In [None]:
# Step Two - Example using eSearch to search the gene database for a particular gene
# search for the gene 'BRCA1'
handle = Entrez.esearch(db='gene', term='BRCA1', retmode='xml')

#retreive the record
record = Entrez.read(handle)

# find the count of records
count = record['Count']
print(f"Number of records found: {count}")

# why are there so many?

In [None]:
# print the record in an easy to read format
print(json.dumps(record, indent=4))

# NB the [All Fields] search is a broad search that will return many results

In [None]:
# Step Three - Example using eSummary to get some metadata about the first 5 of these records
# this time we're going to do this using urllib.requests to show how tou can do this independenly of biopython
# we will request xml and parse that using ElementTree

# get the first 5 ids (to then brute force a query for each of these)
ids = record['IdList'][:5]

eUtils_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
eSummary = "esummary.fcgi"

# don't forget to add the API key and email
#for each id in the list pull the summary
for id in ids:
    url = f"{eUtils_base}{eSummary}?db=gene&id={id}&api_key={api_key}&email={email}"
    with urllib.request.urlopen(url) as response:
        xml = response.read()
        root = ET.fromstring(xml)
        # find the <Organsim> tag
        organism = root.find('DocumentSummarySet/DocumentSummary/Organism/').text
        name = root.find('DocumentSummarySet/DocumentSummary/').text
        print(id,name,organism)

# OK what's going on here? None of them say "BRCA1"! Default search is [All Fields], so any field with 
# a reference to BRCA1 will be returned. This is why we get so many results.

In [None]:
# Step Four - Lets look again but add the [Gene] field to the search
handle = Entrez.esearch(db='gene', term='BRCA1[Gene]', retmode='xml')

#retreive the record
record = Entrez.read(handle)

# find the count of records
count = record['Count']
print(f"Number of records found: {count}")

# print the record in an easy to read format
print(json.dumps(record, indent=4))

In [None]:
# get the first 5 ids
ids = record['IdList'][:5]

eUtils_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
eSummary = "esummary.fcgi"

# don't forget to add the API key and email
#for each id in the list pull the summary
for id in ids:
    url = f"{eUtils_base}{eSummary}?db=gene&id={id}&api_key={api_key}&email={email}"
    with urllib.request.urlopen(url) as response:
        xml = response.read()
        root = ET.fromstring(xml)
        # find the <Organsim> tag
        organism = root.find('DocumentSummarySet/DocumentSummary/Organism/').text
        name = root.find('DocumentSummarySet/DocumentSummary/').text
        print(id,name,organism)

# Ah that's better, now we've got BRCA1 but from lots of different organisms

In [None]:
# Step Five - Finally, let's get this right (narrow down to human)

# search for the gene 'BRCA1' in the human genome using eSearch
handle = Entrez.esearch(db='gene', term='BRCA1[Gene], human[Organism]', retmode='xml')

#retreive the record
record = Entrez.read(handle)

# find the count of records
count = record['Count']
print(f"Number of records found: {count}")

# print the record in an easy to read format
print(json.dumps(record, indent=4))

# get the only id
id = record['IdList'][0]

In [None]:
# now use eSummary to get the metadata for this gene
url = f"{eUtils_base}{eSummary}?db=gene&id={id}&api_key={api_key}&email={email}"
response = urllib.request.urlopen(url)
root = ET.fromstring(response.read())
organism = root.find('DocumentSummarySet/DocumentSummary/Organism/').text
name = root.find('DocumentSummarySet/DocumentSummary/').text
print(id,name,organism)

In [None]:
# Step Six - Use eFetch to get the full record

# lets modify the code above to use eFetch to get the full record
eFetch = "efetch.fcgi"

url = f"{eUtils_base}{eFetch}?db=gene&id={id}&api_key={api_key}&email={email}"
response = urllib.request.urlopen(url)
xml = response.read()

# print the xml
print(xml.decode('utf-8'))

In [None]:
# Step Seven - Use eLink to get the associated nucleotide sequence

# lets go back to using the Entrez module to get the sequence
# get the link specifically for the refseq gene nucleotide sequence
links = Entrez.read(Entrez.elink(dbfrom='gene', id=id, linkname='gene_nuccore_refseqgene'))

# get the id of the nucleotide sequence
nuccore_id = links[0]['LinkSetDb'][0]['Link'][0]['Id']

# get the sequence
handle = Entrez.efetch(db='nuccore', id=nuccore_id, rettype='fasta', retmode='text')
sequence = handle.read()

# print the sequence
print(sequence)