# E-utilities = api for Entrez (NCBI web portal)

E-utilities:
- E-info: database names and fields
- E-search: search particular database, returns primary ids
- E-fetch: download database entries by primary ids
- E-link
- E-post
- E-summary
- E-GQuery

In [76]:
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
import os.path

In [77]:
Entrez.email = 'michael.chambers2@nih.gov'

In [78]:
# using E-info
handle = Entrez.einfo()
result = Entrez.read(handle)
for i in result['DbList']:
    print(i)

pubmed
protein
nuccore
ipg
nucleotide
structure
sparcle
genome
annotinfo
assembly
bioproject
biosample
blastdbinfo
books
cdd
clinvar
gap
gapplus
grasp
dbvar
gene
gds
geoprofiles
homologene
medgen
mesh
ncbisearch
nlmcatalog
omim
orgtrack
pmc
popset
probe
proteinclusters
pcassay
biosystems
pccompound
pcsubstance
seqannot
snp
sra
taxonomy
biocollections
gtr


In [46]:
handle = Entrez.einfo(db = 'pubmed', usehistory='y')
result = Entrez.read(handle)

In [47]:
for i in result['DbInfo'].keys():
    print(i)

DbName
MenuName
Description
DbBuild
Count
LastUpdate
FieldList
LinkList


In [79]:
# E-search
handle = Entrez.esearch(db='pubmed', term="BRCA1")
result = Entrez.read(handle)
for i in result.keys(): print(i)


Count
RetMax
RetStart
IdList
TranslationSet
TranslationStack
QueryTranslation


In [80]:
result['Count']

'16369'

In [81]:
result['IdList']

['31729099', '31727767', '31727117', '31724318', '31723001', '31721094', '31719105', '31719101', '31717415', '31711518', '31706893', '31706282', '31706072', '31705765', '31705130', '31704974', '31701019', '31700994', '31700549', '31699802']

In [102]:
handle = Entrez.esearch(
    db='protein',
    term = '"Homo sapiens"[Organism] AND BRCA1[Gene Name] AND REFSEQ'
)
result = Entrez.read(handle)
result['IdList']

['6552299', '237681125', '237681123', '237681121', '237681119']

In [67]:
# E-fetch
handle = Entrez.efetch(
    db='nucleotide',
    id='186972394,187372713',
    rettype='gb'
)
# prints out the GenBank files
#print(handle.read())

In [82]:
handle = Entrez.esearch(
    db='protein',
    term = '"Homo sapiens"[Organism] AND BRCA1[Gene Name] AND REFSEQ'
)
result = Entrez.read(handle)
id_list = ','.join(result['IdList'])
handle = Entrez.efetch(
    db='protein',
    id=id_list,
    rettype='gb'
)
for r in SeqIO.parse(handle, 'genbank'):
    print(r.id)
    print(r.description)

NP_009225.1
breast cancer type 1 susceptibility protein isoform 1 [Homo sapiens]
NP_009230.2
breast cancer type 1 susceptibility protein isoform 5 [Homo sapiens]
NP_009229.2
breast cancer type 1 susceptibility protein isoform 4 [Homo sapiens]
NP_009228.2
breast cancer type 1 susceptibility protein isoform 3 [Homo sapiens]
NP_009231.2
breast cancer type 1 susceptibility protein isoform 2 [Homo sapiens]


In [83]:
id_list

'6552299,237681125,237681123,237681121,237681119'

In [84]:
# use Esearch history
handle = Entrez.esearch(
    db='protein',
    term = '"Homo sapiens"[Organism] AND BRCA1[Gene Name] AND REFSEQ',
    usehistory='y'
)
result = Entrez.read(handle)
handle.close()

count = int(result['Count'])
id_list = result['IdList']
session_cookie = result['WebEnv']
query_key = result['QueryKey']

print(count)
print(id_list)

5
['6552299', '237681125', '237681123', '237681121', '237681119']


In [85]:
# chunk results
cnk_size = 100
for cnk_start in range(0,count,cnk_size):
    handle = Entrez.efetch(
        db='protein',
        rettype='gb',
        retstart=cnk_start,
        retmax=cnk_size,
        webenv=session_cookie,
        query_key=query_key
    )
    for r in SeqIO.parse(handle,'genbank'):
        print(r.id, r.description)
    handle.close()

NP_009225.1 breast cancer type 1 susceptibility protein isoform 1 [Homo sapiens]
NP_009230.2 breast cancer type 1 susceptibility protein isoform 5 [Homo sapiens]
NP_009229.2 breast cancer type 1 susceptibility protein isoform 4 [Homo sapiens]
NP_009228.2 breast cancer type 1 susceptibility protein isoform 3 [Homo sapiens]
NP_009231.2 breast cancer type 1 susceptibility protein isoform 2 [Homo sapiens]


In [100]:
# now blast for the results and save to file
id = '6552299'
file = f'blastp-np-{id}_e-7.xml'

result_handle = NCBIWWW.qblast(
    'blastp',
    'refseq_protein',
    id,
    expect=1e-5, # filter 
    entrez_query='"Mus musculus"[Organism]'
)
blast_results = result_handle.read()
result_handle.close()

save_file = open(file, 'w')
save_file.write(blast_results)

27761

In [101]:
# parse the file (was 38)
result_handle = open(file)
for blast_result in NCBIXML.parse(result_handle):
    for desc in blast_result.descriptions:
        print('***Alignment***')
        print('sequence:',desc.title)
        print('evalue:', desc.e)
        print()
        #if desc.e < 1e-5:
            #i += 1
#print(i)

***Alignment***
sequence: ref|NP_033894.3| breast cancer type 1 susceptibility protein homolog [Mus musculus] >ref|XP_030101355.1| breast cancer type 1 susceptibility protein homolog isoform X1 [Mus musculus]
evalue: 0.0

***Alignment***
sequence: ref|XP_006532127.1| breast cancer type 1 susceptibility protein homolog isoform X2 [Mus musculus]
evalue: 0.0

***Alignment***
sequence: ref|XP_017169722.1| breast cancer type 1 susceptibility protein homolog isoform X3 [Mus musculus]
evalue: 0.0

***Alignment***
sequence: ref|XP_006532131.1| breast cancer type 1 susceptibility protein homolog isoform X4 [Mus musculus]
evalue: 5.77857e-168

***Alignment***
sequence: ref|XP_011240255.1| tripartite motif-containing protein 30A-like isoform X3 [Mus musculus]
evalue: 3.3666e-08

***Alignment***
sequence: ref|XP_006508443.1| tripartite motif-containing protein 30A-like isoform X2 [Mus musculus]
evalue: 5.28179e-08

***Alignment***
sequence: ref|XP_006508442.1| tripartite motif-containing protein 3

In [98]:
file

'blastp-np-6552299.xml'

In [104]:
blast_results.

'<?xml version="1.0"?>\n<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n<BlastOutput>\n  <BlastOutput_program>blastp</BlastOutput_program>\n  <BlastOutput_version>BLASTP 2.10.0+</BlastOutput_version>\n  <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n  <BlastOutput_db>refseq_protein_v5</BlastOutput_db>\n  <BlastOutput_query-ID>NP_009225.1</BlastOutput_query-ID>\n  <BlastOutput_query-def>breast cancer type 1 susceptibility protein isoform 1 [Homo sapiens]</BlastOutput_query-def>\n  <BlastOutput_query-len>1863</BlastOutput_query-len>\n  <BlastOutput_param>\n    <Parameters>\n      <Parameters_matrix>BLOSUM62</Parameters_matrix>\n      <Parameters_expect>1e-0

In [None]:
# save a blast file


id = '8332116'
file = f'blastn-nr-{id}.xml'
result_handle = NCBIWWW.qblast(
    'blastn',
    'nr',
    id
    #entrez_query='"Mus musculus"[Organism]'
)
blast_results = result_handle.read()
result_handle.close()

save_file = open(file, 'w')
save_file.write(blast_results)
