In [27]:
import pandas as pd
from Bio import Entrez
from Bio import SeqIO
Entrez.email = "edoardo.giacopuzzi@well.ox.ac.uk"

In [None]:
#Retrieve list of available DBs
handle = Entrez.einfo()
record = Entrez.read(handle)
record["DbList"]

In [12]:
#See available descriptors
record["DbInfo"].keys()

dict_keys(['DbName', 'MenuName', 'Description', 'DbBuild', 'Count', 'LastUpdate', 'FieldList', 'LinkList'])

In [9]:
#Access a specific db
handle = Entrez.einfo(db="nucleotide")
record = Entrez.read(handle)

In [None]:
#See searchable terms
for field in record["DbInfo"]["FieldList"]:
    print("%(Name)s, %(FullName)s, %(Description)s" % field)

In [15]:
#Perform search
#See https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch for detailed options
handle = Entrez.esearch(db="gene", term="Homo sapiens[ORGN] AND NEU1[GENE]")
record = Entrez.read(handle)
record["IdList"]

['4758', '51162', '9148']

In [33]:
#Get summary info for a record
#See https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESummary for detailed options
handle = Entrez.esummary(db="gene", id="4758")
record = Entrez.read(handle)

#For a gene get symbol, description, location, summary
gene_symbol = record['DocumentSummarySet']['DocumentSummary'][0]['Name']
gene_desc = record['DocumentSummarySet']['DocumentSummary'][0]['Description']
gene_chr = record['DocumentSummarySet']['DocumentSummary'][0]['GenomicInfo'][0]['ChrLoc']
gene_start = record['DocumentSummarySet']['DocumentSummary'][0]['GenomicInfo'][0]['ChrStart']
gene_end = record['DocumentSummarySet']['DocumentSummary'][0]['GenomicInfo'][0]['ChrStop']
gene_summ = record['DocumentSummarySet']['DocumentSummary'][0]['Summary']

print("Symbol: "+gene_symbol+"\nLocation: "+gene_chr+":"+gene_start+"-"+gene_end+"\n"+gene_summ)

Symbol: NEU1
Location: 6:31862820-31857658
The protein encoded by this gene is a lysosomal enzyme that cleaves terminal sialic acid residues from substrates such as glycoproteins and glycolipids. In the lysosome, this enzyme is part of a heterotrimeric complex together with beta-galactosidase and cathepsin A (the latter is also referred to as 'protective protein'). Mutations in this gene can lead to sialidosis, a lysosomal storage disease that can be type 1 (cherry red spot-myoclonus syndrome or normosomatic type), which is late-onset, or type 2 (the dysmorphic type), which occurs at an earlier age with increased severity. [provided by RefSeq, Jul 2008]


In [3]:
#For a list of gene get infos into panda table
mygenes = ['NEU1','DOCK9','TTN']
myorgn = "Homo sapiens"

d = {'ID' : [],
     'symbol': [],
     'desc': [],
     'location': [],
     'MIM' : [],
     'length': [],
     'summ': []
    }
genetab = pd.DataFrame(d) 

for g in mygenes:
    handle = Entrez.esearch(db="gene", term=myorgn+"[ORGN] AND "+g+"[GENE]")
    record = Entrez.read(handle)
    resultIDs = record["IdList"]
    
    #Retrieve info for every found ID
    for ID in resultIDs:
        handle = Entrez.esummary(db="gene", id=ID)
        record = Entrez.read(handle)

        #Get symbol, description, location, summary
        gene_symbol = record['DocumentSummarySet']['DocumentSummary'][0]['Name']
        gene_desc = record['DocumentSummarySet']['DocumentSummary'][0]['Description']
        gene_mim = record['DocumentSummarySet']['DocumentSummary'][0]['Mim']
        gene_chr = record['DocumentSummarySet']['DocumentSummary'][0]['GenomicInfo'][0]['ChrLoc']
        gene_start = record['DocumentSummarySet']['DocumentSummary'][0]['GenomicInfo'][0]['ChrStart']
        gene_end = record['DocumentSummarySet']['DocumentSummary'][0]['GenomicInfo'][0]['ChrStop']
        location = gene_chr+":"+gene_start+"-"+gene_end
        gene_length = record['DocumentSummarySet']['DocumentSummary'][0]['GeneWeight']
        gene_summ = record['DocumentSummarySet']['DocumentSummary'][0]['Summary']
        newline = {'ID':ID,'symbol':gene_symbol, 'desc':gene_desc, 'MIM':gene_mim, 'location':location, 'length':gene_length,'summ':gene_summ}
        genetab = genetab.append(newline, ignore_index=True)

genetab

Unnamed: 0,ID,symbol,desc,location,MIM,length,summ
0,4758,NEU1,neuraminidase 1,6:31862820-31857658,[608272],10269,The protein encoded by this gene is a lysosoma...
1,51162,EGFL7,EGF like domain multiple 7,9:136654752-136672677,[608582],6960,This gene encodes a secreted endothelial cell ...
2,9148,NEURL1,neuralized E3 ubiquitin protein ligase 1,10:103493704-103592545,[603804],1526,
3,23348,DOCK9,dedicator of cytokinesis 9,13:99086693-98793428,[607325],1963,
4,7273,TTN,titin,2:178807422-178525988,[188840],32078,This gene encodes a large abundant protein of ...


In [24]:
#Search for nucleotide db
handle = Entrez.esearch(db="nucleotide", term="Homo sapiens[ORGN] AND SLC7A4[GENE]")
record = Entrez.read(handle)
record["IdList"]

['568815576', '1519313486', '123998448', '74273660', '74230000', '76827291', '76825405', '38512229', '33873956', '66904765', '261857719', '47678690', '109452089', '109451493', '123984506']

In [33]:
#Get sequence (available formats at https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly)
handle = Entrez.efetch(db="nucleotide", id="568815576", rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
handle.close()
print(record)

ID: NC_000022.11
Name: NC_000022
Description: Homo sapiens chromosome 22, GRCh38.p13 Primary Assembly
Database cross-references: BioProject:PRJNA168, Assembly:GCF_000001405.39
Number of features: 1
/molecule_type=DNA
/topology=linear
/data_file_division=CON
/date=14-JUN-2019
/accessions=['NC_000022']
/sequence_version=11
/keywords=['RefSeq']
/source=Homo sapiens (human)
/organism=Homo sapiens
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']
/references=[Reference(title='Finishing the finished human chromosome 22 sequence', ...), Reference(title='Finishing the euchromatic sequence of the human genome', ...), Reference(title='Initial sequencing and analysis of the human genome', ...)]
/comment=REFSEQ INFORMATION: The reference sequence is identical to
CM000684.2.
On Feb 3, 2014 this sequence version replaced NC_000022.10.
Assembly Name: GRCh38.