In [1]:
import os
import pandas as pd
from Bio import Entrez

In [2]:
Entrez.email = 'chen1i6c04@gmail.com'

def einfo(db=None):
    if db:
        with Entrez.einfo(db=db) as handle:
            record = Entrez.read(handle)
        for fileld in record['DbInfo']['FieldList']:
            print(fileld['Name'], fileld['Description'])
    else:
        with Entrez.einfo() as handle:
            record = Entrez.read(handle)
        return record['DbList']

def egquery(term, db='sra'):
    with Entrez.egquery(term=term) as handle:
        record = Entrez.read(handle)
    for row in record["eGQueryResult"]:
        if row["DbName"]==db:
            count = int(row["Count"])
            return count

def esearch(term, db="sra", retmax=10000):
    with Entrez.esearch(db=db, term=term, retmax=retmax) as handle:
        record = Entrez.read(handle)
        return record['IdList']

def efetch(uid, db, rettype='xml', retmode="xml"):
    with Entrez.efetch(db=db, id=uid, rettype=rettype, retmode=retmode) as handle:
        record = handle.read()
    return record

In [3]:
db = 'nuccore'

In [4]:
einfo(db)

ALL All terms from all searchable fields
UID Unique number assigned to each sequence
FILT Limits the records
WORD Free text associated with record
TITL Words in definition line
KYWD Nonstandardized terms provided by submitter
AUTH Author(s) of publication
JOUR Journal abbreviation of publication
VOL Volume number of publication
ISS Issue number of publication
PAGE Page number(s) of publication
ORGN Scientific and common names of organism, and all higher levels of taxonomy
ACCN Accession number of sequence
PACC Does not include retired secondary accessions
GENE Name of gene associated with sequence
PROT Name of protein associated with sequence
ECNO EC number for enzyme or CAS registry number
PDAT Date sequence added to GenBank
MDAT Date of last update
SUBS CAS chemical name or MEDLINE Substance Name
PROP Classification by source qualifiers and molecule type
SQID String identifier for sequence
GPRJ BioProject
SLEN Length of sequence
FKEY Feature annotated on sequence
PORG Scientific and 

In [9]:
handle = Entrez.efetch(db, id='MT663352.1', rettype='fasta', retmode='text', seq_start=2889, seq_stop=4262, strand=2)

In [10]:
with open('/media/NGS/Data_Analysis/20210220_Campylobacter_resistance/tet(L)_varine.fasta', 'w') as f:
    f.write(handle.read())

In [5]:
from collections import defaultdict
from xml.etree import cElementTree as ET

In [6]:
filepath = "/home/chen1i6c04/下載/gene_result.xml"

tree = ET.parse(filepath)
root = tree.getroot()

In [7]:
summary = defaultdict(dict)
for entrezgene in root:
    geneid = entrezgene.find('*//Gene-track_geneid').text
    summary[geneid]['accession'] = entrezgene.find('*//Gene-commentary_accession').text + '.' + entrezgene.find('*//Gene-commentary_version').text
#     summary[geneid]['product'] = entrezgene.find('*//Prot-ref_desc').text
    summary[geneid]['species'] = entrezgene.find('*//Org-ref_taxname').text
    summary[geneid]['from'] = entrezgene.find('*//Seq-interval_from').text
    summary[geneid]['to'] = entrezgene.find('*//Seq-interval_to').text
    summary[geneid]['strand'] = entrezgene.find('*//Na-strand').attrib['value']

In [8]:
df = pd.DataFrame(summary).T
df

Unnamed: 0,accession,species,from,to,strand
904688,NC_002163.1,Campylobacter jejuni subsp. jejuni NCTC 11168 ...,331124,332602,minus
56644357,NZ_VOPG01000001.1,Campylobacter coli,1441295,1442773,plus


In [9]:
for col in ('from', 'to'):
    df[col] = df[col].astype(int) +1

In [10]:
df = df.drop_duplicates('accession')

In [11]:
df

Unnamed: 0,accession,species,from,to,strand
904688,NC_002163.1,Campylobacter jejuni subsp. jejuni NCTC 11168 ...,331125,332603,minus
56644357,NZ_VOPG01000001.1,Campylobacter coli,1441296,1442774,plus


In [12]:
fetch_results = ""
for _, row in df.iterrows():
    strand = 1
    if row['strand'] != 'plus':
        strand = 2
    with Entrez.efetch(db, id=row['accession'], rettype='fasta', retmode='text',
                       seq_start=row['from'], seq_stop=row['to'], strand=strand) as handle:
        fetch_result = handle.read()
        fetch_results += fetch_result

In [13]:
with open("/media/NGS/Data_Analysis/20210220_Campylobacter_resistance/cmeC/campylobacter_cmeC.fna", 'w') as f:
    f.write(fetch_results)