In [1]:
pip install biopython

In [2]:
# Load libraries
import pandas as pd

from Bio import Entrez, SeqIO

In [3]:
# Set your email (required by NCBI)
Entrez.email = "James.Chang@bcm.edu"

In [4]:
# Search parameters and queries
query = "antiphage[Title]"
return_maximum = 15000

handle = Entrez.esearch(db="protein", term=query, retmax=return_maximum)
record = Entrez.read(handle)
handle.close()

# Get list of sequence IDs from search results
protein_ids = record["IdList"]

print(len(protein_ids))

In [5]:
# Define a class to fetch and process protein sequences from NCBI
class NCBIProteinFetcher:
    def __init__(self, email: str, return_maximum: int = return_maximum):
        Entrez.email = email
        self.return_maximum = return_maximum

    # Fetch sequences from NCBI using protein IDs
    def fetch_sequences(self, protein_ids):
        """Fetch GenBank-format protein records from NCBI."""
        handle = Entrez.efetch(
            db="protein",
            id=",".join(protein_ids),
            rettype="gb",
            retmode="text",
            retmax=self.return_maximum
        )
        records = list(SeqIO.parse(handle, "genbank"))
        handle.close()
        return records

    # Clean up and process the fetched records
    def update_record_descriptions(self, records):
        """Modify record.id to include description and clean it for FASTA headers."""
        for record in records:
            clean_desc = record.description.replace(" ", "_").replace(",", "").replace(";", "")
            record.id = f"{record.id}_{clean_desc}"
            record.description = ""
        return records

In [6]:
# Fetch and update records
def fetch_and_update_sequences(protein_ids):
    records = fetcher.fetch_sequences(protein_ids)
    updated_records = fetcher.update_record_descriptions(records)
    return updated_records

In [8]:
# Retrieve sequences from NCBI
step = 100
records_all = []

# Instantiate the fetcher
fetcher = NCBIProteinFetcher(email="James.Chang@bcm.edu", return_maximum=return_maximum)

# Retrieve and process sequences in increments
for i in range(0, len(protein_ids), step):
    print(f"Processing records from {i} to {min(i + step, len(protein_ids))}")
    protein_ids_chunk = protein_ids[i:min(i + step, len(protein_ids))]
    records_chunk = fetch_and_update_sequences(protein_ids_chunk)
    records_all.extend(records_chunk)

In [10]:
# Write to FASTA file
output_file = "/home/azureuser/cloudfiles/code/Users/jc62/projects/direct_sequence_analysis/data/antiphage_sequences.fasta"
with open(output_file, "w") as fasta_out:
    SeqIO.write(records_all, fasta_out, "fasta")