In [3]:
pip install biopython
from Bio import Entrez
from Bio import SeqIO
from prettytable import PrettyTable
import pandas as pd

Collecting biopython
  Downloading biopython-1.81.tar.gz (19.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: biopython
  Building wheel for biopython (setup.py) ... [?25ldone
[?25h  Created wheel for biopython: filename=biopython-1.81-cp311-cp311-macosx_11_0_arm64.whl size=2651285 sha256=2e9fa4d11e00d549dd9a0596d7e6904e4b9e66420f0721e806616c789d74c30c
  Stored in directory: /Users/davisdunk/Library/Caches/pip/wheels/89/c6/87/163bb8067147772bc1df3e3ba97e4b253cdf64aee4d805385c
Successfully built biopython
Installing collected packages: biopython
Successfully installed biopython-1.81
Note: you may need to restart the kernel to use updated packages.


In [4]:
# Finding most recent of species and genome type

def find_latest_genome(species, genome_type):
    # Define the search term for the species and genome type
    search_term = f"{species}[Organism] AND {genome_type}[Title]"

    # Use the Entrez API to search for genomes
    Entrez.email = input("Please enter your email: ")
    handle = Entrez.esearch(db="nucleotide", term=search_term, sort="most recent")
    record = Entrez.read(handle)
    handle.close()

    # Get the list of matching genome IDs
    genome_ids = record["IdList"]

    if not genome_ids:
        return None  # No genomes found

    # Get the most recent genome record
    latest_genome_id = genome_ids[0]

    # Retrieve the genome record
    handle = Entrez.efetch(db="nucleotide", id=latest_genome_id, rettype="gb", retmode="text")
    genome_record = SeqIO.read(handle, "genbank")
    handle.close()

    return genome_record

def get_reference_sequence(genome_record):
    # Determine the ideal reference sequence (usually the primary assembly)
    for feature in genome_record.features:
        if feature.type == "source" and "refseq" in feature.qualifiers:
            return feature.qualifiers["refseq"][0]

    # If no ideal reference sequence is found, return the first sequence in the record
    return genome_record.id

if __name__ == "__main__":
    species = input("Please enter the species in 'Genus species' format: ")  # Replace with your species or genus of interest
    genome_type = input("Please enter the type of genome (e.g., 'whole' or 'partial'): ")
    
    latest_genome = find_latest_genome(species, genome_type)

    if latest_genome:
        reference_sequence = get_reference_sequence(latest_genome)
        publication_date = latest_genome.annotations.get("date", "Date not available")
        
        print(f"Most recent {genome_type} genome for {species}: {latest_genome.description}")
        print(f"Ideal reference sequence: {reference_sequence}")
        print(f"Publication Date: {publication_date}")
    else:
        print(f"No {genome_type} genomes found for {species}")

        

KeyboardInterrupt: Interrupted by user

Please enter the species in 'Genus species' format:  


In [None]:
#Finding all instances of species and genome type

def find_latest_genome(email, species, genome_type):
    # Define the search term for the species and genome type
    search_term = f"{species}[Organism] AND {genome_type}[Title]"

    # Use the Entrez API to search for genomes
    Entrez.email = email
    handle = Entrez.esearch(db="nucleotide", term=search_term, sort="most recent")
    record = Entrez.read(handle)
    handle.close()

    # Get the list of matching genome IDs
    genome_ids = record["IdList"]

    if not genome_ids:
        return None  # No genomes found

    # Get the most recent genome record
    latest_genome_id = genome_ids[0]

    # Retrieve the genome record
    handle = Entrez.efetch(db="nucleotide", id=latest_genome_id, rettype="gb", retmode="text")
    genome_record = SeqIO.read(handle, "genbank")
    handle.close()

    return genome_record

def get_reference_sequence(genome_record):
    # Determine the ideal reference sequence (usually the primary assembly)
    for feature in genome_record.features:
        if feature.type == "source" and "refseq" in feature.qualifiers:
            return feature.qualifiers["refseq"][0]

    # If no ideal reference sequence is found, return the first sequence in the record
    return genome_record.id

if __name__ == "__main__":
    # Prompt the user for their email
    email = input("Please enter your email: ")

    # Enter multiple species and genome types
    species_list = input("Please enter the species in 'Genus species' format, separated by commas: ").split(',')
    genome_type_list = input("Please enter the types of genomes (e.g., 'whole' or 'partial'), separated by commas: ").split(',')

    # Create a PrettyTable for the output
    table = PrettyTable()
    table.field_names = ["Species", "Genome Type", "Description", "Reference Sequence", "Publication Date"]

    for species, genome_type in zip(species_list, genome_type_list):
        latest_genome = find_latest_genome(email, species, genome_type)

        if latest_genome:
            reference_sequence = get_reference_sequence(latest_genome)
            publication_date = latest_genome.annotations.get("date", "Date not available")

            # Add a row to the table
            table.add_row([species, genome_type, latest_genome.description, reference_sequence, publication_date])
        else:
            # Add a row indicating no genomes found
            table.add_row([species, genome_type, "No genome found", "", ""])

    # Print the table
    print(table)


In [None]:
#Finding all instances of species and genome type and saving to Excel file

def find_latest_genome(email, species, genome_type):
    # Define the search term for the species and genome type
    search_term = f"{species}[Organism] AND {genome_type}[Title]"

    # Use the Entrez API to search for genomes
    Entrez.email = email
    handle = Entrez.esearch(db="nucleotide", term=search_term, sort="most recent")
    record = Entrez.read(handle)
    handle.close()

    # Get the list of matching genome IDs
    genome_ids = record["IdList"]

    if not genome_ids:
        return None  # No genomes found

    # Get the most recent genome record
    latest_genome_id = genome_ids[0]

    # Retrieve the genome record
    handle = Entrez.efetch(db="nucleotide", id=latest_genome_id, rettype="gb", retmode="text")
    genome_record = SeqIO.read(handle, "genbank")
    handle.close()

    return genome_record

def get_reference_sequence(genome_record):
    # Determine the ideal reference sequence (usually the primary assembly)
    for feature in genome_record.features:
        if feature.type == "source" and "refseq" in feature.qualifiers:
            return feature.qualifiers["refseq"][0]

    # If no ideal reference sequence is found, return the first sequence in the record
    return genome_record.id

if __name__ == "__main__":
    # Prompt the user for their email
    email = input("Please enter your email: ")

    # Enter multiple species and genome types
    species_list = input("Please enter the species in 'Genus species' format, separated by commas: ").split(',')
    genome_type_list = input("Please enter the types of genomes (e.g., 'whole' or 'partial'), separated by commas: ").split(',')

    # Create a list to store data
    data = []

    for species, genome_type in zip(species_list, genome_type_list):
        latest_genome = find_latest_genome(email, species, genome_type)

        if latest_genome:
            reference_sequence = get_reference_sequence(latest_genome)
            publication_date = latest_genome.annotations.get("date", "Date not available")

            # Add data to the list
            data.append([species, genome_type, latest_genome.description, reference_sequence, publication_date])
        else:
            # Add data indicating no genomes found
            data.append([species, genome_type, "No genome found", "", ""])

    # Create a DataFrame from the list
    df = pd.DataFrame(data, columns=["Species", "Genome Type", "Description", "Reference Sequence", "Publication Date"])

    # Save the DataFrame to an Excel file
    excel_filename = "genome_data.xlsx"
    df.to_excel(excel_filename, index=False)

    # Read the Excel file back into the code
    df_read = pd.read_excel(excel_filename)

    # Display the DataFrame read from the Excel file
    print(df_read)
