In [1]:
pip install biopython

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting biopython
  Downloading biopython-1.81-cp311-cp311-win_amd64.whl (2.7 MB)
     ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
     --- ------------------------------------ 0.3/2.7 MB 7.9 MB/s eta 0:00:01
     ----------------- ---------------------- 1.2/2.7 MB 15.4 MB/s eta 0:00:01
     ---------------------------------------  2.7/2.7 MB 21.6 MB/s eta 0:00:01
     ---------------------------------------- 2.7/2.7 MB 19.2 MB/s eta 0:00:00
Installing collected packages: biopython
Successfully installed biopython-1.81


In [5]:
from Bio import Entrez
from Bio import SeqIO

def find_latest_genome(species, genome_type):
    # Define the search term for the species and genome type
    search_term = f"{species}[Organism] AND {genome_type}[Title]"

    # Use the Entrez API to search for genomes
    Entrez.email = input("Please enter your email: ")
    handle = Entrez.esearch(db="nucleotide", term=search_term, sort="most recent")
    record = Entrez.read(handle)
    handle.close()

    # Get the list of matching genome IDs
    genome_ids = record["IdList"]

    if not genome_ids:
        return None  # No genomes found

    # Get the most recent genome record
    latest_genome_id = genome_ids[0]

    # Retrieve the genome record
    handle = Entrez.efetch(db="nucleotide", id=latest_genome_id, rettype="gb", retmode="text")
    genome_record = SeqIO.read(handle, "genbank")
    handle.close()

    return genome_record

def get_reference_sequence(genome_record):
    # Determine the ideal reference sequence (usually the primary assembly)
    for feature in genome_record.features:
        if feature.type == "source" and "refseq" in feature.qualifiers:
            return feature.qualifiers["refseq"][0]

    # If no ideal reference sequence is found, return the first sequence in the record
    return genome_record.id

if __name__ == "__main__":
    species = input("Please enter the species in 'Genus species' format: ")  # Replace with your species or genus of interest
    genome_type = input("Please enter the type of genome (e.g., 'whole' or 'partial'): ")
    
    latest_genome = find_latest_genome(species, genome_type)

    if latest_genome:
        reference_sequence = get_reference_sequence(latest_genome)
        publication_date = latest_genome.annotations.get("date", "Date not available")
        
        print(f"Most recent {genome_type} genome for {species}: {latest_genome.description}")
        print(f"Ideal reference sequence: {reference_sequence}")
        print(f"Publication Date: {publication_date}")
    else:
        print(f"No {genome_type} genomes found for {species}")

        

Please enter the species in 'Genus species' format:  Sebastes nigrocinctus
Please enter the type of genome (e.g., 'whole' or 'partial'):  whole
Please enter your email:  arlee@ou.edu


Most recent whole genome for Sebastes nigrocinctus: Sebastes nigrocinctus, whole genome shotgun sequencing project
Ideal reference sequence: CAKANB000000000.1
Publication Date: 07-OCT-2021


In [8]:
from Bio import Entrez
from Bio import SeqIO
from prettytable import PrettyTable

def find_latest_genome(email, species, genome_type):
    # Define the search term for the species and genome type
    search_term = f"{species}[Organism] AND {genome_type}[Title]"

    # Use the Entrez API to search for genomes
    Entrez.email = email
    handle = Entrez.esearch(db="nucleotide", term=search_term, sort="most recent")
    record = Entrez.read(handle)
    handle.close()

    # Get the list of matching genome IDs
    genome_ids = record["IdList"]

    if not genome_ids:
        return None  # No genomes found

    # Get the most recent genome record
    latest_genome_id = genome_ids[0]

    # Retrieve the genome record
    handle = Entrez.efetch(db="nucleotide", id=latest_genome_id, rettype="gb", retmode="text")
    genome_record = SeqIO.read(handle, "genbank")
    handle.close()

    return genome_record

def get_reference_sequence(genome_record):
    # Determine the ideal reference sequence (usually the primary assembly)
    for feature in genome_record.features:
        if feature.type == "source" and "refseq" in feature.qualifiers:
            return feature.qualifiers["refseq"][0]

    # If no ideal reference sequence is found, return the first sequence in the record
    return genome_record.id

if __name__ == "__main__":
    # Prompt the user for their email
    email = input("Please enter your email: ")

    # Enter multiple species and genome types
    species_list = input("Please enter the species in 'Genus species' format, separated by commas: ").split(',')
    genome_type_list = input("Please enter the types of genomes (e.g., 'whole' or 'partial'), separated by commas: ").split(',')

    # Create a PrettyTable for the output
    table = PrettyTable()
    table.field_names = ["Species", "Genome Type", "Description", "Reference Sequence", "Publication Date"]

    for species, genome_type in zip(species_list, genome_type_list):
        latest_genome = find_latest_genome(email, species, genome_type)

        if latest_genome:
            reference_sequence = get_reference_sequence(latest_genome)
            publication_date = latest_genome.annotations.get("date", "Date not available")

            # Add a row to the table
            table.add_row([species, genome_type, latest_genome.description, reference_sequence, publication_date])
        else:
            # Add a row indicating no genomes found
            table.add_row([species, genome_type, "No genome found", "", ""])

    # Print the table
    print(table)


Please enter your email:  arlee@ou.edu
Please enter the species in 'Genus species' format, separated by commas:  Canis lupus, Sebastes iracundus
Please enter the types of genomes (e.g., 'whole' or 'partial'), separated by commas:  whole


+-------------+-------------+-------------------------------------------------------------------------------------------------------+--------------------+------------------+
|   Species   | Genome Type |                                              Description                                              | Reference Sequence | Publication Date |
+-------------+-------------+-------------------------------------------------------------------------------------------------------+--------------------+------------------+
| Canis lupus |    whole    | Canis lupus familiaris breed Irish Wolfhound isolate N220234, whole genome shotgun sequencing project | JASJHM000000000.1  |   20-SEP-2023    |
+-------------+-------------+-------------------------------------------------------------------------------------------------------+--------------------+------------------+


In [7]:
from Bio import Entrez
from Bio import SeqIO
import pandas as pd

def find_latest_genome(email, species, genome_type):
    # Define the search term for the species and genome type
    search_term = f"{species}[Organism] AND {genome_type}[Title]"

    # Use the Entrez API to search for genomes
    Entrez.email = email
    handle = Entrez.esearch(db="nucleotide", term=search_term, sort="most recent")
    record = Entrez.read(handle)
    handle.close()

    # Get the list of matching genome IDs
    genome_ids = record["IdList"]

    if not genome_ids:
        return None  # No genomes found

    # Get the most recent genome record
    latest_genome_id = genome_ids[0]

    # Retrieve the genome record
    handle = Entrez.efetch(db="nucleotide", id=latest_genome_id, rettype="gb", retmode="text")
    genome_record = SeqIO.read(handle, "genbank")
    handle.close()

    return genome_record

def get_reference_sequence(genome_record):
    # Determine the ideal reference sequence (usually the primary assembly)
    for feature in genome_record.features:
        if feature.type == "source" and "refseq" in feature.qualifiers:
            return feature.qualifiers["refseq"][0]

    # If no ideal reference sequence is found, return the first sequence in the record
    return genome_record.id

if __name__ == "__main__":
    # Prompt the user for their email
    email = input("Please enter your email: ")

    # Prompt the user for the type of genome ('whole' or 'partial')
    genome_type = input("Please enter the type of genome (e.g., 'whole' or 'partial'): ")

    # Enter multiple species
    species_list = input("Please enter the species in 'Genus species' format, separated by commas: ").split(',')

    # Create a list to store data
    data = []

    for species in species_list:
        latest_genome = find_latest_genome(email, species, genome_type)

        if latest_genome:
            reference_sequence = get_reference_sequence(latest_genome)
            publication_date = latest_genome.annotations.get("date", "Date not available")

            # Add data to the list
            data.append([species, genome_type, latest_genome.description, reference_sequence, publication_date])
        else:
            # Add data indicating no genomes found
            data.append([species, genome_type, "No genome found", "", ""])

    # Create a DataFrame from the list
    df = pd.DataFrame(data, columns=["Species", "Genome Type", "Description", "Reference Sequence", "Publication Date"])

    # Save the DataFrame to an Excel file
    excel_filename = "genome_data.xlsx"
    df.to_excel(excel_filename, index=False)

    # Read the Excel file back into the code
    df_read = pd.read_excel(excel_filename)

    # Display the DataFrame read from the Excel file
    print(df_read)

Please enter your email:  arlee@ou.edu
Please enter the type of genome (e.g., 'whole' or 'partial'):  whole
Please enter the species in 'Genus species' format, separated by commas:  Sebastes iracundus, Sebastes nigrocinctus


                  Species Genome Type  \
0      Sebastes iracundus       whole   
1   Sebastes nigrocinctus       whole   

                                         Description Reference Sequence  \
0  Sebastes iracundus, whole genome shotgun seque...  CAKANE000000000.1   
1  Sebastes nigrocinctus, whole genome shotgun se...  CAKANB000000000.1   

  Publication Date  
0      07-OCT-2021  
1      07-OCT-2021  
