In [1]:
import os
import gzip
import re

from pathlib import Path

In [5]:
DATA_ROOT = Path("/home/ubuntu/brandon-again/evilevo/refseq/viral")

In [6]:
homo_sapiens = set()

for virus in DATA_ROOT.iterdir():
    gbff_files = list(virus.glob("*.gbff.gz"))
    if not gbff_files:
        print(f"{virus.name} is missing gbff or fasta files")
        continue
    
    gbff = gbff_files[0]
    with gzip.open(gbff, "rt") as f:
        # check for a line like /host="Homo sapiens"
        for line in f:
            if "host=" in line:
                host = line.split("=")[1].strip().strip('"')
                if host.lower() != "homo sapiens":
                    print(f"{virus.name} is not Homo sapiens, is {host}")
                    break

                homo_sapiens.add(virus.name)
                break

homo_sapiens

GCF_000836805.1 is not Homo sapiens, is Chlamydia psittaci
GCF_000836845.1 is not Homo sapiens, is Chemopodium quinoa
GCF_000836905.1 is not Homo sapiens, is Escherichia coli
GCF_000836925.1 is not Homo sapiens, is Escherichia coli
GCF_000836945.1 is not Homo sapiens, is Escherichia coli
GCF_000837165.1 is not Homo sapiens, is Vibrio parahaemolyticus
GCF_000837225.1 is not Homo sapiens, is Escherichia coli
GCF_000837245.1 is not Homo sapiens, is Mycobacterium smegmatis
GCF_000837285.6 is not Homo sapiens, is Discula destructiva Redlin
GCF_000837325.1 is not Homo sapiens, is Anser sp.
GCF_000837365.1 is not Homo sapiens, is Streptococcus thermophilus
GCF_000837385.1 is not Homo sapiens, is Staphylococcus aureus
GCF_000837405.1 is not Homo sapiens, is Staphylococcus aureus E-1
GCF_000837425.1 is not Homo sapiens, is Sinorhizobium meliloti
GCF_000837445.1 is not Homo sapiens, is Nicotiana edwardsonii
GCF_000837465.1 is not Homo sapiens, is Mycobacterium avium
GCF_000837485.1 is not Homo s

{'GCF_000838265.1',
 'GCF_000841965.1',
 'GCF_000845085.1',
 'GCF_000845245.1',
 'GCF_000845685.2',
 'GCF_000845985.1',
 'GCF_000846365.1',
 'GCF_000846685.1',
 'GCF_000846805.1',
 'GCF_000847345.1',
 'GCF_000848125.1',
 'GCF_000854765.1',
 'GCF_000857045.1',
 'GCF_000857085.1',
 'GCF_000858285.1',
 'GCF_000858385.2',
 'GCF_000858765.1',
 'GCF_000859305.1',
 'GCF_000860145.1',
 'GCF_000861005.1',
 'GCF_000861885.1',
 'GCF_000865085.1',
 'GCF_000866645.1',
 'GCF_000870545.1',
 'GCF_000871625.2',
 'GCF_000872045.1',
 'GCF_000873605.1',
 'GCF_000874285.1',
 'GCF_000874865.1',
 'GCF_000879235.1',
 'GCF_000880515.1',
 'GCF_000882595.1',
 'GCF_000882675.1',
 'GCF_000882855.1',
 'GCF_000884175.1',
 'GCF_000884395.1',
 'GCF_000884955.1',
 'GCF_000885035.1',
 'GCF_000885555.1',
 'GCF_000885595.1',
 'GCF_000885815.1',
 'GCF_000886375.1',
 'GCF_000886455.1',
 'GCF_000886475.1',
 'GCF_000886535.1',
 'GCF_000887215.1',
 'GCF_000887335.1',
 'GCF_000887355.1',
 'GCF_000887495.1',
 'GCF_000888015.1',


In [7]:
len(homo_sapiens)

266

In [8]:
single_fasta = []

for virus in homo_sapiens:
    virus_dir = DATA_ROOT / virus
    fasta_files = list(virus_dir.glob("*.fna.gz"))
    if not fasta_files:
        print(f"{virus.name} is missing fasta files")
        continue
    fasta = fasta_files[0]
    with gzip.open(fasta, "rt") as f:
        single_fasta.append(f.read())

with open("homo_sapiens.fasta", "w") as f:
    f.write("".join(single_fasta))

In [None]:
hosts = set()
# Regex to match lines like '                     /host="Homo sapiens"'
# Captures the value inside the quotes or after the equals sign
host_pattern = re.compile(r'^\s*/host="?([^"\n]+)"?')

for root, dirs, files in os.walk(DATA_ROOT):
    for file in files:
        if file.endswith(".gbff.gz"):
            filepath = os.path.join(root, file)
            try:
                with gzip.open(filepath, "rt", encoding="latin-1") as f:
                    for line in f:
                        match = host_pattern.match(line)
                        if match:
                            hosts.add(match.group(1))
            except Exception as e:
                print(f"Error reading {filepath}: {e}")

hosts

{'ferret badger',
 'Klebsiella pneumoniae 151522',
 'Chaetocoelopa sydneyensis',
 'Ameiva ameiva ameiva',
 'peanut worms mix Beihai',
 'horse',
 'Streptococcus thermophilus ST67009',
 'Culex globocoxitus',
 'Streptococcus thermophilus DGCC7854',
 'Lactobacillus jensenii',
 'Phaseolus lunatus',
 'Burkholderia cenocepacia',
 'Coccinia grandis (ivy gourd)',
 'Phormidium foveolarum',
 'Leonurus sibiricus (Lamiaceae)',
 'Asterias forbesi (sea star)',
 'Costus spiralis (spiral ginger)',
 'Pseudomonas aeruginosa Kutter',
 'Neodon clarkei',
 'Chlorocebus sabaeus (African green monkey); female;',
 'Anopheles sp.',
 'Lens culinaris',
 'freshwater atyid shrimp',
 'Apium graveolens',
 'Odonata',
 'Siegesbeckia glabrescens',
 'Zantedeschia aethiopica',
 'Bacillus sp.',
 'Vibrio alginolyticus',
 'Penicillium aurantiogriseum isolate MUT4330-b',
 'Lactococcus sp. 936',
 'Solanum tuberosum ssp. andigena',
 'Phaseolus vulgaris cv. Closeau',
 'Ixodes uriae (putus)',
 'Vicugna pacos (alpaca)',
 'Ixodes sc

In [6]:
for h in hosts:
    if h.lower() in ("homo sapiens", "human"):
        print(h)

Homo sapiens
