# 1b. Download (first-pass) filtered genomes

In this notebook, we will use __`pyphylon`__'s `download` module to download candidate genomes for pangenome generation.

In this example we will select genomes for download from [BV-BRC](https://www.bv-brc.org/)

## Setup

In [None]:
import os
import yaml
import pandas as pd

from tqdm.notebook import tqdm

from pyphylon.downloads import download_genomes_bvbrc
from pyphylon.util import remove_empty_files, load_config

In [None]:
CONFIG = load_config("config.yml")
WORKDIR = CONFIG["WORKDIR"]
DEBUG = CONFIG["DEBUG"]

reuse_temp = False
temp_folder = os.path.join("../temp/")

with open("config.yml", 'r') as f:
    config = yaml.safe_load(f)
if config["REUSE_TEMP"]:
    reuse_temp = True
if config["REUSE_TEMP_DIR"]:
    temp_folder = config["REUSE_TEMP_DIR"]

input_folder = os.path.join("../input/")
output_folder = os.path.join("../output/")

print(f"Temp folder: {temp_folder}")
print(f"Input folder: {input_folder}")
print(f"Output folder: {output_folder}")

In [None]:
# Downloads go under temp/1b_protected/ (survives cleanup.sh unless --force)
RAW_GENOMES = os.path.join(temp_folder, "1b_protected", "raw", "genomes")
os.makedirs(RAW_GENOMES, exist_ok=True)

In [None]:
filtered_species_summary = pd.read_csv(
    os.path.join(temp_folder, '1a_genome_summary.csv'),
    dtype={'genome_id': str}
)
filtered_species_summary

In [None]:
filtered_species_metadata = pd.read_csv(
    os.path.join(temp_folder, '1a_genome_metadata.csv'),
    dtype={'genome_id': str}
)
filtered_species_metadata

## Download

In [None]:
bad_genomes = download_genomes_bvbrc(
    genomes=filtered_species_summary["genome_id"],
    output_dir=RAW_GENOMES,
    filetypes=['fna']
)

In [None]:
empty_files = []
for subdir in tqdm(os.listdir(RAW_GENOMES)):
    subdir_path = os.path.join(RAW_GENOMES, subdir)
    files = remove_empty_files(subdir_path)
    empty_files.extend(files)

In [None]:
print(f"bad genomes: {len(bad_genomes)}")
print(f"empty genomes: {len(empty_files)}")

## Update genome info files

In [None]:
filtered_species_summary['genome_id'] = filtered_species_summary.genome_id.astype('str')
downloaded_genomes = set(filtered_species_summary.genome_id) - set(bad_genomes)

filtered_species_summary = (filtered_species_summary.
                            drop_duplicates(subset=['genome_id']).
                            set_index('genome_id').
                            loc[sorted(downloaded_genomes)].
                            reset_index())


display(
    filtered_species_summary.shape,
    filtered_species_summary.head()
)

In [None]:
filtered_species_metadata['genome_id'] = filtered_species_metadata.genome_id.astype('str')

filtered_species_metadata = (filtered_species_metadata.
                            drop_duplicates(subset=['genome_id']).
                            set_index('genome_id').
                            loc[sorted(downloaded_genomes)].
                            reset_index())


display(
    filtered_species_metadata.shape,
    filtered_species_metadata.head()
)

In [None]:
# Save files
filtered_species_summary.to_csv(os.path.join(temp_folder, '1b_genome_summary.csv'))
filtered_species_metadata.to_csv(os.path.join(temp_folder, '1b_genome_metadata.csv'))