In [None]:
# Download publically available assembelies of common bee associated bacteria
# We will use these assembelies for crispr spacer analysis later
    # We are casting a fairly large net here. 
    # Basically, a bacterial genera has members that are known to associated with bees, we want it
# To do this, we will...
    # 1) Download metadata all the bacteria we care about
        # These are listed in a file named: taxa_to_download.txt
    # 2) We will then pull some usefull pieces of data from these
        # Accession number, Species, Host, Completeness, Contamination, N50, and Coverage
    # 3) We then use R to pick through this and subset to the assembelies which meet our critera
    

In [1]:
# collect meta data into a .json file
conda activate ncbi_datasets
cat taxa_to_download.txt | while read acc
do
datasets summary genome taxon $acc --as-json-lines 
done >> taxa_meta.jsonl

# Parse that .json file and select just the pieces of info we want
cat taxa_meta.jsonl | while read i; do
    Acc=$(echo $i | jq -r '.accession')
    Species=$(echo $i | jq -r '.assembly_info.biosample.description.organism.organism_name')
    Host=$(echo $i | jq -r '.assembly_info.biosample.attributes[] | select(.name == "host") | .value')
    Complete=$(echo $i | jq -r '.checkm_info.completeness')
    Contam=$(echo $i | jq -r '.checkm_info.contamination')
    N50=$(echo $i | jq -r '.assembly_stats.contig_n50')
    Cover=$(echo $i | jq -r '.assembly_stats.genome_coverage')
    echo -e $Acc,$Species,$Host,$Complete,$Contam,$N50,$Cover
done >> taxa_meta_table.csv

# After using R to subset these potential assembelies to just the ones that meet our criteria, download actual genomes
    # Note, this is a lot of data!!!! 
        # Some taxa (i.e. lactobacillus, bifidobacterium, etc.) are very well described (there is a mountain of assembelies on NCBI)
        # And we are basically scraping all of their assembelies
    # The file produced by R is named: good_accessions.csv
    
# Download em
# make a new file with just the accessions
# First thing, the file produced by R cannot be fed directly to NCBI datasets becasue if has two columns, whereas NCBI dataset expects just a list of accessions
# So, start by making a new file called "good_accessions_only.csv" which contains only the good accessions
cat good_accessions.csv | awk -F"," '{print $1}' > good_accessions_only.csv
# Now start the download
mkdir by_accession
cd by_accession
conda activate ncbi_datasets
datasets download genome accession \
--inputfile ../good_accessions_only.csv \
--dehydrated \
--include genome \
--filename quality_assemblies.zip

# once downloaded, unzip and rehydrate the files
# unzip and rehydrate
mkdir unzipped
unzip quality_assemblies.zip -d ./unzipped/quality_assembelies
# Then rehydrate
cd unzipped
datasets rehydrate --directory quality_assembelies 
# 29,584

# Now, we want to move the actual assembely/genome files to a new directory
# we will also rename the files to reflect their specific taxonomy
# move genomes to new dir
mkdir genomes
for f in quality_assembelies/ncbi_dataset/data/*/*.fna
do
X=${f##*data/}
Y=${X%%/*}
cp $f ./genomes/$Y.fna
done
# Rename
# We will use their accession numbers to match them up with their Genus using good_accessions.csv
mkdir renamed
cat ../../good_accessions.csv | while read line
do
acc=$(echo $line | awk -F"," '{print $1}' | tr -d '[:space:]')
gen=$(echo $line | awk -F"," '{print $2}' | tr -d '[:space:]')
cp ./genomes/${acc}.fna ./renamed/${gen}_${acc}.fna
done
# Done! We have downloaded (alot of) bee associated bacterial genomes/assemeblies and given them usefull names              

SyntaxError: invalid syntax (3582804122.py, line 2)