In [None]:
# parameters
ictv_vmr_file = "reference_databases:Phylogenic-Databases/ICTV/VMR/VMR_21-221122_MSL37.xlsx"
genome_type_regex = "DNA"
host_type = "bacteria"

# I was unable to download the entire DB in one go
# when asked for help, NCBI recommended that we download by date range and periodically download the next batch
# ncbi_viruses_file = "reference_databases:NCBI/Viruses/20221018-ncbi-viruses-metadata.csv"
# when subsetting to bacteriophage, I was not getting a set that sufficiently overlapped with ICTV
ncbi_viruses_directory = "reference_databases:Phylogenic-Databases/NCBI/Viruses"

k=7

In [None]:
DATE_TASK = "2023-02-23-INFO-488-phage-phylogenetic-classification"
DIR = mkpath("$(homedir())/workspace/scratch/$DATE_TASK")
cd(DIR)
TODAY, TASK = match(r"^(\d{4}-\d{2}-\d{2})-(.*)$", DATE_TASK).captures

In [None]:
import Pkg
Pkg.update()
pkgs = [
    "Revise",
    "uCSV",
    "DataFrames",
    "StatsBase",
    "ProgressMeter",
    "FASTX",
    "BioSequences",
    "JSON",
    "Kmers",
    "Clustering",
    "Random",
    # "Arrow"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

In [None]:
# if isdir - need to add a check here or just comment out
# run(`rclone copy locus_google_drive:scratch/$(DATE_TASK) $(DIR)`)

In [None]:
ncbi_viruses_directory_local = mkpath(joinpath(DIR, "ncbi-viruses"))
run(`rclone copy $(ncbi_viruses_directory) $(ncbi_viruses_directory_local)`)

In [None]:
types=Dict(
    "Accession" => Union{String, Missing},
    "SRA_Accession" => Union{String, Missing},
    "Submitters" => Union{String, Missing},
    "Organization" => Union{String, Missing},
    "Org_location" => Union{String, Missing},
    "Release_Date" => Union{String, Missing},
    "Isolate" => Union{String, Missing},
    "Species" => Union{String, Missing},
    "Genus" => Union{String, Missing},
    "Family" => Union{String, Missing},
    "Molecule_type" => Union{String, Missing},
    "Length" => Union{Int, Missing},
    "Sequence_Type" => Union{String, Missing},
    "Nuc_Completeness" => Union{String, Missing},
    "Genotype" => Union{String, Missing},
    "Segment" => Union{String, Missing},
    "Publications" => Union{Int, Missing},
    "Geo_Location" => Union{String, Missing},
    "Country" => Union{String, Missing},
    "USA" => Union{String, Missing},
    "Host" => Union{String, Missing},
    "Isolation_Source" => Union{String, Missing},
    "Collection_Date" => Union{String, Missing},
    "BioSample" => Union{String, Missing},
    "GenBank_Title" => Union{String, Missing}
)

ncbi_viruses_table = DataFrames.DataFrame()
ncbi_viruses_files = filter(x -> occursin(r"\.ncbi-viruses\.csv", x), readdir(ncbi_viruses_directory_local, join=true))
ProgressMeter.@showprogress for f in ncbi_viruses_files
    df = DataFrames.DataFrame(uCSV.read(open(`mlr --c2t cat $(f)`), types=types, header=1, delim='\t', encodings=Dict("" => missing))...)
    append!(ncbi_viruses_table, df, promote=true)
end
unique!(ncbi_viruses_table);

In [None]:
# acquire metadata
ictv_vmr_file_local = joinpath(DIR, basename(ictv_vmr_file))
if !isfile(ictv_vmr_file_local)
    run(`rclone copy $(ictv_vmr_file) $(dirname(ictv_vmr_file_local))`)
end
# the two sheets are written out to the _0 and _1 files, respectively
@assert readlines(`in2csv --names $(ictv_vmr_file_local)`) == ["VMRb37", "Column definitions"]
run(pipeline(`in2csv --write-sheets - $(ictv_vmr_file_local)`, devnull))
ictv_vmr_table = DataFrames.DataFrame(uCSV.read(replace(ictv_vmr_file_local, ".xlsx" => "_0.csv"), header=1, quotes='"')...)
# some have multiple equivalent genome submissions, only take first
ictv_vmr_table[!, "Virus GENBANK accession"] = string.(first.(split.(ictv_vmr_table[!, "Virus GENBANK accession"], ';')));

In [None]:
# join ICTV & NCBI
conflict_columns = intersect(names(ictv_vmr_table), names(ncbi_viruses_table))
DataFrames.rename!(ictv_vmr_table, [column => column * "_ICTV" for column in conflict_columns])
DataFrames.rename!(ncbi_viruses_table, [column => column * "_NCBI" for column in conflict_columns])

# 3707
joint_metadata_table = DataFrames.innerjoin(ictv_vmr_table, ncbi_viruses_table, on = "Virus GENBANK accession" => "Accession")
# 3720
# joint_metadata_table = DataFrames.leftjoin(ictv_vmr_table, ncbi_viruses_table, on = "Virus GENBANK accession" => "Accession")
joint_metadata_table = DataFrames.rename!(joint_metadata_table, "Virus GENBANK accession" => "Accession")

# clear the prior tables from memory
ncbi_viruses_table = ictv_vmr_table = nothing
GC.gc()

# filter down to phage with DNA genomes since we don't work with RNA phage
is_right_host = joint_metadata_table[!, "Host source"] .== "bacteria"
is_right_genome_type = map(x -> occursin(Regex(genome_type_regex), x), joint_metadata_table[!, "Genome composition"])
has_genbank_accession = map(x -> !isempty(x), joint_metadata_table[!, "Accession"])
filter_mask = is_right_host .& is_right_genome_type .& has_genbank_accession
joint_metadata_table = joint_metadata_table[filter_mask, :]
sort!(joint_metadata_table, "Accession")

In [None]:
fasta_directory = mkpath(joinpath(DIR, "reference-fastas"))
reference_fastas = String[]
ProgressMeter.@showprogress for accession in joint_metadata_table[!, "Accession"]
    fastx_file = joinpath(fasta_directory, accession * ".fna")
    if !isfile(fastx_file) || (filesize(fastx_file) == 0)
        try
            fastx_records = collect(Mycelia.get_sequence(db="nuccore", accession=accession))
            if isempty(fastx_records)
                @info "trying again"
                fastx_records = collect(Mycelia.get_sequence(db="nuccore", accession=accession))
            end
            @assert !isempty(fastx_records)
            open(fastx_file, "w") do io
                fastx_io = FASTX.FASTA.Writer(io)
                for record in fastx_records
                    write(fastx_io, record)
                end
                close(fastx_io)
            end
        catch e
            display(e)
        end
    end
    if isfile(fastx_file) && (filesize(fastx_file) != 0)
        push!(reference_fastas, fastx_file)
    end
end

reference_list = joinpath(DIR, "reference_list.txt")
open(reference_list, "w") do io
    for x in reference_fastas
        println(io, x)
    end
end

In [None]:
# pull in Benchling file
@time benchling_sequences_json = [JSON.parse(line) for line in eachline(benchling_dna_sequences_jsonl_local)];

@time benchling_sequences_json = filter!(x -> 
    (x["registryId"] != nothing) && # should be registered
    (x["schema"]["name"] in keys(benchling_schema_of_interest)) && # should be in schema of interest
    (!isempty(x["bases"])) # should have bases
    , benchling_sequences_json)

# we checked against the keys, so let's confirm that the schema values match too
@assert all(x -> x["schema"]["id"] in values(benchling_schema_of_interest), benchling_sequences_json)

function benchling_json_to_fasta(json_record)
    identifier = json_record["entityRegistryId"]
    description = json_record["name"]
    sequence = json_record["bases"]
    return FASTX.FASTA.Record(identifier, description, sequence)
end

locus_records = benchling_json_to_fasta.(benchling_sequences_json)
sort!(locus_records, by=x->FASTX.identifier(x))

query_fastas_dir = mkpath(joinpath(DIR, "query-fastas"))
locus_fasta_files = String[]
for locus_record in locus_records
    locus_record_identifier = FASTX.identifier(locus_record)
    fastx_file = joinpath(query_fastas_dir, locus_record_identifier * ".fna")
    open(fastx_file, "w") do io
        fastx_io = FASTX.FASTA.Writer(io)
        write(fastx_io, locus_record)
        close(fastx_io)
    end
    push!(locus_fasta_files, fastx_file)
end

query_list = joinpath(DIR, "query_list.txt")
open(query_list, "w") do io
    for f in locus_fasta_files
        println(io, f)
    end
end

In [None]:
# NOTE, THIS SHOULD BE THE TIE-IN POINT FOR A COMMAND LINE CALL

In [None]:
fasta_ani_outfile = joinpath(DIR, "fastani.txt")
# 7 min
Mycelia.fastani(query_list=query_list, reference_list=reference_list, outfile=fasta_ani_outfile)
ani_table = Mycelia.read_fastani(fasta_ani_outfile)

In [None]:
ani_top_hits = DataFrames.DataFrame()
for g in DataFrames.groupby(ani_table, "identifier")
    push!(ani_top_hits, sort(g, "% identity", rev=true)[1, :])
end
ani_top_hits[!, "identifier"] = map(x -> replace(basename(x), ".fna" => ""), ani_top_hits[!, "identifier"])
ani_top_hits[!, "closest_reference"] = map(x -> replace(basename(x), ".fna" => ""), ani_top_hits[!, "closest_reference"])
ani_top_hits

In [None]:
# determine kmer saturation rate of reference genomes
# Mycelia.assess_dnamer_saturation(readdir(fasta_directory, join=true))
# don't hit predicted saturation until k=29 which is way too high
# just use 7, 8192 is plenty of features for classification. Could probably even get away with 5 but we've been using 7 historically

In [None]:
reference_records = FASTX.FASTA.Record[]
for (i, reference_fasta) in enumerate(reference_fastas)
    records = collect(Mycelia.open_fastx(reference_fasta))
    if length(records) != 1
        @show reference_fasta
        @show i
        display(records)
    else
        record = first(records)
        push!(reference_records, record)
    end
end
reference_records
unique!(reference_records)

In [None]:
joint_reference_sequences_file = joinpath(DIR, "joint-reference-sequences.fna")
open(joint_reference_sequences_file, "w") do io
    fastx_writer = FASTX.FASTA.Writer(io)
    for record in reference_records
        write(fastx_writer, record)
    end
    close(fastx_writer)
end

run(`makeblastdb -parse_seqids -dbtype nucl -in $(joint_reference_sequences_file) -out $(joint_reference_sequences_file)`)

In [None]:
joint_query_sequences_file = joinpath(DIR, "joint-query-sequences.fna")

open(joint_query_sequences_file, "w") do io
    fastx_writer = FASTX.FASTA.Writer(io)
    for record in locus_records
        write(fastx_writer, record)
    end
    close(fastx_writer)
end

In [None]:
# 3 min
blast_report = Mycelia.run_blast(out_dir = DIR, fasta = joint_query_sequences_file, blast_db = joint_reference_sequences_file, blast_command = "blastn")

In [None]:
blast_hits = Mycelia.parse_blast_report(blast_report)

In [None]:
# # take only the best hit for each
blast_top_hits = DataFrames.DataFrame()
for g in DataFrames.groupby(blast_hits, "query id")
    sorted_g = sort!(g, ["bit score", "% identity"], rev=true)
    top_hit = sorted_g[1, :]
    push!(blast_top_hits, top_hit)
end
blast_top_hits

In [None]:
# note, here is where we would read back in locus records from fastas but we already have them in memory from above

In [None]:
joint_records = vcat(reference_records, locus_records)

In [None]:
reference_record_range = 1:length(reference_records)

In [None]:
locus_record_range = length(reference_records)+1:length(joint_records)

In [None]:
@assert issorted(joint_metadata_table[!, "Accession"])

In [None]:
# let's make an index map for fast association
accession_index_map = Dict(accession => i for (i, accession) in enumerate(joint_metadata_table[!, "Accession"]))

In [None]:
counts_matrix, counts_matrix_file = Mycelia.fasta_list_to_counts_table(fasta_list=joint_records, k=k, alphabet=:DNA)

In [None]:
closest_match_table = DataFrames.DataFrame(
    identifier = String[],
    name = String[],
    distance_metric = String[],
    closest_reference = String[],
    distance = Float64[]
)

In [None]:
# euclidiean_distance_matrix
# consider writing me out, looks like I'll take 10 minutes for Endeavor reference phage 2022-10-21 (420 phage)
# consider writing me out, looks like I'll take 12 minutes for all reference phage 2022-10-21 (1400 phage)
@time euclidean_distance_matrix = Mycelia.frequency_matrix_to_euclidean_distance_matrix(counts_matrix)

In [None]:
distance_matrix = euclidean_distance_matrix
distance_metric = "euclidean"

for locus_record_index in locus_record_range
    locus_record = joint_records[locus_record_index]
    value, index = findmin(distance_matrix[reference_record_range, locus_record_index])
    record_identifier = FASTX.identifier(reference_records[index])
    unversioned_record_identifier = first(split(record_identifier, '.'))
    row = (
        identifier = FASTX.identifier(locus_record),
        name = FASTX.description(locus_record),
        distance_metric = distance_metric,
        closest_reference = unversioned_record_identifier,
        distance = value
    )
    push!(closest_match_table, row)
end

In [None]:
# 13 minutes for 5k phage
@time cosine_distance_matrix = Mycelia.frequency_matrix_to_cosine_distance_matrix(counts_matrix)

In [None]:
distance_matrix = cosine_distance_matrix
distance_metric = "cosine"

for locus_record_index in locus_record_range
    locus_record = joint_records[locus_record_index]
    value, index = findmin(distance_matrix[reference_record_range, locus_record_index])
    record_identifier = FASTX.identifier(reference_records[index])
    unversioned_record_identifier = first(split(record_identifier, '.'))
    row = (
        identifier = FASTX.identifier(locus_record),
        name = FASTX.description(locus_record),
        distance_metric = distance_metric,
        closest_reference = unversioned_record_identifier,
        distance = value
    )
    push!(closest_match_table, row)
end

In [None]:
closest_match_table[!, "% identity"] = 
map(row -> 
    occursin(r"cosine"i, row["distance_metric"]) ? 
    (1 - row["distance"]) * 100 : 
    missing,
    collect(DataFrames.eachrow(closest_match_table)))

In [None]:
ani_top_hits[!, "distance_metric"] .= "fastANI"

blast_top_hits[!, "distance_metric"] .= "blast"

blast_top_hits = DataFrames.rename!(
    blast_top_hits,
    ["query id" => "identifier",
    "subject id" => "closest_reference"]
)

blast_top_hits[!, "closest_reference"] = map(x -> string(first(split(x, '.'))), blast_top_hits[!, "closest_reference"])

joint_top_hits_table = vcat(
    ani_top_hits[!, ["identifier", "closest_reference", "% identity", "distance_metric"]],
    blast_top_hits[!, ["identifier", "closest_reference", "% identity", "distance_metric"]])

joint_top_hits_table[!, "distance"] = map(x -> (100 - x) / 100, joint_top_hits_table[!, "% identity"])

locus_id_to_name_map = Dict(FASTX.identifier(x) => FASTX.description(x) for x in locus_records)
joint_top_hits_table[!, "name"] = map(x -> locus_id_to_name_map[x], joint_top_hits_table[!, "identifier"])

joint_match_table = vcat(closest_match_table, joint_top_hits_table)

# merge joint lineage table with joint metadata data & write out to disk
joint_lineage_table = DataFrames.innerjoin(joint_match_table, joint_metadata_table, on=:closest_reference => :Accession)

sort!(joint_lineage_table, "identifier")

for col in names(joint_lineage_table)
    joint_lineage_table[!, col] = map(x -> ismissing(x) ? "" : string(x), joint_lineage_table[!, col])
end

In [None]:
phylogenetic_classifications_table_file = joinpath(DIR, "$(TODAY)-phage-phylogenetic-classifications.tsv")
uCSV.write(phylogenetic_classifications_table_file, joint_lineage_table, delim='\t')
# uCSV.write(joinpath(DIR, "$(TODAY)-phage-phylogenetic-classifications.csv"), joint_lineage_table, quotes='"')

In [None]:
# looks like Species ICTV has duplicates, so just take the first
# for g in DataFrames.groupby(joint_lineage_table[joint_lineage_table[!, "distance_metric"] .== "euclidean", :], "identifier")
#     if DataFrames.nrow(g) > 1
#         display(g)
#     end
# end

euclidean_joint_lineage_table = DataFrames.DataFrame()
for g in DataFrames.groupby(joint_lineage_table[joint_lineage_table[!, "distance_metric"] .== "euclidean", :], "identifier")
    push!(euclidean_joint_lineage_table, g[1, :])
end
@assert issorted(euclidean_joint_lineage_table[!, "identifier"])

In [None]:
@assert all(euclidean_joint_lineage_table[!, "identifier"] .== FASTX.identifier.(locus_records))

In [None]:
euclidean_distance_matrix_subset = euclidean_distance_matrix[locus_record_range, locus_record_range]
# UPGMA
# can also do linkage = single
@time hclust_result = Clustering.hclust(euclidean_distance_matrix_subset, linkage=:average, branchorder=:optimal)

In [None]:
@assert hclust_result.heights == hclust_result.height

@assert hclust_result.merge == hclust_result.merges

@assert hclust_result.method == hclust_result.linkage

In [None]:
node_labels = [replace(join([row["identifier"], row["name"], row["Genus_ICTV"]], "__"), " " => "_") for row in DataFrames.eachrow(euclidean_joint_lineage_table)]

In [None]:
newick = Dict()
for row in 1:size(hclust_result.merges, 1)
    left, right = hclust_result.merges[row, :]
    if left < 0
        phage_name = node_labels[abs(left)]
        l = "$phage_name"
    else
        l = newick[left]
    end
    if right < 0
        phage_name = node_labels[abs(right)]
        r = "$phage_name"
    else
        r = newick[right]
    end
    height = hclust_result.heights[row]
    newick[row] = "($l:$height, $r:$height)"
end

newick_file = "$DIR/$(TODAY)-reference-phage.newick"
open(newick_file, "w") do io
    println(io, newick[size(hclust_result.merges, 1)] * ";")
end

In [None]:
# run(`rclone lsf locus_genomics_storefront:Phylogenies`)
# copy results to the storefront under phylogenies
run(`rclone copy $(newick_file) locus_genomics_storefront:Phylogenies`)
run(`rclone copy $(phylogenetic_classifications_table_file) locus_genomics_storefront:Phylogenies`)

In [None]:
# find all nearly identical by blast, cosine distance, fastANI

In [None]:
# blastn has lots of multi-map hits that I'm not sure how to handle immediately, so skipping for now
# redo the blast steps above but reciprocal blast of internal phage

run(`makeblastdb -parse_seqids -dbtype nucl -in $(joint_query_sequences_file) -out $(joint_query_sequences_file)`)

In [None]:
# 3 min
reciprocal_blast_report = Mycelia.run_blast(out_dir = DIR, fasta = joint_query_sequences_file, blast_db = joint_query_sequences_file, blast_command = "blastn")

In [None]:
reciprocal_blast_hits = Mycelia.parse_blast_report(reciprocal_blast_report)

In [None]:
# # take only the best hit for each PAIR
reciprocal_blast_top_hits = DataFrames.DataFrame()
for g in DataFrames.groupby(reciprocal_blast_hits, ["query id", "subject id"])
    sorted_g = sort!(g, ["bit score", "% identity"], rev=true)
    top_hit = sorted_g[1, :]
    push!(reciprocal_blast_top_hits, top_hit)
end
reciprocal_blast_top_hits

locus_id_to_name_map = Dict(FASTX.identifier(x) => FASTX.description(x) for x in locus_records)

DataFrames.rename!(reciprocal_blast_top_hits,
    ["query id" => "query_identifier",
     "subject id" => "reference_identifier",
     "subject title" => "reference_name",
     "% identity" => "%_identity"])

reciprocal_blast_top_hits[!, "query_name"] = map(x -> locus_id_to_name_map[x], reciprocal_blast_top_hits[!, "query_identifier"])    
reciprocal_blast_top_hits[!, "distance_metric"] .= "blast"
reciprocal_blast_top_hits[!, "distance"] = map(x -> 100 - x, reciprocal_blast_top_hits[!, "%_identity"])
reciprocal_blast_top_hits = reciprocal_blast_top_hits[!, ["query_identifier", "reference_identifier", "distance_metric", "%_identity", "query_name", "reference_name", "distance"]]

In [None]:
# redo the ani steps but reciprocal blast of internal phage
reciprocal_fasta_ani_outfile = joinpath(DIR, "reciprocal-fastani.txt")
Mycelia.fastani(query_list=query_list, reference_list=query_list, outfile=reciprocal_fasta_ani_outfile)

In [None]:
reciprocal_ani_table = Mycelia.read_fastani(reciprocal_fasta_ani_outfile)

# only take the first example of each pair, since there should be reciprocal matches
# reciprocal_ani_table = reciprocal_ani_table[reciprocal_ani_table[!, "query identifier"] .< reciprocal_ani_table[!, "reference identifier"], :]
reciprocal_ani_table[!, "query_identifier"] = map(x -> replace(basename(x), ".fna" => ""), reciprocal_ani_table[!, "query_identifier"])
reciprocal_ani_table[!, "reference_identifier"] = map(x -> replace(basename(x), ".fna" => ""), reciprocal_ani_table[!, "reference_identifier"])
sort!(reciprocal_ani_table);

reciprocal_ani_table[!, "distance_metric"] .= "fastANI"
reciprocal_ani_table = reciprocal_ani_table[!, ["query_identifier", "reference_identifier", "distance_metric", "%_identity"]]


reciprocal_ani_table[!, "query_name"] = map(x -> locus_id_to_name_map[x], reciprocal_ani_table[!, "query_identifier"])
reciprocal_ani_table[!, "reference_name"] = map(x -> locus_id_to_name_map[x], reciprocal_ani_table[!, "reference_identifier"])
reciprocal_ani_table[!, "distance"] = map(x -> 100 - x, reciprocal_ani_table[!, "%_identity"])

In [None]:
cosine_distance_table = DataFrames.DataFrame()

for i1 in locus_record_range
    record_1 = joint_records[i1]
    for i2 in locus_record_range
        record_2 = joint_records[i2]
        record_identifier = 
        row = (
            query_identifier = FASTX.identifier(record_1),
            query_name = FASTX.description(record_1),
            reference_identifier = FASTX.identifier(record_2),
            reference_name = FASTX.description(record_2),
            distance_metric = "cosine",
            distance = cosine_distance_matrix[i1, i2]
        )
        push!(cosine_distance_table, row)
    end
end
cosine_distance_table[!, "%_identity"] = map(x -> ((1 - x) * 100), cosine_distance_table[!, "distance"])
cosine_distance_table

In [None]:
phage_similarity_table = vcat(cosine_distance_table, reciprocal_ani_table, reciprocal_blast_top_hits)
sort!(phage_similarity_table, ["query_identifier", "reference_identifier", "distance_metric"])
phage_similarity_table = phage_similarity_table[phage_similarity_table[!, "query_identifier"] .!= phage_similarity_table[!, "reference_identifier"], :]

In [None]:
# includes the p00jc & 7225 similarity that the team was frustrated by
# # /home/jovyan/workspace/scratch/2022-11-22-INFO-488-phylogenetic-classification-benchling-upload/query-fastas/rPHAGE000150.fna
# # /home/jovyan/workspace/scratch/2022-11-22-INFO-488-phylogenetic-classification-benchling-upload/query-fastas/rPHAGE007225.fna
# # 99.9983
# reciprocal_ani_table[findall(x -> occursin("rPHAGE000150", x), reciprocal_ani_table[!, "identifier"]), :]

# reciprocal_ani_table[reciprocal_ani_table[!, "% identity"] .>= 99.99, :]

# phage_similarity_table = phage_similarity_table[phage_similarity_table[!, "% identity"] .>= 99.99, :]
# 95% identity is the batch release cutoff, so set that as the minimum hard filter and then we can increase filtering stringency later
phage_similarity_table = phage_similarity_table[phage_similarity_table[!, "%_identity"] .>= 95, :]

In [None]:
# only take groups where all 3 metrics hit the threshold
num_distance_metrics = length(unique(phage_similarity_table[!, "distance_metric"]))
phage_similarity_table_subset = DataFrames.DataFrame()
for g in DataFrames.groupby(phage_similarity_table, ["query_identifier", "reference_identifier"])
    if DataFrames.nrow(g) == num_distance_metrics
        for row in DataFrames.eachrow(g)
            push!(phage_similarity_table_subset, row)
        end
    end
end
phage_similarity_table_subset

In [None]:
phage_similarity_table_subset_file = joinpath(DIR, "$(TODAY)-highly-similar-reference-phage.tsv")
uCSV.write(phage_similarity_table_subset_file, phage_similarity_table_subset, delim='\t')
run(`rclone copy $(phage_similarity_table_subset_file) locus_genomics_storefront:Phylogenies`)

In [None]:
# phage_to_archive = sort(unique(phage_similarity_table_subset[!, ["reference_identifier", "reference_name"]]))

In [None]:
# phage_to_archive_file = joinpath(DIR, "$(TODAY)-phage-to-archive.tsv")
# uCSV.write(phage_to_archive_file, phage_to_archive, delim='\t')
# run(`rclone copy $(phage_to_archive_file) locus_genomics_storefront:Phylogenies`)

In [None]:
# for each entity, if we have uniform agreement on ID, upload the genus and species information to Benchling for that reference phage

In [None]:
# TODO write classifications to Benchling

In [None]:
# TODO flag redundant phage in benchling

In [None]:
# run(`rclone copy $(DIR) locus_google_drive:scratch/$(DATE_TASK)`)

In [None]:
# run(`rclone copy $(DIR) locus_google_drive:scratch/$(DATE_TASK)`)