In [1]:
import Pkg
pkgs = [
"DataFrames",
"uCSV",
"ProgressMeter",
"StatsPlots",
"StatsBase",
"FASTX"
]
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
# viral_tax_ids = Set(parse.(Int, filter(!isempty, readlines(`conda run -n taxonkit taxonkit list --ids 10239 --indent ""`))))

In [2]:
# NCBI host metadata
ncbi_metadata_file = joinpath(dirname(pwd()), "metadata", "NCBI-virus-refseq.transformed.tsv")
ncbi_host_metadata = DataFrames.DataFrame(uCSV.read(ncbi_metadata_file, header=1, delim='\t', encodings=Dict("false" => false, "true" => true)))

# ICTV host metadata
ictv_metadata_file = joinpath(dirname(pwd()), "metadata", "VMR_MSL38_v1 - VMR MSL38 v1.transformed.tsv")
ictv_host_metadata = DataFrames.DataFrame(uCSV.read(ictv_metadata_file, header=1, delim='\t', typedetectrows=100))
ictv_host_metadata = ictv_host_metadata[.!isempty.(ictv_host_metadata[!, "taxid"]), :]
ictv_host_metadata[!, "taxid"] = parse.(Int, ictv_host_metadata[!, "taxid"])

# # VirusHostDB metadata
virushostdb_metadata_file = joinpath(dirname(pwd()), "metadata", "virushostdb.transformed.tsv")
virushostdb_metadata = DataFrames.DataFrame(uCSV.read(virushostdb_metadata_file, header=1, delim='\t', typedetectrows=1086, encodings=Dict("missing" => missing, "false" => false, "true" => true)))

vertebrate_taxids = union(
    ictv_host_metadata[ictv_host_metadata[!, "Host source"] .== "vertebrates", "taxid"],
    ncbi_host_metadata[ncbi_host_metadata[!, "host_is_vertebrate"], "taxid"],
    virushostdb_metadata[virushostdb_metadata[!, "host_is_vertebrate"], "virus_taxid"]
)

human_taxids = union(
    ncbi_host_metadata[ncbi_host_metadata[!, "host_is_human"], "taxid"],
    virushostdb_metadata[virushostdb_metadata[!, "host_is_human"], "virus_taxid"]
)

[33m[1m└ [22m[39m[90m@ uCSV ~/.julia/packages/uCSV/goLRH/src/read.jl:213[39m


1559-element Vector{Int64}:
  351073
 3051992
 3052643
 3052759
 2795181
 2304647
 2825847
   11029
 3052040
 2845899
 2055263
   95342
 1965018
       ⋮
  356663
  356664
  373193
   11837
   38804
  132475
   11089
  617102
 2053026
  186538
 1764086
   64320

In [5]:
data_dir = joinpath(dirname(pwd()), "data")
sample_directories = filter(x -> !occursin(".ipynb_checkpoints", x), readdir(joinpath(data_dir, "SRA"), join=true))
sample_directories = filter(x -> isfile(joinpath(x, "megahit", "final.contigs.fastg.gfa.fna")), sample_directories)

594-element Vector{String}:
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399459"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399460"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399461"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399462"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399463"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399464"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399465"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399466"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399467"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399468"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399469"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/

In [6]:
megablast_results = String[]
blastn_results = String[]
for sample_directory in sample_directories
    blastn_directory = joinpath(sample_directory, "blastn")
    for x in filter(x -> occursin(r"\.nt\.megablast\.txt$", x), readdir(blastn_directory, join=true))
        push!(megablast_results, x)
    end
    for x in filter(x -> occursin(r"\.nt\.blastn\.txt$", x), readdir(blastn_directory, join=true))
        push!(blastn_results, x)
    end
end

In [9]:
joint_blast_results_table = DataFrames.DataFrame()

blast_col_types = Dict(
    "query id" => String,
    "query title" => String,
    "subject id" => String,
    "subject gi" => String,
    "subject acc." => String,
    "subject acc.ver" => String,
    "subject title" => String,
    "query length" => Int,
    "subject length" => Int,
    "q. start" => Int,
    "q. end" => Int,
    "s. start" => Int,
    "s. end" => Int,
    "evalue" => Float64,
    "bit score" => Float64,
    "score" => Float64,
    "alignment length" => Int,
    "% identity" => Float64,
    "identical" => Int,
    "mismatches" => Int,
    "subject tax id" => Int,
    "subject sci name" => String,
    "subject com names" => String,
    "subject blast name" => String,
    "subject super kingdom" => String,
    "subject tax ids" => String,
    "subject sci names" => String,
    "subject com names" => String,
    "subject blast names" => String,
    "subject super kingdoms" => String,
    "subject title" => String,
    "subject titles" => String
)

Dict{String, DataType} with 30 entries:
  "q. end"                 => Int64
  "query length"           => Int64
  "subject tax ids"        => String
  "subject title"          => String
  "subject length"         => Int64
  "subject blast names"    => String
  "alignment length"       => Int64
  "subject id"             => String
  "bit score"              => Float64
  "identical"              => Int64
  "query title"            => String
  "subject tax id"         => Int64
  "% identity"             => Float64
  "subject acc.ver"        => String
  "subject blast name"     => String
  "subject super kingdoms" => String
  "query id"               => String
  "s. start"               => Int64
  "s. end"                 => Int64
  "q. start"               => Int64
  "subject super kingdom"  => String
  "subject sci name"       => String
  "mismatches"             => Int64
  "score"                  => Float64
  "subject acc."           => String
  ⋮                        => ⋮

In [None]:
grep -P "fixed_string1|fixed_string2|..." < input_file

In [13]:
first(megablast_results)

"/oak/stanford/scg/lab_mpsnyder/cjprybol/Mycelia/projects/viral-exposome-discovery/data/SRA/SRR6399459/blastn/final.contigs.fastg.gfa.fna.blastn.nt.megablast.txt"

In [11]:
ProgressMeter.@showprogress for blast_report in megablast_results
    header_line = first(Iterators.filter(x -> occursin(r"# Fields:", x), eachline(blast_report)))
    header = split(last(split(header_line, ": ")), ", ")
    viral_lines = String[]
    for line in eachline(blast_report)
        if !occursin("#", line)
            subject_tax_ids = map(x -> parse(Int, x), split(last(split(line, '\t')), ';'))
            if any(x -> x in human_taxids, subject_tax_ids)
                push!(viral_lines, line)
            end
        end
    end
    data, _ = uCSV.read(IOBuffer(join(viral_lines, '\n')), delim='\t', typedetectrows=100, encodings=Dict("N/A" => missing))
    results_table = DataFrames.DataFrame(data, header)
    results_table[!, "method"] .= "megablast"
    results_table[!, "sample"] .= replace(basename(blast_report), ".bam.fasta.blastn.nt.megablast.txt" => "")
    append!(joint_blast_results_table, results_table, promote=true)
end

LoadError: InterruptException:

In [None]:
ProgressMeter.@showprogress for blast_report in blastn_results
    try
        header_line = first(Iterators.filter(x -> occursin(r"# Fields:", x), eachline(blast_report)))
        header = split(last(split(header_line, ": ")), ", ")
        viral_lines = String[]
        for line in eachline(blast_report)
            if !occursin("#", line)
                subject_tax_ids = map(x -> parse(Int, x), split(last(split(line, '\t')), ';'))
                if any(x -> x in human_taxids, subject_tax_ids)
                    push!(viral_lines, line)
                end
            end
        end
        data, _ = uCSV.read(IOBuffer(join(viral_lines, '\n')), delim='\t', types=[blast_col_types[h] for h in header])
        results_table = DataFrames.DataFrame(data, header)
        results_table[!, "method"] .= "blastn"
        results_table[!, "sample"] .= replace(basename(blast_report), ".bam.fasta.blastn.nt.blastn.txt" => "")
        append!(joint_blast_results_table, results_table, promote=true)
    catch
        println("skipping $(blast_report)")
    end
end

In [None]:
StatsBase.describe(joint_blast_results_table[!, "alignment length"])

In [None]:
# StatsPlots.histogram(joint_blast_results_table[!, "alignment length"])

In [None]:
sort!(joint_blast_results_table, "alignment length", rev=true)

In [None]:
joint_blast_results_table_filtered = joint_blast_results_table[joint_blast_results_table[!, "alignment length"] .> 1000, :]

In [None]:
sequence_identifiers_of_interest = Set(unique(joint_blast_results_table_filtered[!, "query id"]))

In [None]:
fasta_sequences = []
ProgressMeter.@showprogress for fasta_file in replace.(megablast_results, ".blastn.nt.megablast.txt" => "")
    for record in FASTX.FASTA.Reader(open(fasta_file))
        if FASTX.identifier(record) in sequence_identifiers_of_interest
            push!(fasta_sequences, record)
        end
    end
end
fasta_sequences

In [None]:
# i = 1 labrador retriever
# i = 2 unknown
# i = 3 bacteriophage
# i = 4 bacteriophage
# i = 5 dog
# i = 6 dog
# i = 7 dog
# i = 8 dog
# i = 9 dog
# i = 10 dog
# i = 11 e. coli or e. coli phage
# i = 12 e. coli or e. coli phage
# i = 13 e. coli or e. coli phage
# i = 14  e. coli or e. coli phage
# i = 15  e. coli or e. coli phage
# i = 16  e. coli or e. coli phage
record = fasta_sequences[i]
println(record)

In [None]:
# for x in sort(unique(joint_blast_results_table_filtered[!, "subject title"]))
#     println(x)
# end