In [None]:
# run(`conda create --channel conda-forge --channel bioconda --channel defaults --strict-channel-priority --name blast blast`)

In [None]:
import Pkg
pkgs = [
    "Revise",
    "DataFrames",
    "uCSV",
    "StatsPlots",
    "FASTX",
    "Conda",
    "XAM",
    "StatsBase"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
import Mycelia

In [None]:
data_dir = joinpath(dirname(pwd()), "data")

In [None]:
db_fasta = joinpath(data_dir, "results/high-confidence-hiv-contigs.fna.gz")
ungzipped_db_fasta = replace(db_fasta, ".gz" => "")
db = "hiv-high-confidence"
# db_fasta = joinpath(data_dir, "results/high-confidence-viral-contigs.fna.gz")
# db = "viral-high-confidence"
blast_task = "megablast"

In [None]:
fasta_paths = filter(x -> !occursin(".ipynb_checkpoints", x) && occursin(r"\.fasta$", x), readdir(joinpath(data_dir, "exposome_data", "joint-reads"), join=true))
# sample_paths = filter(x -> isfile(joinpath.(x, "megahit", "final.contigs.fastg.gfa.fna")), sample_paths)

In [None]:
joint_blast_reports = DataFrames.DataFrame()
for fasta_path in fasta_paths
    sample = basename(fasta_path)
    blastn_directory = dirname(fasta_path)
    assembled_fasta = fasta_path
    blast_report_file = joinpath(blastn_directory, basename(assembled_fasta) * ".blastn.$(db).$(blast_task).txt")
    if isfile(blast_report_file)
        append!(joint_blast_reports, Mycelia.parse_blast_report(blast_report_file))
    end
end

In [None]:
top_hits_table = DataFrames.DataFrame()
for g in DataFrames.groupby(joint_blast_reports, "query id")
    push!(top_hits_table, sort(g, "evalue")[1, :])
end
top_hits_table

In [None]:
ratio = (4, 3)
StatsPlots.scatter(
    top_hits_table[!, "alignment length"],
    top_hits_table[!, "% identity"],
    title = "PacBio HiFi alignments against 'HIV-like' short-read assembly contigs",
    size=ratio .* 200,
    dpi=300,
    legend=false,
    xlabel = "alignment length",
    ylabel = "% identity",
    title_font_size = 12
)

In [None]:
reference_fasta_records = collect(Mycelia.open_fastx(db_fasta))
joint_observations = []
query_ids_of_interest = Set(top_hits_table[!, "query id"])
for fasta_path in fasta_paths
    sample = basename(fasta_path)
    for record in Mycelia.open_fastx(fasta_path)
        if FASTX.identifier(record) in query_ids_of_interest
            push!(joint_observations, record)
        end
    end
end
joint_observations

In [None]:
blast_hits_fasta_file = db_fasta * ".pacbio-blast-hits.fasta"
open(blast_hits_fasta_file, "w") do io
    fastx_io = FASTX.FASTA.Writer(io)
    for record in joint_observations
        write(fastx_io, record)
    end
    close(fastx_io)
end

In [None]:
Conda.create(:minimap2)
Conda.runconda(`install --channel conda-forge --channel bioconda --strict-channel-priority minimap2`, :minimap2)

In [None]:
blast_hits_alignments_file = blast_hits_fasta_file * ".sam"

In [None]:
run(pipeline(`conda run -n minimap2 --live-stream minimap2 -a $(ungzipped_db_fasta) $(blast_hits_fasta_file)`, blast_hits_alignments_file))

In [None]:
filtered_blast_hits_alignment_file = replace(blast_hits_alignments_file, ".sam" => ".filtered-and-sorted.sam")
xam = Mycelia.parse_xam(blast_hits_alignments_file, filter_unmapped=true, primary_only=true, min_mapping_quality=30, min_align_length=100)
open(filtered_blast_hits_alignment_file, "w") do io
    sam_writer = XAM.SAM.Writer(io, xam.header)
    for record in xam.records
        write(sam_writer, record)
    end
    close(sam_writer)
end

In [None]:
run(`conda run --live-stream -n samtools samtools faidx $(ungzipped_db_fasta)`)

In [None]:
filtered_blast_hits_alignment_file_bam = filtered_blast_hits_alignment_file * ".bam"
run(pipeline(`conda run --live-stream -n samtools samtools view -bh $(filtered_blast_hits_alignment_file)`, filtered_blast_hits_alignment_file_bam))

In [None]:
run(`conda run --live-stream -n samtools samtools index $(filtered_blast_hits_alignment_file_bam)`)

In [None]:
contig_alignments = sort(collect(StatsBase.countmap(XAM.SAM.refname.(xam.records))), by=x->x[2], rev=true)

In [None]:
contigs_with_alignments = Set(first.(contig_alignments))
ref_sequence_hits = Dict(FASTX.identifier(record) => FASTX.sequence(record) for record in filter(x -> FASTX.identifier(x) in contigs_with_alignments, collect(Mycelia.open_fastx(db_fasta))))

In [None]:
# 11-element Vector{Pair{String, Int64}}:
  # "3965" => 69 # human
   # "982" => 47  #human
 # "14459" => 9 #human
 # "13041" => 3 #human
 # "42270" => 1 # human
  # "1101" => 1 # human
 # "18373" => 1 # human
  # "7830" => 1 # human

# "96992" => 2 # human - LINE, endo/exo nuclease, RNA directed DNA polymerase
 # "75088" => 1 # human, conserved uncharacterized protein
 "19160" => 1 # human, DNA directed RNA polymerase

In [None]:
# println(ref_sequence_hits["7830"])