In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames",
    "uCSV",
    "OrderedCollections",
    "CSV",
    "XAM",
    "CodecZlib",
    "ProgressMeter",
    "StatsPlots"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
project_dir = dirname(pwd())
data_dir = mkpath(joinpath(project_dir, "data"))

In [None]:
pilot_samples = readdir(joinpath(data_dir, "samples"), join=true)

In [None]:
pilot_sample_table = DataFrames.DataFrame(
    sample_ID = String[],
    forward_reads = String[],
    reverse_reads = String[],
    filesize = Int[]
)

# run on just non-human reads first
for pilot_sample in pilot_samples
    qc_filtered_batches = filter(x -> occursin(r"trimgalore$", x), readdir(pilot_sample, join=true))
    for qc_filtered_batch in qc_filtered_batches
        forward_reads = filter(x -> occursin(r"\.unmapped\.1\.fq\.gz$", x), readdir(qc_filtered_batch, join=true))
        # forward_reads = filter(x -> occursin(r"1_val_1\.fq\.gz$", x), readdir(qc_filtered_batch, join=true))
        @assert length(forward_reads) == 1
        forward_reads = first(forward_reads)
        reverse_reads = filter(x -> occursin(r"\.unmapped\.2\.fq\.gz$", x), readdir(qc_filtered_batch, join=true))
        # reverse_reads = filter(x -> occursin(r"2_val_2\.fq\.gz$", x), readdir(qc_filtered_batch, join=true))
        @assert length(reverse_reads) == 1
        reverse_reads = first(reverse_reads)
        joint_filesize = filesize(forward_reads) + filesize(reverse_reads)
        row = (sample_ID = pilot_sample, forward_reads = forward_reads, reverse_reads = reverse_reads, filesize = joint_filesize)
        push!(pilot_sample_table, row)
    end
end
pilot_sample_table

In [None]:
sort!(pilot_sample_table, "filesize")

In [None]:
kraken_directories = replace.(pilot_sample_table[!, "forward_reads"], ".1.fq.gz" => "_kraken")
pilot_sample_table[!, "kraken_results"] = [first(filter(x -> occursin(r"k2_pluspfp_20231009\.kraken\-report\.tsv$", x), readdir(kraken_directory, join=true))) for kraken_directory in kraken_directories]
pilot_sample_table[!, "mapped_reads"] = replace.(pilot_sample_table[!, "forward_reads"], ".1.fq.gz" => ".nt.fna.gz.xsr.I51G.mmi.minimap2.sam.gz")
pilot_sample_table

In [None]:
pilot_sample_table[!, "%classified_kraken"] .= 0.0
pilot_sample_table[!, "%classified_mapping"] .= 0.0
for (i, row) in enumerate(DataFrames.eachrow(pilot_sample_table))
    kraken_report = Mycelia.read_kraken_report(row["kraken_results"])
    pilot_sample_table[i, "%classified_kraken"] = kraken_report[findfirst(kraken_report[!, "ncbi_taxonid"] .== 1), "percentage_of_fragments_at_or_below_taxon"]
end
pilot_sample_table[!, "%classified_kraken"]

In [None]:
function parse_xam_to_classification_stats(xam)
# merge name conflicts, leaving breadcrumb for reference
# function xam_records_to_dataframe(records)
    record_table = DataFrames.DataFrame(
        template = String[],
        flag = UInt16[],
        reference = String[],
        position = UnitRange{Int}[],
        mappingquality = UInt8[],
        tlen = Int[],
        alignlength = Int[],
        ismapped = Bool[],
        isprimary = Bool[],
        alignment_score = Int[],
        mismatches = Int[]
    )
    if occursin(r"\.bam$", xam)
        MODULE = XAM.BAM
        io = open(xam)
    elseif occursin(r"\.sam$", xam)
        MODULE = XAM.SAM
        io = open(xam)
    elseif occursin(r"\.sam.gz$", xam)
        MODULE = XAM.SAM
        io = CodecZlib.GzipDecompressorStream(open(xam))
    else
        error("unrecognized file extension in file: $xam")
    end
    # filter out header lines
    reader = MODULE.Reader(IOBuffer(join(Iterators.filter(line -> !startswith(line, '@'), eachline(io)), '\n')))
    # reader = MODULE.Reader(io)
    mapped_ids = Set{String}()
    unmapped_ids = Set{String}()
    for record in reader
        if XAM.SAM.ismapped(record)
            push!(mapped_ids, XAM.SAM.tempname(record))
        else
            push!(unmapped_ids, XAM.SAM.tempname(record))
        end
    end
    close(io)
    filter!(read -> !(read in mapped_ids), unmapped_ids)
    return (;mapped_ids, unmapped_ids)
end

mapping_id_sets = parse_xam_to_classification_stats(pilot_sample_table[1, "mapped_reads"])

In [None]:
ProgressMeter.@showprogress for (i, row) in enumerate(DataFrames.eachrow(pilot_sample_table))
    sam_file = row["mapped_reads"]
    mapping_id_sets = parse_xam_to_classification_stats(sam_file)
    percent_mapped = length(mapping_id_sets.mapped_ids) / (length(mapping_id_sets.mapped_ids) + length(mapping_id_sets.unmapped_ids)) * 100
    pilot_sample_table[i, "%classified_mapping"] = percent_mapped
end

pilot_sample_table[!, "%classified_mapping"]

In [None]:
p = StatsPlots.scatter(
    pilot_sample_table[!, "%classified_kraken"],
    pilot_sample_table[!, "%classified_mapping"],
    xlabel = "% classified kraken NT (OLD)",
    ylabel = "% classified mapping NT (NEW)",
    title = "relative classification gains for NONHUMAN reads",
    xlims=[0, 100],
    ylims=[0, 100])
StatsPlots.plot(p, 0:100, 0:100)