In [None]:
import Pkg
Pkg.activate(".")

# not for the faint of heart!
# Pkg.update()

pkgs = [
"ProgressMeter",
"uCSV",
"DataFrames",
"StatsPlots",
"StatsBase",
"Statistics"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

Pkg.develop(path="../../..")
import Mycelia

In [None]:
PROJECT_BASE = dirname(pwd())

In [None]:
SRA_DIR = joinpath(PROJECT_BASE, "data", "SRA")

In [None]:
SRRs = filter(x -> occursin(r"^SRR", x), readdir(SRA_DIR))

In [None]:
joint_virus_table = DataFrames.DataFrame()
ProgressMeter.@showprogress for SRR in SRRs
    SRR_dir = joinpath(SRA_DIR, SRR)
    kraken_dir = joinpath(SRR_dir, "kraken")
    kraken_reports = filter(x -> occursin(r"\.kraken-report\.tsv$", x), readdir(kraken_dir))
    @assert length(kraken_reports) == 1
    kraken_report = first(kraken_reports)

    # ranks: (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies.

    kraken_report_header = [
        "percentage_of_fragments_at_or_below_taxon",
        "number_of_fragments_at_or_below_taxon",
        "number_of_fragments_assigned_directly_to_taxon",
        "rank",
        "ncbi_taxonid",
        "scientific_name"
    ]

    data, header = uCSV.read(joinpath(kraken_dir, kraken_report), delim='\t')
    kraken_report_table = DataFrames.DataFrame(data, kraken_report_header)
    kraken_report_table[!, "sample_id"] .= SRR
    for row in DataFrames.eachrow(kraken_report_table[kraken_report_table[!, "ncbi_taxonid"] .== 10239, :])
        push!(joint_virus_table, row)
    end
end
joint_virus_table

In [None]:
filtered_joint_virus_table = joint_virus_table[joint_virus_table[!, "number_of_fragments_at_or_below_taxon"] .> 0, :]

In [None]:
filtered_joint_virus_table

In [None]:
sort(filtered_joint_virus_table[filtered_joint_virus_table[!, "percentage_of_fragments_at_or_below_taxon"] .>= 1, :], "percentage_of_fragments_at_or_below_taxon", rev=true)

In [None]:
one_percent_viral_samples = [
    "SRR7365459",
    "SRR6399596",
    "SRR6399584",
    "SRR6399485"
]

In [None]:
targeted_sample_table = DataFrames.DataFrame()
ProgressMeter.@showprogress for SRR in one_percent_viral_samples
    SRR_dir = joinpath(SRA_DIR, SRR)
    kraken_dir = joinpath(SRR_dir, "kraken")
    kraken_reports = filter(x -> occursin(r"\.kraken-report\.tsv$", x), readdir(kraken_dir))
    @assert length(kraken_reports) == 1
    kraken_report = first(kraken_reports)

    # ranks: (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies.

    kraken_report_header = [
        "percentage_of_fragments_at_or_below_taxon",
        "number_of_fragments_at_or_below_taxon",
        "number_of_fragments_assigned_directly_to_taxon",
        "rank",
        "ncbi_taxonid",
        "scientific_name"
    ]

    data, header = uCSV.read(joinpath(kraken_dir, kraken_report), delim='\t')
    kraken_report_table = DataFrames.DataFrame(data, kraken_report_header)
    kraken_report_table[!, "sample_id"] .= SRR
    for row in DataFrames.eachrow(kraken_report_table)
        push!(targeted_sample_table, row)
    end
end
targeted_sample_table

In [None]:
# unique(targeted_sample_table[!, "rank"])
findall(targeted_sample_table[!, "rank"] .== "D")

In [None]:
findall(targeted_sample_table[!, "ncbi_taxonid"] .== 10239)

In [None]:
findall(targeted_sample_table[!, "ncbi_taxonid"] .== 10239)

viral_ranges = [
  19057:39060-1,
  58113:78116-1,
  97169:117172-1,
    136225:DataFrames.nrow(targeted_sample_table)
]

In [None]:
viral_rows = reduce(vcat, collect.(viral_ranges))

targeted_sample_table = targeted_sample_table[viral_rows, :]

targeted_sample_table = targeted_sample_table[targeted_sample_table[!, "number_of_fragments_at_or_below_taxon"] .>= 1, :]

In [None]:
pwd()

In [None]:
uCSV.write("targetted_viral_samples.tsv", targeted_sample_table, delim='\t')

In [None]:
targeted_sample_table

In [None]:
# sort(filtered_joint_virus_table[filtered_joint_virus_table[!, "percentage_of_fragments_at_or_below_taxon"] .>= 0.1, :], "percentage_of_fragments_at_or_below_taxon", rev=true)

In [None]:
# sort(filtered_joint_virus_table, "percentage_of_fragments_at_or_below_taxon", rev=true)[1:100, :]

In [None]:
# sort(filtered_joint_virus_table, "number_of_fragments_at_or_below_taxon", rev=true)[1:10, :]

In [None]:
# StatsBase.describe(filtered_joint_virus_table[!, "percentage_of_fragments_at_or_below_taxon"])

# # Summary Stats:
# # Length:         434
# # Missing Count:  0
# # Mean:           0.076336
# # Minimum:        0.000000
# # 1st Quartile:   0.000000
# # Median:         0.010000
# # 3rd Quartile:   0.030000
# # Maximum:        7.420000

In [None]:
# StatsBase.describe(filtered_joint_virus_table[!, "number_of_fragments_at_or_below_taxon"])

# # Summary Stats:
# # Length:         434
# # Missing Count:  0
# # Mean:           36341.534562
# # Minimum:        85.000000
# # 1st Quartile:   1951.750000
# # Median:         5345.000000
# # 3rd Quartile:   13846.750000
# # Maximum:        2694174.000000
# # Type:           Int64

# # Maximum:        2,694,174.000000

In [None]:
# StatsPlots.histogram(
#     filtered_joint_virus_table[!, "percentage_of_fragments_at_or_below_taxon"],
#     ylabel = "# of samples",
#     xlabel = "% of reads classified as viral",
#     title = "Relative abundance of viral sequences in exposome",
#     legend = false
# )

In [None]:
# StatsPlots.histogram(
#     filtered_joint_virus_table[!, "number_of_fragments_at_or_below_taxon"],
#     ylabel = "# of samples",
#     xlabel = "# of reads classified as viral",
#     title = "Absolute abundance of viral sequences in exposome",
#     legend = false
# )

In [None]:
# sort(filtered_joint_virus_table, "percentage_of_fragments_at_or_below_taxon", rev=true)[1:10, :]

In [None]:
# sort(filtered_joint_virus_table, "number_of_fragments_at_or_below_taxon", rev=true)[1:10, :]