The objective of this notebook is to:
- join all of the kraken results with the sample metadata
- subset to samples with metadata
- collapse P3+ into Other
- do PCA, coloring by metadata
- do GLM modelling to determine if any have significance

In [None]:
# don't try and install plotting libraries without this
# can set in ~/.local/share/jupyter/kernels/
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "DataFrames",
    "StatsBase",
    "StatsPlots",
    "uCSV",
    "ProgressMeter",
    "Distances",
    "Clustering",
    "Colors",
    "MultivariateStats",
    "Dates",
    "CategoricalArrays",
    "GLM",
    "Statistics",
    "DelimitedFiles",
    "PlotlyJS"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
import Mycelia

In [None]:
data_dir = joinpath(dirname(pwd()), "data")

In [None]:
results_dir = joinpath(data_dir, "results")

In [None]:
# load in metadata
metadata_dir = joinpath(dirname(pwd()), "metadata")

exposome_environmental_data = DataFrames.DataFrame(uCSV.read(
    joinpath(metadata_dir, "metadata_exposome.rds.tsv"),
    delim='\t',
    header=1,
    typedetectrows=300
))

joint_sample_metadata = DataFrames.DataFrame(uCSV.read(
    joinpath(metadata_dir, "exposome/joint_sample_metadata.tsv"),
    delim='\t',
    header=1,
    typedetectrows=300
))

@assert joint_sample_metadata[!, "Library Name"] == joint_sample_metadata[!, "LibraryName"]
joint_metadata = DataFrames.innerjoin(
    joint_sample_metadata,
    exposome_environmental_data,
    on="Library Name" => "samplenames");


# # recode P3 and beyond to Other, since they don't have enough samples to do much analysis on
# joint_metadata[!, "aownership"] = map(x -> x in Set(["P1", "P2"]) ? x : "Others", joint_metadata[!, "aownership"])

joint_metadata[!, "date.start"] = Dates.Date.(joint_metadata[!, "date.start"], "yyyy-mm-dd")
joint_metadata[!, "date.end"] = Dates.Date.(joint_metadata[!, "date.end"], "yyyy-mm-dd")
# use this if we want to unified axis across all participants, which I don't think we do
# joint_metadata[!, "date.start_relative"] = joint_metadata[!, "date.start"] .- first(joint_metadata[!, "date.start"])
# joint_metadata[!, "date.end_relative"] = joint_metadata[!, "date.end"] .- first(joint_metadata[!, "date.start"])
joint_metadata[!, "duration"] = joint_metadata[!, "date.end"] .- joint_metadata[!, "date.start"]

joint_metadata[!, "temperature"] = something.(tryparse.(Float64, joint_metadata[!, "temperature"]), missing)
joint_metadata[!, "humid"] = something.(tryparse.(Float64, joint_metadata[!, "humid"]), missing)
joint_metadata[!, "particle"] = something.(tryparse.(Float64, joint_metadata[!, "particle"]), missing)
joint_metadata[!, "latitude"] = something.(tryparse.(Float64, joint_metadata[!, "latitude"]), missing)
joint_metadata[!, "longitude"] = something.(tryparse.(Float64, joint_metadata[!, "longitude"]), missing)

joint_metadata = DataFrames.dropmissing(joint_metadata)

In [None]:
sample_paths = sort(joinpath.(data_dir, "SRA", joint_metadata[!, "Run"]))
kraken_db = "k2_pluspfp"
kraken_db_regex = Regex("$(kraken_db)_\\d{8}")
kraken_reports = map(path ->
    first(filter(x -> occursin(kraken_db_regex, x) && occursin(r"kraken-report\.tsv$", x), readdir(joinpath(path, "kraken"), join=true))),
    sample_paths)

# create a full joint table so that we can subset dynamically down below without needing to re-read all of them over and over again
joint_report_table = DataFrames.DataFrame()
ProgressMeter.@showprogress for kraken_report in kraken_reports
    report_table = Mycelia.read_kraken_report(kraken_report)
    report_table[!, "report"] .= basename(kraken_report)
    append!(joint_report_table, report_table)
end
joint_report_table[!, "taxon"] = map(row -> string(row["ncbi_taxonid"]) * "_" * row["scientific_name"], DataFrames.eachrow(joint_report_table))
joint_report_table[!, "sample_identifier"] = string.(first.(split.(joint_report_table[!, "report"], '.')))
joint_report_table

In [None]:
# StatsPlots.plotlyjs()
#default
# StatsPlots.gr()

In [None]:
taxon_levels = Mycelia.list_ranks()
viral_tax_ids = Mycelia.list_subtaxa(10239)

for rank_level in 3:8

    (taxon_index, taxon_level) = collect(enumerate(taxon_levels))[rank_level]
    println("$(taxon_index) - $(taxon_level)")
    rank_table = Mycelia.list_rank(taxon_level)

    # filter the kraken results to only those at this level
    taxids_at_this_rank = Set(rank_table[!, "taxid"])
    rank_report_table = joint_report_table[map(x -> x in taxids_at_this_rank, joint_report_table[!, "ncbi_taxonid"]), :]
    rank_joint_table = DataFrames.innerjoin(joint_metadata, rank_report_table, on="Run" => "sample_identifier")
    # find all columns with invariant metadata, and drop them
    rank_joint_table = rank_joint_table[!, [n for n in names(rank_joint_table) if length(unique(rank_joint_table[!, n])) > 1]]

    # println("[")
    # for n in names(rank_joint_table)
    #     println("\t\"$(n)\",")
    # end
    # println("]")

    columns_of_interest = [
        "Run",
        "altitude",
        "geo_loc_name",
        "location",
        "geo",
        "geo2",
        "duration",
        "date.month",
        "season",
        "particle",
        "temperature",
        "humid",
        "weekend",
        "aownership",
        "latitude",
        "longitude",
        "ncbi_taxonid",
        "scientific_name",
        "taxon",
        "number_of_fragments_at_or_below_taxon",
    ]

    rank_joint_table = rank_joint_table[!, columns_of_interest]

    # viral only
    rank_joint_viral_table = rank_joint_table[map(x -> x in viral_tax_ids, rank_joint_table[!, "ncbi_taxonid"]), :]
    # require at least 3 reads of support
    rank_joint_viral_table = rank_joint_viral_table[rank_joint_viral_table[!, "number_of_fragments_at_or_below_taxon"] .>= 3, :]

    unique_samples = unique(sort(rank_joint_viral_table[!, "Run"]))
    unique_taxa = unique(sort(rank_joint_viral_table[!, "taxon"]))
    sample2index = Dict(s => i for (i, s) in enumerate(unique_samples))
    taxa2index = Dict(t => i for (i, t) in enumerate(unique_taxa))
    counts_matrix = zeros(length(unique_taxa), length(unique_samples))

    for row in DataFrames.eachrow(rank_joint_viral_table)
        x = taxa2index[row["taxon"]]
        y = sample2index[row["Run"]]
        counts_matrix[x, y] = row["number_of_fragments_at_or_below_taxon"]
    end
    counts_matrix
    relative_abundance_matrix = zeros(length(unique_taxa), length(unique_samples))
    for (i, col) in enumerate(eachcol(counts_matrix))
        relative_abundance_matrix[:, i] .= counts_matrix[:, i] ./ sum(counts_matrix[:, i])
    end
    relative_abundance_matrix

    fit_pca = MultivariateStats.fit(MultivariateStats.PCA, relative_abundance_matrix)
    transformed_observations = MultivariateStats.predict(fit_pca, relative_abundance_matrix)

    rank_joint_viral_table = DataFrames.rename(rank_joint_viral_table, 
        [
            "geo_loc_name" => "geo_location",
            "geo2" => "region",
            "date.month" => "month(1-12)",
            "aownership" => "participant"
        ]

    )

    feature_columns = [
        # "geo_location",
        # "location",
        "region",
        # "month(1-12)",
        "season",
        # "weekend",
        "participant"
    ]

    for feature in feature_columns
        sample2feature_table = sort(unique(rank_joint_viral_table[!, ["Run", feature]]))

        unique_features = sort(unique(rank_joint_viral_table[!, feature]))
        colorscheme = Colors.distinguishable_colors(length(unique_features), [Colors.RGB(1,1,1), Colors.RGB(0,0,0)], dropseed=true)
        feature2index = Dict(f => i for (i, f) in enumerate(unique_features))

        xs = [Float64[] for i in 1:length(unique_features)]
        ys = [Float64[] for i in 1:length(unique_features)]
        zs = [Float64[] for i in 1:length(unique_features)]

        for (i, row) in enumerate(DataFrames.eachrow(sample2feature_table))
            feature_index = feature2index[row[feature]]
            sample_index = sample2index[row["Run"]]
            push!(xs[feature_index], transformed_observations[1, i])
            push!(ys[feature_index], transformed_observations[2, i])
            push!(zs[feature_index], transformed_observations[3, i])
        end

        plot = 
        StatsPlots.scatter(
            xs,
            ys,
            # zs,
            xlabel = "PC1",
            ylabel = "PC2",
            # zlabel = "PC3",
            labels = hcat(unique_features...),
            title = "Viral abundance profiles - kraken - $(kraken_db) - $(taxon_level) - $(feature)",
            # legend = :outertopright,
            size = (1920/2, 1080/2),
            margins = 10StatsPlots.px,
            seriescolor = hcat(colorscheme...),
            # alpha=0.5,
            # size=1,
            markersize=3,
            dpi=300
        )

        display(plot)
        for extension in [".png"]
            file = joinpath(results_dir, "taxonomic-breakdowns.kraken.$(kraken_db).$(taxon_index).$(taxon_level).$(feature).2d.pca") * extension
            StatsPlots.savefig(plot, file)
        end
    end
end