The objective of this notebook is to:
- join all of the kraken results with the sample metadata
- subset to samples with metadata
- collapse P3+ into Other
- do PCA, coloring by metadata
- do GLM modelling to determine if any have significance

In [None]:
# don't try and install plotting libraries without this
# can set in ~/.local/share/jupyter/kernels/
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "DataFrames",
    "StatsBase",
    "StatsPlots",
    "uCSV",
    "ProgressMeter",
    "Distances",
    "Clustering",
    "Colors",
    "MultivariateStats",
    "Dates",
    "CategoricalArrays",
    "GLM",
    "Statistics"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
import Mycelia

In [None]:
data_dir = joinpath(dirname(pwd()), "data")

In [None]:
results_dir = joinpath(data_dir, "results")

In [None]:
# load in metadata
metadata_dir = joinpath(dirname(pwd()), "metadata")

exposome_environmental_data = DataFrames.DataFrame(uCSV.read(
    joinpath(metadata_dir, "metadata_exposome.rds.tsv"),
    delim='\t',
    header=1,
    typedetectrows=300
))

joint_sample_metadata = DataFrames.DataFrame(uCSV.read(
    joinpath(metadata_dir, "exposome/joint_sample_metadata.tsv"),
    delim='\t',
    header=1,
    typedetectrows=300
))

@assert joint_sample_metadata[!, "Library Name"] == joint_sample_metadata[!, "LibraryName"]
joint_metadata = DataFrames.innerjoin(
    joint_sample_metadata,
    exposome_environmental_data,
    on="Library Name" => "samplenames");


# # recode P3 and beyond to Other, since they don't have enough samples to do much analysis on
# joint_metadata[!, "aownership"] = map(x -> x in Set(["P1", "P2"]) ? x : "Others", joint_metadata[!, "aownership"])

joint_metadata[!, "date.start"] = Dates.Date.(joint_metadata[!, "date.start"], "yyyy-mm-dd")
joint_metadata[!, "date.end"] = Dates.Date.(joint_metadata[!, "date.end"], "yyyy-mm-dd")
# use this if we want to unified axis across all participants, which I don't think we do
# joint_metadata[!, "date.start_relative"] = joint_metadata[!, "date.start"] .- first(joint_metadata[!, "date.start"])
# joint_metadata[!, "date.end_relative"] = joint_metadata[!, "date.end"] .- first(joint_metadata[!, "date.start"])
joint_metadata[!, "duration"] = joint_metadata[!, "date.end"] .- joint_metadata[!, "date.start"]

joint_metadata[!, "temperature"] = something.(tryparse.(Float64, joint_metadata[!, "temperature"]), missing)
joint_metadata[!, "humid"] = something.(tryparse.(Float64, joint_metadata[!, "humid"]), missing)
joint_metadata[!, "particle"] = something.(tryparse.(Float64, joint_metadata[!, "particle"]), missing)
joint_metadata[!, "latitude"] = something.(tryparse.(Float64, joint_metadata[!, "latitude"]), missing)
joint_metadata[!, "longitude"] = something.(tryparse.(Float64, joint_metadata[!, "longitude"]), missing)
joint_metadata[!, "altitude"] = something.(tryparse.(Float64, joint_metadata[!, "altitude"]), missing)
DataFrames.rename!(joint_metadata, "date.month" => "month")

joint_metadata = DataFrames.dropmissing(joint_metadata)

In [None]:
sample_paths = sort(joinpath.(data_dir, "SRA", joint_metadata[!, "Run"]))
kraken_db = "k2_pluspfp"
kraken_db_regex = Regex("$(kraken_db)_\\d{8}")
kraken_reports = map(path ->
    first(filter(x -> occursin(kraken_db_regex, x) && occursin(r"kraken-report\.tsv$", x), readdir(joinpath(path, "kraken"), join=true))),
    sample_paths)

# create a full joint table so that we can subset dynamically down below without needing to re-read all of them over and over again
joint_report_table = DataFrames.DataFrame()
ProgressMeter.@showprogress for kraken_report in kraken_reports
    report_table = Mycelia.read_kraken_report(kraken_report)
    report_table[!, "report"] .= basename(kraken_report)
    append!(joint_report_table, report_table)
end
joint_report_table[!, "taxon"] = map(row -> string(row["ncbi_taxonid"]) * "_" * row["scientific_name"], DataFrames.eachrow(joint_report_table))
joint_report_table[!, "sample_identifier"] = string.(first.(split.(joint_report_table[!, "report"], '.')))
joint_report_table

In [None]:
# StatsPlots.plotlyjs()
#default
# StatsPlots.gr()

In [None]:
linear_model_results = DataFrames.DataFrame(
    rank = String[],
    taxon = String[],
    feature = String[],
    rawpvalue = Float64[]
)

In [None]:
taxon_levels = Mycelia.list_ranks()
viral_tax_ids = Mycelia.list_subtaxa(10239)

# rank_level = 3
# rank_level = 4
# rank_level = 5
# rank_level = 6
# rank_level = 7
# rank_level = 8

for rank_level in 3:8

    (taxon_index, taxon_level) = collect(enumerate(taxon_levels))[rank_level]
    println("$(taxon_index) - $(taxon_level)")
    rank_table = Mycelia.list_rank(taxon_level)

    # filter the kraken results to only those at this level
    taxids_at_this_rank = Set(rank_table[!, "taxid"])
    rank_report_table = joint_report_table[map(x -> x in taxids_at_this_rank, joint_report_table[!, "ncbi_taxonid"]), :]
    rank_joint_table = DataFrames.innerjoin(joint_metadata, rank_report_table, on="Run" => "sample_identifier")
    # find all columns with invariant metadata, and drop them
    rank_joint_table = rank_joint_table[!, [n for n in names(rank_joint_table) if length(unique(rank_joint_table[!, n])) > 1]]

    # println("[")
    # for n in names(rank_joint_table)
    #     println("\t\"$(n)\",")
    # end
    # println("]")

    columns_of_interest = [
        "Run",
        "altitude",
        "geo_loc_name",
        "location",
        "geo",
        "geo2",
        "duration",
        "month",
        "season",
        "particle",
        "temperature",
        "humid",
        "weekend",
        "aownership",
        "latitude",
        "longitude",
        "ncbi_taxonid",
        "scientific_name",
        "taxon",
        "number_of_fragments_at_or_below_taxon",
    ]

    rank_joint_table = rank_joint_table[!, columns_of_interest]

    # viral only
    rank_joint_viral_table = rank_joint_table[map(x -> x in viral_tax_ids, rank_joint_table[!, "ncbi_taxonid"]), :]
    # require at least 3 reads of support
    rank_joint_viral_table = rank_joint_viral_table[rank_joint_viral_table[!, "number_of_fragments_at_or_below_taxon"] .>= 3, :]

    normalized_rank_joint_viral_table = DataFrames.DataFrame()
    for gdf in DataFrames.groupby(rank_joint_viral_table, "Run")
        gdf[!, "proportion_of_fragments_at_or_below_taxon"] = gdf[!, "number_of_fragments_at_or_below_taxon"] ./ sum(gdf[!, "number_of_fragments_at_or_below_taxon"])
        append!(normalized_rank_joint_viral_table, gdf)
    end
    normalized_rank_joint_viral_table = normalized_rank_joint_viral_table[!, DataFrames.Not("number_of_fragments_at_or_below_taxon")]
    normalized_rank_joint_viral_table = normalized_rank_joint_viral_table[!, DataFrames.Not(["ncbi_taxonid", "scientific_name", "Run"])]

    categorical_columns = [
        "geo_loc_name",
        "location",
        "geo2",
        "month",
        "season",
        "weekend",
        "aownership"
    ]
    for categorical_column in categorical_columns
        normalized_rank_joint_viral_table[!, categorical_column] = CategoricalArrays.categorical(normalized_rank_joint_viral_table[!, categorical_column])
    end
    normalized_rank_joint_viral_table

    taxon_tables = DataFrames.groupby(normalized_rank_joint_viral_table, "taxon")

    # # Define the model
    # # geo_loc_name
    # # weekend
    # # aownership
    for taxon_table in taxon_tables
        try
            # model = GLM.lm(GLM.@formula(proportion_of_fragments_at_or_below_taxon ~ altitude + geo_loc_name + location + geo2 + duration + month + season + particle + temperature + humid + weekend +  aownership + latitude + longitude ), taxon_table)
            model = GLM.lm(GLM.@formula(proportion_of_fragments_at_or_below_taxon ~ geo2 + season + aownership), taxon_table)

            coeftable = GLM.coeftable(model)

            for (feature, pval) in zip(coeftable.rownms, coeftable.cols[coeftable.pvalcol])
                row = (
                    rank = "$(taxon_index)-$(taxon_level)",
                    taxon = taxon_table[1, "taxon"],
                    feature = feature,
                    rawpvalue = pval
                    )
                push!(linear_model_results, row)
            end
        catch
            display(DataFrames.nrow(taxon_table))
        end
    end
end

In [None]:
linear_model_results[!, "adjusted_pvalue"] .= linear_model_results[!, "rawpvalue"] .* DataFrames.nrow(linear_model_results)
sort!(linear_model_results, "adjusted_pvalue")
linear_model_results

In [None]:
# uCSV.write(joinpath(results_dir, "20240101.kraken-abundance-associations.linear-modelling-results.tsv"), linear_model_results, delim='\t')
uCSV.write(joinpath(results_dir, "20240101.kraken-abundance-associations.linear-modelling-results.1.tsv"), linear_model_results, delim='\t')

In [None]:
# bonferroni correct by # of tests
# sort by p-value

In [None]:
# # Evaluate the model
# predictions = GLM.predict(model, taxon_table)