The objective of this notebook is to load all of the 6 classifications:
- mmseqs
    - UniRef50
    - UniRef90
    - UniRef100
    
- genomand
- kraken
- blast

- and then for each sample and each assembled contig, determine what the call was for each tool
- visualize a correlation matrix showing the average agreement and disagreement rates between tools

In [1]:
import Pkg
pkgs = [
    "Revise",
    "DataFrames",
    "StatsBase",
    "ProgressMeter",
    "uCSV"
]
# Pkg.activate(; temp=true)
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
import Mycelia

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mPrecompiling Mycelia [453d265d-8292-4a7b-a57c-dce3f9ae6acd]
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mSkipping precompilation since __precompile__(false). Importing Mycelia [453d265d-8292-4a7b-a57c-dce3f9ae6acd].


In [None]:
# load in metadata
metadata_dir = joinpath(dirname(pwd()), "metadata")

exposome_environmental_data = DataFrames.DataFrame(uCSV.read(
    joinpath(metadata_dir, "metadata_exposome.rds.tsv"),
    delim='\t',
    header=1,
    typedetectrows=300
))

joint_sample_metadata = DataFrames.DataFrame(uCSV.read(
    joinpath(metadata_dir, "exposome/joint_sample_metadata.tsv"),
    delim='\t',
    header=1,
    typedetectrows=300
))

@assert joint_sample_metadata[!, "Library Name"] == joint_sample_metadata[!, "LibraryName"]

joint_metadata = DataFrames.innerjoin(
    joint_sample_metadata,
    exposome_environmental_data,
    on="Library Name" => "samplenames")

In [2]:
base_dir = dirname(pwd())
data_dir = joinpath(base_dir, "data")
sample_directories = readdir(joinpath(data_dir, "SRA"), join=true)

594-element Vector{String}:
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399459"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399460"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399461"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399462"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399463"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399464"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399465"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399466"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399467"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399468"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/SRR6399469"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 39 bytes ⋯ "e-discovery/data/SRA/

In [3]:
easy_taxonomy_lca_reports = String[]
for sample_directory in sample_directories
    mmseqs_directory_contents = readdir(joinpath(sample_directory, "mmseqs_easy_taxonomy"), join=true)
    sample_lca_reports = filter(x -> occursin("final.contigs.fastg.gfa.fna.mmseqs_easy_taxonomy.", x) && occursin("_lca.tsv", x), mmseqs_directory_contents)
    append!(easy_taxonomy_lca_reports, sample_lca_reports)
end
easy_taxonomy_lca_reports

1782-element Vector{String}:
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 128 bytes ⋯ "asy_taxonomy.UniRef100_lca.tsv"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 127 bytes ⋯ "easy_taxonomy.UniRef50_lca.tsv"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 127 bytes ⋯ "easy_taxonomy.UniRef90_lca.tsv"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 128 bytes ⋯ "asy_taxonomy.UniRef100_lca.tsv"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 127 bytes ⋯ "easy_taxonomy.UniRef50_lca.tsv"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 127 bytes ⋯ "easy_taxonomy.UniRef90_lca.tsv"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 128 bytes ⋯ "asy_taxonomy.UniRef100_lca.tsv"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 127 bytes ⋯ "easy_taxonomy.UniRef50_lca.tsv"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 127 bytes ⋯ "easy_taxonomy.UniRef90_lca.tsv"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 128 bytes ⋯ "asy_taxonomy.UniRef100_lca.tsv"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 127 bytes ⋯ "easy_taxonomy.UniRef50_lca.tsv"
 "/oak/stanford/scg/lab_mpsnyder/" ⋯ 127 bytes ⋯ "easy_taxonomy.UniRe

In [4]:
joint_lca_table = DataFrames.DataFrame()
ProgressMeter.@showprogress for lca_tsv in easy_taxonomy_lca_reports
    method = replace(replace(basename(lca_tsv), "final.contigs.fastg.gfa.fna." => ""), "_lca.tsv" => "")
    # @show method
    this_lca_table = Mycelia.parse_mmseqs_easy_taxonomy_lca_tsv(lca_tsv)
    this_lca_table[!, "sample"] .= basename(dirname(dirname(lca_tsv)))
    this_lca_table[!, "method"] .= method
    append!(joint_lca_table, this_lca_table)
end
joint_lca_table

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:05:00[39m


Row,contig_id,taxon_id,taxon_rank,taxon_name,fragments_retained,fragments_taxonomically_assigned,fragments_in_agreement_with_assignment,support -log(E-value),sample,method
Unnamed: 0_level_1,Int64,Int64,String,String,Int64,Int64,Int64,Float64,String,String
1,6611,147550,class,Sordariomycetes,1,1,1,1.0,SRR6399459,mmseqs_easy_taxonomy.UniRef100
2,6612,229535,species,Penicillium nordicum,1,1,1,1.0,SRR6399459,mmseqs_easy_taxonomy.UniRef100
3,6617,470,species,Acinetobacter baumannii,1,1,1,1.0,SRR6399459,mmseqs_easy_taxonomy.UniRef100
4,6620,9597,species,Pan paniscus,3,2,1,0.61,SRR6399459,mmseqs_easy_taxonomy.UniRef100
5,6621,469,genus,Acinetobacter,1,1,1,1.0,SRR6399459,mmseqs_easy_taxonomy.UniRef100
6,6622,286661,species,Peltaster fructicola,3,3,1,0.91,SRR6399459,mmseqs_easy_taxonomy.UniRef100
7,6623,207598,subfamily,Homininae,5,5,3,0.61,SRR6399459,mmseqs_easy_taxonomy.UniRef100
8,6625,5139,order,Sordariales,1,1,1,1.0,SRR6399459,mmseqs_easy_taxonomy.UniRef100
9,6626,1,no rank,root,5,4,4,1.0,SRR6399459,mmseqs_easy_taxonomy.UniRef100
10,6628,222544,subclass,Sordariomycetidae,8,6,4,0.75,SRR6399459,mmseqs_easy_taxonomy.UniRef100
