In [None]:
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
pkgs = [
    "Revise",
    "DataFrames",
    "uCSV"
]
# Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end
import Mycelia

In [None]:
project_dir = dirname(pwd())
data_dir = mkpath(joinpath(project_dir, "data"))

In [None]:
viral_reference_fastas_dir = mkpath(joinpath(data_dir, "viral-reference-fastas"))

In [None]:
reference_fastas = filter(x -> occursin(r"\.f(na|asta)\.gz$", x), readdir(viral_reference_fastas_dir, join=true))
sort!(reference_fastas, by=x->filesize(x))

In [None]:
slurm_logdir = mkpath("$(homedir())/workspace/slurmlogs")
exposome_metadata_dir = joinpath(project_dir, "metadata")
exposome_environmental_data = DataFrames.DataFrame(uCSV.read(
    joinpath(exposome_metadata_dir, "metadata_exposome.rds.tsv"),
    delim='\t',
    header=1,
    typedetectrows=300
))

joint_sample_metadata = DataFrames.DataFrame(uCSV.read(
    joinpath(exposome_metadata_dir, "exposome/joint_sample_metadata.tsv"),
    delim='\t',
    header=1,
    typedetectrows=300
))

@assert joint_sample_metadata[!, "Library Name"] == joint_sample_metadata[!, "LibraryName"]

joint_metadata = DataFrames.innerjoin(
    joint_sample_metadata,
    exposome_environmental_data,
    on="Library Name" => "samplenames")

sample_directories = sort(joinpath.(project_dir, "data", "SRA", joint_metadata[!, "Run"]))

In [None]:
# make sketches for references
# 7384.737911 seconds (338.18 k allocations: 15.754 MiB, 0.00% compilation time)
# 2 hours
reference_fasta = "/oak/stanford/scg/lab_mpsnyder/cjprybol/Mycelia/projects/viral-exposome/data/viral-reference-fastas/IMGVR_all_nucleotides-high_confidence.fna.gz"
# 16107.376116 seconds (654.58 k allocations: 28.929 MiB, 0.00% gc time)
# 4 hours
# reference_fasta = "/oak/stanford/scg/lab_mpsnyder/cjprybol/Mycelia/projects/viral-exposome/data/viral-reference-fastas/nt_viruses.fasta.gz"
# @time run(`$(Mycelia.MAMBA) run --live-stream -n sourmash sourmash sketch dna -p scaled=1000,k=31 $(reference_fasta) -o $(reference_fasta).sig`)

In [None]:
sourmash_sketch = reference_fasta * ".sourmash-sketch.zip"

In [None]:
# 1-4 already submitted for IMG/VR, none for nt_viral
for sample in sample_directories[1:1]
    forward = joinpath(sample, "trim_galore", "$(basename(sample))_1_val_1.fq.gz")
    reverse = joinpath(sample, "trim_galore", "$(basename(sample))_2_val_2.fq.gz")
    outdir = mkpath(joinpath(sample, "sourmash"))
    sample_id = basename(sample) * "." * basename(sourmash_sketch)
    file_name = sample_id * ".sourmash-sketch.zip"
    file_path = joinpath(outdir, file_name)
    # @show outfile
    if !isfile(file_path) || (filesize(file_path) == 0)
        cmd =
        """
        $(Mycelia.MAMBA) run --live-stream -n sourmash sourmash sketch dna -p k=17,k=23,k=31,abund $(forward) $(reverse) --name $(sample_id) -o $(file_path)
        """
        # @info "submitting `$cmd`"
        Mycelia.sbatch(
            job_name = "$(file_name)",
            mail_user = "cameron.prybol@gmail.com",
            logdir = slurm_logdir,
            partition = "batch",
            account = "mpsnyder",
            mem_gb = 16,
            cpus_per_task = 2,
            cmd = cmd)
    else
        @info "$(outfile) already present..."
    end
end

In [None]:
# @time run(`$(Mycelia.MAMBA) run --live-stream -n sourmash sourmash gather -o $(file_path).sourmash-gather.csv $(file_path) $(sourmash_sketch)`)

In [None]:
# # sketch reads for each sample

# sourmash sketch dna -p k=21,k=31,k=51,abund metagenome.fq.gz

# sourmash sketch dna -p k=31 sample_R1.fq.gz sample_R2.fq.gz --name "sample" -o sample.zip

# gunzip -c sample_R?.fq.gz | sourmash sketch dna -p k=31 - \
#     -o sample.zip

In [None]:
# # merge all of the read sketches
# sourmash signature merge file1.sig file2.sig -o merged.sig

In [None]:
# gather each sample

In [None]:
# 

In [None]:
# gather the merged signature

In [None]:
# review outputs

In [None]:
# filter and then map reads if I want more detail