In [None]:
DATE = "2021-08-25"
TASK = "k-medoids-error-cluster-detection-multi-entity-graph-aligner-test"
DIR = mkpath("$(homedir())/$(DATE)-$(TASK)")

In [None]:
import Pkg
pkgs = [
#     "BioAlignments",
    "BioSequences",
    "Clustering",
#     "CodecZlib",
#     "Colors",
#     "Combinatorics",
    "DataFrames",
#     "DataStructures",
    "Dates",
#     "DelimitedFiles",
    "Distances",
#     "EzXML",
    "FASTX",
#     "GFF3",
#     "HTTP",
#     "Impute",
#     "JSON",
    "LightGraphs",
    "LSHFunctions",
#     "Measures",
    "MetaGraphs",
    "https://github.com/cjprybol/Mycelia.git",
#     "NumericIO",
#     "PlotlyJS",
#     "Plots",
    "Primes",
#     "Printf",
    "ProgressMeter",
    "Random",
    "Revise",
    "SparseArrays",
    "Statistics",
    "StatsBase",
    "StatsPlots",
#     "StringDistances",
    "uCSV",
#     "XLSX",
]

unregistered_packages = filter(pkg -> occursin(r"(^https|git$)", pkg), pkgs)
registered_packages = setdiff(pkgs, unregistered_packages)

for pkg in registered_packages
    try
        eval(Meta.parse("import $(pkg)"))
    catch
        Pkg.add(pkg)
        Pkg.build(pkg)
        eval(Meta.parse("import $(pkg)"))
    end
end

for pkg_url in unregistered_packages
    pkg_name = replace(basename(pkg_url), ".git" => "")
    try
        eval(Meta.parse("import $(pkg_name)"))
    catch
        Pkg.develop(url=pkg_url)
        Pkg.build(pkg_name)
        eval(Meta.parse("import $(pkg_name)"))
    end
end

In [None]:
if Sys.isapple()
    BANDAGE = "/Applications/Bandage.app/Contents/MacOS/Bandage"
else
    BANDAGE = "Bandage"
end

In [None]:
function assess_kmer_sparsity_in_reads(k, observations)
    canonical_kmer_counts = Mycelia.count_canonical_kmers(BioSequences.DNAMer{k}, observations)
    total_observed_canonical_kmers = length(keys(canonical_kmer_counts))
    total_possible_canonical_kmers = (4^k)/2
    sparsity = round(total_observed_canonical_kmers/total_possible_canonical_kmers*100, sigdigits=3)
    return sparsity
end

In [None]:
function sequence_to_canonical_kmers(kmer_type, sequence)
    return (BioSequences.canonical(kmer.fw) for kmer in BioSequences.each(kmer_type, sequence))
end

In [None]:
function determine_edge_weights(graph)
    outgoing_edge_probabilities = determine_edge_weights(graph, true)
    incoming_edge_probabilities = determine_edge_weights(graph, false)
    return Dict(true => outgoing_edge_probabilities, false => incoming_edge_probabilities)
end

In [None]:
function determine_edge_weights(graph, strand)
    kmers = [graph.vprops[v][:kmer] for v in LightGraphs.vertices(graph)]
    outgoing_edge_weights = SparseArrays.spzeros(length(kmers), length(kmers))
    
    for (kmer_index, kmer) in enumerate(kmers)
        if !strand
            kmer = BioSequences.reverse_complement(kmer)
        end
        
        downstream_neighbors = []
        downstream_neighbor_weights = []
        for neighbor in BioSequences.neighbors(kmer)
            canonical_neighbor = BioSequences.canonical(neighbor)
            neighbor_orientation = neighbor == canonical_neighbor
            
            neighbor_index_range = searchsorted(kmers, canonical_neighbor)
            if !isempty(neighbor_index_range)
                @assert length(neighbor_index_range) == 1
                neighbor_index = first(neighbor_index_range)
                if LightGraphs.has_edge(graph, LightGraphs.Edge(kmer_index, neighbor_index))
                    edge_orientations = graph.eprops[LightGraphs.Edge(kmer_index, neighbor_index)][:orientations]
                    this_edge_orientation = (source_orientation = strand, destination_orientation = neighbor_orientation)
                    if this_edge_orientation in edge_orientations
                        edge_weight = graph.eprops[LightGraphs.Edge(kmer_index, neighbor_index)][:weight]
                        outgoing_edge_weights[kmer_index, neighbor_index] = edge_weight
                    end
                end
            end
        end
    end
    return outgoing_edge_weights
end

In [None]:
function determine_edge_probabilities(edge_weights)
    edge_probabilities = deepcopy(edge_weights)
    for orientation in [true, false]
        for row in 1:size(edge_probabilities[orientation], 1)
            edge_weights = edge_probabilities[orientation][row, :]
            edge_total = sum(edge_weights)
            for (col, edge_weight) in enumerate(edge_weights)
                if edge_total > 0
                    edge_probabilities[orientation][row, col] = edge_weight/edge_total
                else
                    edge_probabilities[orientation][row, col] = 0.0
                end
            end
        end
    end
    return edge_probabilities
end

In [None]:
function random_step(current_vertex, current_orientation, step_probabilities)
    outgoing_edge_likelihoods = step_probabilities[initial_orientation][current_vertex, :]
    chosen_step = StatsBase.sample(outgoing_edge_likelihoods.nzind, StatsBase.weights(outgoing_edge_likelihoods.nzval))
    possible_orientations = observation_graph.eprops[LightGraphs.Edge(current_vertex, chosen_step)][:orientations]
    possible_orientations = filter(o -> o.source_orientation == current_orientation, possible_orientations)
    chosen_orientation = rand(possible_orientations).destination_orientation
    chosen_oriented_step = (vertex = chosen_step, orientation = chosen_orientation)
    return chosen_oriented_step
end

In [None]:
function random_walk(observation_graph, initial_vertex, initial_orientation, step_probabilities; n_steps=size(step_probabilities[true], 1))
    step_count = 0
    outgoing_edge_likelihoods = step_probabilities[initial_orientation][initial_vertex, :]
    walk = Vector{NamedTuple{(:vertex, :orientation), Tuple{Int64, Bool}}}()
    done = (step_count >= n_steps) || (sum(outgoing_edge_likelihoods) == 0)
    current_vertex = initial_vertex
    current_orientation = initial_orientation
    while !done
#         @show "choosing"
        chosen_step = StatsBase.sample(outgoing_edge_likelihoods.nzind, StatsBase.weights(outgoing_edge_likelihoods.nzval))
        chosen_step
        possible_orientations = observation_graph.eprops[LightGraphs.Edge(current_vertex, chosen_step)][:orientations]
        possible_orientations
        current_orientation
        possible_orientations = filter(o -> o.source_orientation == current_orientation, possible_orientations)
        possible_orientations
        chosen_orientation = rand(possible_orientations).destination_orientation
        chosen_oriented_step = (vertex = chosen_step, orientation = chosen_orientation)
        push!(walk, chosen_oriented_step)
        current_vertex = chosen_step
        current_orientation = chosen_orientation
        outgoing_edge_likelihoods = step_probabilities[last(walk).orientation][last(walk).vertex, :]
        step_count += 1
#         @show outgoing_edge_likelihoods
        done = (step_count >= n_steps) || (sum(outgoing_edge_likelihoods) == 0)
    end
    return walk
end

In [None]:
# km = vmax/2 = optimal number of reads
# slope is less than 10%?

In [None]:
# generate genomes
fasta_records = []
sequence_lengths = [100, 1000]
weights = [10, 1]
for (sequence_length, depth) in zip(sequence_lengths, weights)
    sequence = BioSequences.randdnaseq(Random.seed!(sequence_length), sequence_length)
    sequence_id = string(hash(sequence))
    description = string(depth)
    fasta_record = FASTX.FASTA.Record(sequence_id, description, sequence)
    push!(fasta_records, fasta_record)
end

In [None]:
file_basename = join(["L-$(length(FASTX.sequence(record)))" for record in fasta_records], '_')
reference_fasta_file = "$(DIR)/$(file_basename).fasta"
open(reference_fasta_file, "w") do io
    fastx_io = FASTX.FASTA.Writer(io)
    for fasta_record in fasta_records
        write(fastx_io, fasta_record)
    end
    close(fastx_io)
end

In [None]:
# randomly sample reads with errors from the genome
error_rate = 0.01
n_reads = 10_000

In [None]:
observations = [Mycelia.observe(StatsBase.sample(fasta_records, StatsBase.Weights(weights)), error_rate = error_rate) for i in 1:n_reads];

In [None]:
# determine distribution of read lengths

In [None]:
sorted_read_lengths = sort(length.(FASTX.sequence.(observations)))

In [None]:
first_percentile_read_length = sorted_read_lengths[Int(round(length(sorted_read_lengths) * .01))]

In [None]:
# determine optimal k
min_k = min(5, first_percentile_read_length)
max_k = min(61, first_percentile_read_length)
k_options = Primes.primes(min_k, max_k)
# sparsity_threshold = 0.01
# sparsity_threshold = 0.1
sparsity_threshold = 1
# sparsity_threshold = 10
k_index = nothing
for (i, k) in enumerate(k_options)
    sparsity = assess_kmer_sparsity_in_reads(k, observations)
    @show k, sparsity
    if sparsity < sparsity_threshold
        k_index = i
        break
    end
end
# k_index = findfirst(k -> assess_kmer_sparsity_in_reads(k, observations) <= sparsity_threshold, k_options)
if k_index == nothing
    k = k_options[end]
else
    k = k_options[k_index]
end
@show k

In [None]:
observations_file = "$(DIR)/$(file_basename)-DEPTH-$(n_reads).fastq"
open(observations_file, "w") do io
    fastq_io = FASTX.FASTQ.Writer(io)
    for record in observations
        write(fastq_io, record)
    end
    close(fastq_io)
end

# Begin graph resolving of observation graph

In [None]:
# assess kmer counts

In [None]:
# fit linear trendline to kmer counts

In [None]:
# apply thresholding when constructing kmer graph at the trendline

In [None]:
kmer_counts = Mycelia.count_canonick = k_options[k_index]al_kmers(BioSequences.DNAMer{k}, observations_file)

In [None]:
kmer_counts_histogram = sort(collect(StatsBase.countmap(values(kmer_counts))), by=x->x[1])

In [None]:
scale = 250
Mycelia.plot_kmer_frequency_spectra(values(kmer_counts), size=(2scale,scale), log_scale=log2, title="kmer frequencies")

In [None]:
StatsPlots.scatter(log2.(first.(kmer_counts_histogram)))

# Fit error cluster

In [None]:
kmer_depth_of_coverage_bins = log2.(first.(kmer_counts_histogram))

In [None]:
distance_matrix = zeros((length(kmer_depth_of_coverage_bins), length(kmer_depth_of_coverage_bins)))
for (row, depth_of_coverage_bin_1) in enumerate(kmer_depth_of_coverage_bins)
    for (col, depth_of_coverage_bin_2) in enumerate(kmer_depth_of_coverage_bins)
        distance = abs(depth_of_coverage_bin_1 - depth_of_coverage_bin_2)
        distance_matrix[row, col] = distance
    end
end
distance_matrix

In [None]:
# distance_matrix = zeros((length(kmer_counts_histogram), length(kmer_counts_histogram)))
# for (row, kmer_count_pair_1) in enumerate(kmer_counts_histogram)
#     for (col, kmer_count_pair_2) in enumerate(kmer_counts_histogram)
# #         distance = abs(depth_of_coverage_bin_1 - depth_of_coverage_bin_2)
#         distance = Distances.euclidean([kmer_count_pair_1...], [kmer_count_pair_2...])
#         distance_matrix[row, col] = distance
#     end
# end
# distance_matrix

In [None]:
ks = Primes.primes(2, 17)
ys = map(k ->
            Statistics.mean(Statistics.mean(Clustering.silhouettes(Clustering.kmedoids(distance_matrix, k), distance_matrix)) for i in 1:100),
            ks)

In [None]:
StatsPlots.plot(ks, ys, label="silhouette score", ylabel = "silhouette score", xlabel = "number of clusters")

In [None]:
ymax, ymax_index = findmax(ys)

In [None]:
optimal_k = ks[ymax_index]

In [None]:
clusterings = [Clustering.kmedoids(distance_matrix, optimal_k) for i in 1:10]

max_value, max_value_index = findmax(clustering -> Statistics.mean(Clustering.silhouettes(clustering, distance_matrix)), clusterings)

In [None]:
optimal_clustering = clusterings[max_value_index]

In [None]:
optimal_clustering.assignments

In [None]:
min_medoid_value, min_medoid_index = findmin(optimal_clustering.medoids)

In [None]:
indices_to_include = map(assignment -> assignment .!= min_medoid_index, optimal_clustering.assignments)

In [None]:
kmer_depth_of_coverage_bins

In [None]:
threshold = Int(ceil(2^maximum(kmer_depth_of_coverage_bins[.!indices_to_include]))) + 1

In [None]:
scale = 250
p = Mycelia.plot_kmer_frequency_spectra(values(kmer_counts), log_scale = log2, size=(2scale,scale), title="kmer frequencies")
StatsPlots.vline!(p, log2.([threshold]))

# raw graph

In [None]:
# # make me faster by counting edges ahead of time
# @time observation_graph = Mycelia.fastx_to_simple_kmer_graph(BioSequences.DNAMer{k}, observations_file)

In [None]:
# kmer_counts = Dict(observation_graph.vprops[v][:kmer] => observation_graph.vprops[v][:weight] for v in LightGraphs.vertices(observation_graph))
# total_observed_kmers = sum(values(kmer_counts))
# kmer_probabilities = Dict(k => v/total_observed_kmers for (k,v) in kmer_counts)
# scale = 250
# Mycelia.plot_kmer_frequency_spectra(values(kmer_counts), size=(2scale,scale), title="kmer frequencies")

In [None]:
# reference_kmers = keys(Mycelia.count_canonical_kmers(BioSequences.DNAMer{k}, reference_fasta_file))
# distance_to_reference_graph = 1 - LSHFunctions.jaccard(Set(reference_kmers), Set(keys(kmer_counts)))

In [None]:
# kmer_counts_histogram = sort(collect(StatsBase.countmap(collect(values(kmer_counts)))), by=x->x[1])

In [None]:
# # visualize
# gfa_file = observations_file * ".k-$k.gfa"
# Mycelia.graph_to_gfa(observation_graph, gfa_file)

# run(`$(BANDAGE) image $(gfa_file) $(gfa_file).svg --depwidth .9 --deppower .9`)
# # --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# # --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# # --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

# html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
# html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

# x = "<img src=$(html_path_to_svg)>"
# display("text/html", x)
# # display("image/svg+xml", read("$(gfa_file).svg", String))

In [None]:
# threshold filtered graph

In [None]:
# make me faster by counting edges ahead of time
@time observation_graph = Mycelia.fastx_to_simple_kmer_graph(BioSequences.DNAMer{k}, observations_file, minimum_coverage=threshold)

In [None]:
kmer_counts = Dict(observation_graph.vprops[v][:kmer] => observation_graph.vprops[v][:weight] for v in LightGraphs.vertices(observation_graph))
total_observed_kmers = sum(values(kmer_counts))
kmer_probabilities = Dict(k => v/total_observed_kmers for (k,v) in kmer_counts)
scale = 250
Mycelia.plot_kmer_frequency_spectra(values(kmer_counts), size=(2scale,scale), title="kmer frequencies")

In [None]:
reference_kmers = keys(Mycelia.count_canonical_kmers(BioSequences.DNAMer{k}, reference_fasta_file))
distance_to_reference_graph = 1 - LSHFunctions.jaccard(Set(reference_kmers), Set(keys(kmer_counts)))

In [None]:
kmer_counts_histogram = sort(collect(StatsBase.countmap(collect(values(kmer_counts)))), by=x->x[1])

In [None]:
# visualize
gfa_file = observations_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(observation_graph, gfa_file)

run(`$(BANDAGE) image $(gfa_file) $(gfa_file).svg --depwidth .9 --deppower .9`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

x = "<img src=$(html_path_to_svg)>"
display("text/html", x)
# display("image/svg+xml", read("$(gfa_file).svg", String))

In [None]:
# this didn't work because it uses longer minimizers than the kmers were set to?
# run(`GraphAligner -g $(gfa_file) -f $(observations_file) -a $(observations_file).gaf -x dbg`)

In [None]:
# try again using simplified graph

In [None]:
simplified_graph = Mycelia.simplify_kmer_graph(observation_graph)

In [None]:
# write simplified graph to gfa
# visualize
gfa_file = observations_file * ".k-$k.simplified.gfa"
Mycelia.graph_to_gfa(simplified_graph, gfa_file)

run(`$(BANDAGE) image $(gfa_file) $(gfa_file).svg --depwidth .9 --deppower .9`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

x = "<img src=$(html_path_to_svg)>"
display("text/html", x)
# display("image/svg+xml", read("$(gfa_file).svg", String))

In [None]:
run(`GraphAligner -g $(gfa_file) -f $(observations_file) -a $(observations_file).gaf -x vg`)

In [None]:
alignments_file = "$(observations_file).gaf"

In [None]:
# conda install -c conda-forge -c bioconda bcalm

In [None]:
line = readline(alignments_file)

In [None]:
split(line, '\t')

In [None]:

# first(uCSV.read(alignments_file, delim='\t'))

In [None]:
# https://github.com/lh3/gfatools/blob/master/doc/rGFA.md#the-graph-alignment-format-gaf
# Query sequence name
# Query sequence length
# Query start (0-based; closed)
# Query end (0-based; open)
# Strand relative to the path: "+" or "-"
# Path matching /([><][^\s><]+(:\d+-\d+)?)+|([^\s><]+)/
# Path length
# Start position on the path (0-based)
# End position on the path (0-based)
# Number of residue matches
# Alignment block length
# Mapping quality (0-255; 255 for missing)
#  "NM:i:0"
#  "AS:f:100"
#  "dv:f:0"
#  "id:f:1"
#  "cg:Z:100="

header = [
    "Query sequence name",
    "Query sequence length",
    "Query start (0-based; closed)",
    "Query end (0-based; open)",
    "Strand",
    "Path",
    "Path length",
    "Start position on the path (0-based)",
    "End position on the path (0-based)",
    "Number of residue matches",
    "Alignment block length",
    "Mapping quality (0-255; 255 for missing)",
    "NM",
    "AS",
    "dv",
    "id",
    "cg"
    ]
mapping_results = DataFrames.DataFrame(first(uCSV.read(alignments_file, delim='\t')), header)

In [None]:
# take only the first entry for each read
mapping_results = DataFrames.DataFrame([g[1, :] for g in DataFrames.groupby(mapping_results, "Query sequence name")])

In [None]:
corrected_observations_file = replace(observations_file, ".fastq" => ".k$(k).fasta")

In [None]:
open(corrected_observations_file, "w") do io
    fastx_io = FASTX.FASTA.Writer(io)
    for mapping_result in DataFrames.eachrow(mapping_results[10:20, :])
#         @show mapping_result["Path"]
        steps = split(replace(mapping_result["Path"], r"(>|<)" => s" \1"))
#         @show steps
        oriented_steps = [(vertex = parse(Int, replace(step, r">|<" => "")), orientation = first(step) == '>' ? true : false) for step in steps]
#         @show oriented_steps
        path_sequences = BioSequences.LongDNASeq[]
        for step in oriented_steps
            seq = simplified_graph.vprops[step.vertex][:sequence]
            if !step.orientation
                seq = BioSequences.reverse_complement(seq)
            end
            push!(path_sequences, seq)
        end
#         @show path_sequences
        path_sequence = reduce(*, path_sequences)
#         @show path_sequence
        if mapping_result["Strand"] == "-"
            path_sequence = BioSequences.reverse_complement(path_sequence)
#             @show "flipping orientation"
        end

        start = mapping_result["Start position on the path (0-based)"] + 1
        stop = mapping_result["End position on the path (0-based)"]
#         @show "extracting $(start):$(stop)"
        path_replacement = path_sequence[start:stop]
#         @show path_replacement
        
        sequence_hash = string(hash(path_replacement))
        barcode_id = string(hash(Dates.now(), hash(Random.randstring(32))))
        record_identifier = "$(sequence_hash)-$(barcode_id)"

        record = FASTX.FASTA.Record(
            record_identifier,
            mapping_result["Query sequence name"],
            path_replacement
        )
#         @show record
        write(fastx_io, record)
    end
    close(fastx_io)
end

In [None]:
corrected_observation_graph = Mycelia.fastx_to_kmer_graph(BioSequences.DNAMer{k}, corrected_observations_file)

In [None]:
# visualize
gfa_file = corrected_observations_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(corrected_observation_graph, gfa_file)

run(`$(BANDAGE) image $(gfa_file) $(gfa_file).svg --depwidth .9 --deppower .9`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

x = "<img src=$(html_path_to_svg)>"
display("text/html", x)
# display("image/svg+xml", read("$(gfa_file).svg", String))

In [None]:
kmer_counts = Dict(corrected_observation_graph.vprops[v][:kmer] => corrected_observation_graph.vprops[v][:weight] for v in LightGraphs.vertices(corrected_observation_graph))
total_observed_kmers = sum(values(kmer_counts))
kmer_probabilities = Dict(k => v/total_observed_kmers for (k,v) in kmer_counts)
scale = 250
Mycelia.plot_kmer_frequency_spectra(values(kmer_counts), size=(2scale,scale), title="kmer frequencies")

In [None]:
# compare the new graph to the error-free graph to the error-corrected graph

In [None]:
reference_kmers = keys(Mycelia.count_canonical_kmers(BioSequences.DNAMer{k}, reference_fasta_file))
distance_to_reference_graph = 1 - LSHFunctions.jaccard(Set(reference_kmers), Set(keys(kmer_counts)))

# repeat with iterative k

In [None]:
k = k_options[k_index]

In [None]:
k = k_options[findfirst(new_k -> new_k > k, k_options)]

In [None]:
observations_file = corrected_observations_file

In [None]:
kmer_counts = Mycelia.count_canonical_kmers(BioSequences.DNAMer{k}, observations_file)

In [None]:
kmer_counts_histogram = sort(collect(StatsBase.countmap(values(kmer_counts))), by=x->x[1])

In [None]:
scale = 250
Mycelia.plot_kmer_frequency_spectra(values(kmer_counts), size=(2scale,scale), log_scale=log2, title="kmer frequencies")

In [None]:
StatsPlots.scatter(log2.(first.(kmer_counts_histogram)))

# Fit error cluster

In [None]:
kmer_depth_of_coverage_bins = log2.(first.(kmer_counts_histogram))

In [None]:
distance_matrix = zeros((length(kmer_depth_of_coverage_bins), length(kmer_depth_of_coverage_bins)))
for (row, depth_of_coverage_bin_1) in enumerate(kmer_depth_of_coverage_bins)
    for (col, depth_of_coverage_bin_2) in enumerate(kmer_depth_of_coverage_bins)
        distance = abs(depth_of_coverage_bin_1 - depth_of_coverage_bin_2)
        distance_matrix[row, col] = distance
    end
end
distance_matrix

In [None]:
# distance_matrix = zeros((length(kmer_counts_histogram), length(kmer_counts_histogram)))
# for (row, kmer_count_pair_1) in enumerate(kmer_counts_histogram)
#     for (col, kmer_count_pair_2) in enumerate(kmer_counts_histogram)
# #         distance = abs(depth_of_coverage_bin_1 - depth_of_coverage_bin_2)
#         distance = Distances.euclidean([kmer_count_pair_1...], [kmer_count_pair_2...])
#         distance_matrix[row, col] = distance
#     end
# end
# distance_matrix

In [None]:
max_kmedoids_k = min(length(kmer_depth_of_coverage_bins), 7)
ks = Primes.primes(2, max_kmedoids_k)
ys = map(k ->
            Statistics.mean(Statistics.mean(Clustering.silhouettes(Clustering.kmedoids(distance_matrix, k), distance_matrix)) for i in 1:100),
            ks)

In [None]:
StatsPlots.plot(ks, ys, label="silhouette score", ylabel = "silhouette score", xlabel = "number of clusters")

In [None]:
ymax, ymax_index = findmax(ys)

In [None]:
optimal_k = ks[ymax_index]

In [None]:
clusterings = [Clustering.kmedoids(distance_matrix, optimal_k) for i in 1:10]

max_value, max_value_index = findmax(clustering -> Statistics.mean(Clustering.silhouettes(clustering, distance_matrix)), clusterings)

In [None]:
optimal_clustering = clusterings[max_value_index]

In [None]:
optimal_clustering.assignments

In [None]:
min_medoid_value, min_medoid_index = findmin(optimal_clustering.medoids)

In [None]:
indices_to_include = map(assignment -> assignment .!= min_medoid_index, optimal_clustering.assignments)

In [None]:
kmer_depth_of_coverage_bins

In [None]:
threshold = Int(ceil(2^maximum(kmer_depth_of_coverage_bins[.!indices_to_include]))) + 1

In [None]:
scale = 250
p = Mycelia.plot_kmer_frequency_spectra(values(kmer_counts), log_scale = log2, size=(2scale,scale), title="kmer frequencies")
StatsPlots.vline!(p, log2.([threshold]))

# raw graph

In [None]:
# # make me faster by counting edges ahead of time
# @time observation_graph = Mycelia.fastx_to_simple_kmer_graph(BioSequences.DNAMer{k}, observations_file)

In [None]:
# kmer_counts = Dict(observation_graph.vprops[v][:kmer] => observation_graph.vprops[v][:weight] for v in LightGraphs.vertices(observation_graph))
# total_observed_kmers = sum(values(kmer_counts))
# kmer_probabilities = Dict(k => v/total_observed_kmers for (k,v) in kmer_counts)
# scale = 250
# Mycelia.plot_kmer_frequency_spectra(values(kmer_counts), size=(2scale,scale), title="kmer frequencies")

In [None]:
# reference_kmers = keys(Mycelia.count_canonical_kmers(BioSequences.DNAMer{k}, reference_fasta_file))
# distance_to_reference_graph = 1 - LSHFunctions.jaccard(Set(reference_kmers), Set(keys(kmer_counts)))

In [None]:
# kmer_counts_histogram = sort(collect(StatsBase.countmap(collect(values(kmer_counts)))), by=x->x[1])

In [None]:
# # visualize
# gfa_file = observations_file * ".k-$k.gfa"
# Mycelia.graph_to_gfa(observation_graph, gfa_file)

# run(`$(BANDAGE) image $(gfa_file) $(gfa_file).svg --depwidth .9 --deppower .9`)
# # --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# # --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# # --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

# html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
# html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

# x = "<img src=$(html_path_to_svg)>"
# display("text/html", x)
# # display("image/svg+xml", read("$(gfa_file).svg", String))

In [None]:
# threshold filtered graph

In [None]:
# make me faster by counting edges ahead of time
@time observation_graph = Mycelia.fastx_to_simple_kmer_graph(BioSequences.DNAMer{k}, observations_file, minimum_coverage=1)

In [None]:
kmer_counts = Dict(observation_graph.vprops[v][:kmer] => observation_graph.vprops[v][:weight] for v in LightGraphs.vertices(observation_graph))
total_observed_kmers = sum(values(kmer_counts))
kmer_probabilities = Dict(k => v/total_observed_kmers for (k,v) in kmer_counts)
scale = 250
Mycelia.plot_kmer_frequency_spectra(values(kmer_counts), size=(2scale,scale), title="kmer frequencies")

In [None]:
reference_kmers = keys(Mycelia.count_canonical_kmers(BioSequences.DNAMer{k}, reference_fasta_file))
distance_to_reference_graph = 1 - LSHFunctions.jaccard(Set(reference_kmers), Set(keys(kmer_counts)))

In [None]:
kmer_counts_histogram = sort(collect(StatsBase.countmap(collect(values(kmer_counts)))), by=x->x[1])

In [None]:
# visualize
gfa_file = observations_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(observation_graph, gfa_file)

run(`$(BANDAGE) image $(gfa_file) $(gfa_file).svg --depwidth .9 --deppower .9`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

x = "<img src=$(html_path_to_svg)>"
display("text/html", x)
# display("image/svg+xml", read("$(gfa_file).svg", String))

In [None]:
# this didn't work because it uses longer minimizers than the kmers were set to?
# run(`GraphAligner -g $(gfa_file) -f $(observations_file) -a $(observations_file).gaf -x dbg`)

In [None]:
# try again using simplified graph

In [None]:
simplified_graph = Mycelia.simplify_kmer_graph(observation_graph)

In [None]:
# write simplified graph to gfa
# visualize
gfa_file = observations_file * ".k-$k.simplified.gfa"
Mycelia.graph_to_gfa(simplified_graph, gfa_file)

run(`$(BANDAGE) image $(gfa_file) $(gfa_file).svg --depwidth .9 --deppower .9`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

x = "<img src=$(html_path_to_svg)>"
display("text/html", x)
# display("image/svg+xml", read("$(gfa_file).svg", String))

In [None]:
run(`GraphAligner -g $(gfa_file) -f $(observations_file) -a $(observations_file).gaf -x vg`)

In [None]:
alignments_file = "$(observations_file).gaf"

In [None]:
# conda install -c conda-forge -c bioconda bcalm

In [None]:
line = readline(alignments_file)

In [None]:
split(line, '\t')

In [None]:

# first(uCSV.read(alignments_file, delim='\t'))

In [None]:
# https://github.com/lh3/gfatools/blob/master/doc/rGFA.md#the-graph-alignment-format-gaf
# Query sequence name
# Query sequence length
# Query start (0-based; closed)
# Query end (0-based; open)
# Strand relative to the path: "+" or "-"
# Path matching /([><][^\s><]+(:\d+-\d+)?)+|([^\s><]+)/
# Path length
# Start position on the path (0-based)
# End position on the path (0-based)
# Number of residue matches
# Alignment block length
# Mapping quality (0-255; 255 for missing)
#  "NM:i:0"
#  "AS:f:100"
#  "dv:f:0"
#  "id:f:1"
#  "cg:Z:100="

header = [
    "Query sequence name",
    "Query sequence length",
    "Query start (0-based; closed)",
    "Query end (0-based; open)",
    "Strand",
    "Path",
    "Path length",
    "Start position on the path (0-based)",
    "End position on the path (0-based)",
    "Number of residue matches",
    "Alignment block length",
    "Mapping quality (0-255; 255 for missing)",
    "NM",
    "AS",
    "dv",
    "id",
    "cg"
    ]
mapping_results = DataFrames.DataFrame(first(uCSV.read(alignments_file, delim='\t')), header)

In [None]:
# take only the first entry for each read
mapping_results = DataFrames.DataFrame([g[1, :] for g in DataFrames.groupby(mapping_results, "Query sequence name")])

In [None]:
corrected_observations_file = replace(observations_file, ".fastq" => ".k$(k).fasta")

In [None]:
open(corrected_observations_file, "w") do io
    fastx_io = FASTX.FASTA.Writer(io)
    for mapping_result in DataFrames.eachrow(mapping_results[10:20, :])
#         @show mapping_result["Path"]
        steps = split(replace(mapping_result["Path"], r"(>|<)" => s" \1"))
#         @show steps
        oriented_steps = [(vertex = parse(Int, replace(step, r">|<" => "")), orientation = first(step) == '>' ? true : false) for step in steps]
#         @show oriented_steps
        path_sequences = BioSequences.LongDNASeq[]
        for step in oriented_steps
            seq = simplified_graph.vprops[step.vertex][:sequence]
            if !step.orientation
                seq = BioSequences.reverse_complement(seq)
            end
            push!(path_sequences, seq)
        end
#         @show path_sequences
        path_sequence = reduce(*, path_sequences)
#         @show path_sequence
        if mapping_result["Strand"] == "-"
            path_sequence = BioSequences.reverse_complement(path_sequence)
#             @show "flipping orientation"
        end

        start = mapping_result["Start position on the path (0-based)"] + 1
        stop = mapping_result["End position on the path (0-based)"]
#         @show "extracting $(start):$(stop)"
        path_replacement = path_sequence[start:stop]
#         @show path_replacement
        
        sequence_hash = string(hash(path_replacement))
        barcode_id = string(hash(Dates.now(), hash(Random.randstring(32))))
        record_identifier = "$(sequence_hash)-$(barcode_id)"

        record = FASTX.FASTA.Record(
            record_identifier,
            mapping_result["Query sequence name"],
            path_replacement
        )
#         @show record
        write(fastx_io, record)
    end
    close(fastx_io)
end

In [None]:
corrected_observation_graph = Mycelia.fastx_to_kmer_graph(BioSequences.DNAMer{k}, corrected_observations_file)

In [None]:
# visualize
gfa_file = corrected_observations_file * ".k-$k.gfa"
Mycelia.graph_to_gfa(corrected_observation_graph, gfa_file)

run(`$(BANDAGE) image $(gfa_file) $(gfa_file).svg --depwidth .9 --deppower .9`)
# --nodewidth <float> Average node width (0.5 to 1000, default: 5)
# --depwidth <float>  Depth effect on width (0 to 1, default: 0.5)
# --deppower <float>  Power of depth effect on width (0 to 1, default: 0.5)

html_path_to_svg = "./" * repeat("../", length(split(pwd(), '/')) - 3)
html_path_to_svg *= replace("$(gfa_file).svg", "$(homedir())/" => "")

x = "<img src=$(html_path_to_svg)>"
display("text/html", x)
# display("image/svg+xml", read("$(gfa_file).svg", String))

In [None]:
kmer_counts = Dict(corrected_observation_graph.vprops[v][:kmer] => corrected_observation_graph.vprops[v][:weight] for v in LightGraphs.vertices(corrected_observation_graph))
total_observed_kmers = sum(values(kmer_counts))
kmer_probabilities = Dict(k => v/total_observed_kmers for (k,v) in kmer_counts)
scale = 250
Mycelia.plot_kmer_frequency_spectra(values(kmer_counts), size=(2scale,scale), title="kmer frequencies")

In [None]:
# compare the new graph to the error-free graph to the error-corrected graph

In [None]:
reference_kmers = keys(Mycelia.count_canonical_kmers(BioSequences.DNAMer{k}, reference_fasta_file))
distance_to_reference_graph = 1 - LSHFunctions.jaccard(Set(reference_kmers), Set(keys(kmer_counts)))