In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
haskey(ENV, "LD_LIBRARY_PATH") && @assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

# Pkg.develop(path="/global/cfs/projectdirs/m4269/cjprybol/Mycelia")
# Pkg.develop(path="../../..")
Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames",
    "FASTX",
    "XAM",
    "uCSV",
    "CodecZlib",
    "ProgressMeter",
    "StatsBase",
    "Statistics",
    "CSV",
    "Random",
    "Distributions",
    "Plots",
    "OrderedCollections",
    "StatsPlots",
    "Colors",
    "Clustering"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
basedir = dirname(pwd())
data_dir = joinpath(basedir, "data")

In [None]:
locus_c_strain_directory = mkpath(joinpath(data_dir, "locus-c-strains"))

In [None]:
in_fastas = filter(x -> occursin(Mycelia.FASTA_REGEX, x), readdir(locus_c_strain_directory, join=true))
# locus_c_strain_fasta = joinpath(data_dir, "locus-c-strains.fna")
outfile = joinpath(basedir, "results", "20240702.c-strain-ani-analysis.txt")
fasta_list_file = joinpath(data_dir, "locus-c-strain-file-list.txt")
open(fasta_list_file, "w") do io
    for f in in_fastas
        println(io, f)
    end
end
readlines(fasta_list_file)

In [None]:
# defaults to using all cores in the system
# Mycelia.fastani_list(query_list = fasta_list_file, reference_list = fasta_list_file, threads=8, outfile = outfile)

fastani_results = Mycelia.read_fastani(outfile)
fastani_results[!, "query_strain"] = map(x -> lowercase(match(r"(c\d{6})"i, x).captures[1]), basename.(fastani_results[!, "query"]))
fastani_results[!, "reference_strain"] = map(x -> lowercase(match(r"(c\d{6})"i, x).captures[1]), basename.(fastani_results[!, "reference"]))
unique_strains = sort(collect(union(fastani_results[!, "query_strain"], fastani_results[!, "reference_strain"])))

In [None]:
strain_to_index_map = Dict(s => i for (i, s) in enumerate(unique_strains))

In [None]:
ani_distance_matrix = Array{Float64}(undef, length(unique_strains), length(unique_strains))
ani_distance_matrix .= Inf
for group in DataFrames.groupby(fastani_results, ["query_strain", "reference_strain"])
    row_index = strain_to_index_map[group[1, "query_strain"]]
    column_index = strain_to_index_map[group[1, "reference_strain"]]
    average_percent_identity = Statistics.mean(group[!, "%_identity"])
    ani_distance_matrix[row_index, column_index] = average_percent_identity
end

# average across the diagonals to ensure they are symmetric
for i in 1:size(ani_distance_matrix, 1)
    for j in i+1:size(ani_distance_matrix, 1)
        ani_distance_matrix[i, j] = ani_distance_matrix[j, i] = Statistics.mean([ani_distance_matrix[i, j], ani_distance_matrix[j, i]])
    end
end
ani_distance_matrix

# convert % ani into a distance
for i in eachindex(ani_distance_matrix)
    ani_distance_matrix[i] = 1 - (ani_distance_matrix[i] / 100)
    if ani_distance_matrix[i] == -Inf
        ani_distance_matrix[i] = 1
    end
end
ani_distance_matrix

In [None]:
optimal_clustering_assessment = Mycelia.fit_optimal_number_of_clusters(ani_distance_matrix)

In [None]:
clustering_result = Clustering.kmeans(ani_distance_matrix, optimal_clustering_assessment.optimal_number_of_clusters)

In [None]:
clustering_result.assignments

In [None]:
group_1_strains = unique_strains[clustering_result.assignments .== 1]

In [None]:
group_2_strains = unique_strains[clustering_result.assignments .== 2]

In [None]:
group_3_strains = unique_strains[clustering_result.assignments .== 3]

In [None]:
(1 .- ani_distance_matrix[clustering_result.assignments .== 1, clustering_result.assignments .== 1]) .* 100

In [None]:
(1 .- ani_distance_matrix[clustering_result.assignments .== 2, clustering_result.assignments .== 2]) .* 100

In [None]:
(1 .- ani_distance_matrix[clustering_result.assignments .== 3, clustering_result.assignments .== 3]) .* 100

In [None]:
newick_outfile = outfile * ".newick"

In [None]:
Mycelia.distance_matrix_to_newick(distance_matrix = ani_distance_matrix, labels = unique_strains, outfile = newick_outfile)

In [None]:
unique_strains

In [None]:
strains_of_interest = [
    strain_to_index_map["c000835"],
    strain_to_index_map["c000836"],
    strain_to_index_map["c000837"],
    strain_to_index_map["c000838"],
    strain_to_index_map["c000839"]
]

(1 .- ani_distance_matrix[strains_of_interest, strains_of_interest]) .* 100

In [None]:
ani_similarity_matrix = (1 .- ani_distance_matrix) .* 100

In [None]:
similar_indices = Int[]
for s in strains_of_interest
    append!(similar_indices, findall(ani_similarity_matrix[s, :] .>= 99.5))
end
similar_indices = sort(unique(similar_indices))

In [None]:
tagged_strains = deepcopy(unique_strains)
for s in similar_indices
    tagged_strains[s] = tagged_strains[s] * "_99.5"
end
tagged_strains

In [None]:
tagged_newick = Mycelia.distance_matrix_to_newick(distance_matrix = ani_distance_matrix, labels = tagged_strains, outfile = newick_outfile = outfile * ".tagged.newick")

In [None]:
# ./fastANI --ql [QUERY_LIST] --rl [REFERENCE_LIST] -o [OUTPUT_FILE]