In [None]:
import Pkg
Pkg.update()

pkgs = [
    "Revise",
    "MetaGraphs",
    "Graphs",
    "JSON",
    "uCSV",
    "DataFrames",
    "Dates",
    "Primes",
    "Kmers",
    "BioSequences",
    "FASTX",
    "ProgressMeter",
    "Random",
    "StatsBase"
]

Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

# Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

In [None]:
#papermill parameters
base_directory = "$(homedir())/workspace/sars-cov2-pangenome-analysis/"
sequences_directory = "$(base_directory)/data/sequences"
metadata_file = "$(base_directory)/metadata/sequences.csv"
working_directory = "$(homedir())/workspace/scratch/$(Dates.today())"
datetime = replace(string(Dates.now()), r"[\-:.a-zA-Z]" => "")

In [None]:
neo4j_username = "neo4j"

# remote_neo4j_address = ENV["NEO4J_URL"]
# remote_neo4j_password = ENV["NEO4J_PASSWORD"]
local_neo4j_bolt_address = "bolt://localhost:7687"
local_neo4j_http_address = "neo4j://localhost:7474"
# local_neo4j_password = "neo4j"
# local_neo4j_password = Random.randstring(7)
local_neo4j_password = "ii0sRIc"

neo4j_local_import_directory = "/home/jovyan/.local/neo4j-community-4.4.11/import"

In [None]:
NEO4J_PATH="/home/jovyan/.local/neo4j-community-4.4.11"

In [None]:
run(`$(NEO4J_PATH)/bin/neo4j stop`)
run(`$(NEO4J_PATH)/bin/neo4j start`)
sleep(5)
run(`$(NEO4J_PATH)/bin/neo4j status`)

In [None]:
# # # update password - only need to do this on initial setup
# cmd = "ALTER CURRENT USER SET PASSWORD FROM 'neo4j' TO '$(local_neo4j_password)'"
# run(Mycelia.cypher(cmd, address=local_neo4j_bolt_address, password="neo4j", username="neo4j", database="system"))

In [None]:
mkpath(working_directory)
cd(working_directory)

In [None]:
total_n = countlines(metadata_file)
# f = metadata_file
# n = 10^1 # complete
# n = 10^2 # complete
n = 10^4
# n = 10^4
# n = 10^5
# n = 10^6
# n = total_n - 1
f = open(`head -n $(n+1) $metadata_file`)
@time sequence_metadata = DataFrames.DataFrame(uCSV.read(f, quotes='"', header=1, typedetectrows=100)...)

In [None]:
fastx_files = map(x -> "$(sequences_directory)/$(x).fna", sequence_metadata[!, "Accession"])

In [None]:
graph = Mycelia.initialize_graph()
graph = Mycelia.add_fastx_to_graph!(graph, fastx_files)
graph = Mycelia.add_metadata_from_table!(graph, sequence_metadata, identifier_column = "Accession")

In [None]:
# check memory usage

In [None]:
# min_k = max_k = Mycelia.assess_dnamer_saturation(fastx_files)
# min_k = max_k = 11
# max_k = 31
# min_k=17
# max_k=17
# min_k=max_k=31
# kmer_sizes = Primes.primes(min_k, max_k)
kmer_size = 11

In [None]:
# for kmer_size in kmer_sizes
    # @show kmer_size
@time Mycelia.add_fasta_record_kmers_to_graph!(graph, kmer_size)
# end

In [None]:
# for kmer_size in kmer_sizes
    # @show kmer_size
@time Mycelia.graph_to_gfa(graph, kmer_size)
# end

In [None]:
for gfa in filter(x -> occursin(r"\.gfa$", x), readdir(working_directory, join=true))
    @show gfa
    for nodewidth in [100, 500, 1000]
        gfa_img = "$(gfa).$(nodewidth).jpg"
        if !isfile(gfa_img)
            run(`Bandage image $gfa $gfa_img --deppower 1 --depwidth 1 --nodewidth $(nodewidth)`)
        end
    end
end

In [None]:
run(`$(NEO4J_PATH)/bin/neo4j stop`)
run(`$(NEO4J_PATH)/bin/neo4j start`)
sleep(3)
run(`$(NEO4J_PATH)/bin/neo4j status`)

In [None]:
# Mycelia.list_databases(address=local_neo4j_bolt_address, password=local_neo4j_password)

In [None]:
Mycelia.create_node_constraints(graph, address=local_neo4j_bolt_address, password=local_neo4j_password)

In [None]:
run(Mycelia.cypher("MATCH (n) DETACH DELETE n", address=local_neo4j_bolt_address, password=local_neo4j_password))
run(Mycelia.cypher("MATCH (n) RETURN count(n) as count", address=local_neo4j_bolt_address, password=local_neo4j_password))
# run(Mycelia.cypher("MATCH (n) DETACH DELETE n", address=local_neo4j_bolt_address, password=local_neo4j_password))

In [None]:
Mycelia.upload_nodes_to_neo4j(graph=graph, address=local_neo4j_bolt_address, password=local_neo4j_password, neo4j_import_directory=neo4j_local_import_directory)

In [None]:
run(Mycelia.cypher("MATCH (n) RETURN count(n) as count", address=local_neo4j_bolt_address, password=local_neo4j_password))

In [None]:
Graphs.nv(graph)

In [None]:
function upload_edges_to_neo4j(;graph, address, username="neo4j", password, format="auto", database="neo4j", neo4j_import_directory)
    
    edge_types = unique(MetaGraphs.props(graph, e)[:TYPE] for e in Graphs.edges(graph))
    for edge_type in edge_types
        @info "uploading edge_type => $(Mycelia.type_to_string(edge_type))..."
        edge_type_table = edge_type_to_dataframe(edge_type=edge_type, graph=graph)
        upload_edge_table(table=edge_type_table, address=address, password=password, neo4j_import_dir=neo4j_import_directory)
    end
    @info "done!"
end

function edge_type_to_dataframe(;edge_type, graph)
    edges = filter(e -> MetaGraphs.props(graph, e)[:TYPE] == edge_type, collect(Graphs.edges(graph)))
    edge_type_parameters = unique(reduce(vcat, map(e -> collect(keys(MetaGraphs.props(graph, e))), edges)))
    edge_type_table = DataFrames.DataFrame(Dict(p => [] for p in edge_type_parameters))
    edge_type_table[!, "src"] = String[]
    edge_type_table[!, "dst"] = String[]
    for edge in edges
        edge_dict = copy(MetaGraphs.props(graph, edge))
        edge_dict[:src] = string(MetaGraphs.props(graph, edge.src)[:identifier])
        edge_dict[:dst] = string(MetaGraphs.props(graph, edge.dst)[:identifier])
        push!(edge_type_table, edge_dict)
    end
    # normalize
    edge_type_table[!, "TYPE"] = Mycelia.type_to_string.(edge_type_table[!, "TYPE"])
    for column in names(edge_type_table)
        T = Union{unique(typeof.(edge_type_table[!, column]))...}
        if T <: AbstractDict
            edge_type_table[!, column] = map(d -> JSON.json(string(JSON.json(d))), edge_type_table[!, column])
        else
            edge_type_table[!, column] = JSON.json.(string.(edge_type_table[!, column]))
        end
    end
    return edge_type_table
end

function upload_edge_table(;table, window_size=1000, address, password, username="neo4j", database="neo4j", neo4j_import_dir)
    nrows = DataFrames.nrow(table)
    windows = (i:min(i+window_size-1,nrows) for i in 1:window_size:nrows)
    
    edge_types = unique(table[!, "TYPE"])
    @assert length(edge_types) == 1
    EDGE_TYPE = first(edge_types)
    parameters = ["$(n): row.$(n)" for n in filter(x -> !(x in ["TYPE"]), names(table))]
    parameters = "{" * join(parameters, ", ") * "}"

    ProgressMeter.@showprogress for (i, window) in enumerate(windows)
        df_sub = table[window, :]
        f = "edge$i.tsv"
        local_f_path = "$(neo4j_import_dir)/$(f)"
        uCSV.write(local_f_path, df_sub, delim='\t')
        run(`chmod 777 $(local_f_path)`)
        f_url = "file:///$(f)"
        cmd =
        """
        LOAD CSV WITH HEADERS FROM '$(f_url)' AS row FIELDTERMINATOR '\t'
        MATCH (src {identifier: row.src})
        MATCH (dst {identifier: row.dst})
        CREATE (src)-[p:`$(EDGE_TYPE)` $(parameters)]->(dst)
        """
        # had been merge above!
        # create Progress:   2%|▊                                        |  ETA: 1:04:39
        # merge Progress:   3%|█▍                                       |  ETA: 1:25:12
        cmd = rstrip(replace(cmd, '\n' => ' '))
        cypher_cmd = Mycelia.cypher(cmd, address = address, username = username, password = password, database = database)
        run(cypher_cmd)
        # rm(local_f_path)
    end
end

In [None]:
upload_edges_to_neo4j(graph=graph, address=local_neo4j_bolt_address, password=local_neo4j_password, neo4j_import_directory=neo4j_local_import_directory)

In [None]:
run(Mycelia.cypher("MATCH (n1)-[r]->(n2) return count(r)", address=local_neo4j_bolt_address, password=local_neo4j_password))

In [None]:
run(Mycelia.cypher("MATCH (n1)-[r]->(n2) return r LIMIT 5", address=local_neo4j_bolt_address, password=local_neo4j_password))

In [None]:
Graphs.ne(graph)

In [None]:

neo4j_dump = "$(working_directory)/neo4j-covid-database-$(n)-$(datetime).dump"

In [None]:
run(`$(NEO4J_PATH)/bin/neo4j stop`)
run(`$(NEO4J_PATH)/bin/neo4j-admin dump --database=neo4j --to=$(neo4j_dump)`)
run(`$(NEO4J_PATH)/bin/neo4j start`)

In [None]:
println("scp $(neo4j_dump) cameron@odin.lcfta.com:/mnt/coldstorage/neo4j/import/$(basename(neo4j_dump))")

```
scp /home/jovyan/workspace/scratch/2022-10-29/neo4j-covid-database-11-20221029211438544.dump cameron@odin.lcfta.com:/mnt/coldstorage/neo4j/import/neo4j-covid-database-11-20221029211438544.dump

scp /home/jovyan/workspace/scratch/2022-10-29/neo4j-covid-database-101-20221030135057320.dump cameron@odin.lcfta.com:/mnt/coldstorage/neo4j/import/neo4j-covid-database-101-20221030135057320.dump

scp /home/jovyan/workspace/scratch/2022-10-29/neo4j-covid-database-1000-20221203204005977.dump cameron@odin.lcfta.com:/mnt/coldstorage/neo4j/import/neo4j-covid-database-1000-20221203204005977.dump
```

to reload new database
- `scp local/path/to/db.dump cameron@odin.lcfta.com:/mnt/coldstorage/neo4j/import/db.dump`
- `ssh cameron@odin.lcfta.com`
- `sudo docker stop neo4j && sudo docker rm neo4j`
- `sudo docker stop neo4jloader && sudo docker rm neo4jloader`
- `sudo chmod -R 777 /mnt/coldstorage/neo4j && sudo bash /mnt/coldstorage/git/docker-neo4j/loaddb.sh /var/lib/neo4j/import/neo4j-covid-database-1000-20221203204005977.dump`
- `cd /mnt/coldstorage/git/docker-neo4j && sudo ./reinit.sh`
- `cd /mnt/coldstorage/git/docker-neo4j && sudo ./run.sh`

get container ID
- `sudo docker container ls`