In [None]:
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "JSON",
    "ProgressMeter",
    "SHA"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
project_dir = dirname(pwd())
data_dir = mkpath(joinpath(project_dir, "data"))

In [None]:
sra_dir = joinpath(data_dir, "SRA")

In [None]:
# Step 1: List Existing Files on Google Drive

# We assume you have rclone installed and configured.
# Note: You'll need to have system calls to rclone. Ensure `rclone` is in your PATH.

# Run the rclone command to list files from Google Drive
drive_file = "drive_file_list.json"
if !isfile(drive_file)
    println("run me")
    # several minutes
    @time run(pipeline(`/scg/apps/software/rclone/1.67.0/rclone lsjson --hash -R snyder-virome:viral-exposome/data/SRA`, drive_file))
else
    println("$(drive_file) already present")
end

In [None]:
# Load the data from the JSON file
# quick 160seconds
@time gdrive_files = filter(gf -> gf["IsDir"] == false, JSON.parsefile(drive_file))
gdrive_files = convert(Vector{first(unique(typeof.(gdrive_files)))}, gdrive_files)
gdrive_files = sort(gdrive_files, by=x->x["Size"])

In [None]:
import Printf
# Function to format file sizes in a human-readable format
function human_readable(size)
    if size < 1024
        return "$size Bytes"
    elseif size < 1048576
        return Printf.@sprintf("%.2f KiB", size / 1024)
    elseif size < 1073741824
        return Printf.@sprintf("%.2f MiB", size / 1048576)
    else
        return Printf.@sprintf("%.2f GiB", size / 1073741824)
    end
end

In [None]:
# Check and remove local files that match Google Drive files
local_dir = sra_dir

# Initialize counters for total files removed and total disk space reclaimed
total_files_removed = 0
total_space_freed = 0

# Total files removed: 3626
# Total space freed: 327.15 GiB

# current_stop = 131000
current_stop = 140000
# current_stop = next_stop
next_start = current_stop + 1

next_stop = next_start + 2^15
next_stop = min(length(gdrive_files), next_stop)
# next_stop = next_start + 2^10
# 20 minutes in the later stages
# next_stop = next_start + 2^12
# next_stop = length(gdrive_files)
# next_stop = 2^1
println("current_stop = $(current_stop)")
println("next_start = $(next_start)")
println("next_stop = $(next_stop)")

ProgressMeter.@showprogress for gdrive_file in gdrive_files[next_start:next_stop]
    file_path = gdrive_file["Path"]
    local_file_path = joinpath(local_dir, file_path)
    local_size = filesize(local_file_path)
    if isfile(local_file_path)
        # Calculate local file size and hashes
        size_match = (local_size == gdrive_file["Size"])
        # println()
        println("file_size = $(human_readable(local_size))" * "\t" * "file_path = $(file_path)")
        
        # Compare sizes and hashes
        if size_match
            @time sha256 = SHA.bytes2hex(SHA.sha256(read(local_file_path)))
            sha256_match = (sha256 == gdrive_file["Hashes"]["sha256"])
            if sha256_match
                # Remove the local file
                rm(local_file_path)
                total_files_removed += 1
                total_space_freed += local_size
                # Print details
                println("Removed file: $file_path")
                println("Freed: $(human_readable(local_size))")
            end
        end
    else
        # println("$file_path has already been removed")
    end
end
println("Total files removed: $total_files_removed")
println("Total space freed: $(human_readable(total_space_freed))\n")

In [None]:
# Step 2: List Local Files

# Run the rclone command to list local files
local_file = "local_file_list.json"
# this seems much slower - probably because we're computing SHA256 on the fly whereas google drive just has it precomputed?
@time run(pipeline(`/scg/apps/software/rclone/1.67.0/rclone lsjson --hash -R $(sra_dir)`, local_file))

# time /scg/apps/software/rclone/1.67.0/rclone lsjson --hash -R /oak/stanford/scg/lab_mpsnyder/cjprybol/Mycelia/projects/viral-exposome/data/SRA > local_file_list.json

In [None]:

# Load the data from the JSON file
@time local_files = load_json(local_file)

In [None]:
# Step 3: Compute File Diff Locally

# Extract file paths and hashes from JSON data
function get_file_details(file_list)
    return Dict(file["Path"] => (file["Size"], file["Hashes"]["SHA-256"]) for file in file_list)
end

# Get details of files
local_file_details = get_file_details(local_files)
gdrive_file_details = get_file_details(gdrive_files)

# Determine the files that need to be uploaded
files_to_upload = Set()
for (file, (size, hash)) in local_file_details
    if haskey(gdrive_file_details, file)
        gdrivesize, gdriveshash = gdrive_file_details[file]
        if size != gdrivesize || hash != gdriveshash
            files_to_upload[file] = (size, hash)
        end
    else
        files_to_upload[file] = (size, hash)
    end
end

# Optional: Write the list of files to upload to a file
open("files_to_upload.txt", "w") do f
    for file in keys(files_to_upload)
        write(f, "$file\n")
    end
end

In [None]:
# Step 4: Upload Files with Exponential Backoff and Delete After Upload

# Define a function to run the rclone copy command with retries
function upload_file_with_retry(file, local_dir, remote_dir; retries=10, delay=10)
    attempt = 0
    success = false
    
    while attempt < retries && !success
        try
            run(`rclone copy $local_dir/$file $remote_dir/$file --tpslimit 10 --tpslimit-time 10s`)
            success = true
        catch e
            println("Error uploading $file: $e. Retrying in $(delay * (2 ^ attempt)) seconds.")
            sleep(delay * (2 ^ attempt))
            attempt += 1
        end
    end
    
    if success
        # Delete local file
        rm(joinpath(local_dir, file))
    else
        println("Failed to upload $file after $(retries) attempts.")
    end
end

# Read files to upload from the text file
files_to_copy = readlines("files_to_upload.txt")

# Set local and remote directories
local_dir = "/path/to/your/local/directory"
remote_dir = "gdrive:your_target_folder"

# Upload the files with exponential backoff and delete after upload
for file in files_to_copy
    upload_file_with_retry(file, local_dir, remote_dir)
end