# Setup

In [1]:
import Pkg
Pkg.activate(".")

[32m[1m  Activating[22m[39m environment at `~/3Dto2D/v1/Project.toml`


In [2]:
import ArgParse
import Dates
import NIfTI
import NPZ
import Statistics
import Flux
import CUDA
import Zygote
import Plots
import BSON
import Augmentor
include("/home/johjo50/.julia/mycode/codeinfo.jl")

Main.CodeInfo

# Commandline Interface
In anticipation of migrating to a normal julia file that will be run from commandline or by another programme.

In [3]:
# Emulating command line arguments to facilitate later migration from jupyter
cli_args = split("-g 0 -m model_isotropic_02.jl -l /flush/common/UKbiobank/T1_dset/struc_brain_HCC_Johan_csv/label_data.csv /flush/common/UKbiobank/T1_dset/struc_brain_HCC_Johan")

7-element Vector{SubString{String}}:
 "-g"
 "0"
 "-m"
 "model_isotropic_02.jl"
 "-l"
 "/flush/common/UKbiobank/T1_dset/struc_brain_HCC_Johan_csv/label_data.csv"
 "/flush/common/UKbiobank/T1_dset/struc_brain_HCC_Johan"

In [4]:
#ArgParse
settings = ArgParse.ArgParseSettings()
ArgParse.@add_arg_table! settings begin
    "--training-gpu", "-g"
        help = "gpu to use for training model"
        arg_type = Int
        default = 0
    "--secondary-gpu", "-s"
        help = "gpu to use for other tasks such as validation and testing (optional)"
        arg_type = Int
    "--model", "-m"
        help = "which model to load"
        arg_type = String
    "--label-file", "-l"
        help = "the file with the labels for the data, a data directory is given and this option isn't used <data-dir>/*.csv and <data-dir>/../*.csv will be tried"
        arg_type = String
    "data-dir"
        help = "directory where the data is located"
end

ArgParseSettings(
  prog=
  description=
  epilog=
  usage=
  version=Unspecified version
  add_help=true
  add_version=false
  fromfile_prefix_chars=Set{Char}()
  autofix_names=false
  error_on_conflict=true
  allow_ambiguous_opts=false
  commands_are_required=true
  default_group=
  exc_handler=default_handler
  preformatted_description=false
  preformatted_epilog=false
  exit_after_help=false
  >> usage: <PROGRAM> [-g TRAINING-GPU] [-s SECONDARY-GPU] [-m MODEL]
                 [-l LABEL-FILE] [data-dir]
  )

# Hardcoded Paths
Might in the future be modifiable via the CLI.

In [5]:
const label_file = "/flush/common/UKbiobank/T1_dset/struc_brain_HCC_Johan_csv/label_data.csv"
const data_dir = "/flush/common/UKbiobank/T1_dset/struc_brain_HCC_Johan"
const model_dir = "/home/johjo50/3Dto2D/v1/models"
const result_dir = "/home/johjo50/3Dto2D/v1/results"

"/home/johjo50/3Dto2D/v1/results"

# General functions

In [6]:
global out_fds = []

function mprint(args...; kwargs...)
    global out_fds
    if !isdefined(@__MODULE__, :out_fds) || isnothing(out_fds) || isempty(out_fds)
        # println("No valid output pool. Reverting to normal print")
    else
        for ofd ∈ out_fds
            redirect_stdout(() -> print(args...; kwargs...), ofd)
            flush(ofd)
        end
    end
    print(args...; kwargs...)
end

function mprintln(args...; kwargs...)
    global out_fds
    if !isdefined(@__MODULE__, :out_fds) || isnothing(out_fds) || isempty(out_fds)
        # println("No valid output pool. Reverting to normal println")
        
    else
        for ofd ∈ out_fds
            redirect_stdout(() -> println(args...; kwargs...), ofd)
            flush(ofd)
        end
    end
    println(args...; kwargs...)
end

mprintln (generic function with 1 method)

In [7]:
function openfile(o)
    println("Ignoring request to open $o")
end

function openfile(pfile::AbstractString)
    println("Received request to open $pfile")
    if isdir(dirname(pfile))
        rv = open(pfile, "w")
        println("File opened, returning filehandle")
        return rv
    end
end

function openfile(pfd::IO)
    println("Received request to open $pfd")
    if iswritable(pfd)
        println("Is writable, returning as is")
        return pfd
    end
    println("Not writeble, ignoring")
end

openfile (generic function with 3 methods)

In [8]:
function closefile(o)
    println("Ignoring request to close $o")
end

function closefile(fd::IOStream)
    println("Closing $fd")
    close(fd)
end

closefile (generic function with 2 methods)

In [9]:
function withmultipleoutput(f, args...; kwargs...)
    # Let's you use mprint(ln) to output to many streams.
    # kwargs are currently ignored
    # args can be filenames or IO objects, stdout should not be added (included by default)
    #
    # N.B. Don't nest! The files to write to are in a global variable!
    #
    global out_fds
    push!(out_fds, (filter(x -> !isnothing(x), map(openfile, args)))...)
    println("Entering \"with-multiple-output\"-context.")
    # println("out_fds ($(typeof(out_fds))):")
    # display(out_fds)
    try
        f()
    finally
        println("Leaving \"with-multiple-output\"-context. Closing appropriate files and resetting the output pool")
        # println("out_fds ($(typeof(out_fds))):")
        # display(out_fds)
        map(closefile, out_fds)
        out_fds = []
    end
end

withmultipleoutput (generic function with 1 method)

In [10]:
function fw_msg(fn)
    mprintln("\rFile written to path:\n  '$fn'")
end

fw_msg (generic function with 1 method)

In [11]:
tprint = Flux.throttle(print, 100)

(::Flux.var"#throttled#122"{Flux.var"#throttled#118#123"{Bool, Bool, typeof(print), Int64}}) (generic function with 1 method)



In [12]:
function assign(dct,key,def)
    if haskey(dct, key)
        return dct[key]
    elseif haskey(dct, Symbol(key))
        return dct[Symbol(key)]
    else
        return def
    end
end

            

assign (generic function with 1 method)

# Data loading functions

In [13]:
function keyrecord(x)
    chomp(x)
    try
        row = split(x, ",")
        k = parse(Int32, row[1])
        v = collect(Float32, Iterators.map(x -> parse(Float32, x), row[2:end]))
        return (k, v)
    catch e
        return (Int32(0), collect(Float32, (0.0, 0.0)))
    end
end

keyrecord (generic function with 1 method)

In [14]:
let partition = :train, vol_file_list = nothing
    global load_data
    function load_data(data_dir::String, label_file::String)
        if !isdir(joinpath(data_dir, ".projections"))
            mkdir(joinpath(data_dir, ".projections"))
        end
        labels = Dict(Iterators.map(keyrecord, eachline(label_file)))
        delete!(labels, 0)
        list_length = length(labels)
        if partition == :train
            start_p = Int32(1)                          # NB CHANGE BACK FOR "PRODUCTION RUNS" (I think...)
            stop_p = round(Int32, list_length * 0.70f0) # 220908 Changing the split from 70/15/15 for
            no_of_subs = stop_p                         # faster prototyping. New split  56/12/32
            vol_file_list = collect(Iterators.filter(f -> contains(f, r".nii.gz$"), readdir(data_dir)))
        end
        if partition == :validation
            start_p = round(Int32, list_length * 0.70f0) + Int32(1)
            stop_p = round(Int32, list_length * 0.85f0)
            no_of_subs = stop_p - start_p + Int32(1)
        end
        if partition == :test
            start_p = round(Int32, list_length * 0.85f0) + Int32(1)
            stop_p = round(Int32 ,list_length)
            no_of_subs = stop_p - start_p + Int32(1)
        end
        X_trans = zeros(Float32, 256, 256, 2, no_of_subs)
        X_coron = zeros(Float32, 208, 256, 2, no_of_subs)
        X_sagit = zeros(Float32, 208, 256, 2, no_of_subs)
        y = zeros(Float32, 2, no_of_subs)

        Threads.@threads for (vol_no, fn) ∈ collect(enumerate(vol_file_list[start_p : stop_p]))
            tprint("\r$vol_no       ")
            sub_id = parse( Int32, (match(r"T1_(\d+)_", fn)).captures[1] )
            @inbounds y[:, vol_no] = labels[sub_id]
            vol = nothing
            flush(stdout)
            
            if isfile(joinpath(data_dir, ".projections/$fn.tr.npy"))
                xt = NPZ.npzread(joinpath(data_dir, ".projections/$fn.tr.npy"))
                @inbounds X_trans[:, :, :, vol_no] = xt
            else
                vol = NIfTI.niread(joinpath(data_dir, fn)).raw
                xt = dropdims(cat(Statistics.mean(vol, dims=1), Statistics.std(vol, dims=1), dims=4), dims = 1)
                @inbounds X_trans[:, :, :, vol_no] = xt
                NPZ.npzwrite(joinpath(data_dir, ".projections/$fn.tr.npy"), X_trans[:, :, :, vol_no])
            end

            if isfile(joinpath(data_dir, ".projections/$fn.co.npy"))
                xc = NPZ.npzread(joinpath(data_dir, ".projections/$fn.co.npy"))
                @inbounds X_coron[:, :, :, vol_no] = xc
            else
                if isnothing(vol)
                    vol = NIfTI.niread(joinpath(data_dir, fn)).raw
                end
                xc = dropdims(cat(Statistics.mean(vol, dims=2), Statistics.std(vol, dims=2), dims=4), dims = 2)
                @inbounds X_coron[:, :, :, vol_no] = xc
                NPZ.npzwrite(joinpath(data_dir, ".projections/$fn.co.npy"), X_coron[:, :, :, vol_no])
            end

            if isfile(joinpath(data_dir, ".projections/$fn.sa.npy"))
                xs = NPZ.npzread(joinpath(data_dir, ".projections/$fn.sa.npy"))
                @inbounds X_sagit[:, :, :, vol_no] = xs
            else
                if isnothing(vol)
                    vol = NIfTI.niread(joinpath(data_dir, fn)).raw
                end
                xs = dropdims(cat(Statistics.mean(vol, dims=3), Statistics.std(vol, dims=3), dims=4), dims = 3)
                @inbounds X_sagit[:, :, :, vol_no] = xs
                NPZ.npzwrite(joinpath(data_dir, ".projections/$fn.sa.npy"), X_sagit[:, :, :, vol_no])
            end
            #= This check is obviously inappropriate for a multithreaded solution
            if vol_no == no_of_subs
                break
            end
            =#
        end #for statement
        if partition == :train
            partition = :validation
        elseif partition == :validation
            partition = :test
        else
            partition = :train
        end
        return (X_trans, X_coron, X_sagit, y)
    end #function load_data
end #let statement

load_data (generic function with 1 method)

# Model loading functions

In [15]:
function canread(path::String)::Bool
    if ccall((:access, "libglib"), Cint, (Cstring, Cint), path, 4) == 0
        return true
    else
        return false
    end
end

canread (generic function with 1 method)

In [16]:
function load_model(fn)
    # NB! No longer returns the model
    global model
    mo = match(r"\.(bson|jl)$"i, fn)
    sfx = lowercase(mo[1])
    if sfx == "jl"
        Base.include(@__MODULE__, fn)
        return make_my_model
    elseif sfx == "bson"
        BSON.@load fn model
    else
        throw(ErrorException("Executing unreachable code: out of cheese error\nPlease restart universe!"))
    end
end

load_model (generic function with 1 method)

In [17]:
function save_weights(weights::Vector{Any}, mpath)
    weights = weights |> Flux.cpu
    ts = Dates.format(Dates.now(),"yyyy-mm-dd-HH-MM-SS")
    mn = basename(mpath)
    BSON.@save (model_dir * "/model_$(mn)_$(ts)_weights.bson") weights mpath ts
    fw_msg(model_dir * "/model_$(mn)_$(ts)_weights.bson")
end

function save_weights(weights::Zygote.Params, mpath)
    save_weights(collect(weights), mpath)
end

function save_weights(model, mpath)
    # A model has a very complicated type signature, if we don't know what it is we assume it's a model.
    save_weights(Flux.params(model), mpath)
end

function load_weights(fn)
    BSON.@load fn weights mpath ts
    return weights
end

function load_model_and_weights(fn)
    # This function takes the file name of a file with weights. It is assumed that this file also correctly
    # can identify which model it uses and reference the file from which to load it.
    BSON.@load fn weights mpath ts
    model = load_model(mpath)
    Flux.loadparams!(model, weights)
    model = model |> Flux.gpu
    return model
end

function save_model(mpath, m = nothing, rdir = nothing)
    # This is the new ".mdl.bson" format.
    if m == nothing
        if @isdefined model
            m = model
        else
            throw(Core.UndefVarError("Global model not defined and no other specified"))
        end
    end
    m = m |> Flux.cpu
    ts = Dates.format(Dates.now(),"yyyy-mm-dd-HH-MM-SS")
    mn = basename(mpath)
    #BSON.bson(model_dir * "/model_$(mn)_$(ts).mdl.bson", Dict(:model => m))
    #fw_msg(model_dir * "/model_$(mn)_$(ts).mdl.bson")
    if !isnothing(rdir)
        BSON.bson(rdir * "/model_$(mn)_$(ts).mdl.bson", Dict(:model => m))
        fw_msg(rdir * "/model_$(mn)_$(ts).mdl.bson")
    end
end

save_model (generic function with 3 methods)

# Training functions

In [18]:
Base.sum(x::Flux.Zeros) = Float32(0)

Base.sum(f::Function, x::Flux.Zeros) = Float32(0)

function my_loss(m, batch; kwargs...) # Not used right now because it is probably more convenient to keep the penalty separate
    return Flux.mse(m(( batch.d₁ , batch.d₂ , batch.d₃ )), reshape(batch.l[1, :], 1, :)) + penalty(m)
end

function age_loss(m, batch; kwargs...)
    return Flux.mse(m(( aug(batch.d₁) , aug(batch.d₂) , aug(batch.d₃) )), reshape(batch.l[1, :], 1, :))
end

age_loss (generic function with 1 method)

In [19]:
function get_counter_funcs()
    count = nothing
    reset_count = nothing
    let c = 1, tprint = Flux.throttle(print, 5)
        function count()
            tprint("\r$c                                                                                 ")
            c += 1
        end
        function reset_count()
            c = 1
        end
    end
    return (count, reset_count)
end

get_counter_funcs (generic function with 1 method)

In [20]:
function mclean()
    Flux.throttle(GC.gc, 37)
end

function mdeepclean()
    Flux.throttle(() -> GC.gc(true), 1031)
end

mdeepclean (generic function with 1 method)

In [21]:

import Base:isinteger
function isinteger(x)
    return false
end

isinteger (generic function with 10 methods)

In [22]:
function my_train!(model, loss, penalty, ps, data, opt; cb = () -> (), nob = nothing, kwargs...)
    # Note that the kwargs are only there to pass directly to the loss function
    ps = Zygote.Params(ps)
    cb = Flux.Optimise.runall(cb)
    if isinteger(nob)
        l = Array{Float32, 1}(undef, nob)    
    else      
        l = []
    end
    history = Dict(:loss => l, :metrics => Dict(:penalty => []))
    for (i, d) in enumerate(data)
        try
            # Below each batch is loaded separately onto the gpu as part of a hard-coded solution for augmentation
            gs = Zygote.gradient(ps) do
                l = loss(model, Flux.Optimise.batchmemaybe(d |> Flux.gpu)...; kwargs...) + penalty(model)
            end                
            if isinteger(nob)
                history[:loss][i] = l
            else          
                push!(history[:loss], l)
            end
            push!(history[:metrics][:penalty], penalty(model))
            Flux.update!(opt, ps, gs)
            cb()
        catch ex
            if ex isa Flux.Optimise.StopException
                break
            elseif ex isa Flux.Optimise.SkipException
                continue
            else
                rethrow(ex)
            end
        end
    end
    return history
end

my_train! (generic function with 1 method)

# Main procedure

In [23]:
myargs = ArgParse.parse_args(cli_args, settings)

Dict{String, Any} with 5 entries:
  "label-file"    => "/flush/common/UKbiobank/T1_dset/struc_brain_HCC_Johan_csv…
  "data-dir"      => "/flush/common/UKbiobank/T1_dset/struc_brain_HCC_Johan"
  "secondary-gpu" => nothing
  "model"         => "model_isotropic_02.jl"
  "training-gpu"  => 0

In [24]:
CUDA.device!(myargs["training-gpu"])

CuDevice(0): Quadro RTX 8000

In [25]:
X_trans, X_coron, X_sagit, y = load_data(data_dir, label_file)
X_trans_valid, X_coron_valid, X_sagit_valid, y_valid = load_data(data_dir, label_file);

18633       2261       10171       3391       1131       6216       5086       19761       14121       15813       

In [26]:
include("augment.jl")

get_augment_loader (generic function with 1 method)

In [34]:
pl1, pl2 = getpipeline("S X Y R E")
# So far, on-the-fly augmentation has turned out to be impractical
# In stead I use the above to acquire an augmentation pipeline (using Scale, XShear, YShear, Rotation and Elastic deformation)
# No numerical parameters means that I get (my) default values
# Below I make a train_loader using said pipeline (really two, to accomodate the different dimensions of the proj's)
train_loader = get_augment_loader((d₁ = X_trans, d₂ = X_coron, d₃ = X_sagit, l = y), 4, pl1, pl2)
valid_loader = Flux.DataLoader((d₁ = X_trans_valid, d₂ = X_coron_valid, d₃ = X_sagit_valid, l = y_valid) , batchsize = 32, shuffle = false);


Found parameter group 'S'
Exiting inner augmentation parsing loop, rest of group configuration string is ''
Found parameter group 'X'
Exiting inner augmentation parsing loop, rest of group configuration string is ''
Found parameter group 'Y'
Exiting inner augmentation parsing loop, rest of group configuration string is ''
Found parameter group 'R'
Exiting inner augmentation parsing loop, rest of group configuration string is ''
Found parameter group 'E'
Exiting inner augmentation parsing loop, rest of group configuration string is ''
Exiting outer augmentation parsing loop, rest of configuration string is ''
Scale parameters not understood ('Float32[]'), using default range 1.01:0.05:1.2
Parameters for shearing along X-axis not understood ('Float32[]'), using default range -5.0:5.0
Parameters for shearing along Y-axis not understood ('Float32[]'), using default range -5.0:5.0
Rotation parameters not understood ('Float32[]'), using default range -5.0:5.0
Parameters for elastic distortio

In [27]:
bcount, reset_bcount = get_counter_funcs()
tcount, reset_tcount = get_counter_funcs()

(var"#count#17"{Flux.var"#throttled#122"{Flux.var"#throttled#118#123"{Bool, Bool, typeof(print), Int64}}}(Core.Box(1), Flux.var"#throttled#122"{Flux.var"#throttled#118#123"{Bool, Bool, typeof(print), Int64}}(Flux.var"#throttled#118#123"{Bool, Bool, typeof(print), Int64}(true, false, print, 5, Core.Box(nothing), Core.Box(nothing), Core.Box(true)))), var"#reset_count#18"(Core.Box(1)))

In [28]:
no_of_epochs = 100
no_of_k_epochs = 4

4

In [29]:
# Some default behaviors in the absence of certain features
penalty(l) = 0 # When not using my custom convolution layer with advanced regularization
aug = identity # When not using augmentation
# include("models/model_01_reg_conv.jl")

identity (generic function with 1 method)

In [30]:
include("changelayers.jl")

true

In [35]:
function train_and_evaluate(mpath, arguments, modifications, hyperparameters; identifier=nothing)
    best_val_loss = 30 # arbitrary cutoff for saving models based on val loss
    IJulia.set_max_stdio(1 << 25)
    aecs = ("\e[7m", "\e[27m")
    make_my_model = load_model(mpath)
    lr = assign(hyperparameters, "lr", 0.003)
    wreg = assign(hyperparameters, "wreg", (0, 0))
    areg = assign(hyperparameters, "areg", (0, 0))
    # drate = assign(hyperparameters, "drate", 0.25) this is best done with modifications
    result="\e[38;2;0;255;0mWorks\e[39m"
    try
        model = make_my_model(arguments...;) |> Flux.gpu
        for m ∈ modifications
            println("Attempting to modify the model on the fly with $m $(supertypes(typeof(m)))")
            Base.invokelatest(m, model) # these could use changelayers from changelayers.jl but also do more drastic changes
        end
        if isnothing(identifier)
            mn = basename(mpath) * "_" * join(map(k -> "$(k)=$(hyperparameters[k])", sort(collect(keys(hyperparameters)))), '_')
        else
            mn = identifier
        end
        
        global_val_loss_curve = Vector{Float32}()
        global_train_loss_curve = Vector{Float32}()
        # Flux.loadparams!(model, ps_init |> Flux.gpu)
        ps = Flux.params(model)
        opt = Flux.ADAM(lr, (0.9, 0.999))
        global_val_loss_curve = Vector{Float32}()
        global_train_loss_curve = Vector{Float32}()
        ts = Dates.format(Dates.now(), "yyyy-mm-dd-HH-MM-SS")
        rdir = joinpath(result_dir, "results_$(mn)_$(ts)_$(lr)")
        if !isdir(rdir)
            mkdir(rdir)
        end
        withmultipleoutput(joinpath(rdir, "train_log_$ts")) do
            mprintln(repr("text/plain", model))
            for k_epoch ∈ 1:no_of_k_epochs
                val_losses = zeros(round(Int32, size(y_valid,2) / 32 + 0.5))
                val_loss_curve = Array{Float32,1}(undef, no_of_epochs)
                train_loss_curve = Array{Float32,1}(undef, no_of_epochs)
                for ep ∈ 1:no_of_epochs
                    Flux.trainmode!(model, true)
                    h = my_train!(model, age_loss, penalty, ps, train_loader, opt; cb = [bcount, mclean, mdeepclean ])
                    reset_bcount()
                    train_loss_curve[ep] = Statistics.mean(h[:loss])
                    mprint("\rEpoch$(k_epoch).$(ep): running against validation set")
                    Flux.testmode!(model, true)
                    for (i,b) in enumerate(valid_loader)
                        val_losses[i] = age_loss(model, b |> Flux.gpu) # This bit ought to be harmless but isn't needed any more
                    end
                    mprint("\r                                                     ")
                    val_loss_curve[ep] = Statistics.mean(val_losses)
                    if val_loss_curve[ep] < best_val_loss
                        best_val_loss = val_loss_curve[ep]
                        save_model(mpath * "_val_loss=$(best_val_loss)@epoch_$(ep)", model, rdir)
                    end
                    ts = Dates.format(Dates.now(),"yyyy-mm-dd@HH:MM:SS")
                    if ep % 200 == 0
                        mprintln("\r$ts: This is epoch $((k_epoch - 1) * no_of_epochs + ep) and validation loss was $(val_loss_curve[ep]) while training loss was $(train_loss_curve[ep])")
                    end
                end
                Plots.plot(train_loss_curve)
                Plots.plot!(val_loss_curve)
                ts = Dates.format(Dates.now(), "yyyy-mm-dd-HH-MM-SS")
                Plots.savefig(rdir * "/k_epoch_plot_$(k_epoch)_$(mn)_$(ts)_$(lr).png")
                fw_msg(rdir * "/k_epoch_plot_$(k_epoch)_$(mn)_$(ts)_$(lr).png")
                save_model(mpath, model, rdir)
                append!(global_train_loss_curve, train_loss_curve)
                append!(global_val_loss_curve, val_loss_curve)
            end
            Plots.plot(global_train_loss_curve)
            Plots.plot!(global_val_loss_curve)
            ts = Dates.format(Dates.now(), "yyyy-mm-dd-HH-MM-SS")
            Plots.savefig(rdir * "/global_plot_$(mn)_$(ts)_$(lr).png")
            fw_msg(rdir * "/global_plot_$(mn)_$(ts)_$(lr).png")
            mprintln("\nPreparing to save accumulated learning data")
            gt_min = minimum(global_train_loss_curve)
            gt_end = global_train_loss_curve[end]
            gv_min = minimum(global_val_loss_curve)
            gv_end = global_val_loss_curve[end]
            mprintln("Saving accumulated learning data")
            BSON.@save (rdir * "/global_loss_$(mn)_$(ts)_$(lr)___$(gv_min)_$(gv_end)_$(gt_min)_$(gt_end).bson") global_train_loss_curve global_val_loss_curve
            fw_msg(rdir * "/global_loss_$(mn)_$(ts)_$(lr)___$(gv_min)_$(gv_end)_$(gt_min)_$(gt_end).bson")
            mprintln("Accumulated learning data saved")
        end

    catch e
        result="\e[38;2;255;0;0mCrashes ($(typeof(e)))\e[39m "*repr(MIME("text/plain"), e)
        mprintln("\n",e,"\n")
        flush(stdout)
        for sf in stacktrace(catch_backtrace())
            display(sf)
            mprintln()
            flush(stdout)
        end
        if e isa InterruptException
            rethrow()
        end
    finally
        mprintln("\r", "", "\t", result)
        mprint("\n\e[38;2;0;0;0;48;2;255;255;0m")
        for i ∈ 0:11
            mprint(" ▁▂▃▄▅▆▇█" * aecs[1 + i % 2])
        end
        mprintln("\e[0m\n")
    end

end # end train_and_evaluate


train_and_evaluate (generic function with 1 method)

In [36]:
# include("hpiterator.jl")

In [37]:
function runfromfile(fn)
    # This function takes the name of a file
    # It runs ONE training from the file if it finds one suitable
    # and deletes the description of that training from the file
    # The trainings are described as a line containing:
    # <gpu> args=<arguments> kwargs=<keyword arguments>
    # where <gpu> is the device number (typically 0, 1 etc)
    # args and kwargs are passed to the train_and_evaluate function
    lines = open(readlines, fn, "r") 
    # the file is read and closed but a race condition can exist if a processes read the file
    # before another one has closed it. This can lead to repeating runs or perhaps dropping some.
    # This simply has to be monitored afterwards.
    forbiddenkeywords = [:aug, :augmentation, "aug", "augmentation", ] # things that might show up in the run file but should be dealt with elsewhere

    lines_before = Vector{eltype(lines)}()
    lines_after = Vector{eltype(lines)}()
    cur = nothing
    for idx ∈ 1:length(lines) # I know lines doesn't use "exotic" indexing
        mo = match(r"^(\d+)\D+", lines[idx])
        if !isnothing(mo) # && parse(Int32, mo[1]) % 3 == CUDA.device().handle #Run any job
            cur = idx
            break
        end
    end
    if isnothing(cur)
        return false
    else
        lines_before = lines[1:cur-1]
        lines_after = lines[cur+1:end]
        open(f->println(f, join(cat(lines_before, lines_after; dims = (1,)), "\n")) , fn, "w")
        # now the file is written and closed and the race condition over.
        line = lines[cur]
        mo = match(r"^(\d+)\s+args\s*=\s*(.*?)\s*kwargs\s*=\s*(.*)\s*$", line)
        println("mo = $mo")
        if isnothing(mo)
            mo = match(r"^(\d+)?(\s+)?(args)?(\s*)(=)?(\s*)(.+?\))?(\s*)(kwargs)?(\s*)(=)?(\s*)(.*)(\s*)($)?", line)
            if isnothing(mo)
                ei = 1
            else
                println("mo = $mo")
                egr = nothing
                for gr = 1:13
                    if isnothing(mo[gr])
                        egr = gr
                        println("egr = $egr")
                        break
                    end
                end
                if isnothing(egr) || egr == 1
                    ei = 1
                else
                    ei = mo.offsets[egr - 1] + length(mo[egr - 1])
                end
            end
            markerline = "\e[1;38;5;255;255;0m" * "─" ^ (ei - 1) * "\e[5m⬏\e[0m"
            throw(ErrorException("Line $idx in $fn does not conform to the syntax.\n\n$line\n$markerline"))
        else
            args = eval(Meta.parse(mo[2]))
            kwargs = eval(Meta.parse(mo[3]))
            for kw in forbiddenkeywords
                delete!(kwargs, kw)
            end
            println("Attempting to run train_and_evaluate(args...; kwargs...) with\n    args   = $args\n    kwargs = $kwargs")
            train_and_evaluate(args...; kwargs...)
        end
        return true
    end
end

runfromfile (generic function with 1 method)

In [None]:
toc = Inf64
while toc > 6000000000000
    tic = time_ns()
    # below is an attempt at a workaround for some hard to understand world age / CUDA interactions
    nextline = readline("runs221108")
    mo = match(r"\"([^\"]+)\"", nextline)
    if ! isnothing(mo)
        mfn = mo[1]
        load_model(mfn)
    end
    # End of workaround
    runfromfile("runs221108")
    toc = time_ns() - tic
    println("toc = $toc ($(round(Int32, toc / 10 ^ 9)) s)\n\n")
end

In [39]:
tmp = Flux.DataLoader((d₁ = train_loader.data[:d₁], d₂ = train_loader.data[:d₂], d₃ = train_loader.data[:d₃], l = train_loader.data[:l]) , batchsize = 32, shuffle = true);
train_loader = tmp

MLUtils.DataLoader{NamedTuple{(:d₁, :d₂, :d₃, :l), Tuple{Array{Float32, 4}, Array{Float32, 4}, Array{Float32, 4}, Matrix{Float32}}}, Random._GLOBAL_RNG, Val{nothing}}((d₁ = Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

...

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]

Float32[0.0 0.0 … 0.0

In [None]:
let args   = ("models/model_channel_toggle.jl", [2, 4, 6], (), Dict{Any, Any}()), kwargs = Dict(:identifier => "model_channel_toggle.jl_channels=2.4.6")
    train_and_evaluate(args...; kwargs...)
end

Has positional arguments
Looks like a collection
Is a set of channels
Channel model called with:
args:
  1: 2
  2: 4
  3: 6

kwargs:
Building model with channels 2, 4 and 6
Received request to open /home/johjo50/3Dto2D/v1/results/results_model_channel_toggle.jl_channels=2.4.6_2022-11-09-10-04-51_0.003/train_log_2022-11-09-10-04-51
File opened, returning filehandle
Entering "with-multiple-output"-context.
Chain(
  Parallel(
    var"#99#115"(),
    Chain(
      var"#89#104"(),
      Conv((3, 3), 1 => 4, σ, pad=1),   # 40 parameters
      Conv((3, 3), 4 => 4, pad=1, stride=2, bias=false),  # 144 parameters
      BatchNorm(4, σ),                  # 8 parameters, plus 8
      Dropout(0.2),
      Conv((3, 3), 4 => 8, σ, pad=1),   # 296 parameters
      Conv((3, 3), 8 => 8, pad=1, stride=2, bias=false),  # 576 parameters
      BatchNorm(8, σ),                  # 16 parameters, plus 16
      Dropout(0.2),
      Conv((3, 3), 8 => 16, σ, pad=1),  # 1_168 parameters
      Conv((3, 3), 16 => 16, p

  '/home/johjo50/3Dto2D/v1/results/results_model_channel_toggle.jl_channels=2.4.6_2022-11-09-10-04-51_0.003/k_epoch_plot_1_model_channel_toggle.jl_channels=2.4.6_2022-11-10-09-25-27_0.003.png'
File written to path:
  '/home/johjo50/3Dto2D/v1/results/results_model_channel_toggle.jl_channels=2.4.6_2022-11-09-10-04-51_0.003/model_model_channel_toggle.jl_2022-11-10-09-25-43.mdl.bson'
742                                                                                  

In [None]:
results_model_channel_toggle.jl_channels=2.4.6_2022-11-09-10-04-51_0.003/k_epoch_plot_1_model_channel_toggle.jl_channels=2.4.6_2022-11-10-09-25-27_0.003.png