# Generate ROI matches file

This notebook converts ANTSUN output `jld2` data dictionaries into the `roi_match.h5` file used in the `AutoCellLabeler_freely_moving` notebook. The `roi_match.h5` file contains information matching the freely-moving ROIs to the immobilized ROIs for the corresponding datasets.

In [1]:
using JLD2, FlavellBase, ImageDataIO, Glob, ProgressMeter, HDF5, Plots

## Define datasets and file paths

In [8]:
datasets_prj_neuropal = ["2022-07-15-06", "2022-07-15-12", "2022-07-20-01", "2022-07-26-01", "2022-08-02-01", "2023-01-23-08", "2023-01-23-15", "2023-01-23-21", "2023-01-19-08", "2023-01-19-22", "2023-01-09-28", "2023-01-17-01", "2023-01-19-15", "2023-01-23-01", "2023-03-07-01", "2022-12-21-06", "2023-01-05-18", "2023-01-06-01", "2023-01-06-08", "2023-01-09-08", "2023-01-09-15", "2023-01-09-22", "2023-01-10-07", "2023-01-10-14", "2023-01-13-07", "2023-01-16-01", "2023-01-16-08", "2023-01-16-15", "2023-01-16-22", "2023-01-17-07", "2023-01-17-14", "2023-01-18-01"]
datasets_prj_rim = ["2023-06-09-01", "2023-07-28-04", "2023-06-24-02", "2023-07-07-11", "2023-08-07-01", "2023-06-24-11", "2023-07-07-18", "2023-08-18-11", "2023-06-24-28", "2023-07-11-02", "2023-08-22-08", "2023-07-12-01", "2023-07-01-09", "2023-07-13-01", "2023-06-09-10", "2023-07-07-01", "2023-08-07-16", "2023-08-22-01", "2023-08-23-23", "2023-08-25-02", "2023-09-15-01", "2023-09-15-08", "2023-08-18-18", "2023-08-19-01", "2023-08-23-09", "2023-08-25-09", "2023-09-01-01", "2023-08-31-03", "2023-07-01-01", "2023-07-01-23"]
datasets_prj_aversion = ["2023-03-30-01", "2023-06-29-01", "2023-06-29-13", "2023-07-14-08", "2023-07-14-14", "2023-07-27-01", "2023-08-08-07", "2023-08-14-01", "2023-08-16-01", "2023-08-21-01", "2023-09-07-01", "2023-09-14-01", "2023-08-15-01", "2023-10-05-01", "2023-06-23-08", "2023-12-11-01", "2023-06-21-01"]
datasets_prj_5ht = ["2022-07-26-31", "2022-07-26-38", "2022-07-27-31", "2022-07-27-38", "2022-07-27-45", "2022-08-02-31", "2022-08-02-38", "2022-08-03-31"]
datasets_prj_starvation = ["2023-05-25-08", "2023-05-26-08", "2023-06-05-10", "2023-06-05-17", "2023-07-24-27", "2023-09-27-14", "2023-05-25-01", "2023-05-26-01", "2023-07-24-12", "2023-07-24-20", "2023-09-12-01", "2023-09-19-01", "2023-09-29-19", "2023-10-09-01", "2023-09-13-02"]

datasets = vcat(datasets_prj_neuropal, datasets_prj_rim, datasets_prj_aversion, datasets_prj_5ht, datasets_prj_starvation)

datasets_val = ["2023-06-24-02", "2023-08-07-01", "2023-08-19-01", # RIM datasets
                "2022-07-26-01", "2023-01-23-21", "2023-01-23-01", # NeuroPAL datasets
                "2023-07-14-08", # Aversion datasets
                "2022-08-02-31", # 5-HT datasets
                "2023-07-24-27", "2023-07-24-20"] # Starvation datasets
datasets_test = ["2023-08-22-01", "2023-07-07-18", "2023-07-01-23",  # RIM datasets
                 "2023-01-06-01", "2023-01-10-07", "2023-01-17-07", # Neuropal datasets
                 "2023-08-21-01", "2023-06-23-08", # Aversion datasets
                 "2022-07-27-38", # 5-HT datasets
                 "2023-10-09-01", "2023-09-13-02" # Starvation datasets
                 ]

datasets_train = setdiff(datasets, vcat(datasets_val, datasets_test));

In [10]:
input_paths = Dict(
    "prj_rim" => "/store1/prj_rim/data_processed",
    "prj_neuropal" => "/store1/prj_neuropal/data_processed",
    "prj_starvation" => "/data1/prj_starvation/data_processed",
    "prj_5ht" => "/data3/prj_5ht/published_data/data_processed_neuropal",
    "prj_aversion" => "/data1/prj_aversion/data_processed"
);

## Load ANTSUN output data

In [11]:
data_dicts = Dict()
params_dict = Dict()
param_paths = Dict()

for dataset in datasets_test
    prj_dir = ""
    if dataset in datasets_prj_rim
        prj_dir = input_paths["prj_rim"]
    elseif dataset in datasets_prj_neuropal
        prj_dir = input_paths["prj_neuropal"]
    elseif dataset in datasets_prj_starvation
        prj_dir = input_paths["prj_starvation"]
    elseif dataset in datasets_prj_5ht
        prj_dir = input_paths["prj_5ht"]
    elseif dataset in datasets_prj_aversion
        prj_dir = input_paths["prj_aversion"]
    else
        @warn("Dataset $dataset not found in any project")
        continue
    end

    path_root_process = joinpath(prj_dir, "$(dataset)_output")

    path_param_path = joinpath(path_root_process, "param_path.jld2")
    
    if isfile(path_param_path)
        f = JLD2.jldopen(path_param_path)
        param_paths[dataset] = f["param_path"]
        close(f)
    else
        @warn("No param_path.jld2 file found for dataset: $dataset")
    end
    param_path = param_paths[dataset]

    change_rootpath!(param_path, path_root_process)

    if isfile(param_path["path_param"])
        f = JLD2.jldopen(param_path["path_param"])
        params_dict[dataset] = f["param"]
        close(f)
    end
    
    param = params_dict[dataset]

    add_get_basename!(param_path)
    
    if isfile(param_path["path_data_dict"])
        f = JLD2.jldopen(param_path["path_data_dict"])
        data_dicts[dataset] = f["data_dict"]
        close(f)
    else
        data_dicts[dataset] = Dict()
    end
end

[33m[1m└ [22m[39m[90m@ JLD2 ~/.julia/packages/JLD2/twZ5D/src/data/reconstructing_datatypes.jl:621[39m
[33m[1m└ [22m[39m[90m@ JLD2 ~/.julia/packages/JLD2/twZ5D/src/data/reconstructing_datatypes.jl:621[39m
[33m[1m└ [22m[39m[90m@ JLD2 ~/.julia/packages/JLD2/twZ5D/src/data/reconstructing_datatypes.jl:621[39m
[33m[1m└ [22m[39m[90m@ JLD2 ~/.julia/packages/JLD2/twZ5D/src/data/reconstructing_datatypes.jl:621[39m
[33m[1m└ [22m[39m[90m@ JLD2 ~/.julia/packages/JLD2/twZ5D/src/data/reconstructing_datatypes.jl:621[39m
[33m[1m└ [22m[39m[90m@ JLD2 ~/.julia/packages/JLD2/twZ5D/src/data/reconstructing_datatypes.jl:621[39m
[33m[1m└ [22m[39m[90m@ JLD2 ~/.julia/packages/JLD2/twZ5D/src/data/reconstructing_datatypes.jl:621[39m
[33m[1m└ [22m[39m[90m@ JLD2 ~/.julia/packages/JLD2/twZ5D/src/data/reconstructing_datatypes.jl:621[39m
[33m[1m└ [22m[39m[90m@ JLD2 ~/.julia/packages/JLD2/twZ5D/src/data/reconstructing_datatypes.jl:621[39m
[33m[1m└ [22m[39m[90m@ 

In [12]:
all_red_reg_data_dicts = Dict()

for dataset in datasets_test
    param_path = param_paths[dataset]
    path_root_process = param_paths[dataset]["path_root_process"]
    
    if isfile(joinpath(param_path["path_root_process"], "all_red_registered_data_dict.jld2"))
        f = JLD2.jldopen(joinpath(param_path["path_root_process"], "all_red_registered_data_dict.jld2"))
        all_red_reg_data_dicts[dataset] = f["data_dict"]
        close(f)
    else
        all_red_reg_data_dicts[dataset] = Dict()
    end
end

## Define ROI matching functions

In [13]:
function map_t_roi(dataset, t, roi)
    neuron_orig = get(get(data_dicts[dataset]["new_label_map"], t, Dict()), roi, NaN)
    fm_neuron = -1
    for (i, n) in enumerate(data_dicts[dataset]["valid_rois"])
        if n == neuron_orig
            fm_neuron = i
            break
        end
    end

    for (i, n) in enumerate(all_red_reg_data_dicts[dataset]["roi_matches"])
        if n == fm_neuron
            return i
        end
    end
    return -1
end

function extract_dataset_info(directory::String)
    # Pattern to capture 'DATASET' and 'T' from filenames
    pattern = r"^(.*?)_(\d+)\.csv$"

    # Dictionary to store dataset names and their corresponding T values
    dataset_dict = Dict{String, Vector{Int}}()

    # List all CSV files in the directory
    for filename in glob("*.csv", directory)
        # Extract the base name of the file
        basename = splitdir(filename)[2]
        
        # Apply the regex pattern to extract 'DATASET' and 'T'
        match = match(pattern, basename)
        if match !== nothing
            dataset_name = match.captures[1]
            t_value = parse(Int, match.captures[2])

            # Populate the dictionary with the dataset name and T values
            if haskey(dataset_dict, dataset_name)
                push!(dataset_dict[dataset_name], t_value)
            else
                dataset_dict[dataset_name] = [t_value]
            end
        end
    end

    return dataset_dict
end

function save_dict_to_h5(filename::String, data::Dict{String,Array{<:Any,2}})
    h5open(filename, "w") do file
        for (key, value) in data
            # Write each matrix under a group named by the key
            write(file, key, value)
        end
    end
end

save_dict_to_h5 (generic function with 1 method)

## Generate ROI matches

In [14]:
max_roi = -1

for dataset in datasets_test
    for t in keys(data_dicts[dataset]["new_label_map"])
        for roi in keys(data_dicts[dataset]["new_label_map"][t])
            if roi > max_roi
                max_roi = roi
            end
        end
    end
end

In [15]:
roi_dict = Dict{String, Matrix}()

@showprogress for dataset in datasets_test
    roi_dict[dataset] = zeros(1600, max_roi)
    for t in keys(data_dicts[dataset]["new_label_map"])
        if t > 1600
            continue
        end
        for roi in keys(data_dicts[dataset]["new_label_map"][t])
            roi_dict[dataset][t,roi] = map_t_roi(dataset, t, roi)
        end
    end
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:01:43[39m


## Save ROI matches

Replace the path to where you'd like to save the matches file. You will load this file in the `AutoCellLabeler_freely_moving` notebook.

In [18]:
save_dict_to_h5("/store1/adam/test/roi_match.h5", roi_dict)