In [None]:
using Pkg
envpath = expanduser("~/envs/d8reg/")
Pkg.activate(envpath)

# libs
using Images
using DataFrames
using CSV
using Random
using MLUtils: splitobs, randobs

In [None]:
# input folders
datasetPath  = "../pascalvoc/VOCdevkit/VOC2012/"
imagesPath   = datasetPath * "JPEGImages/"
masksPath    = datasetPath * "SegmentationClass/"
cowslist     = datasetPath * "ImageSets/Main/cow_trainval.txt"   # escolha da classe de interesse

# output folders
workpath = pwd() * "/"
workpath = replace(workpath, homedir() => "~")

In [None]:
# df
df = CSV.read(expanduser(cowslist), header=false, DataFrame)

# split columns
df = DataFrame(filename=first.(split.(df[:,1], ' ')), label=last.(split.(df[:,1], ' ')))
df.filename = df.filename .|> String
df.label = map(x -> x == "-1" ? false : true, df.label)

# add new column "segmented" to data frame
df.segmented = zeros(Bool, size(df, 1))
first(df, 5)

In [None]:
# populate segmented column
segfiles = readdir(expanduser(masksPath))

df.segmented = 
    [ifelse(row.label && row.filename * ".png" in segfiles, true, false) 
    for row in eachrow(df)]

sum(df.segmented)

In [None]:
# split segmented dataset in train, valid, test
dfaux = df[df.segmented, :]
N = size(dfaux, 1)
Random.seed!(1234)   # MUST NOT CHANGE
indtrain, indval, indtest = splitobs(1:N, at = (0.70, 0.20), shuffle = true)
dftrain_seg = dfaux[indtrain, :]
dfvalid_seg = dfaux[indval, :]
dftest_seg  = dfaux[indtest, :]

size(dftrain_seg), size(dfvalid_seg), size(dftest_seg)

In [None]:
# make non segmented dataset with same size as segmented
# then, split in train, valid, test
dfaux = df[df.segmented .== 0, :]
Random.seed!(1234)   # MUST NOT CHANGE
sampled_df = randobs(dfaux, N)
indtrain, indval, indtest = splitobs(1:N, at = (0.70, 0.20), shuffle = true)
dftrain_noseg = sampled_df[indtrain, :]
dfvalid_noseg = sampled_df[indval, :]
dftest_noseg  = sampled_df[indtest, :]

size(dftrain_noseg), size(dfvalid_noseg), size(dftest_noseg)

In [None]:
# merge datasets
dftrain = vcat(dftrain_seg, dftrain_noseg)
dfvalid = vcat(dfvalid_seg, dfvalid_noseg)
dftest  = vcat(dftest_seg, dftest_noseg)

size(dftrain), size(dfvalid), size(dftest)

In [None]:
# make X addresses
dftrain.X = imagesPath .* dftrain.filename .* ".jpg"
dfvalid.X = imagesPath .* dfvalid.filename .* ".jpg"
dftest.X  = imagesPath .* dftest.filename .* ".jpg"

# make y addresses
dftrain.y = 
    [row.segmented == 1 ? masksPath .* row.filename .* ".png" : "" for row in eachrow(dftrain)]
dfvalid.y =
    [row.segmented == 1 ? masksPath .* row.filename .* ".png" : "" for row in eachrow(dfvalid)]
dftest.y =
    [row.segmented == 1 ? masksPath .* row.filename .* ".png" : "" for row in eachrow(dftest)];

# remove "filename" column
dftrain = select!(dftrain, [:X, :y, :label, :segmented])
dfvalid = select!(dfvalid, [:X, :y, :label, :segmented])
dftest  = select!(dftest, [:X, :y, :label, :segmented]);

In [None]:
first(dftrain, 3)

In [None]:
last(dftrain, 3)

In [None]:
# save datasets
CSV.write(expanduser(workpath) * "dftrain.csv", dftrain)
CSV.write(expanduser(workpath) * "dfvalid.csv", dfvalid)
CSV.write(expanduser(workpath) * "dftest.csv", dftest)

In [None]:
# checkpoint
dftrain = CSV.read(expanduser(workpath) * "dftrain.csv", DataFrame)
df = dftrain[dftrain.segmented, :]

k = rand(1:size(df, 1))
img = Images.load(expanduser(df.X[k]))
msk = Images.load(expanduser(df.y[k]))
mosaicview([img, msk]; nrow=1)