In [None]:
using Pkg
envpath = expanduser("~/envs/dev/")
Pkg.activate(envpath)

# libs
using Images
using DataFrames
using CSV
using Random
using MLUtils: splitobs, randobs

In [None]:
# input folders
datasetPath  = "~/datasets/pascalvoc/VOCdevkit/VOC2012/"
imagesPath   = datasetPath * "JPEGImages/"
masksPath    = datasetPath * "SegmentationClass/"
cowslist     = datasetPath * "ImageSets/Main/cow_trainval.txt"

# output folders
workpath = pwd() * "/"
workpath = replace(workpath, homedir() => "~")

In [None]:
# dfcow
dfcow = CSV.read(expanduser(cowslist), header=false, DataFrame)

# split columns
dfcow = DataFrame(filename=first.(split.(dfcow[:,1], ' ')), label=last.(split.(dfcow[:,1], ' ')))
dfcow.filename = dfcow.filename .|> String
dfcow.label = map(x -> x == "-1" ? 0 : 1, dfcow.label)

# add new column "segmented" to data frame
dfcow.segmented = zeros(Int, size(dfcow, 1))
first(dfcow, 5)

In [None]:
# populate segmented column
segfiles = readdir(expanduser(masksPath))

dfcow.segmented = 
    [ifelse(row.label == 1 && row.filename * ".png" in segfiles, 1, 0) 
    for row in eachrow(dfcow)]

sum(dfcow.segmented)

In [None]:
# split segmented dataset in train, valid, test
df = dfcow[dfcow.segmented .== 1, :]
N = size(df, 1)
Random.seed!(1234)   # MUST NOT CHANGE
indtrain, indval, indtest = splitobs(1:N, at = (0.70, 0.20), shuffle = true)
dftrain_seg = df[indtrain, :]
dfvalid_seg = df[indval, :]
dftest_seg  = df[indtest, :]

size(dftrain_seg), size(dfvalid_seg), size(dftest_seg)

In [None]:
# make non segmented dataset with same size as segmented
# then, split in train, valid, test
df = dfcow[dfcow.segmented .== 0, :]
Random.seed!(1234)   # MUST NOT CHANGE
sampled_df = randobs(df, N)
indtrain, indval, indtest = splitobs(1:N, at = (0.70, 0.20), shuffle = true)
dftrain_noseg = sampled_df[indtrain, :]
dfvalid_noseg = sampled_df[indval, :]
dftest_noseg  = sampled_df[indtest, :]

size(dftrain_noseg), size(dfvalid_noseg), size(dftest_noseg)

In [None]:
# merge datasets
dftrain = vcat(dftrain_seg, dftrain_noseg)
dfvalid = vcat(dfvalid_seg, dfvalid_noseg)
dftest  = vcat(dftest_seg, dftest_noseg)

size(dftrain), size(dfvalid), size(dftest)

In [None]:
# make X addresses
dftrain.X = imagesPath .* dftrain.filename .* ".jpg"
dfvalid.X = imagesPath .* dfvalid.filename .* ".jpg"
dftest.X  = imagesPath .* dftest.filename .* ".jpg"

# make y addresses
dftrain.y = 
    [row.segmented == 1 ? masksPath .* row.filename .* ".png" : "" for row in eachrow(dftrain)]
dfvalid.y =
    [row.segmented == 1 ? masksPath .* row.filename .* ".png" : "" for row in eachrow(dfvalid)]
dftest.y =
    [row.segmented == 1 ? masksPath .* row.filename .* ".png" : "" for row in eachrow(dftest)];

# remove "filename" column
dftrain = select!(dftrain, [:X, :y, :label, :segmented])
dfvalid = select!(dfvalid, [:X, :y, :label, :segmented])
dftest  = select!(dftest, [:X, :y, :label, :segmented]);

In [None]:
first(dftrain, 3)

In [None]:
last(dftrain, 3)

In [None]:
# save datasets
CSV.write(expanduser(workpath) * "dftrain-coi.csv", dftrain)
CSV.write(expanduser(workpath) * "dfvalid-coi.csv", dfvalid)
CSV.write(expanduser(workpath) * "dftest-coi.csv", dftest)

In [None]:
# checkpoint
dftrain = CSV.read(expanduser(workpath) * "dftrain-coi.csv", DataFrame)
df = dftrain[dftrain.segmented .== 1, :]

k = rand(1:size(df, 1))
img = Images.load(expanduser(df.X[k]))
msk = Images.load(expanduser(df.y[k]))
mosaicview([img, msk]; nrow=1)