In [None]:
using Pkg
envpath = expanduser("~/envs/dev/")
Pkg.activate(envpath)

# libs
using Images
using DataFrames
using CSV
using Random
using MLUtils: splitobs

In [None]:
# input folders
datasetPath = "~/datasets/pascalvoc/VOCdevkit/VOC2012/"         # path to downloaded/extracted dataset
imagesPath  = datasetPath * "JPEGImages/"
masksPath   = datasetPath * "SegmentationClass/"
classlist   = datasetPath * "ImageSets/Main/cow_trainval.txt"   # class of interest

# output folders
workpath = pwd() * "/"
workpath = replace(workpath, homedir() => "~")

In [None]:
# create data frame from classlist
df = CSV.read(expanduser(classlist), header=false, DataFrame)

# split columns
df = DataFrame(filename=first.(split.(df[:,1], ' ')), label=last.(split.(df[:,1], ' ')))
df.filename = df.filename .|> String
df.label = map(x -> x == "-1" ? 0 : 1, df.label)

# add new column "segmented" to data frame
df.segmented = zeros(Int, size(df, 1))
first(df, 3)

In [None]:
# populate segmented column
segfiles = readdir(expanduser(masksPath))

df.segmented = 
    [ifelse(row.label == 1 && row.filename * ".png" in segfiles, 1, 0) 
    for row in eachrow(df)]

sum(df.segmented)
first(df, 3)

In [None]:
# split segmented dataset in train, valid, test
df = df[df.segmented .== 1, :]
N = size(df, 1)
Random.seed!(1234)
indtrain, indval, indtest = splitobs(1:N, at = (0.70, 0.20), shuffle = true)
dftrain = df[indtrain, :]
dfvalid = df[indval, :]
dftest  = df[indtest, :]

size(dftrain), size(dfvalid), size(dftest)

In [None]:
# make X addresses
dftrain.X = imagesPath .* dftrain.filename .* ".jpg"
dfvalid.X = imagesPath .* dfvalid.filename .* ".jpg"
dftest.X  = imagesPath .* dftest.filename .* ".jpg"

# make y addresses
dftrain.y = masksPath .* dftrain.filename .* ".png"
dfvalid.y = masksPath .* dfvalid.filename .* ".png"
dftest.y  = masksPath .* dftest.filename .* ".png"

# select only (X, y) columns
dftrain = select!(dftrain, [:X, :y])
dfvalid = select!(dfvalid, [:X, :y])
dftest  = select!(dftest, [:X, :y])

In [None]:
# save datasets
CSV.write(expanduser(workpath) * "dftrain.csv", dftrain)
CSV.write(expanduser(workpath) * "dfvalid.csv", dfvalid)
CSV.write(expanduser(workpath) * "dftest.csv", dftest)

In [None]:
# checkpoint
df = CSV.read(expanduser(workpath) * "dftrain.csv", DataFrame)

k = rand(1:size(df, 1))
img = Images.load(expanduser(df.X[k]))
msk = Images.load(expanduser(df.y[k]))
mosaicview([img, msk]; nrow=1)