### Linear regression with MNIST


#### Libraries

In [None]:
using MLDatasets           # mnist
using Images
using PreprocessingImages; pim = PreprocessingImages
using PreprocessingArrays; pa  = PreprocessingArrays

using MLJ                  # make_blobs, rmse, confmat, categorical
using MLDataUtils          # label, nlabel, labelfreq
using GLM                  # (lm works as regression; GLM not OK for categorical outcomes)
using Metrics              # r2-score
using Random
using Plots; gr()
using StatsPlots
using DataFrames

#### Functions

In [None]:
# plotting function
function categoryStratificationChart(trainY, testY)
    # check for unique labels
    l = label(trainY); sort!(l)
    N = nlabel(trainY)

    # trainset classes
    d1 = labelfreq(trainY)
    d1 = DataFrame([(k, v) for (k,v) in d1])
    rename!(d1, ["class","trainset"])

    # testset classes
    d2 = labelfreq(testY)
    d2 = DataFrame([(k, v) for (k,v) in d2])
    rename!(d2, ["class","testset"])

    # make a single df (df facilitates sorting)
    df = leftjoin(d1, d2, on = :class)
    sort!(df,[:class])

    # make a matrix out of df (matrix facilitates plotting)
    M = df |> Array
    p1 = groupedbar(M[:, 2:3],
        bar_position = :dodge,
        size=(500,300),
        xtick=(1:N, l),   # não é obvio chegar a esta configuração
        legend=:outerright,
        label=["trainset" "testset"])
    p1 = title!("Dataset stratification", xlabel="categories", ylabel="count")
end


In [None]:
# functions for feature extraction
meanIntensity(img) = mean(Float32.(img))

function hSymmetry(img)
    imgFloat = Float32.(img)
    imgReverse = reverse(imgFloat, dims=1)
    return -mean( abs.(imgFloat - imgReverse) )
end


#### MNIST

In [None]:
# load mnist
datasetX,    datasetY    = MNIST(:train)[:]
validationX, validationY = MNIST(:test)[:]

display( size(datasetX) )

img  = datasetX[:, :, 1:5]
img2 = permutedims(img, (2, 1, 3))

display(datasetY[1:5]')
mosaicview( Gray.(img2)  ; nrow=1)

In [None]:
# split trainset, testset from dataset
Random.seed!(1)
(trainX, trainY), (testX, testY) = stratifiedobs((datasetX, datasetY), p = 0.7)
size(trainX), size(testX), size(validationX)

In [None]:
# stratification of split data
categoryStratificationChart(trainY, testY)

#### Linear regression (two classes - two predictors)

In [None]:
typeof(trainX)

In [None]:
# convert images to vectors
trainX = pim.batchImage2Vector(trainX)
trainY = trainY .|> Float32              # needed for lm
typeof(trainX), typeof(trainY)

In [None]:
# select classes for prediction
c = (1, 5)

# data selection from chosen classes
trainX = vcat( trainX[trainY .== c[1] ], trainX[ trainY .== c[2] ] )
trainY = vcat( trainY[trainY .== c[1] ], trainY[ trainY .== c[2] ] )
display(levels(trainY))
size(trainX), size(trainY)

In [None]:
# generate predictors
N = size(trainX)[1]

a = [meanIntensity(trainX[i]) for i in 1:N]
b = [hSymmetry(trainX[i])     for i in 1:N]
trainXLinear = hcat(a, b)
trainXLinear = pa.rescaleByColumns(trainXLinear)
display(size(trainXLinear))

# generate outcome
trainYLinear = copy(trainY);

In [None]:
# fit the model
lmFit = lm(trainXLinear, trainYLinear)

# predict with trainset
ŷ = GLM.predict(lmFit, trainXLinear)
ŷ = Int32.(round.(ŷ))
levels(ŷ)'

In [None]:
# metrics
display(MLJ.rmse(ŷ, trainYLinear))


In [None]:
# demonstration of prediction accuracy
v = rand(1:N, 8)
trainX[v]

imgs = pim.batchVector2Image( trainX[v], 28, 28 )
img2 = permutedims(imgs, (2, 1, 3))

display( mosaicview( Gray.(img2) ; nrow=1) )
display(trainY[v]' .|> Int32)
display(ŷ[v]')