### Demonstration of GLM with MNIST

Superseeded by MLJ version. Solved problems:
* Factorization of categorical outcome
* Flexibility for tuning/optimization

#### Get the data

In [None]:
# libraries
using Flux             # the julia ml library
using Images           # image processing and machine vision for julia
using MLJ              # make_blobs, rmse, ConfusionMatrix
using MLDataUtils      # label, nlabel, labelfreq
using MLDatasets       # mnist

using GLM              # lm, glm, predict

using LinearAlgebra    # pinv pseudo-inverse matrix
using Metrics          # r2-score
using Random
using StatsBase        # standardize (normalization)
using Distributions

using Plots; gr()
using StatsPlots
using Printf

using CSV
using DataFrames
using CategoricalArrays   # advised by GLM docs


In [None]:
# load mnist from MLDatasets
trainX_original,      trainY_original      = MNIST.traindata()
validationX_original, validationY_original = MNIST.testdata();

display([MNIST.convert2image(MNIST.traintensor(i)) for i in 1:5])
trainY_original[1:5]'

In [None]:
# trainset, testset, validation set
Random.seed!(1)
(trainX, trainY), (testX, testY) = stratifiedobs((trainX_original, trainY_original), p = 0.7)
validationX = copy(validationX_original); validationY = copy(validationY_original)

size(trainX), size(testX), size(validationX)

In [None]:
# functions for feature extraction
meanIntensity(img) = mean(Float64.(img))

function hSymmetry(img)
    imgFloat = Float64.(img)
    imgReverse = reverse(imgFloat, dims=1)
    return -mean( abs.(imgFloat - imgReverse) )
end

In [None]:
h, v, N = size(trainX)
a = [meanIntensity( trainX[:, :, i] ) for i in 1:N]
b = [hSymmetry( trainX[:, :, i] )     for i in 1:N]
trainX = hcat(a, b)
display(size(trainX))


In [None]:
# rescale predictors
function rescaleByColumns(X)
    # using StatsBase
    X = Float64.(X)
    dt = fit(ZScoreTransform, X; dims=1, center=true, scale=true)
    rescaledX = StatsBase.transform(dt, X)
end

trainX = rescaleByColumns(trainX)
mean(trainX, dims=1)

In [None]:
# select two classes
P = 5   # positive class
N = 1   # negative class

# data selection from above classes and sizes
trainX = vcat( trainX[trainY .== P, :], trainX[trainY .== N, :] )
trainY = vcat( trainY[trainY .== P],    trainY[trainY .== N] )
levels(trainY)

In [None]:
# rename Y classes
trainY = [i == N ? 0 : 1 for i in trainY]
levels(trainY)

### Fit the model

In [None]:
# fit the model
glmFit = glm(trainX, trainY, Binomial())
p = GLM.predict(glmFit)   # predict with trainset (no need to declare it)
ŷ = [i > 0.5 ? 1 : 0 for i in p]

# metrics
display(MLJ.rmse(ŷ, trainY))
MLJ.confmat(ŷ, trainY)