# Algorithme de région de confiance basique

In [None]:
using BenchmarkTools
using LinearAlgebra
using ForwardDiff

verbose = false

Nous allons regrouper les constantes dans la structure `BasicTrustRegion`.

In [None]:
struct BasicTrustRegion{T <: Real}
    η1:: T
    η2:: T
    γ1:: T
    γ2:: T
end

function BTRDefaults()
    return BasicTrustRegion(0.01,0.9,0.5,0.5)
end

# We define a type to store the state at a given iteration.
mutable struct BTRState
    iter::Int
    x::Vector
    xcand::Vector
    g::Vector
    step::Vector
    Δ::Float64
    ρ::Float64

    tol::Float64

    trace
    keepTrace::Bool
    
    function BTRState()
        state = new()
        state.tol = 1e-6
        state.keepTrace = false
        return state
    end
end

In [None]:
function acceptCandidate!(state::BTRState, b::BasicTrustRegion)
    # If the iteration is successful, update the iterate
    if (state.ρ >= b.η1)
        return true
    else
        return false
    end
end

In [None]:
function updateRadius!(state::BTRState, b::BasicTrustRegion)
    if (state.ρ >= b.η2)
        stepnorm = norm(state.step)
        state.Δ = min(1e20,max(4*stepnorm,state.Δ))
    elseif (state.ρ >= b.η1)
        state.Δ *= b.γ2
    else
        state.Δ *= b.γ1
    end
end

## Algorithme de région de confiance de base

In [None]:
function btr(f::Function, ∇f::Function, ∇2f::Function, Step::Function,
             x0::Vector, state:: BTRState = BTRState(), ApproxH::Bool = false, verbose::Bool = false)
    
    b = BTRDefaults()
    state.iter = 0
    state.x = x0
    n = length(x0)

    tol2 = state.tol*state.tol
    
    state.g = zeros(n)
    
    fx = f(x0)
    state.g = ∇f(x0)
    state.Δ = 0.1*norm(state.g) # 1.0

    if (ApproxH)
        # A better initialization procedure should be used with quasi-Newton approximations
        # We could rely on some preconditioner.
        H = zeros(n,n)+I
        y = zeros(n)
        gcand = zeros(n)
    else
        H = ∇2f(x0)
    end
    
    nmax = 1000
    if (state.keepTrace)
        state.trace= x0'
    end
    
    function model(s:: AbstractVector{T}, g:: AbstractVector{T}, H:: AbstractMatrix{T}) where T
        return dot(s, g)+0.5*dot(s, H*s)
    end
    
    while (dot(state.g,state.g) > tol2 && state.iter < nmax)
        # Compute the step by approximately minimize the model
        state.step = Step(state.g, H, state.Δ)
        state.xcand = state.x+state.step
        
        # Compute the actual reduction over the predicted reduction
        fcand = f(state.xcand)
        state.ρ = (fcand-fx)/(model(state.step, state.g, H))

       if (ApproxH)
            gcand = ∇f(state.xcand)
            y = gcand-state.g
            sy = dot(state.step,y)
#            if (sy < 1e-6)
#                println(state.iter, ". ", state.ρ, " ", state.Δ, " ", norm(state.step), " ", (model(state.step, state.g, H)), " ", norm(y), " ", sy, " ", norm(state.g))
#            end
            H = ∇2f(H, y, state.step)
        end

        if (acceptCandidate!(state, b))
            state.x = copy(state.xcand)
            if (ApproxH == false)
                state.g = ∇f(state.x)
                H = ∇2f(state.x)
            else
                state.g = copy(gcand)
            end
            fx = fcand
        end

        if (state.keepTrace)
            state.trace= [state.trace ; state.x']
        end
        
        updateRadius!(state, b)
        state.iter += 1
    end
    
    return state
end

In [None]:
function CauchyStep(g::Vector, H::Matrix, Δ::Float64)
    q = dot(g,H*g)
    normg = norm(g)
    if (q <= 0)
        τ = 1.0
    else
        τ = min((normg*normg*normg)/(q*Δ),1.0)
    end
    return -τ*g*Δ/normg
end

In [None]:
function BFGSUpdate(B:: Matrix, y:: Vector, s:: Vector)
    sy = dot(s,y)
    if (sy > 1e-12)
        Bs = B*s
        B[:,:] = B - (Bs*Bs')/dot(s, Bs) + (y*y')/sy
    end
    
    return B
end

In [None]:
function SR1Update(B:: Matrix, y:: Vector, s:: Vector)
    Bs = B*s
    z = y-Bs
    den = dot(z,s)
    if (abs(den) > 1e-8)
        B[:,:] = B + (z*z')/den
    end
    
    return B
end

## Gradient conjugué tronqué

In [None]:
function stopCG(normg::Float64, normg0::Float64, k::Int, kmax::Int, χ::Float64 = 0.1, θ::Float64 = 0.5)
    if ((k == kmax) || (normg <= normg0*min(χ, normg0^θ)))
        if (verbose)
            println("CG stops after $k iterations")
        end
        return true
    else
        return false
    end
end

In [None]:
function TruncatedCG(g::AbstractVector{T}, H::AbstractMatrix{T}, Δ::Float64) where T
    n = length(g)
    s = zeros(n)

    normg0 = norm(g)
    v = g
    d = -v
    gv = dot(g,v)
    norm2d = gv
    norm2s = 0
    sMd = 0
    k = 0
    Δ2 = Δ*Δ

    while (stopCG(norm(g), normg0, k, n) == false)
        Hd = H*d  # we can redefine the multiplication operator
        κ = dot(d,Hd)
 
        # Is the curvature negative in the direction d?
        if (κ <= 0)
            if (k == 0)
                s = d/norm(d)*Δ
            else
            σ = (-sMd+sqrt(sMd*sMd+norm2d*(Δ2-dot(s,s))))/norm2d
            s += σ*d
            end
            break
        end

        α = gv/κ

        # Check is the model minimizer is outside the trust region
        norm2s += α*(2*sMd+α*norm2d)
        if (norm2s >= Δ2)
            if (k == 0)
                s = d/norm(d)*Δ
            else
            σ = (-sMd+sqrt(sMd*sMd+norm2d*(Δ2-dot(s,s))))/norm2d
            s += σ*d
            end
            break
        end

        # The model minimizer is inside the trust region
        s += α*d
        g += α*Hd
        v = g
        newgv = dot(g,v)
        β = newgv/gv
        gv = newgv
        d = -v+β*d
        
        sMd = β*(sMd+α*norm2d)
        norm2d = gv+β*β*norm2d
        
        k += 1;
    end
    
    return s
end

## Exemple

In [None]:
function rosenbrock(x::Vector)
    return (1.0 - x[1])^2 + 100.0 * (x[2] - x[1]^2)^2
end

function rosenbrock_gradient(x::Vector)
    return [ (-2.0 * (1.0 - x[1]) - 400.0 * (x[2] - x[1]^2) * x[1])
             (200.0 * (x[2] - x[1]^2)) ]
end

function rosenbrock_hessian(x::Vector)
    return [ (2.0 - 400.0 * x[2] + 1200.0 * x[1]^2) (-400.0 * x[1]) ;
             (-400.0 * x[1]) 200.0 ]
end

function rosenbrock_gradient!(x::Vector, storage::Vector)
    storage[1] = -2.0 * (1.0 - x[1]) - 400.0 * (x[2] - x[1]^2) * x[1]
    storage[2] = 200.0 * (x[2] - x[1]^2)
end

function rosenbrock_hessian!(x::Vector, storage::Matrix)
    storage[1, 1] = 2.0 - 400.0 * x[2] + 1200.0 * x[1]^2
    storage[1, 2] = -400.0 * x[1]
    storage[2, 1] = -400.0 * x[1]
    storage[2, 2] = 200.0
end

In [None]:
defaultState = BTRState()

In [None]:
defaultState.tol

In [None]:
state = btr(rosenbrock, rosenbrock_gradient, rosenbrock_hessian, CauchyStep, [0,0])

In [None]:
state = btr(rosenbrock, rosenbrock_gradient, rosenbrock_hessian, TruncatedCG, [0,0])

In [None]:
defaultState.tol = 1e-4

In [None]:
state = btr(rosenbrock, rosenbrock_gradient, rosenbrock_hessian, CauchyStep, [0,0], defaultState)

In [None]:
state = btr(rosenbrock, rosenbrock_gradient, rosenbrock_hessian, TruncatedCG, [0,0], defaultState)

In [None]:
@benchmark btr(rosenbrock, rosenbrock_gradient, rosenbrock_hessian, CauchyStep, [0,0], defaultState)

In [None]:
@benchmark btr(rosenbrock, rosenbrock_gradient, rosenbrock_hessian, TruncatedCG, [0,0], defaultState)

In [None]:
state = btr(rosenbrock, rosenbrock_gradient, BFGSUpdate, TruncatedCG, [0,0], defaultState, true)

In [None]:
state = btr(rosenbrock, rosenbrock_gradient, SR1Update, TruncatedCG, [0,0], defaultState, true)

In [None]:
# @benchmark btr(rosenbrock, rosenbrock_gradient!, BFGSUpdate, TruncatedCG, [0,0], defaultState, true)

In [None]:
# state = btr(rosenbrock, rosenbrock_gradient!, BFGSUpdate, CauchyStep, [0,0], defaultState, true)

"Trust-Region Methods", Introduction
$$
f(x,y) = -10x^2+10y^2+4\sin(xy)-2x+x^4
$$

In [None]:
using ForwardDiff
defaultState.tol = 1e-6

In [None]:
f(x::Vector) = -10*x[1]^2+10*x[2]^2+4*sin(x[1]*x[2])-2*x[1]+x[1]^4

In [None]:
g = x -> ForwardDiff.gradient(f, x);
H = x -> ForwardDiff.hessian(f, x)

function g!(x::Vector, storage::Vector)
    s = g(x)
    storage[1:length(s)] = s[1:length(s)]
end

In [None]:
function H!(x::Vector, storage::Matrix)
    s = H(x)
    n, m = size(s)
    storage[1:n,1:m] = s[1:length(s)]
end

In [None]:
state = btr(f, g, H, CauchyStep, [0,0])

In [None]:
state = btr(f, g, H, TruncatedCG, [0,0])

In [None]:
#state = btr(f, g!, BFGSUpdate, CauchyStep, [0,0], BTRState(), true)

In [None]:
#state = btr(f, g!, BFGSUpdate, TruncatedCG, [0,0], BTRState(), true)

In [None]:
f([2.30663, -0.332309])

In [None]:
state = btr(f, g, H, TruncatedCG, [-2.0,-2.0], BTRState())

In [None]:
# f([-2.21022, 0.329748])

In [None]:
# st = BTRState()
# st.keepTrace = true

In [None]:
# state = btr(f, g!, BFGSUpdate, TruncatedCG, [-2.0,-2.0], st, true)

In [None]:
# st.trace

## Rosenbrock généralisé

Les exemples précédents travaillent sur des fonctions à deux dimensions, limitant la capacité d'évaluer les gains du gradient conjugué tronqué. Reprenons la fonction de Rosenbrock généralisée
$$
f(x) = \sum_{i = 1}^{n-1} \left( 100(x_{i+1}^2-x_i)^2 + (x_i-1)^2 \right).
$$
avec l'implémentation du gradient et la matrice Hessienne, avec un stockage dense ou creux.

In [None]:
function rosenbrock(x::Vector)
    return sum(100*(x[i+1]^2 - x[i])^2 + (x[i] - 1)^2 for i in 1:length(x)-1)
end

function ∇f(x:: Vector)
    n = length(x)
    g = zeros(n)
    for i = 1:n-1
        g[i] = -200*(x[i+1]^2-x[i])+2*(x[i]-1)
    end
    for i = 2:n
        g[i] += 400*x[i]*(x[i]^2-x[i-1])
    end
    return g
end

function Hess(x:: Vector)
    n = length(x)
    H = zeros(n,n)
    H[1,1] = 202
    for i = 2:n
        H[i,i-1] = H[i-1,i] = -400*x[i]
        H[i,i] = 400*(x[i]^2-x[i-1])+800*x[i]^2 + 202
    end
    H[n,n] -= 202
    return H
end

function TriHess(x)
    n = length(x)
    d = zeros(n)
    d[1] = 202
    d[2:n] = [400*(x[i]^2-x[i-1])+800*x[i]^2 + 202 for i = 2:n]
    d[n] -= 202
    dl = [-400*x[i] for i = 2:n]
    H = SymTridiagonal(d, dl)
    return H
end

In [None]:
function ∇f!(x:: Vector, g:: Vector)
    g[:] = ∇f(x)
    return g
end

function Hess!(x:: Vector, H:: Matrix)
    H[:,:] = Hess(x)
    return H
end

function TriHess!(x:: AbstractVector{T}, H:: AbstractMatrix{T}) where T
    n = length(x)
    d = zeros(n)
    d[1] = 202
    d[2:n] = [400*(x[i]^2-x[i-1])+800*x[i]^2 + 202 for i = 2:n]
    d[n] -= 202
    dl = [-400*x[i] for i = 2:n]
    H = SymTridiagonal(d, dl)
    return H
end

In [None]:
gr = x -> ForwardDiff.gradient(rosenbrock, x);
Hr = x -> ForwardDiff.hessian(rosenbrock, x)

In [None]:
n = 5

x = 10*ones(n)
x[1] = 2
x[5] = 5
g = zeros(n)
H = zeros(n,n)

∇f!(x, g)
H1 = Hess!(x, H)

H = zeros(2,2)
H2 = TriHess!(x, H)

norm(H1-H2)

In [None]:
state = btr(rosenbrock, ∇f, TriHess, TruncatedCG, x, BTRState())

In [None]:
state = btr(rosenbrock, ∇f, Hess, TruncatedCG, x, BTRState())

In [None]:
state.x

In [None]:
state.g

In [None]:
state = btr(rosenbrock, gr, Hr, TruncatedCG, x, BTRState())

In [None]:
n = 1000

x = 10*ones(n)

In [None]:
@benchmark btr(rosenbrock, ∇f, Hess, TruncatedCG, x, BTRState())

In [None]:
@benchmark btr(rosenbrock, ∇f, TriHess, TruncatedCG, x, BTRState())

In [None]:
@benchmark btr(rosenbrock, gr, Hr, TruncatedCG, x, BTRState())

In [None]:
verbose = true
btr(rosenbrock, ∇f, Hess, TruncatedCG, x, BTRState())

In [None]:
verbose = false
btr(rosenbrock, ∇f, TriHess, TruncatedCG, x, BTRState())

In [None]:
n = 50

x = 5*ones(n)

st = BTRState()
state = btr(rosenbrock, ∇f, BFGSUpdate, TruncatedCG, x, st, true)

In [None]:
norm(state.g)

In [None]:
state = btr(rosenbrock, ∇f, SR1Update, TruncatedCG, x, st, true)

In [None]:
@benchmark btr(rosenbrock, ∇f, SR1Update, TruncatedCG, x, st, true)

In [None]:
@benchmark btr(rosenbrock, gr, Hr, TruncatedCG, x, st)

In [None]:
@benchmark btr(rosenbrock, ∇f, Hr, TruncatedCG, x, st)

In [None]:
@benchmark btr(rosenbrock, ∇f, Hess, TruncatedCG, x, st)

In [None]:
@benchmark btr(rosenbrock, ∇f, TriHess, TruncatedCG, x, st)

In [None]:
#compute the second derivative of rosenbrook at x in the direction v
function gvr(x:: AbstractVector{T}) where T
    n = length(x)
    t = (-200*(x[2]^2-x[1])+2*(x[1]-1))*v[1]
    for i = 2:n-1
        t += (-200*(x[i+1]^2-x[i])+2*(x[i]-1)+400*x[i]*(x[i]^2-x[i-1]))*v[i]
    end
    t += 400*x[n]*(x[n]^2-x[n-1])*v[n]
    return t
end

v = ones(n)

Hxv = x -> ForwardDiff.gradient(gvr, x)
norm(Hxv(x)-TriHess(x)*v)

In [None]:
mutable struct NotYetComputedMatrix{T} <: AbstractMatrix{T}
    x::Vector{T}
    f::Function
    function NotYetComputedMatrix(f::Function, T::Type = Float64)
        n = new{T}()
        n.f = f
        return n
    end
end

function H!(x::Vector{T}, stack::NotYetComputedMatrix{T}) where T
    stack.x = x
    stack
end

import Base.*
function *(h::NotYetComputedMatrix{T}, v::AbstractVector{T}) where T
    return h.f(h.x, v)
end

import Base.show
show(io::IO, mine::MIME{Symbol("text/plain")}, x::NotYetComputedMatrix{T}) where T = show(io, "NYCM{$T}")

In [None]:
NotYetComputedMatrix(Hxv)