From b5ec52be3bea569a6c9448e53d506053a9e8d79e Mon Sep 17 00:00:00 2001
From: Holger <holger.trittenbach@gmail.com>
Date: Thu, 1 Nov 2018 11:03:01 +0100
Subject: [PATCH 1/2] Implement SMO solver.

---
 .gitignore                    |   1 +
 Project.toml                  |   1 +
 REQUIRE                       |   1 +
 src/SVDD.jl                   |   7 +-
 src/solvers/smo_svdd.jl       | 169 ++++++++++++++++++++++++++++++++++
 src/solvers/solver_base.jl    |   1 +
 test/runtests.jl              |   4 +-
 test/solvers/smo_svdd_test.jl |  17 ++++
 8 files changed, 199 insertions(+), 2 deletions(-)
 create mode 100644 src/solvers/smo_svdd.jl
 create mode 100644 src/solvers/solver_base.jl
 create mode 100644 test/solvers/smo_svdd_test.jl

diff --git a/.gitignore b/.gitignore
index a6c100d..2bee9cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@
 debug/
 
 examples/\.ipynb_checkpoints/
+dev/.ipynb_checkpoints/
diff --git a/Project.toml b/Project.toml
index f7f942f..691d56b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,5 +12,6 @@ MLKernels = "6899632a-1081-549c-8d71-752c8a25a7ba"
 MLLabelUtils = "66a33bbf-0c2b-5fc8-a008-9da813334f0a"
 Memento = "f28f55f0-a522-5efc-85c2-fe41dfb9b2d9"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/REQUIRE b/REQUIRE
index 1674b27..31bf815 100644
--- a/REQUIRE
+++ b/REQUIRE
@@ -6,3 +6,4 @@ Ipopt
 Distributions
 StatsBase
 Memento
+Compat
diff --git a/src/SVDD.jl b/src/SVDD.jl
index bd3cf5d..8c60fd3 100644
--- a/src/SVDD.jl
+++ b/src/SVDD.jl
@@ -14,8 +14,11 @@ include("init_strategies/strategies_C.jl")
 include("init_strategies/strategies_gamma.jl")
 include("init_strategies/strategies_combined.jl")
 
+include("solvers/solver_base.jl")
+include("solvers/smo_svdd.jl")
+
 using Memento
-using LinearAlgebra, Random
+using LinearAlgebra, Random, Statistics
 
 const LOGGER = getlogger(@__MODULE__)
 
@@ -29,6 +32,8 @@ export
     SVDDneg,
     SSAD,
 
+    SMOSolver,
+
     ModelSolverException, ModelStateException,
 
     FixedParameterInitialization,
diff --git a/src/solvers/smo_svdd.jl b/src/solvers/smo_svdd.jl
new file mode 100644
index 0000000..1e5ea1c
--- /dev/null
+++ b/src/solvers/smo_svdd.jl
@@ -0,0 +1,169 @@
+struct SMOSolver <: SVDDSolver
+    opt_precision::Float64
+    max_iterations::Int
+end
+
+function takeStep!(α, i1, i2, K, C, opt_precision)
+    i1 == i2 && return false
+
+    L = max(0, α[i1] + α[i2] - C)
+    H = min(C, α[i1] + α[i2])
+    (abs(L - H) < opt_precision) && return false
+
+    Δ = α[i1] + α[i2]
+    c(i) = sum(α[j]*K[i,j] for j in eachindex(α) if !(j in [i1, i2]))
+
+    alpha2 = ((2*Δ*(K[i1, i1] - K[i1,i2]) + c(i1) - c(i2) - K[i1, i1] + K[i2, i2]) /
+             (2*K[i1,i1] - 4*K[i1,i2] + 2*K[i2,i2]))
+
+    if alpha2 > H
+        alpha2 = H
+    elseif alpha2 < L
+        alpha2 = L
+    end
+
+    # see page 10
+    # J. Platt, "Sequential minimal optimization: A fast algorithm for training support vector machines," 1998.
+    if abs(α[i2] - alpha2) < opt_precision * (alpha2 + α[i2] + opt_precision)
+        return false
+    else
+        alpha1 = Δ - alpha2
+        α[i1] = alpha1
+        α[i2] = alpha2
+        return true
+    end
+end
+
+# TODO: this is a faster version of predict from classifier_svdd for in-sample predictions
+# this should be the default for in-sample predictions
+function calculate_predictions(α, K, C, opt_precision)
+    sv_larger_than_zero = α .> opt_precision
+    sv_smaller_than_C = α .< (C - opt_precision)
+
+    sv_larger_than_zero_idx = findall(sv_larger_than_zero)
+    const_term = sum(α[i] * α[j] * K[i,j] for i in sv_larger_than_zero_idx for j in sv_larger_than_zero_idx)
+    distances_to_center = [K[z, z] - 2 * sum(α[i] * K[i, z] for i in sv_larger_than_zero_idx) + const_term for z in eachindex(α)]
+
+    ## see W.-C. Chang, C.-P. Lee, and C.-J. Lin, "A revisit to support vector data description," 2013
+    if any(sv_larger_than_zero .& sv_smaller_than_C)
+        R = mean(distances_to_center[sv_larger_than_zero .& sv_smaller_than_C])
+    else
+        R = (minimum(distances_to_center[sv_larger_than_zero]) + maximum(distances_to_center[sv_larger_than_zero])) / 2
+    end
+
+    distances_to_decision_boundary = distances_to_center .- R
+    return (distances_to_center, distances_to_decision_boundary, R)
+end
+
+function violates_KKT_condition(i2, distances_to_decision_boundary, α, C, opt_precision)
+    p1 = (α[i2] > opt_precision) && (distances_to_decision_boundary[i2] < -opt_precision) # inlier, but alpha > 0
+    p2 = (α[i2] < C - opt_precision) && (distances_to_decision_boundary[i2] > opt_precision) # outlier, but alpha != C
+    return p1 || p2
+end
+
+# See Equation (4.8) in
+# B. Schölkopf, J. C. Platt, J. Shawe-Taylor, A. J. Smola, and R. C. Williamson,
+# "Estimating the support of a high-dimensional distribution," 2001
+function second_choice_heuristic(i2, α, distances_to_center, C, opt_precision)
+    SV_nb = (α .> opt_precision) .& (α .< C - opt_precision)
+    if !any(SV_nb)
+        SV_nb = α .> opt_precision
+    end
+    findall(SV_nb)[findmax(abs.(distances_to_center[i2] .- distances_to_center[SV_nb]))[2]]
+end
+
+# The fallback strategies if second choice heuristic returns false follow recommendations in
+# J. Platt, "Sequential minimal optimization: A fast algorithm for training support vector machines," 1998.
+function examineExample!(α, i2, distances_to_center, K, C, opt_precision)
+    # use the second choice heuristic
+    i1 = second_choice_heuristic(i2, α, distances_to_center, C, opt_precision)
+    takeStep!(α, i1, i2, K, C, opt_precision) && return true
+
+    # loop over all non-zero and non-C alpha, starting at random position
+    candidates = findall((α .> opt_precision) .& (α .< C - opt_precision))
+    if !isempty(candidates)
+        for i1 in shuffle(candidates)
+            takeStep!(α, i1, i2, K, C, opt_precision) && return true
+        end
+    end
+
+    # loop over all
+    for i1 in shuffle(eachindex(α))
+        takeStep!(α, i1, i2, K, C, opt_precision) && return true
+    end
+    return false
+end
+
+function initialize_alpha(data, C)
+    n_init = trunc(Int, 1 / (C)) + 1
+    α = fill(0.0, size(data, 2))
+    α[sample(1:size(data, 2), n_init, replace=false)] .= 1 / n_init
+    @assert sum(α) ≈ 1 && all(α .<= C)
+    return α
+end
+
+function examine_and_update_predictions!(α, distances_to_center, distances_to_decision_boundary, R,
+        KKT_violations, black_list, K, C, opt_precision)
+    i2 = sample(KKT_violations)
+    if examineExample!(α, i2, distances_to_center, K, C, opt_precision)
+        distances_to_center, distances_to_decision_boundary, R = calculate_predictions(α, K, C, opt_precision)
+    else
+        push!(black_list, i2)
+    end
+    return distances_to_center, distances_to_decision_boundary, R
+end
+
+function smo(α, K, C, opt_precision, max_iterations)
+    distances_to_center, distances_to_decision_boundary, R = calculate_predictions(α, K, C, opt_precision)
+
+    iter = 0
+    while iter < max_iterations
+        # Fall back strategy: add indices to black list if examine can not make positive step.
+        # See page 9, J. Platt, "Sequential minimal optimization: A fast algorithm for training support vector machines," 1998.
+        black_list = Set{Int}()
+        iter += 1
+
+        # scan over all data
+        KKT_violation_all_idx = filter(i -> violates_KKT_condition(i, distances_to_decision_boundary, α, C, opt_precision) && i ∉ black_list, eachindex(α))
+        if isempty(KKT_violation_all_idx)
+            return build_result(α, distances_to_decision_boundary, R, K, C, opt_precision, :Optimal, "No more KKT_violations.")
+        else
+            distances_to_center, distances_to_decision_boundary, R = examine_and_update_predictions!(α, distances_to_center, distances_to_decision_boundary, R, KKT_violation_all_idx, black_list, K, C, opt_precision)
+        end
+
+        # scan over SV_nb
+        SV_nb = (α .> opt_precision) .& (α .< C - opt_precision)
+        KKT_violations_in_SV_nb = filter(i -> violates_KKT_condition(i, distances_to_decision_boundary, α, C, opt_precision) && i ∉ black_list, findall(SV_nb))
+        while length(KKT_violations_in_SV_nb) > 0 && iter < max_iterations
+            iter += 1
+            distances_to_center, distances_to_decision_boundary, R = examine_and_update_predictions!(α, distances_to_center, distances_to_decision_boundary, R, KKT_violations_in_SV_nb, black_list, K, C, opt_precision)
+            KKT_violations_in_SV_nb = filter(i -> violates_KKT_condition(i, distances_to_decision_boundary, α, C, opt_precision) && i ∉ black_list, findall(SV_nb))
+        end
+    end
+    return build_result(α, distances_to_decision_boundary, R, K, C, opt_precision, :UserLimit, "Reached max number of iterations.")
+end
+
+function calculate_duality_gap(α, distances_to_decision_boundary, R, K, C, opt_precision)
+    sv_larger_than_zero_idx = findall(α .> opt_precision)
+    primal_obj = R + sum(distances_to_decision_boundary[distances_to_decision_boundary .> opt_precision] * C)
+    dual_obj = sum(α[i] * K[i,i] for i in sv_larger_than_zero_idx) - sum(α[i] * α[j] * K[i,j] for i in sv_larger_than_zero_idx for j in sv_larger_than_zero_idx)
+    duality_gap = primal_obj - dual_obj
+    return primal_obj, dual_obj, duality_gap
+end
+
+function build_result(α, distances_to_decision_boundary, R, K, C, opt_precision, status, msg)
+    if status == :Optimal
+        info(LOGGER, "Exit with status: $status. (" * msg * ")")
+    else
+        warn(LOGGER, "Exit with status: $status. (" * msg * ")")
+    end
+    primal_obj, dual_obj, duality_gap = calculate_duality_gap(α, distances_to_decision_boundary, R, K, C, opt_precision)
+    info(LOGGER, "duality gap: $duality_gap, primal objective: $primal_obj, dual objective: $dual_obj")
+   return α, primal_obj, duality_gap, duality_gap, status
+end
+
+function solve!(model::VanillaSVDD, solver::SMOSolver)
+    α = initialize_alpha(model.data, model.C)
+    model.alpha_values, primal_obj, dual_obj, duality_gap, status = smo(α, model.K, model.C, solver.opt_precision, solver.max_iterations)
+    return status
+end
diff --git a/src/solvers/solver_base.jl b/src/solvers/solver_base.jl
new file mode 100644
index 0000000..cf73060
--- /dev/null
+++ b/src/solvers/solver_base.jl
@@ -0,0 +1 @@
+abstract type SVDDSolver end
diff --git a/test/runtests.jl b/test/runtests.jl
index 68382d7..c15ef10 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,7 +3,7 @@ using JuMP, Ipopt
 using StatsBase, Distributions
 using MLKernels, MLLabelUtils
 using Test
-using LinearAlgebra, Random
+using LinearAlgebra, Random, Statistics
 
 TEST_SOLVER =  with_optimizer(Ipopt.Optimizer, print_level=0)
 
@@ -19,4 +19,6 @@ include("test_utils.jl")
     include("classifiers/classifier_svdd_vanilla_test.jl")
 
     include("init_strategies/init_strategies_test.jl")
+
+    include("solvers/smo_svdd_test.jl")
 end
diff --git a/test/solvers/smo_svdd_test.jl b/test/solvers/smo_svdd_test.jl
new file mode 100644
index 0000000..b79b8fb
--- /dev/null
+++ b/test/solvers/smo_svdd_test.jl
@@ -0,0 +1,17 @@
+@testset "smo svdd" begin
+    dummy_data, labels = generate_mvn_with_outliers(2, 1000, 123, true, true)
+    pools = fill(:U, size(dummy_data, 2))
+
+    init_strategy = SVDD.FixedParameterInitialization(MLKernels.GaussianKernel(0.5), 0.5)
+
+    model = SVDD.VanillaSVDD(dummy_data)
+    SVDD.initialize!(model, init_strategy)
+    solver = SMOSolver(1e-4, 10)
+    status = SVDD.solve!(model, solver)
+    @test status == :UserLimit
+
+    solver = SMOSolver(1e-2, 1000)
+    status = SVDD.fit!(model, solver)
+    @test status == :Optimal
+    @test model.R > 0
+end

From 00cc530e4e9e8fbba957d9ec2a18ee9c93f77849 Mon Sep 17 00:00:00 2001
From: Holger <holger.trittenbach@gmail.com>
Date: Sat, 10 Nov 2018 14:27:39 +0100
Subject: [PATCH 2/2] Add docs.

---
 .travis.yml             |  10 ++
 Manifest.toml           |  12 ++
 Project.toml            |   1 +
 README.md               |   5 +
 docs/.gitignore         |   2 +
 docs/Project.toml       |   2 +
 docs/make.jl            |  18 +++
 docs/src/index.md       |   3 +
 docs/src/smo.md         | 246 ++++++++++++++++++++++++++++++++++++++++
 docs/src/start.md       |   9 ++
 src/solvers/smo_svdd.jl |  41 ++++++-
 11 files changed, 345 insertions(+), 4 deletions(-)
 create mode 100644 docs/.gitignore
 create mode 100644 docs/Project.toml
 create mode 100644 docs/make.jl
 create mode 100644 docs/src/index.md
 create mode 100644 docs/src/smo.md
 create mode 100644 docs/src/start.md

diff --git a/.travis.yml b/.travis.yml
index 67b9d0c..cf9a7bc 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,3 +20,13 @@ cache:
 
 after_success:
   - julia -e 'import Pkg; cd(Pkg.dir("SVDD")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())';
+
+jobs:
+  include:
+    - stage: "Documentation"
+      julia: 1.0
+      os: linux
+      script:
+        - julia --project="." -e 'using Pkg; Pkg.instantiate();'
+        - julia --project="." docs/make.jl
+      after_success: skip
diff --git a/Manifest.toml b/Manifest.toml
index 697edad..fe36e1f 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -73,6 +73,18 @@ git-tree-sha1 = "c24e9b6500c037673f0241a2783472b8c3d080c7"
 uuid = "31c24e10-a181-5473-b8eb-7969acd0382f"
 version = "0.16.4"
 
+[[DocStringExtensions]]
+deps = ["LibGit2", "Markdown", "Pkg", "Test"]
+git-tree-sha1 = "a016e0bfe98a748c4488e2248c2ef4c67d6fdd35"
+uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+version = "0.5.0"
+
+[[Documenter]]
+deps = ["Base64", "DocStringExtensions", "InteractiveUtils", "LibGit2", "Logging", "Markdown", "Pkg", "REPL", "Random", "Test", "Unicode"]
+git-tree-sha1 = "9f2135e0e7ecb63f9c3ef73ea15a31d8cdb79bb7"
+uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+version = "0.20.0"
+
 [[EzXML]]
 deps = ["BinaryProvider", "Libdl", "Printf", "Test"]
 git-tree-sha1 = "5623d1486bfaadd815f5c4ca501adda02b5337f1"
diff --git a/Project.toml b/Project.toml
index 691d56b..01e8a79 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.2.0"
 
 [deps]
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Ipopt = "b6b21f68-93f8-5de0-b562-5493be1d77c9"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/README.md b/README.md
index 85ae81d..216bcc0 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # SVDD.jl
 _A Julia package for Support Vector Data Description._
 
+[![][docs-master-img]][docs-master-url]
 [![Build Status](https://travis-ci.com/englhardt/SVDD.jl.svg?branch=master)](https://travis-ci.com/englhardt/SVDD.jl)
 [![Coverage Status](https://coveralls.io/repos/github/englhardt/SVDD.jl/badge.svg?branch=master)](https://coveralls.io/github/englhardt/SVDD.jl?branch=master)
 
@@ -72,3 +73,7 @@ This package is developed and maintained by [Holger Trittenbach](https://github.
 [3] Scott, David W. Multivariate density estimation: theory, practice, and visualization. John Wiley & Sons, 2015.
 
 [4] Silverman, Bernard W. Density estimation for statistics and data analysis. Routledge, 2018.
+
+
+[docs-master-img]: https://img.shields.io/badge/docs-master-blue.svg
+[docs-master-url]: https://englhardt.github.io/SVDD.jl/latest
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 0000000..a303fff
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1,2 @@
+build/
+site/
diff --git a/docs/Project.toml b/docs/Project.toml
new file mode 100644
index 0000000..dfa65cd
--- /dev/null
+++ b/docs/Project.toml
@@ -0,0 +1,2 @@
+[deps]
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
diff --git a/docs/make.jl b/docs/make.jl
new file mode 100644
index 0000000..26cf7d6
--- /dev/null
+++ b/docs/make.jl
@@ -0,0 +1,18 @@
+using Documenter
+using SVDD
+
+makedocs(
+    sitename = "SVDD Documentation",
+    pages = [
+        "Home" => "index.md",
+        "Getting Started" => "start.md",
+        "Custom Solvers" => [
+            "SMO" => "smo.md"
+            ]
+        ],
+    format = :html,
+    modules = [SVDD]
+)
+deploydocs(
+    repo = "github.com/englhardt/SVDD.jl.git"
+)
diff --git a/docs/src/index.md b/docs/src/index.md
new file mode 100644
index 0000000..50c3bdb
--- /dev/null
+++ b/docs/src/index.md
@@ -0,0 +1,3 @@
+# SVDD.jl
+
+Documentation for SVDD.jl
diff --git a/docs/src/smo.md b/docs/src/smo.md
new file mode 100644
index 0000000..4144663
--- /dev/null
+++ b/docs/src/smo.md
@@ -0,0 +1,246 @@
+# SMO
+
+Sequential Minimal Optimization (SMO) is a decomposition method to solve quadratic optimization problems with a specific structure. The original SMO algorithm by John C. Platt has been proposed for Support Vector Machines (SVM). There are several modifications for other types of support vector machines. This section describes the implementation of SMO for Support Vector Data Description (SVDD) [2].
+
+The implementation of SMO for SVDD bases on an adaption of SMO for one-class classification [3]. Therefore, this documentation focuses on the specific adaptions required for SVDD. The following descriptions assume familarity with the basics of SMO [1] and its adaption to one-class SVM [3], and of SVDD [2].
+
+## SVDD Overview
+
+SVDD is an optimization problem of the following form.
+
+```math
+  \begin{aligned}
+  P: \ & \underset{R, a, \xi}{\text{minimize}}
+  & & R^2 + \sum_i \xi_i  \\
+  & \text{subject to}
+  & & \left\Vert \Phi(x_{i}) - a \right\Vert^2 \leq R^2 + \xi_i, \; ∀ i \\
+  & & & \xi_i \geq 0, \; ∀ i
+  \end{aligned}
+```
+with radius $R$ and center of the hypershpere $a$, a slack variable $\xi$, and a mapping into an implicit feature space $\Phi$
+
+The Lagrangian Dual is:
+
+```math
+\begin{aligned}
+D: \ & \underset{\alpha}{\text{maximize}}
+& & \sum_{i,j}\alpha_i\alpha_j K_{i,j} + \sum_i \alpha_iK_{i,i}  \\
+& \text{subject to}
+& & \sum_i \alpha_i = 1 \\
+& & & 0 \leq \alpha_i \leq C, \; ∀ i \\
+\end{aligned}
+```
+where $\alpha$ are the Lagrange multipliers, and $K_{i,j} = \langle {\Phi(x_i),\Phi{x_j}} \rangle$ the inner product in the implicit feature space.
+Solving the Lagrangian gives an optimal $α$.
+
+## SMO for SVDD
+
+The basic idea of SMO is to solve reduced versions of the Lagrangian iteratively.
+In each iteration, the reduced version of the Lagrangian consists of only two decision variables, i.e., $\alpha_{i1}$ and $\alpha_{i2}$, while $\alpha_j, j∉\{i1, i2\}$ are fixed.
+An iteration of SMO consists of two steps:
+
+**Selection Step:** Select $i1$ and $i2$.
+  * The search for a good $i2$ are implemented in [`SVDD.smo`](@ref)
+  * There are several heuristics to select $i1$ based on the choice for $i2$. These heuristics are implemented in [`SVDD.examineExample!`](@ref)
+
+**Optimization Step:** Solving the reduced Lagrangian for $\alpha_{i1}$ and $\alpha_{i2}$.
+  * Implemented in [`SVDD.takeStep!`](@ref)
+
+The iterative procedure converges to the global optimum.
+The following sections give details on both steps.
+
+### Optimization Step: Solving the reduced Lagrangian
+
+The following describes how to infer the optimal solution for a given $\alpha_{i1}$ and $\alpha_{i2}$ analytically.
+
+First, $\alpha_{i1}$ and $\alpha_{i2}$ can only be changed in a limited range.
+The reason is that after the optimization step, they still have to obey the constraints of the Lagrangian.
+From $\sum_i\alpha_i = 1$, one can infer that $Δ = \alpha_{i1} + \alpha_{i2}$ remains constant for one optimization step.
+This is, if we add some value to $\alpha_{i2}$, we must remove the same value from $\alpha_{i1}$.
+We also know that $\alpha_{i} \geq 0$ and $\alpha_{i} \leq C$.
+From this, one can infer the maximum and minumum value that one can add/substract from $\alpha_{i2}$, i.e., one can calculate the lower and the upper bound:
+
+```math
+\begin{aligned}
+  L &= max(0, \alpha_{i1} + \alpha_{i2} - C)\\
+  H &= min(C, \alpha_{i1} + \alpha_{i2})
+\end{aligned}
+```
+
+(Note: This is slightly different to the original SMO, as one does not need to discern between different labels $y_i \in \{1,-1\}$.)
+
+Second, the optimal value ``\alpha^*_{i2}`` can be derived analytically by setting the partial derivative of the Lagrangian objective function to 0.
+
+```math
+f_{D} = \sum_{i,j} \alpha_i \alpha_j K_{i,j} - \sum_{i}\alpha_{i} K_{i,i} \\
+\frac{\delta f_{D}}{\alpha_{i2}} = 0
+
+\iff  \alpha^*_{i2} = \frac{2\Delta(K_{i1,i1} - K_{i1,i2}) + C_1 - C_2 - K_{i1,i1} + K_{i2,i2}}{2K_{i1,i1}+2K_{i2, i2}-4K_{i1, i2}}, \\
+\text{where} \ C_k=\alpha_{k}\sum_{j=3}^{N}\alpha_j K_{k,j}
+```
+
+The resulting value is _clipped_ to the feasible interval.
+
+```
+if α*_i2 > H
+    α'_i2 = H
+elseif α*_i2 < L
+    α'_i2 = L
+end
+```
+where `α'_i2` is the updated value of `α_i2` after the optimization step.
+It follows that
+
+```
+  α'_i1 = Δ - α'_i2
+```
+
+To allow the algorithm to converge, one has to decide on a threshold whether the updates to the alpha values has been significant, i.e., if the difference between the old and the new value is above a specified precision.
+The implementation uses the decision rule from the original SMO [1, p.10], i.e., update alpha values only if
+
+```math
+\lvert\alpha_{i2} - \alpha'_{i2} \rvert > \text{opt_precision} * (\alpha_{i2} + \alpha'_{i2} + \text{opt_precision})
+```
+
+where `opt_precision` is a parameter of the optimization algorithm.
+This optimization step is implemented in
+
+```@docs
+SVDD.takeStep!
+```
+### Selection Step: Finding a pair (i1, i2)
+
+To take an optimization step, one has to select i1 and i2 first.
+The rationale of SMO is to select indices that are likely to make a large step optimization step.
+SMO uses heuristics to first select i2, and then select i1 based on it.
+
+##### Selection of i2
+
+A minimum of $P$ has to obey the [KKT conditions](https://en.wikipedia.org/wiki/Karush%E2%80%93Kuhn%E2%80%93Tucker_conditions).
+The relevant KKT condition here is complementary slackness, i.e.,
+
+```math
+  \mu_i g_i(x^*) = 0, \, \forall i
+```
+
+with dual variable $\mu$ and inequality conditions $g$.
+In other words, either the inequality constraint is fulfilled with equality, i.e., $g_i = 0$, or the Lagrange multiplier is zero, i.e., $\mu_i=0$.
+For SVDD, this translates to
+
+```math
+  \begin{aligned}
+    &\left\lVert a - \phi(x_i) \right\rVert^2 < R^2 \rightarrow \alpha_i = 0 \\
+    &\left\lVert a - \phi(x_i) \right\rVert^2 = R^2 \rightarrow  0 < \alpha_i < C\\
+    &\left\lVert a - \phi(x_i) \right\rVert^2 > R^2 \rightarrow \alpha_i = C
+ \end{aligned}
+```
+See [2] for details.
+The distance to the decision boundary is $\left\lVert a - \phi(x_i) \right\rVert^2 - R^2$ which is negative for observations that lie in the hypershpere.
+
+So to check for KKT violations, one has to calculate the distance of $\phi(x_i)$ from the decision boundary, i.e., the left-hand side of the implications above, and compare it with the the respective $\alpha$ value.
+The check for KKT violations is implemented in
+
+```@docs
+  SVDD.violates_KKT_condition
+```
+
+[`SVDD.smo`](@ref) selects $i2$ by searching for indices that violate the KKT conditions.
+
+```@docs
+  SVDD.smo
+```
+
+This function conducts two tyes of search.
+
+_First type:_ search over the full data set, and randomly selects one of the violating indices.
+
+_Second type:_ restricted search for violations over the subset where $0 <\alpha_i < C$.
+These variables are the non-bounded support vectors $SV_{nb}$.
+
+There is one search of the first type, then multiple searches of the second type.
+After each search, $i2$ is selected randomly from one of the violating indices, see
+
+```@docs
+SVDD.examine_and_update_predictions!
+```
+
+##### Selection of i1
+
+SMO selects $i1$ such that the optimization step is as large as possible.
+The idea for selecting $i1$ is as follows.
+For $\alpha_{i2} > 0$ and negative distance to decision boundary, alpha may decrease.
+So a good $\alpha_{i1}$ is one that is likely to increase in the optimization step, i.e., an index where the distance to the decision boundary is positive, and $\alpha_{i1} = 0$.
+The heuristic SMO selects the $i1$ with maximum absolute distance between the distance to the center of $i2$ and the distance to the center of some $i1 \in SV_{nb}$.
+(Note that using the distance to the decision boundary is equivalent to using the distance to the center in this step).
+This selection heuristic is implemented in
+
+```@docs
+  SVDD.second_choice_heuristic
+```
+
+In some cases, the selected $i1$ does not lead to a positive optimization step.
+In this case, there are two fallback strategies.
+First, all other indices in $SV_{nb}$ are selected, in random order, whether they result in a positive optimization step.
+Second, if there still is no $i1$ that results in a positive optimization step, all remaining indices are selected.
+If none of the fallback strategies works, $i2$ is skipped and added to a blacklist.
+The fallback strategies are implemented in
+
+```@docs
+  SVDD.examineExample!
+```
+
+### Termination
+
+If there are no more KKT violations, the algorithm terminates.
+
+### Further implementation details
+
+This section describes some further implementation details.
+
+##### Initialize alpha
+
+The vector $\alpha$ must be initialized such that it fulfills the constraints of $D$.
+The implementation uses the initialization strategy proposed in [3], i.e., randomly setting $\frac{1}{C}$ indices to $C$.
+This is implemented in
+
+```@docs
+  SVDD.initialize_alpha
+```
+
+##### Calculating Distances to Decision Boundary
+
+The distances to the decision boundary are calculated in
+
+```@docs
+  SVDD.calculate_predictions
+```
+
+In general, to calculate $R$, one can calculate the distance to any non-bounded support vector, i.e., $0 < \alpha_i < C$, as they all lie on the hypershpere.
+However, this may not always hold.
+There may be cases where the solution for R is not unique, and different support vectors result in different $R$, in particular in intermediate optimization steps where some $\alpha$ values may be non-bounded but violate the KKT conditions.
+Therefore, $R$ is averaged over all non-bounded support vectors.
+See also [4] for details on non-unique $R$ values.
+
+##### SMO parameters
+
+There are two parameters for SMO: `opt_precision` and `max_iterations`.
+
+`opt_precision` influences the convergence.
+Small `opt_precision` values require a larger number of iterations until termination.
+
+`max_iterations` controls the number of times a new $i2$ is selected to attempt an optimization step.
+
+## External API
+
+```@docs
+  SVDD.solve!(model::VanillaSVDD, solver::SMOSolver)
+```
+
+## References
+[1] J. Platt, "Sequential minimal optimization: A fast algorithm for training support vector machines," 1998.
+
+[2] D. M. J. Tax and R. P. W. Duin, "Support Vector Data Description,"" Mach. Learn., 2004.
+
+[3] B. Schölkopf, J. C. Platt, J. Shawe-Taylor, A. J. Smola, and R. C. Williamson, "Estimating the support of a high-dimensional distribution,"" Neural Comput., 2001.
+
+[4] W.-C. Chang, C.-P. Lee, and C.-J. Lin, "A revisit to support vector data description,"Nat. Taiwan Univ., Tech. Rep, 2013.
diff --git a/docs/src/start.md b/docs/src/start.md
new file mode 100644
index 0000000..918cd27
--- /dev/null
+++ b/docs/src/start.md
@@ -0,0 +1,9 @@
+# Getting Started
+
+## Installation
+
+SVDD.jl is not yet registered. To install, run
+
+```
+(v1.0) pkg> add https://github.com/englhardt/SVDD.jl
+```
diff --git a/src/solvers/smo_svdd.jl b/src/solvers/smo_svdd.jl
index 1e5ea1c..76d7542 100644
--- a/src/solvers/smo_svdd.jl
+++ b/src/solvers/smo_svdd.jl
@@ -3,6 +3,11 @@ struct SMOSolver <: SVDDSolver
     max_iterations::Int
 end
 
+"""
+    takeStep!(α, i1, i2, K, C, opt_precision)
+
+    Take optimization step for i1 and i2 and update α.
+"""
 function takeStep!(α, i1, i2, K, C, opt_precision)
     i1 == i2 && return false
 
@@ -36,6 +41,9 @@ end
 
 # TODO: this is a faster version of predict from classifier_svdd for in-sample predictions
 # this should be the default for in-sample predictions
+"""
+    calculate_predictions(α, K, C, opt_precision)
+"""
 function calculate_predictions(α, K, C, opt_precision)
     sv_larger_than_zero = α .> opt_precision
     sv_smaller_than_C = α .< (C - opt_precision)
@@ -54,7 +62,10 @@ function calculate_predictions(α, K, C, opt_precision)
     distances_to_decision_boundary = distances_to_center .- R
     return (distances_to_center, distances_to_decision_boundary, R)
 end
+"""
+    violates_KKT_condition(i2, distances_to_decision_boundary, α, C, opt_precision)
 
+"""
 function violates_KKT_condition(i2, distances_to_decision_boundary, α, C, opt_precision)
     p1 = (α[i2] > opt_precision) && (distances_to_decision_boundary[i2] < -opt_precision) # inlier, but alpha > 0
     p2 = (α[i2] < C - opt_precision) && (distances_to_decision_boundary[i2] > opt_precision) # outlier, but alpha != C
@@ -64,6 +75,9 @@ end
 # See Equation (4.8) in
 # B. Schölkopf, J. C. Platt, J. Shawe-Taylor, A. J. Smola, and R. C. Williamson,
 # "Estimating the support of a high-dimensional distribution," 2001
+"""
+    second_choice_heuristic(i2, α, distances_to_center, C, opt_precision)
+"""
 function second_choice_heuristic(i2, α, distances_to_center, C, opt_precision)
     SV_nb = (α .> opt_precision) .& (α .< C - opt_precision)
     if !any(SV_nb)
@@ -72,8 +86,13 @@ function second_choice_heuristic(i2, α, distances_to_center, C, opt_precision)
     findall(SV_nb)[findmax(abs.(distances_to_center[i2] .- distances_to_center[SV_nb]))[2]]
 end
 
-# The fallback strategies if second choice heuristic returns false follow recommendations in
-# J. Platt, "Sequential minimal optimization: A fast algorithm for training support vector machines," 1998.
+
+"""
+    examineExample!(α, i2, distances_to_center, K, C, opt_precision)
+
+    The fallback strategies if second choice heuristic returns false follow recommendations in
+    J. Platt, "Sequential minimal optimization: A fast algorithm for training support vector machines," 1998.
+"""
 function examineExample!(α, i2, distances_to_center, K, C, opt_precision)
     # use the second choice heuristic
     i1 = second_choice_heuristic(i2, α, distances_to_center, C, opt_precision)
@@ -94,6 +113,9 @@ function examineExample!(α, i2, distances_to_center, K, C, opt_precision)
     return false
 end
 
+"""
+    initialize_alpha(data, C)
+"""
 function initialize_alpha(data, C)
     n_init = trunc(Int, 1 / (C)) + 1
     α = fill(0.0, size(data, 2))
@@ -102,6 +124,10 @@ function initialize_alpha(data, C)
     return α
 end
 
+"""
+    examine_and_update_predictions!(α, distances_to_center, distances_to_decision_boundary, R,
+        KKT_violations, black_list, K, C, opt_precision)
+"""
 function examine_and_update_predictions!(α, distances_to_center, distances_to_decision_boundary, R,
         KKT_violations, black_list, K, C, opt_precision)
     i2 = sample(KKT_violations)
@@ -113,6 +139,10 @@ function examine_and_update_predictions!(α, distances_to_center, distances_to_d
     return distances_to_center, distances_to_decision_boundary, R
 end
 
+"""
+    smo(α, K, C, opt_precision, max_iterations)
+
+"""
 function smo(α, K, C, opt_precision, max_iterations)
     distances_to_center, distances_to_decision_boundary, R = calculate_predictions(α, K, C, opt_precision)
 
@@ -153,15 +183,18 @@ end
 
 function build_result(α, distances_to_decision_boundary, R, K, C, opt_precision, status, msg)
     if status == :Optimal
-        info(LOGGER, "Exit with status: $status. (" * msg * ")")
+        info(LOGGER, "Exit with status: $status. ($msg)")
     else
-        warn(LOGGER, "Exit with status: $status. (" * msg * ")")
+        warn(LOGGER, "Exit with status: $status. ($msg)")
     end
     primal_obj, dual_obj, duality_gap = calculate_duality_gap(α, distances_to_decision_boundary, R, K, C, opt_precision)
     info(LOGGER, "duality gap: $duality_gap, primal objective: $primal_obj, dual objective: $dual_obj")
    return α, primal_obj, duality_gap, duality_gap, status
 end
 
+"""
+    solve!(model::VanillaSVDD, solver::SMOSolver)
+"""
 function solve!(model::VanillaSVDD, solver::SMOSolver)
     α = initialize_alpha(model.data, model.C)
     model.alpha_values, primal_obj, dual_obj, duality_gap, status = smo(α, model.K, model.C, solver.opt_precision, solver.max_iterations)