Skip to content

Commit

Permalink
Implement wang gamma init strategy.
Browse files Browse the repository at this point in the history
  • Loading branch information
englhardt committed Nov 6, 2018
1 parent 7fc6209 commit d559997
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 0 deletions.
30 changes: 30 additions & 0 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ git-tree-sha1 = "c49ec69428ffea0c1d1bbdc63d1a70f5df5860ad"
uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
version = "0.0.7"

[[Distances]]
deps = ["LinearAlgebra", "Printf", "Random", "Statistics", "Test"]
git-tree-sha1 = "2f38605722542f1c0a32dd2856fb529d8c226c69"
uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
version = "0.7.3"

[[Distributed]]
deps = ["LinearAlgebra", "Random", "Serialization", "Sockets"]
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
Expand Down Expand Up @@ -95,6 +101,12 @@ git-tree-sha1 = "b94462673702e3701852bbff61ff8bbf96feb351"
uuid = "b6b21f68-93f8-5de0-b562-5493be1d77c9"
version = "0.4.3"

[[IterTools]]
deps = ["SparseArrays", "Test"]
git-tree-sha1 = "79246285c43602384e6f1943b3554042a3712056"
uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
version = "1.1.1"

[[JSON]]
deps = ["Dates", "Distributed", "Mmap", "Sockets", "Test", "Unicode"]
git-tree-sha1 = "fec8e4d433072731466d37ed0061b3ba7f70eeb9"
Expand Down Expand Up @@ -128,6 +140,12 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
[[Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"

[[MLBase]]
deps = ["IterTools", "Random", "Reexport", "StatsBase", "Test"]
git-tree-sha1 = "f63a8d37429568b8c4384d76c4a96fc2897d6ddf"
uuid = "f0e99cf1-93fa-52ec-9ecc-5026115318e0"
version = "0.8.0"

[[MLKernels]]
deps = ["LinearAlgebra", "SpecialFunctions", "Statistics", "Test"]
git-tree-sha1 = "456c6aa2928ae85064ef1d28b80d04fafed08986"
Expand Down Expand Up @@ -191,6 +209,12 @@ git-tree-sha1 = "ce3b85e484a5d4c71dd5316215069311135fa9f2"
uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
version = "0.3.2"

[[NearestNeighbors]]
deps = ["Distances", "LinearAlgebra", "Mmap", "StaticArrays", "Test"]
git-tree-sha1 = "aab46b96ae5c2a9c08146188016d6312276094e5"
uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
version = "0.4.2"

[[Nullables]]
deps = ["Compat"]
git-tree-sha1 = "ae1a63457e14554df2159b0b028f48536125092d"
Expand Down Expand Up @@ -231,6 +255,12 @@ uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
deps = ["Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

[[Reexport]]
deps = ["Pkg"]
git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0"
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
version = "0.2.0"

[[Rmath]]
deps = ["BinaryProvider", "Libdl", "Random", "Statistics", "Test"]
git-tree-sha1 = "9a6c758cdf73036c3239b0afbea790def1dabff9"
Expand Down
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Ipopt = "b6b21f68-93f8-5de0-b562-5493be1d77c9"
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MLBase = "f0e99cf1-93fa-52ec-9ecc-5026115318e0"
MLKernels = "48eadcf2-8ff1-11e8-289d-f103432a0bb3"
MLLabelUtils = "66a33bbf-0c2b-5fc8-a008-9da813334f0a"
Memento = "f28f55f0-a522-5efc-85c2-fe41dfb9b2d9"
NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
97 changes: 97 additions & 0 deletions src/init_strategies/strategies_gamma.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,100 @@ struct RuleOfThumbScott <: InitializationStrategyGamma end
function calculate_gamma(model, strategy::RuleOfThumbScott)
return size(model.data, 2)^(-1.0/(size(model.data,1) + 4))
end

"""
Generate binary data to tune a one class classifier according to the following paper:
Wang, S. et al. 2018. Hyperparameter selection of one-class support vector machine by self-adaptive data
shifting. Pattern Recognition. 74, 2018.
"""
struct WangGammaStrategy <: InitializationStrategyGamma
solver
gamma_search_range
C
scoring_function
end

WangGammaStrategy(solver) = WangGammaStrategy(solver, 10.0.^range(-2, stop=2, length=50), 1, f1_scoring)
WangGammaStrategy(solver, gamma_search_range, C) = WangGammaStrategy(solver, gamma_search_range, C, f1_scoring)

function generate_binary_data_for_tuning(data, k=nothing, threshold=0.1)
k = round(Int, ceil(5 * log10(size(data, 2))))
tree = KDTree(data)
edge_idx = Int[]
norm_vec = []
data_target = []
l_ns = 0

for i in 1:size(data, 2)
idx, dist = knn(tree, data[:, i], k + 1, true)
v_ij = mapslices(normalize, data[:, i] .- data[:, idx[2:end]], dims=1)
n_i = sum(v_ij, dims=2)
θ_ij = sum(v_ij .* n_i, dims=1)
l_i = 1 / k * sum(θ_ij .>= 0)
if l_i >= 1 - threshold
# add new edge
push!(edge_idx, i)
push!(norm_vec, n_i)
end

# generate pseudo inlier
n_i = normalize(vec(n_i))
Λ_i_positive = sum(n_i .* (data[:, idx[2:end]] .- data[:, i]), dims=1)
if length(Λ_i_positive[Λ_i_positive .> 0]) > 0
# shift along positive direction of data density gradient
x_ij_min_positive = minimum(Λ_i_positive[Λ_i_positive .> 0])
push!(data_target, data[:, i] + x_ij_min_positive * n_i)
end
l_ns += 1 / k * sum(dist)
end

# compute negative shift amount
l_ns *= 1 / length(edge_idx)
# generate pseudo outliers by shifting along negative direction
data_outliers = data[:, edge_idx] + mapslices(normalize, hcat(norm_vec...), dims=1) * l_ns

data_target = hcat(data_target...)

return data_target, data_outliers
end

function f1_scoring(predictions, ground_truth)
return MLBase.f1score(MLBase.roc(ground_truth .== :outlier, predictions .== :outlier))
end

function calculate_gamma(model, strategy::WangGammaStrategy)
m = deepcopy(model)
@show "Generating"
data_target, data_outliers = generate_binary_data_for_tuning(m.data)
@show "Done"
ground_truth = vcat(fill(:inlier, size(m.data, 2) + size(data_target, 2)),
fill(:outlier, size(data_outliers, 2)))

debug(LOGGER, "[Gamma Search] Searching for parameter C.")
best_gamma = 1.0
best_score = -Inf
for gamma in strategy.gamma_search_range
@show gamma
init_strategy = FixedParameterInitialization(GaussianKernel(gamma), strategy.C)
initialize!(m, init_strategy)
@show "init done"
set_adjust_K!(m, true)
@show "adjusting"
try
fit!(m, strategy.solver)
@show "fitting done"
catch e
debug(LOGGER, "[Gamma Search] Fitting failed for gamma $gamma.")
println(e)
continue
end
predictions = classify.(predict(m, hcat(m.data, data_target, data_outliers)));
score = strategy.scoring_function(ground_truth, predictions)
@show "scoring done"
if score > best_score
@show best_gamma = gamma
@show best_score = score
end
end
return best_gamma
end
2 changes: 2 additions & 0 deletions src/svdd_base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ using MLKernels
using JuMP
using MLLabelUtils
using StatsBase
using NearestNeighbors
import MLBase

const learning_pool_enc = LabelEnc.NativeLabels([:U, :Lin, :Lout])
const class_label_enc = LabelEnc.NativeLabels([:inlier, :outlier])
Expand Down
13 changes: 13 additions & 0 deletions test/init_strategies/init_strategies_test.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,17 @@
actual_outlier_percentage = countmap(SVDD.classify.(SVDD.predict(model, model.data)))[:outlier] / size(model.data, 2)
@test abs.(actual_outlier_percentage - target_outlier_percentage) < 0.04
end

dummy_data, labels = generate_mvn_with_outliers(2, 50, 42, true, true)
pools = fill(:Lin, size(dummy_data, 2))

@testset "WangGammaStrategy" begin
model = SVDD.VanillaSVDD(dummy_data[:, labels .== :inlier])
gamma_strategy = SVDD.WangGammaStrategy(TEST_SOLVER, [0.1, 0.5], 1)
init_strategy = SVDD.SimpleCombinedStrategy(gamma_strategy, SVDD.FixedCStrategy(1))

SVDD.initialize!(model, init_strategy)
SVDD.fit!(model, TEST_SOLVER)
@test all(SVDD.classify.(SVDD.predict(model, dummy_data)) .== labels)
end
end

0 comments on commit d559997

Please sign in to comment.