In [None]:
using Pkg
Pkg.add([
     "MLJ", 
     "MLJBase", 
     "MLJModels", 
     "MLJEnsembles", 
     "MLJLinearModels", 
     "DecisionTree", 
     "MLJDecisionTreeInterface", 
     "NaiveBayes", 
     "EvoTrees", 
     "CategoricalArrays", 
     "Random",
     "LIBSVM",           
     "Plots",            
     "MLJModelInterface", 
     "CSV",              
     "DataFrames",      
     "MLJFlux", 
     "UrlDownload",      
     "XGBoost",
     "NearestNeighborModels",
     "Tables"
])

include("Utils.jl")
include("Approaches.jl")
include("preprocessings.jl")
include("models.jl")

using .Utils
using .Approaches
using CSV, DataFrames, Random

In [None]:
using CSV

# Cargar datos
df = CSV.read("wdbc.data", DataFrame, header=false)

new_names = [
    "ID", "Diagnosis",
    "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean",
    "compactness_mean", "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
    "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
    "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se",
    "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst",
    "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"
]

rename!(df, new_names)

y_raw = df.Diagnosis            # 'M' o 'B'
X_df = df[:, 3:end]             # 30 features
X = Matrix(X_df)

# Etiqueta booleana: true = maligno (M)
y_vec = y_raw .== "M"

println("N = ", size(X,1), "  d = ", size(X,2))
println("Malignos: ", sum(y_vec), "  Benignos: ", sum(.!y_vec))


N = 569  d = 30
Malignos: 212  Benignos: 357


In [None]:
Random.seed!(42)

N = size(X, 1)
train_idx, test_idx = Utils.holdOut(N, 0.2)

X_train = X[train_idx, :]
X_test  = X[test_idx, :]
y_train_vec = y_vec[train_idx]
y_test_vec  = y_vec[test_idx]

k_folds = 10
cv_indices = Utils.crossvalidation(y_train_vec, k_folds)


456-element Vector{Int64}:
  7
  9
 10
  4
  6
  2
  5
  9
 10
  3
  1
  8
  8
  ⋮
  7
 10
  5
  9
  7
  7
 10
 10
  6
  4
  1
 10

In [None]:
# Approach 1: espacio original + MinMax, con todos los modelos + ensemble
approach1 = Approach(
    "Original space + MinMax",
    preprocessing_1,
    [MODEL_ANN, MODEL_SVM, MODEL_TREE, MODEL_KNN]
)

# Approach 2: PCA 95% varianza + mismos modelos
approach2 = Approach(
    "PCA 95% variance",
    preprocessing_2,
    [MODEL_ANN, MODEL_SVM, MODEL_TREE, MODEL_KNN]
)
# Approach 3: ICA (8 componentes)
# Como preprocessing_ica tiene keyword outdim, lo fijamos aquí:
approach_ica = Approach(
    "ICA (1 components)",
    (Xtr, Xte, ytr) -> preprocessing_ica(Xtr, Xte, ytr; outdim = 1),
    [MODEL_ANN, MODEL_SVM, MODEL_TREE, MODEL_KNN]
)

# Approach 4: LDA (dimensión <= nº_clases - 1, aquí será 1)
approach_lda = Approach(
    "LDA",
    preprocessing_lda,
    [MODEL_ANN, MODEL_SVM, MODEL_TREE, MODEL_KNN]
)

approaches = [approach1, approach2, approach_ica, approach_lda]

4-element Vector{Approach}:
 Approach("Original space + MinMax", Main.preprocessing_1, ModelSpec[ModelSpec(:ANN, Dict{Symbol, Any}[Dict(:maxEpochs => 200, :topology => [5], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [10], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [15], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [20], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [30], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [10, 5], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [20, 10], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [30, 15], :learningRate => 0.01)]), ModelSpec(:SVM, Dict{Symbol, Any}[Dict(:kernel => :linear, :C => 0.1), Dict(:kernel => :linear, :C => 1.0), Dict(:kernel => :linear, :C => 10.0), Dict(:gamma => 0.01, :kernel => :rbf, :C => 0.1), Dict(:gamma => 0.01, :kernel => :rbf, :C => 1.0), Dict(:gamma => 0.1, :kernel => :rbf, :C => 0.5), Dict(:degree => 3, :kernel

In [None]:
results = Dict{String,ApproachResult}()

for appr in approaches
    println("==============================================")
    println(" Running approach: ", appr.name)
    println("==============================================")
    res = run_approach(appr, X_train, y_train_vec, cv_indices)
    results[appr.name] = res

    println("  -> winner model: ", res.winner_name)
    println("  -> best metrics per model:")
    for spec in appr.model_specs
        mr = res.model_results[spec.name]
        println("     ", spec.name, " | best f2 score = ", mr.best_metric)
    end
end


 Running approach: Original space + MinMax


[33m[1m│ [22m[39m  The input will be converted, but any earlier layers may be very slow.
[33m[1m│ [22m[39m  layer = Dense(30 => 5, σ)   [90m# 155 parameters[39m
[33m[1m│ [22m[39m  summary(x) = "30×328 adjoint(::Matrix{Float64}) with eltype Float64"
[33m[1m└ [22m[39m[90m@ Flux ~/.julia/packages/Flux/uRn8o/src/layers/stateless.jl:60[39m


  -> winner model: ANN
  -> best metrics per model:
     ANN | best f2 score = 0.953269483608788
     SVM | best f2 score = 0.942025575327239
     DecisionTree | best f2 score = 0.9415741486999252
     kNN | best f2 score = 0.9300271820821949
 Running approach: PCA 95% variance


[33m[1m│ [22m[39msupports. Suppress this type check by specifying `scitype_check_level=0`.
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mRun `@doc MultivariateStats.PCA` to learn more about your model's requirements.
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mCommonly, but non exclusively, supervised models are constructed using the syntax
[33m[1m│ [22m[39m`machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
[33m[1m│ [22m[39mconstructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
[33m[1m│ [22m[39msample or class weights.
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mIn general, data in `machine(model, data...)` is expected to satisfy
[33m[1m│ [22m[39m
[33m[1m│ [22m[39m    scitype(data) <: MLJ.fit_data_scitype(model)
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mIn the present case:
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mscitype(data) = Tuple{AbstractMatrix{Continuous}}
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mf

  -> winner model: ANN
  -> best metrics per model:
     ANN | best f2 score = 0.9550788129961607
     SVM | best f2 score = 0.8836603949560932
     DecisionTree | best f2 score = 0.9351398425600115
     kNN | best f2 score = 0.940327330849836
 Running approach: ICA (1 components)


[33m[1m│ [22m[39msupports. Suppress this type check by specifying `scitype_check_level=0`.
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mRun `@doc MultivariateStats.PCA` to learn more about your model's requirements.
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mCommonly, but non exclusively, supervised models are constructed using the syntax
[33m[1m│ [22m[39m`machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
[33m[1m│ [22m[39mconstructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
[33m[1m│ [22m[39msample or class weights.
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mIn general, data in `machine(model, data...)` is expected to satisfy
[33m[1m│ [22m[39m
[33m[1m│ [22m[39m    scitype(data) <: MLJ.fit_data_scitype(model)
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mIn the present case:
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mscitype(data) = Tuple{AbstractMatrix{Continuous}}
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mf

In [None]:
print(results)

Dict{String, ApproachResult}("Original space + MinMax" => ApproachResult(Approach("Original space + MinMax", Main.preprocessing_1, ModelSpec[ModelSpec(:ANN, Dict{Symbol, Any}[Dict(:maxEpochs => 200, :topology => [5], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [10], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [15], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [20], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [30], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [10, 5], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [20, 10], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [30, 15], :learningRate => 0.01)]), ModelSpec(:SVM, Dict{Symbol, Any}[Dict(:kernel => :linear, :C => 0.1), Dict(:kernel => :linear, :C => 1.0), Dict(:kernel => :linear, :C => 10.0), Dict(:gamma => 0.01, :kernel => :rbf, :C => 0.1), Dict(:gamma => 0.01, :kernel => :rbf, :C => 1.0), Dict(:gamma => 0.1, :kernel => 

:learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [10], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [15], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [20], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [30], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [10, 5], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [20, 10], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [30, 15], :learningRate => 0.01)]), ModelSpec(:SVM, Dict{Symbol, Any}[Dict(:kernel => :linear, :C => 0.1), Dict(:kernel => :linear, :C => 1.0), Dict(:kernel => :linear, :C => 10.0), Dict(:gamma => 0.01, :kernel => :rbf, :C => 0.1), Dict(:gamma => 0.01, :kernel => :rbf, :C => 1.0), Dict(:gamma => 0.1, :kernel => :rbf, :C => 0.5), Dict(:degree => 3, :kernel => :poly, :C => 1.0), Dict(:gamma => 0.01, :kernel => :sigmoid, :C => 1.0)]), ModelSpec(:DecisionTree, Dict{Symbol, Any}[Dict(:max_depth => 2), Dict(:max_depth => 3), Dict(

In [None]:
for (name, res) in results
    println("\n======================================================================")
    println(" Final evaluation on test for approach: ", name)
    println(" Model: ", res.winner_name, " || Params: ", res.winner_params)
    println("\n======================================================================")
    train_and_evaluate_winner(res, X_train, y_train_vec, X_test, y_test_vec)
end


 Final evaluation on test for approach: Original space + MinMax
 Model: ANN || Params: Dict{Symbol, Any}(:maxEpochs => 200, :topology => [30, 15], :learningRate => 0.01)

Confusion Matrix:
[70 0; 2 41]

Accuracy: 0.9823008849557522
Error rate: 0.017699115044247787
Sensitivity: 0.9534883720930233
Specificity: 1.0
PPV: 1.0
NPV: 0.9722222222222222
F1-score: 0.9761904761904763
F2-score: 0.9624413145539907

 Final evaluation on test for approach: ICA (1 components)
 Model: ANN || Params: Dict{Symbol, Any}(:maxEpochs => 200, :topology => [30, 15], :learningRate => 0.01)

Confusion Matrix:
[65 5; 10 33]

Accuracy: 0.8672566371681416
Error rate: 0.13274336283185842
Sensitivity: 0.7674418604651163
Specificity: 0.9285714285714286
PPV: 0.868421052631579
NPV: 0.8666666666666667
F1-score: 0.8148148148148148
F2-score: 0.7857142857142857

 Final evaluation on test for approach: PCA 95% variance
 Model: ANN || Params: Dict{Symbol, Any}(:maxEpochs => 200, :topology => [30, 15], :learningRate => 0.01)


[33m[1m│ [22m[39msupports. Suppress this type check by specifying `scitype_check_level=0`.
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mRun `@doc MultivariateStats.ICA` to learn more about your model's requirements.
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mCommonly, but non exclusively, supervised models are constructed using the syntax
[33m[1m│ [22m[39m`machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
[33m[1m│ [22m[39mconstructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
[33m[1m│ [22m[39msample or class weights.
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mIn general, data in `machine(model, data...)` is expected to satisfy
[33m[1m│ [22m[39m
[33m[1m│ [22m[39m    scitype(data) <: MLJ.fit_data_scitype(model)
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mIn the present case:
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mscitype(data) = Tuple{AbstractMatrix{Continuous}}
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mf

Accuracy: 0.9734513274336283
Error rate: 0.02654867256637168
Sensitivity: 0.9302325581395349
Specificity: 1.0
PPV: 1.0
NPV: 0.958904109589041
F1-score: 0.963855421686747
F2-score: 0.9433962264150945

 Final evaluation on test for approach: LDA
 Model: ANN || Params: Dict{Symbol, Any}(:maxEpochs => 200, :topology => [30, 15], :learningRate => 0.01)

Confusion Matrix:
[70 0; 3 40]

Accuracy: 0.9734513274336283
Error rate: 0.02654867256637168
Sensitivity: 0.9302325581395349
Specificity: 1.0
PPV: 1.0
NPV: 0.958904109589041
F1-score: 0.963855421686747
F2-score: 0.9433962264150945
