In [1]:
using Pkg
Pkg.add([
     "MLJ", 
     "MLJBase", 
     "MLJModels", 
     "MLJEnsembles", 
     "MLJLinearModels", 
     "DecisionTree", 
     "MLJDecisionTreeInterface", 
     "NaiveBayes", 
     "EvoTrees", 
     "CategoricalArrays", 
     "Random",
     "LIBSVM",           
     "Plots",            
     "MLJModelInterface", 
     "CSV",              
     "DataFrames",      
     "MLJFlux", 
     "UrlDownload",      
     "XGBoost",
     "NearestNeighborModels",
     "Tables"
])

include("Utils.jl")
include("Approaches.jl")
include("preprocessings.jl")
include("models.jl")

using .Utils
using .Approaches
using CSV, DataFrames, Random

[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.11/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.11/Manifest.toml`
┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main.Utils /home/adrian/.julia/packages/MLJModels/wEnSQ/src/loading.jl:159


import MLJFlux ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main.Approaches /home/adrian/.julia/packages/MLJModels/wEnSQ/src/loading.jl:159


import MLJLIBSVMInterface ✔
import MLJDecisionTreeInterface ✔
import NearestNeighborModels ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main.Approaches /home/adrian/.julia/packages/MLJModels/wEnSQ/src/loading.jl:159
┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main.Approaches /home/adrian/.julia/packages/MLJModels/wEnSQ/src/loading.jl:159


In [2]:
using CSV

# Cargar datos
df = CSV.read("wdbc.data", DataFrame, header=false)

new_names = [
    "ID", "Diagnosis",
    "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean",
    "compactness_mean", "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
    "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
    "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se",
    "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst",
    "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"
]

rename!(df, new_names)

y_raw = df.Diagnosis            # 'M' o 'B'
X_df = df[:, 3:end]             # 30 features
X = Matrix(X_df)

# Etiqueta booleana: true = maligno (M)
y_vec = y_raw .== "M"

println("N = ", size(X,1), "  d = ", size(X,2))
println("Malignos: ", sum(y_vec), "  Benignos: ", sum(.!y_vec))


N = 569  d = 30
Malignos: 212  Benignos: 357


In [3]:
Random.seed!(42)

N = size(X, 1)
train_idx, test_idx = Utils.holdOut(N, 0.2)

X_train = X[train_idx, :]
X_test  = X[test_idx, :]
y_train_vec = y_vec[train_idx]
y_test_vec  = y_vec[test_idx]

k_folds = 10
cv_indices = Utils.crossvalidation(y_train_vec, k_folds)


456-element Vector{Int64}:
  7
  9
 10
  4
  6
  2
  5
  9
 10
  3
  ⋮
  9
  7
  7
 10
 10
  6
  4
  1
 10

In [4]:
# Approach 1: espacio original + MinMax, con todos los modelos + ensemble
approach1 = Approach(
    "Original space + MinMax",
    preprocessing_1,
    [MODEL_ANN, MODEL_SVM, MODEL_TREE, MODEL_KNN],
    false
)

# Approach 2: PCA 95% varianza + mismos modelos
approach2 = Approach(
    "PCA 95% variance",
    preprocessing_2,
    [MODEL_ANN, MODEL_SVM, MODEL_TREE, MODEL_KNN],
    false
)

approaches = [approach1, approach2]


2-element Vector{Approach}:
 Approach("Original space + MinMax", Main.preprocessing_1, ModelSpec[ModelSpec(:ANN, Dict{Symbol, Any}[Dict(:maxEpochs => 200, :topology => [5], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [10], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [20], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [10, 10], :learningRate => 0.01)]), ModelSpec(:SVM, Dict{Symbol, Any}[Dict(:kernel => :rbf, :C => 1.0), Dict(:kernel => :rbf, :C => 10.0), Dict(:kernel => :linear, :C => 1.0)]), ModelSpec(:DecisionTree, Dict{Symbol, Any}[Dict(:max_depth => 3), Dict(:max_depth => 5), Dict(:max_depth => 10)]), ModelSpec(:kNN, Dict{Symbol, Any}[Dict(:K => 3), Dict(:K => 5), Dict(:K => 7)])], false)
 Approach("PCA 95% variance", Main.preprocessing_2, ModelSpec[ModelSpec(:ANN, Dict{Symbol, Any}[Dict(:maxEpochs => 200, :topology => [5], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [10], :learningRate => 0.01), Dict(:maxEpochs => 20

In [5]:
results = Dict{String,ApproachResult}()

for appr in approaches
    println("==============================================")
    println(" Running approach: ", appr.name)
    println("==============================================")
    res = run_approach(appr, X_train, y_train_vec, cv_indices)
    results[appr.name] = res

    println("  -> winner model: ", res.winner_name)
    println("  -> best metrics per model:")
    for spec in appr.model_specs
        mr = res.model_results[spec.name]
        println("     ", spec.name, " | best acc = ", mr.best_metric)
    end
    if appr.use_ensemble && res.ensemble_metric !== nothing
        println("     Ensemble | acc = ", res.ensemble_metric)
    end
end


 Running approach: Original space + MinMax


│   The input will be converted, but any earlier layers may be very slow.
│   layer = Dense(30 => 5, σ)
│   summary(x) = 30×328 adjoint(::Matrix{Float64}) with eltype Float64
└ @ Flux /home/adrian/.julia/packages/Flux/uRn8o/src/layers/stateless.jl:60


  -> winner model: SVM
  -> best metrics per model:
     ANN | best acc = 0.9683432586736934
     SVM | best acc = 0.9691216512955643
     DecisionTree | best acc = 0.9538581466842336
     kNN | best acc = 0.9603293807641633
 Running approach: PCA 95% variance


│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc MultivariateStats.PCA` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{AbstractMatrix{Continuous}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:AbstractVector{<:Continuous}}}
└ @ MLJBase /home/adrian/.julia/packages/MLJBase/QOkBT/src/machines.jl:237


  -> winner model: SVM
  -> best metrics per model:
     ANN | best acc = 0.9224568730786121
     SVM | best acc = 0.973717610891524
     DecisionTree | best acc = 0.9381488801054019
     kNN | best acc = 0.9449648660518226


In [6]:
for (name, res) in results
    println("\n==============================================")
    println(" Final evaluation on test for approach: ", name)
    println("==============================================")
    train_and_evaluate_winner(res, X_train, y_train_vec, X_test, y_test_vec)
end


 Final evaluation on test for approach: Original space + MinMax
Confusion Matrix:
[70 0; 3 40]

Accuracy: 0.9734513274336283
Error rate: 0.02654867256637168
Sensitivity: 0.9302325581395349
Specificity: 1.0
PPV: 1.0
NPV: 0.958904109589041
F1-score: 0.963855421686747

 Final evaluation on test for approach: PCA 95% variance
Confusion Matrix:
[70 0; 2 41]

Accuracy: 0.9823008849557522
Error rate: 0.017699115044247787
Sensitivity: 0.9534883720930233
Specificity: 1.0
PPV: 1.0
NPV: 0.9722222222222222
F1-score: 0.9761904761904763


│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc MultivariateStats.PCA` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{AbstractMatrix{Continuous}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:AbstractVector{<:Continuous}}}
└ @ MLJBase /home/adrian/.julia/packages/MLJBase/QOkBT/src/machines.jl:237
