In [1]:
using Pkg
Pkg.add([
     "MLJ", 
     "MLJBase", 
     "MLJModels", 
     "MLJEnsembles", 
     "MLJLinearModels", 
     "DecisionTree", 
     "MLJDecisionTreeInterface", 
     "NaiveBayes", 
     "EvoTrees", 
     "CategoricalArrays", 
     "Random",
     "LIBSVM",           
     "Plots",            
     "MLJModelInterface", 
     "CSV",              
     "DataFrames",      
     "MLJFlux", 
     "UrlDownload",      
     "XGBoost",
     "NearestNeighborModels",
     "Tables"
])

using Random

const SEED = 12      
Random.seed!(SEED)      

include("Utils.jl")
include("Approaches.jl")
include("preprocessings.jl")
include("models.jl")

using .Utils
using .Approaches
using CSV, DataFrames

[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.11/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.11/Manifest.toml`
┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main.Utils /Users/ana/.julia/packages/MLJModels/wEnSQ/src/loading.jl:159


import MLJFlux ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main.Approaches /Users/ana/.julia/packages/MLJModels/wEnSQ/src/loading.jl:159


import MLJLIBSVMInterface ✔
import MLJDecisionTreeInterface ✔
import NearestNeighborModels ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main.Approaches /Users/ana/.julia/packages/MLJModels/wEnSQ/src/loading.jl:159
┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main.Approaches /Users/ana/.julia/packages/MLJModels/wEnSQ/src/loading.jl:159


In [2]:
using CSV

# Cargar datos
df = CSV.read("wdbc.data", DataFrame, header=false)

new_names = [
    "ID", "Diagnosis",
    "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean",
    "compactness_mean", "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
    "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
    "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se",
    "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst",
    "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"
]

rename!(df, new_names)

y_raw = df.Diagnosis            # 'M' o 'B'
X_df = df[:, 3:end]             # 30 features
X = Matrix(X_df)

# Etiqueta booleana: true = maligno (M)
y_vec = y_raw .== "M"

println("N = ", size(X,1), "  d = ", size(X,2))
println("Malignos: ", sum(y_vec), "  Benignos: ", sum(.!y_vec))


N = 569  d = 30
Malignos: 212  Benignos: 357


In [3]:
N = size(X, 1)
train_idx, test_idx = Utils.holdOut(N, 0.2)

X_train = X[train_idx, :]
X_test  = X[test_idx, :]
y_train_vec = y_vec[train_idx]
y_test_vec  = y_vec[test_idx]

k_folds = 10
cv_indices = Utils.crossvalidation(y_train_vec, k_folds)


456-element Vector{Int64}:
 8
 3
 7
 1
 3
 8
 3
 8
 6
 6
 ⋮
 4
 2
 3
 5
 1
 6
 3
 7
 1

In [4]:
# Approach 1: espacio original + MinMax, con todos los modelos + ensemble
approach1 = Approach(
    "Original space + MinMax",
    preprocessing_1,
    [MODEL_ANN, MODEL_SVM, MODEL_TREE, MODEL_KNN]
)

# Approach 2: PCA 95% varianza + mismos modelos
approach2 = Approach(
    "PCA 95% variance",
    preprocessing_2,
    [MODEL_ANN, MODEL_SVM, MODEL_TREE, MODEL_KNN]
)
# Approach 3: ICA (8 componentes)
# Como preprocessing_ica tiene keyword outdim, lo fijamos aquí:
approach_ica = Approach(
    "ICA (1 components)",
    (Xtr, Xte, ytr) -> preprocessing_ica(Xtr, Xte, ytr; outdim = 1),
    [MODEL_ANN, MODEL_SVM, MODEL_TREE, MODEL_KNN]
)

# Approach 4: LDA (dimensión <= nº_clases - 1, aquí será 1)
approach_lda = Approach(
    "LDA",
    preprocessing_lda,
    [MODEL_ANN, MODEL_SVM, MODEL_TREE, MODEL_KNN]
)

approaches = [approach1, approach2, approach_ica, approach_lda]

4-element Vector{Approach}:
 Approach("Original space + MinMax", Main.preprocessing_1, ModelSpec[ModelSpec(:ANN, Dict{Symbol, Any}[Dict(:maxEpochs => 200, :topology => [5], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [10], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [15], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [20], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [30], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [10, 5], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [20, 10], :learningRate => 0.01), Dict(:maxEpochs => 200, :topology => [30, 15], :learningRate => 0.01)]), ModelSpec(:SVM, Dict{Symbol, Any}[Dict(:kernel => :linear, :C => 0.1), Dict(:kernel => :linear, :C => 1.0), Dict(:kernel => :linear, :C => 10.0), Dict(:gamma => 0.01, :kernel => :rbf, :C => 0.1), Dict(:gamma => 0.01, :kernel => :rbf, :C => 1.0), Dict(:gamma => 0.1, :kernel => :rbf, :C => 0.5), Dict(:degree => 3, :kernel

In [5]:
results = Dict{String,ApproachResult}()

for appr in approaches
    println("==============================================")
    println(" Running approach: ", appr.name)
    println("==============================================")
    res = run_approach(appr, X_train, y_train_vec, cv_indices)
    results[appr.name] = res

    println("  -> winner model: ", res.winner_name)
    println("  -> best metrics per model:")
    for spec in appr.model_specs
        mr = res.model_results[spec.name]
        println("     ", spec.name, " | best f2 score = ", mr.best_metric)
    end
end


 Running approach: Original space + MinMax
  -> winner model: ANN
  -> best metrics per model:
     ANN | best f2 score = 0.9555143950526421
     SVM | best f2 score = 0.9257382919852887
     DecisionTree | best f2 score = 0.9336886606017313
     kNN | best f2 score = 0.952452400398597
 Running approach: PCA 95% variance
  -> winner model: ANN
  -> best metrics per model:
     ANN | best f2 score = 0.959303186140301
     SVM | best f2 score = 0.8748486673956201
     DecisionTree | best f2 score = 0.932624220860701
     kNN | best f2 score = 0.9482448432498634
 Running approach: ICA (1 components)
  -> winner model: ANN
  -> best metrics per model:
     ANN | best f2 score = 0.9550410731572944
     SVM | best f2 score = 0.7780649913170239
     DecisionTree | best f2 score = 0.8810683593701871
     kNN | best f2 score = 0.8561939340528134
 Running approach: LDA
  -> winner model: SVM
  -> best metrics per model:
     ANN | best f2 score = 0.9574189780482796
     SVM | best f2 score = 0.9

In [6]:
############################################
#  Ensemble with the 4 models of the BEST approach
#  (train on full train set, evaluate on test)
############################################

# 1) Select the best approach according to the F2 of its winning model
best_scores = Float64[]
for appr in approaches
    res = results[appr.name]
    # F2 of the best model of this approach
    push!(best_scores, res.model_results[res.winner_name].best_metric)
end

best_idx      = argmax(best_scores)
best_approach = approaches[best_idx]
best_result   = results[best_approach.name]
best_preproc = best_approach.preprocessor 

println("Best approach used for the ensemble: ", best_approach.name)

# 2) Get the 4 best models (ANN, SVM, tree, kNN) from that approach

estimators = Symbol[]
modelsHyperParameters = Dict{Symbol,Any}[]

for spec in best_approach.model_specs
    push!(estimators, spec.name)
    push!(modelsHyperParameters, best_result.model_results[spec.name].best_params)
end

println("\nModels included in the ensemble (all from approach '", best_approach.name, "'):")
for (est, hp) in zip(estimators, modelsHyperParameters)
    println("  - ", est, " with hyperparameters ", hp)
end

# 3) Ensemble hyperparameters
#    (train_and_test_ensemble only uses :voting; :metric is ignored here)
ensembleHyperParameters = Dict(
    :voting => :hard   # majority voting
)

# Train ensemble
mach_ens = train_ensemble(
    best_preproc,
    estimators,
    modelsHyperParameters,
    ensembleHyperParameters,
    X_train,
    y_train_vec
)

Best approach used for the ensemble: LDA

Models included in the ensemble (all from approach 'LDA'):
  - ANN with hyperparameters Dict{Symbol, Any}(:maxEpochs => 200, :topology => [30, 15], :learningRate => 0.01)
  - SVM with hyperparameters Dict{Symbol, Any}(:gamma => 0.01, :kernel => :rbf, :C => 1.0)
  - DecisionTree with hyperparameters Dict{Symbol, Any}(:max_depth => 2)
  - kNN with hyperparameters Dict{Symbol, Any}(:K => 1)

Training Ensemble on full training set (preprocessed)...
  - Voting strategy: hard
  - Number of models: 4


trained Machine; caches model-specific representations of data
  model: VotingClassifier(models = Probabilistic[NeuralNetworkClassifier(builder = Short(n_hidden = 30, …), …), ProbabilisticSVC(kernel = RadialBasis, …), DecisionTreeClassifier(max_depth = 2, …), KNNClassifier(K = 1, …)], …)
  args: 
    1:	Source @842 ⏎ Table{AbstractVector{Continuous}}
    2:	Source @524 ⏎ AbstractVector{Multiclass{2}}


In [7]:
# Test approaches
for (name, res) in results
    println("\n======================================================================")
    println(" Final evaluation on test for approach: ", name)
    println(" Model: ", res.winner_name, " || Params: ", res.winner_params)
    println("\n======================================================================")
    train_and_evaluate_winner(res, X_train, y_train_vec, X_test, y_test_vec)
end

# Test ensemble
y_pred_ensemble = test_ensemble(
    best_preproc,
    mach_ens,
    X_train,
    y_train_vec,
    X_test,
    y_test_vec
)


 Final evaluation on test for approach: Original space + MinMax
 Model: ANN || Params: Dict{Symbol, Any}(:maxEpochs => 200, :topology => [30, 15], :learningRate => 0.01)

Confusion Matrix:
[66 2; 4 41]

Accuracy: 0.9469026548672567
Error rate: 0.05309734513274336
Sensitivity: 0.9111111111111111
Specificity: 0.9705882352941176
PPV: 0.9534883720930233
NPV: 0.9428571428571428
F1-score: 0.9318181818181819
F2-score: 0.9192825112107623

 Final evaluation on test for approach: ICA (1 components)
 Model: ANN || Params: Dict{Symbol, Any}(:maxEpochs => 200, :topology => [30, 15], :learningRate => 0.01)

Confusion Matrix:
[66 2; 7 38]

Accuracy: 0.9203539823008849
Error rate: 0.07964601769911504
Sensitivity: 0.8444444444444444
Specificity: 0.9705882352941176
PPV: 0.95
NPV: 0.9041095890410958
F1-score: 0.8941176470588236
F2-score: 0.8636363636363638

 Final evaluation on test for approach: PCA 95% variance
 Model: ANN || Params: Dict{Symbol, Any}(:maxEpochs => 200, :topology => [30, 15], :learnin

113-element BitVector:
 0
 1
 0
 1
 0
 1
 0
 0
 0
 0
 ⋮
 0
 1
 0
 1
 1
 0
 0
 0
 0