In [8]:
using Pkg
# Ensure required packages (uncomment to install if needed)
Pkg.add([
    "MLJ", 
    "MLJBase", 
    "MLJModels", 
    "MLJEnsembles", 
    "MLJLinearModels", 
    "DecisionTree", 
    "MLJDecisionTreeInterface", 
    "NaiveBayes", 
    "EvoTrees", 
    "CategoricalArrays", 
    "Random",
    "LIBSVM",           
    "Plots",            
    "MLJModelInterface", 
    "CSV",              
    "DataFrames",      
    "MLJFlux", 
    "UrlDownload",      
    "XGBoost"    
])
include("Utils.jl")
using .Utils
using CSV
using DataFrames
using Statistics


[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.11/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.11/Manifest.toml`


In [9]:
# 1. Load data from wdbc.data file
df = CSV.read("wdbc.data", DataFrame, header=false)

new_names = [
    "ID", "Diagnosis",
    # The Mean (first 10 features)
    "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean",
    "compactness_mean", "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
    # The Standard Error (next 10 features)
    "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
    "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se",
    # The "Worst" or Largest (last 10 features)
    "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst",
    "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"
]
rename!(df, new_names)

# 4. Verify the changes
# Let's look at the first 3 rows and the new header
println("--- Updated DataFrame Headers ---")
first(df, 3)

# 2. Data separation (mimicking the Python script)
y = df[:, 2]
x = df[:, 3:end]

# 3. Metadata equivalent
println("--- Dataset Summary ---")
println("Total Rows: ", nrow(df))
println("Total Columns: ", ncol(df))
println("Target variable (y) shape: ", size(y))
println("Features matrix (X) shape: ", size(x))

println("\n--- Variable Information (First 5 Features) ---")
display(describe(x))
println("\n--- Target Distribution ---")
display(combine(groupby(DataFrame(Diagnosis=y), :Diagnosis), nrow))

--- Updated DataFrame Headers ---
--- Dataset Summary ---
Total Rows: 569
Total Columns: 32
Target variable (y) shape: (569,)
Features matrix (X) shape: (569, 30)

--- Variable Information (First 5 Features) ---


Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Float64,Float64,Float64,Float64,Int64,DataType
1,radius_mean,14.1273,6.981,13.37,28.11,0,Float64
2,texture_mean,19.2896,9.71,18.84,39.28,0,Float64
3,perimeter_mean,91.969,43.79,86.24,188.5,0,Float64
4,area_mean,654.889,143.5,551.1,2501.0,0,Float64
5,smoothness_mean,0.0963603,0.05263,0.09587,0.1634,0,Float64
6,compactness_mean,0.104341,0.01938,0.09263,0.3454,0,Float64
7,concavity_mean,0.0887993,0.0,0.06154,0.4268,0,Float64
8,concave_points_mean,0.0489191,0.0,0.0335,0.2012,0,Float64
9,symmetry_mean,0.181162,0.106,0.1792,0.304,0,Float64
10,fractal_dimension_mean,0.0627976,0.04996,0.06154,0.09744,0,Float64



--- Target Distribution ---


Row,Diagnosis,nrow
Unnamed: 0_level_1,String1,Int64
1,M,212
2,B,357


In [10]:
x = Matrix(x)
Utils.normalizeMinMax!(x)

classes = unique(y)
y_encoded = Utils.oneHotEncoding(y, classes)

println("--- Data Summary (after processing with Utils) ---")
println("Features (X) dimension: ", size(x))
println("Targets (y_encoded) dimension: ", size(y_encoded))
println("Mean of first feature column: ", mean(x[:, 1]))
println("Max of first feature column: ", maximum(x[:, 1]))
println("Targets (first 5 encoded): \n", first(y_encoded, 5))

--- Data Summary (after processing with Utils) ---
Features (X) dimension: (569, 30)
Targets (y_encoded) dimension: (569, 1)
Mean of first feature column: 0.3382219574941812
Max of first feature column: 1.0
Targets (first 5 encoded): 
Bool[1, 1, 1, 1, 1]


In [11]:
num_instances = size(x, 1)
validation_ratio = 0.1
test_ratio = 0.1

train_idx, val_idx, test_idx = Utils.holdOut(num_instances, validation_ratio, test_ratio)

x_train = x[train_idx, :]
y_train = y_encoded[train_idx, :]

x_val = x[val_idx, :]
y_val = y_encoded[val_idx, :]

x_test = x[test_idx, :]
y_test = y_encoded[test_idx, :]

println("Total instances: $num_instances")
println("Training set size: $(size(x_train, 1))")
println("Validation set size: $(size(x_val, 1))")
println("Test set size: $(size(x_test, 1))")

Total instances: 569
Training set size: 456
Validation set size: 57
Test set size: 56
