# Tutorial on Machine Learning with MLJ

In [21]:
using Pkg
Pkg.activate()

[32m[1m  Activating[22m[39m project at `~/.julia/environments/v1.7`


In [22]:
using MLJ
using VegaLite

## 1. O Básico

In [3]:
const X_iris, y_iris = @load_iris;
schema(X_iris)

┌──────────────┬────────────┬─────────┐
│[22m names        [0m│[22m scitypes   [0m│[22m types   [0m│
├──────────────┼────────────┼─────────┤
│ sepal_length │ Continuous │ Float64 │
│ sepal_width  │ Continuous │ Float64 │
│ petal_length │ Continuous │ Float64 │
│ petal_width  │ Continuous │ Float64 │
└──────────────┴────────────┴─────────┘


In [4]:
levels(y_iris)

3-element Vector{String}:
 "setosa"
 "versicolor"
 "virginica"

In [5]:
DecisionTree = @load DecisionTreeClassifier pkg=BetaML # model type

import BetaML ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main /home/davibarreira/.julia/packages/MLJModels/lDzCR/src/loading.jl:168


BetaML.Trees.DecisionTreeClassifier

In [6]:
model = DecisionTree(minRecords=5)                    # model instance

DecisionTreeClassifier(
  maxDepth = 0, 
  minGain = 0.0, 
  minRecords = 5, 
  maxFeatures = 0, 
  splittingCriterion = BetaML.Utils.gini, 
  rng = Random._GLOBAL_RNG())

In [7]:
mach = machine(model, X_iris, y_iris)

Machine trained 0 times; caches data
  model: DecisionTreeClassifier(maxDepth = 0, …)
  args: 
    1:	Source @016 ⏎ `Table{AbstractVector{Continuous}}`
    2:	Source @711 ⏎ `AbstractVector{Multiclass{3}}`


In [8]:
train_rows = vcat(1:60, 91:150); # some row indices (observations are rows not columns)
fit!(mach, rows=train_rows)
fitted_params(mach)

┌ Info: Training machine(DecisionTreeClassifier(maxDepth = 0, …), …).
└ @ MLJBase /home/davibarreira/.julia/packages/MLJBase/jcOVb/src/machines.jl:487


(fitresult = (BetaML.Trees.DecisionNode{Float64}(BetaML.Trees.Question{Float64}(4, 1.0), BetaML.Trees.DecisionNode{Float64}(BetaML.Trees.Question{Float64}(3, 4.8), BetaML.Trees.DecisionNode{Float64}(BetaML.Trees.Question{Float64}(4, 1.6), BetaML.Trees.Leaf{String}(Dict("virginica" => 1.0), 4), BetaML.Trees.Leaf{String}(Dict("virginica" => 0.75, "versicolor" => 0.25), 4), 3, 0.92), BetaML.Trees.DecisionNode{Float64}(BetaML.Trees.Question{Float64}(4, 1.7), BetaML.Trees.Leaf{String}(Dict("virginica" => 1.0), 4), BetaML.Trees.Leaf{String}(Dict("versicolor" => 1.0), 4), 3, 0.05), 2, 0.7142857142857143), BetaML.Trees.Leaf{String}(Dict("setosa" => 1.0), 2), 1, 0.5833333333333334), CategoricalArrays.CategoricalValue{String, UInt32} "setosa"),)

In [9]:
predict(mach, rows=71:73)

3-element CategoricalDistributions.UnivariateFiniteVector{Multiclass{3}, String, UInt32, Float64}:
 UnivariateFinite{Multiclass{3}}(setosa=>0.0, versicolor=>0.0, virginica=>1.0)
 UnivariateFinite{Multiclass{3}}(setosa=>0.0, versicolor=>1.0, virginica=>0.0)
 UnivariateFinite{Multiclass{3}}(setosa=>0.0, versicolor=>0.25, virginica=>0.75)

In [10]:
Xnew = (sepal_length = [5.1, 6.3],
        sepal_width = [3.0, 2.5],
        petal_length = [1.4, 4.9],
        petal_width = [0.3, 1.5])
yhat = predict(mach, Xnew)

2-element CategoricalDistributions.UnivariateFiniteVector{Multiclass{3}, String, UInt32, Float64}:
 UnivariateFinite{Multiclass{3}}(setosa=>1.0, versicolor=>0.0, virginica=>0.0)
 UnivariateFinite{Multiclass{3}}(setosa=>0.0, versicolor=>0.25, virginica=>0.75)

In [11]:
pdf.(yhat, "virginica")

2-element Vector{Float64}:
 0.0
 0.75

## 2. Indo mais a fundo

In [12]:
using DataFrames
data = OpenML.load(42178) # data set from OpenML.org
df0 = DataFrame(data)
first(df0, 4)

Unnamed: 0_level_0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines
Unnamed: 0_level_1,String,String,Float64,String,String,Float64,String,String
1,7590-VHVEG,Female,0.0,Yes,No,1.0,No,No phone service
2,5575-GNVDE,Male,0.0,No,No,34.0,Yes,No
3,3668-QPYBK,Male,0.0,No,No,2.0,Yes,No
4,7795-CFOCW,Male,0.0,No,No,45.0,No,No phone service


In [13]:
fix_blanks(v) = map(v) do x
    if x == " "
        return "0.0"
    else
        return x
    end
end

df0.TotalCharges = fix_blanks(df0.TotalCharges);

In [14]:
coerce!(df0, :TotalCharges => Continuous);
coerce!(df0, Textual => Multiclass);
coerce!(df0, :Churn => OrderedFactor)
levels(df0.Churn) # to check order

2-element Vector{String}:
 "No"
 "Yes"

In [15]:
schema(df0) |> DataFrames.DataFrame

Unnamed: 0_level_0,names,scitypes,types
Unnamed: 0_level_1,Symbol,DataType,DataType
1,customerID,Multiclass{7043},"CategoricalValue{String, UInt32}"
2,gender,Multiclass{2},"CategoricalValue{String, UInt32}"
3,SeniorCitizen,Continuous,Float64
4,Partner,Multiclass{2},"CategoricalValue{String, UInt32}"
5,Dependents,Multiclass{2},"CategoricalValue{String, UInt32}"
6,tenure,Continuous,Float64
7,PhoneService,Multiclass{2},"CategoricalValue{String, UInt32}"
8,MultipleLines,Multiclass{3},"CategoricalValue{String, UInt32}"
9,InternetService,Multiclass{3},"CategoricalValue{String, UInt32}"
10,OnlineSecurity,Multiclass{3},"CategoricalValue{String, UInt32}"


In [16]:
df, df_test, df_dumped = partition(df0, 0.07, 0.03, # in ratios 7:3:90
                                   stratify=df0.Churn,
                                   rng=123);

In [17]:
const y, X = unpack(df, ==(:Churn), !=(:customerID));
const ytest, Xtest = unpack(df_test, ==(:Churn), !=(:customerID));

In [25]:
Booster = @load EvoTreeClassifier pkg=EvoTrees

import EvoTrees ✔


┌ Info: Precompiling EvoTrees [f6006082-12f8-11e9-0c9c-0d5d367ab1e5]
└ @ Base loading.jl:1423
┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main /home/davibarreira/.julia/packages/MLJModels/lDzCR/src/loading.jl:168


EvoTreeClassifier

In [26]:
booster = Booster()

EvoTreeClassifier(
  loss = EvoTrees.Softmax(), 
  nrounds = 10, 
  λ = 0.0, 
  γ = 0.0, 
  η = 0.1, 
  max_depth = 5, 
  min_weight = 1.0, 
  rowsample = 1.0, 
  colsample = 1.0, 
  nbins = 64, 
  α = 0.5, 
  metric = :mlogloss, 
  rng = Random.MersenneTwister(123), 
  device = "cpu")

Nosso modelo não pode ser utilizado diretamente.

In [28]:
scitype(X) <: input_scitype(booster)

false

In [29]:
pipe = ContinuousEncoder() |> booster

ProbabilisticPipeline(
  continuous_encoder = ContinuousEncoder(
        drop_last = false, 
        one_hot_ordered_factors = false), 
  evo_tree_classifier = EvoTreeClassifier(
        loss = EvoTrees.Softmax(), 
        nrounds = 10, 
        λ = 0.0, 
        γ = 0.0, 
        η = 0.1, 
        max_depth = 5, 
        min_weight = 1.0, 
        rowsample = 1.0, 
        colsample = 1.0, 
        nbins = 64, 
        α = 0.5, 
        metric = :mlogloss, 
        rng = Random.MersenneTwister(123), 
        device = "cpu"), 
  cache = true)

In [31]:
pipe.evo_tree_classifier.max_depth

5

In [32]:
mach_pipe = machine(pipe, X, y)

Machine trained 0 times; caches data
  model: ProbabilisticPipeline(continuous_encoder = ContinuousEncoder(drop_last = false, …), …)
  args: 
    1:	Source @851 ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{3}}, AbstractVector{Multiclass{2}}, AbstractVector{Multiclass{4}}}}`
    2:	Source @689 ⏎ `AbstractVector{OrderedFactor{2}}`


In [33]:
train, validation = partition(1:length(y), 0.7)

([1, 2, 3, 4, 5, 6, 7, 8, 9, 10  …  336, 337, 338, 339, 340, 341, 342, 343, 344, 345], [346, 347, 348, 349, 350, 351, 352, 353, 354, 355  …  484, 485, 486, 487, 488, 489, 490, 491, 492, 493])

In [34]:
fit!(mach_pipe, rows=train)

┌ Info: Training machine(ProbabilisticPipeline(continuous_encoder = ContinuousEncoder(drop_last = false, …), …), …).
└ @ MLJBase /home/davibarreira/.julia/packages/MLJBase/jcOVb/src/machines.jl:487
┌ Info: Training machine(ContinuousEncoder(drop_last = false, …), …).
└ @ MLJBase /home/davibarreira/.julia/packages/MLJBase/jcOVb/src/machines.jl:487
┌ Info: Training machine(EvoTreeClassifier(loss = EvoTrees.Softmax(), …), …).
└ @ MLJBase /home/davibarreira/.julia/packages/MLJBase/jcOVb/src/machines.jl:487


Machine trained 1 time; caches data
  model: ProbabilisticPipeline(continuous_encoder = ContinuousEncoder(drop_last = false, …), …)
  args: 
    1:	Source @851 ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Multiclass{3}}, AbstractVector{Multiclass{2}}, AbstractVector{Multiclass{4}}}}`
    2:	Source @689 ⏎ `AbstractVector{OrderedFactor{2}}`
