In [2]:
# using MetadataTools,  DocStringExtensions
using Random: shuffle, MersenneTwister

# export MulticlassPerceptronClassifier, fit!, predict
using LinearAlgebra: mul!
using SparseArrays

using MLJBase
using MLJ
using Revise

In [3]:
#> needed for classifiers:
using CategoricalArrays

### Implementing the model struct

**A model is an object storing hyperparameters associated with some machine learning algorithm. In MLJ, hyperparameters include configuration parameters, like the number of threads, and special instructions, such as "compute feature rankings", which may or may not affect the final learning outcome. However, the logging level (verbosity below) is excluded.
**


In MLJ I would do

```
mutable struct MulticlassPerceptronClassifier <: MLJBase.Deterministic
    n_epochs::Int
    epoch_patience::Int
    pocket::Bool
    average_weights::Bool
    element_type::DataType
end
```






In [4]:
mutable struct MulticlassPerceptronClassifier <: MLJBase.Deterministic
    n_epochs::Int
    epoch_patience::Int
    pocket::Bool
    average_weights::Bool
    element_type::DataType
end

**
Models (which are mutable) should not be given internal constructors.
**

**
It is recommended that they be given an external lazy keyword constructor of the same name. This constructor defines default values for every field, and optionally corrects invalid field values by calling a clean! method (whose fallback returns an empty message string):
**

In [5]:
# keyword constructor
function MulticlassPerceptronClassifier( ; 
                                        n_epochs=100,
                                        epoch_patience=5,
                                        pocket=true,
                                        average_weights=true,
                                        element_type=Float32)

    model = MulticlassPerceptronClassifier(n_epochs,
                                           epoch_patience,
                                           pocket,
                                           average_weights,
                                           element_type)
    
    message = MLJBase.clean!(model)
    isempty(message) || @warn message
    return model
end

MulticlassPerceptronClassifier

The function MLJBase.clean is used to change the model hyperparameters in case they are set in an invalid way.

In [6]:
function MLJ.clean!(model::MulticlassPerceptronClassifier)
    warning = ""
    if model.n_epochs < 1
        warning *= "Need n_epochs ≥ 1. Resetting n_epochs=100 "
        model.n_epochs = 50
    end
    
    if model.epoch_patience <1
        warning *= "Need epoch_patience ≥ 1. Resetting epoch_patience=5 "
        model.epoch_patience = 5
    end
    return warning
end

### Implementing a `fit` method

In [7]:
mutable struct MulticlassPerceptronClassifierParameters{T}
    W::AbstractMatrix{T}
    b::AbstractVector{T}
    n_classes::Int
    n_features::Int
    is_sparse::Bool
end

#MulticlassPerceptronClassifierParameters(T::Type, n_classes::Int, n_features::Int) = MulticlassPerceptronClassifierParameters{T}(rand(T, n_features, n_classes),
#                                                                                       zeros(T, n_classes),
#                                                                                       n_classes,
#                                                                                       n_features,
#                                                                                       is_sparse)

In [8]:

function MulticlassPerceptronClassifierParameters(T::Type, n_classes::Int, n_features::Int, is_sparse::Bool) 
    
    if is_sparse==false
        return MulticlassPerceptronClassifierParameters{T}(rand(T, n_features, n_classes),
                                                                                       zeros(T, n_classes),
                                                                                       n_classes,
                                                                                       n_features,
                                                                                       is_sparse)
    else
        return  MulticlassPerceptronClassifierParameters{T}(sparse(rand(T, n_features, n_classes)),
        spzeros(T, n_classes),
                                                                                        n_classes,
                                                                                        n_features,
                                                                                        is_sparse)     
    end
end



MulticlassPerceptronClassifierParameters

In [9]:
"""
Predicts the class for a given input in a `MulticlassPerceptronClassifier`.
The placeholder is used to avoid allocating memory for each matrix-vector multiplication.

- Returns the predicted class.
"""
function predict_with_placeholder(h::MulticlassPerceptronClassifierParameters, x::AbstractVector, class_placeholder::AbstractVector)
    #@fastmath class_placeholder .= At_mul_B!(class_placeholder, h.W, x) .+ h.b
    class_placeholder .= mul!(class_placeholder, transpose(h.W), x)  .+ h.b
    return argmax(class_placeholder)
end


predict_with_placeholder

In [10]:

"""
Compute the accuracy betwwen `y` and `y_hat`.
"""
function accuracy(y::AbstractVector, y_hat::AbstractVector)
    acc = 0.
    @fastmath for k = 1:length(y)
            @inbounds  acc += y[k] == y_hat[k]
    end
    return acc/length(y_hat)
end


accuracy

In [11]:
function MLJBase.fit(model::MulticlassPerceptronClassifier,
                     verbosity::Int,   
                     X,
                     y)
    
    #Xmatrix = MLJBase.matrix(X)
    n_classes    = length(unique(y))
    classes_seen = unique(y)
    n_features   = size(train_x,1)  # this assumes data comes in cols
    
    #decode  = MLJBase.decoder(y[1]) # for predict method
    decode =  false

    # Defining the fitpredict object
    is_sparse = issparse(X)
    perceptron = MulticlassPerceptronClassifierParameters(model.element_type, n_classes, n_features, is_sparse);
    
    
    ### Fitting code starts
    fit!(perceptron, X, y; 
         print_flag=verbosity, 
         n_epochs=model.n_epochs);
    
    ### Fitting code ends
    cache = nothing
    fitresult = (perceptron, decode)
    report = NamedTuple{}()
    
    #> return package-specific statistics (eg, feature rankings,
    #> internal estimates of generalization error) in `report`, which
    #> should be a named tuple with the same type every call (can have
    #> empty values)
    
    return fitresult, cache, report

end

In [12]:

function fit!(h::MulticlassPerceptronClassifierParameters, X::AbstractArray, y::AbstractVector;
              n_epochs=50, 
              learning_rate=1., 
              print_flag=0,
              compute_accuracy=true, 
              seed=MersenneTwister(1234),
              pocket=false,
              shuffle_data=false)
    
    n_features, n_samples = size(X)
    @assert length(y) == n_samples
    scores = []
    
    T = eltype(X)
    learning_rate     = T(learning_rate)
    class_placeholder = zeros(T, h.n_classes)
    y_preds           = zeros(Int64, n_samples)
    data_indices      = Array(1:n_samples)
    max_acc           = zero(T)

    if pocket
        W_hat = zeros(T, h.n_features, h.n_classes)
        b_hat = zeros(T, h.n_classes)
    end

    @fastmath for epoch in 1:n_epochs

        n_mistakes = 0
        if shuffle_data
            shuffle!(seed, data_indices)
        end
        #println("\nepoch ",epoch,"\n")
        @inbounds for m in data_indices
            #println("sample seen ", m ,"\n")
            x = view(X, :, m);
            #y_hat = predict_with_placeholder(h, x, class_placeholder)
            y_hat = argmax(h.W'* x .+ h.b)
            
            if y[m] != y_hat
                n_mistakes += 1
                ####  wij ← wij − η (yj −tj) · xi
                h.W[:, y[m]]  .= h.W[:, y[m]]  .+ learning_rate .* x
                h.b[y[m]]      = h.b[y[m]]     + learning_rate
                h.W[:, y_hat] .= h.W[:, y_hat] .- learning_rate .* x
                h.b[y_hat]     = h.b[y_hat]    - learning_rate
            end
        end

        #println("FINISHED")

        if compute_accuracy
             @inbounds for m in  data_indices
                 y_preds[m] = predict_with_placeholder(h, view(X, :, m), class_placeholder)
            end
            acc = accuracy(y, y_preds)
            push!(scores, acc)
        else
            acc = (n_samples - n_mistakes)/n_samples
            push!(scores, acc)
        end

        if pocket
            if acc > max_acc
                max_acc = acc
                copy!(W_hat, h.W)
                copy!(b_hat, h.b)
            end
        end

        if print_flag ==1
            print("\r\u1b[K")
            print("Epoch: $(epoch) \t Accuracy: $(round(acc; digits=3))")
        end
        
        if print_flag ==2
            println("Epoch: $(epoch) \t Accuracy: $(round(acc; digits=3))")
        end
    end
end

fit! (generic function with 1 method)

In [18]:
using MLDatasets

train_x, train_y = MLDatasets.MNIST.traindata();
test_x, test_y   = MLDatasets.MNIST.testdata();
train_x          = Float32.(train_x);
test_x           = Float32.(test_x);
train_y          = train_y .+ 1;
test_y           = test_y .+ 1;
train_y          = Int64.(train_y);
test_y           = Int64.(test_y);
train_x          = reshape(train_x, 784, 60000);
test_x           = reshape(test_x,  784, 10000);

In [19]:
model = MulticlassPerceptronClassifier(n_epochs=20)

MulticlassPerceptronClassifier(n_epochs = 20,
                               epoch_patience = 5,
                               pocket = true,
                               average_weights = true,
                               element_type = Float32,)[34m @ 6…82[39m

In [20]:
model

MulticlassPerceptronClassifier(n_epochs = 20,
                               epoch_patience = 5,
                               pocket = true,
                               average_weights = true,
                               element_type = Float32,)[34m @ 6…82[39m

In [21]:
fitresult, _ , _  = MLJBase.fit(model, 2, train_x, train_y);

Epoch: 1 	 Accuracy: 0.872
Epoch: 2 	 Accuracy: 0.872
Epoch: 3 	 Accuracy: 0.855
Epoch: 4 	 Accuracy: 0.882
Epoch: 5 	 Accuracy: 0.88
Epoch: 6 	 Accuracy: 0.871
Epoch: 7 	 Accuracy: 0.893
Epoch: 8 	 Accuracy: 0.889
Epoch: 9 	 Accuracy: 0.88
Epoch: 10 	 Accuracy: 0.875
Epoch: 11 	 Accuracy: 0.884
Epoch: 12 	 Accuracy: 0.871
Epoch: 13 	 Accuracy: 0.867
Epoch: 14 	 Accuracy: 0.887
Epoch: 15 	 Accuracy: 0.879
Epoch: 16 	 Accuracy: 0.886
Epoch: 17 	 Accuracy: 0.884
Epoch: 18 	 Accuracy: 0.894
Epoch: 19 	 Accuracy: 0.869
Epoch: 20 	 Accuracy: 0.874


### Creating a predict

In order to predict in MLJ we need 3 things.

- The model (abstract model definition with hyperparameters)
- The fitresult of the model (containing the learned parameters of the model)
- Data


#### Example of predict for an SVMC

The following code is for predicting with a sklearn SVMC

```
function MLJBase.predict(model::SVMC
                         , fitresult
                         , Xnew)

    xnew = MLJBase.matrix(Xnew)
    result, decode = fitresult
    prediction = ScikitLearn.predict(result, xnew)
    return decode(prediction)
end
```



In [22]:
function predict(h::MulticlassPerceptronClassifierParameters, x::AbstractVector, class_placeholder::AbstractVector)
    class_placeholder .= mul!(class_placeholder, transpose(h.W), x)  .+ h.b
    return argmax(class_placeholder)
end

predict (generic function with 1 method)

In [23]:
"""
Function to predict the class for a given input batch.
- Returns the predicted class.
"""
function predict(h::MulticlassPerceptronClassifierParameters, X::AbstractMatrix)
    predictions = zeros(Int64, size(X, 2))
    class_placeholder = zeros(eltype(h.W), h.n_classes)

    @inbounds for m in 1:length(predictions)
        predictions[m] = predict(h, view(X,:,m), class_placeholder)
    end
    
    return predictions
end

predict

In [24]:
function MLJBase.predict(model::MulticlassPerceptronClassifier, fitresult, Xnew)
    xnew = MLJBase.matrix(Xnew)
    result, decode = fitresult
    prediction = predict(result, xnew)
    return prediction #decode(prediction)
end

In [25]:
MLJBase.predict(model,fitresult,train_x[:,1:15])

15-element Array{Int64,1}:
  6
  1
  5
  2
 10
  3
  2
  4
  2
  5
  4
  6
  4
  7
  2

### Allowing fit(model,X,y) with categorical arrays


Assume that the data is given with a categorical label

In [26]:
catname = Dict(1 => "one",
               2 => "two",
               3 => "three",
               4 => "four",
               5 => "five",
               6 => "six",
               7 => "seven",
               8 => "eight",
               9 => "nine",
               10 => "ten");

In [27]:
train_y_labels = [catname[i] for i in train_y];

Let us imagine that we are given the data in the following format

In [28]:
train_y_labels[1:3]

3-element Array{String,1}:
 "six" 
 "one" 
 "five"

we should, before we fit any model, take care of rewritting classes as numbers from 1 to N.

We can make our MLJ model do this automatically inside the `fit` method as long as we provide `train_y_labels`  as a CategoricalArray 

In [29]:
train_y_cat = CategoricalArray(train_y_labels);

In [30]:
train_y_cat[1:10]

10-element CategoricalArray{String,1,UInt32}:
 "six"  
 "one"  
 "five" 
 "two"  
 "ten"  
 "three"
 "two"  
 "four" 
 "two"  
 "five" 

A categorical array will contain

- `x.pool` all possible categories found.
- `x.refs` each value of the array encoded as a refenrence to an element in the pool.
- `levels(x)` returns the possible levels (categories) of x.

In [31]:
length(train_y_cat.pool)

10

In [32]:
train_y_cat.pool

CategoricalPool{String,UInt32}(["eight","five","four","nine","one","seven","six","ten","three","two"])

In [33]:
train_y_cat[1:5]

5-element CategoricalArray{String,1,UInt32}:
 "six" 
 "one" 
 "five"
 "two" 
 "ten" 

In [34]:
train_y_cat.refs[1:5]

5-element Array{UInt32,1}:
 0x00000001
 0x00000002
 0x00000003
 0x00000004
 0x00000005

In [35]:
int(train_y_cat[1:5])

5-element Array{UInt32,1}:
 0x00000007
 0x00000005
 0x00000002
 0x0000000a
 0x00000008

If we want to map from numbers back to strings we need a decoder 

In [36]:
dec = decoder(train_y_cat[1])

MLJBase.CategoricalDecoder{String,UInt32}(CategoricalPool{String,UInt32}(["eight","five","four","nine","one","seven","six","ten","three","two"]), [9, 3, 7, 10, 2, 8, 1, 5, 6, 4])

In [37]:
dec(int(train_y_cat[1:5]))

5-element CategoricalArray{String,1,UInt32}:
 "six" 
 "one" 
 "five"
 "two" 
 "ten" 

In [38]:
dec(10)

CategoricalString{UInt32} "two"

Now we have all ingrediets to create

`MLJBase.fit(model::MulticlassPerceptronClassifier, verbosity,X,y)`

Which takes as input `y` a Categorical array and does the following:

- Takes a single element from the categorical array (which stores all possible class labels) and from this element it creates  a decoding function that, given an integer it returns back a category. 

```julia
    # decoder maps Integer->Category, used in the predict method
    decode  = MLJBase.decoder(y[1]) 
```


- It maps the categorical array to integers:
```julia
   # Encodes CategoricalArray to an Array of integers
   y = Int.(int(y))  
```

In [39]:

function MLJBase.fit(model::MulticlassPerceptronClassifier,
                     verbosity::Int,   
                     X,
                     y)
    
    #Xmatrix = MLJBase.matrix(X)
    n_classes    = length(unique(y))
    classes_seen = unique(y)
    n_features   = size(X,1)  # this assumes data comes in cols

    decode  = MLJBase.decoder(y[1]) # for predict method
    
    y = Int.(int(y))  # Encoding my categorical array to an array of integers

    is_sparse = issparse(X)
    perceptron = MulticlassPerceptronClassifierParameters(model.element_type, n_classes, n_features, is_sparse);

    ### Fitting code starts
    fit!(perceptron, X, y; 
         print_flag=verbosity, 
         n_epochs=model.n_epochs);
    
    ### Fitting code ends
    cache = nothing
    fitresult = (perceptron, decode)
    report = NamedTuple{}()
    
    #> return package-specific statistics (eg, feature rankings,
    #> internal estimates of generalization error) in `report`, which
    #> should be a named tuple with the same type every call (can have
    #> empty values)
    return fitresult, cache, report
end


In [40]:
model = MulticlassPerceptronClassifier(n_epochs=5)

MulticlassPerceptronClassifier(n_epochs = 5,
                               epoch_patience = 5,
                               pocket = true,
                               average_weights = true,
                               element_type = Float32,)[34m @ 1…83[39m

In [41]:
@time fitresult, _ , _  = MLJBase.fit(model, 2, train_x, train_y_cat)

Epoch: 1 	 Accuracy: 0.872
Epoch: 2 	 Accuracy: 0.88
Epoch: 3 	 Accuracy: 0.884
Epoch: 4 	 Accuracy: 0.884
Epoch: 5 	 Accuracy: 0.88
  1.741249 seconds (4.26 M allocations: 405.762 MiB, 5.18% gc time)


((MulticlassPerceptronClassifierParameters{Float32}(Float32[0.73096 0.0342447 … 0.669984 0.208333; 0.612626 0.413586 … 0.994569 0.0856211; … ; 0.0386736 0.179588 … 0.637083 0.953892; 0.912965 0.834118 … 0.531651 0.308105], Float32[49.0, -4.0, -20.0, -88.0, -45.0, -26.0, 100.0, -14.0, 18.0, 30.0], 10, 784, false), MLJBase.CategoricalDecoder{String,UInt32}(CategoricalPool{String,UInt32}(["eight","five","four","nine","one","seven","six","ten","three","two"]), [9, 3, 7, 10, 2, 8, 1, 5, 6, 4])), nothing, NamedTuple())

In [42]:
dec = fitresult[2]

MLJBase.CategoricalDecoder{String,UInt32}(CategoricalPool{String,UInt32}(["eight","five","four","nine","one","seven","six","ten","three","two"]), [9, 3, 7, 10, 2, 8, 1, 5, 6, 4])

In [43]:
dec(2)

CategoricalString{UInt32} "five"

In [44]:
function MLJBase.predict(model::MulticlassPerceptronClassifier, fitresult, Xnew)
    xnew = MLJBase.matrix(Xnew)
    result, decode = fitresult
    prediction = predict(result, xnew)
    return decode(prediction)
end

In [45]:
MLJBase.predict(model, fitresult, train_x[:,1:4])

4-element CategoricalArray{String,1,UInt32}:
 "six" 
 "one" 
 "five"
 "two" 

Prediction types for deterministic responses.

In the case of Deterministic models, yhat should be an AbstractVector (commonly a plain Vector) with the same element type as the target y passed to the fit method (see above). Any CategoricalValue or CategoricalString appearing in yhat must have the same levels in its pool as was present in the elements of the target y presented in training, even if not all levels appear in the training data or prediction itself. For example, in the univariate target case, this means MLJ.classes(yhat[i]) = MLJ.classes(y[j]) for all admissible i and j. (The method classes is described under Convenience methods below).

Unfortunately, code not written with the preservation of categorical levels in mind poses special problems. To help with this, MLJBase provides three utility methods: int (for converting a CategoricalValue or CategoricalString into an integer, the ordering of these integers being consistent with that of the pool), decoder (for constructing a callable object that decodes the integers back into CategoricalValue/CategoricalString objects), and classes, for extracting the complete pool from a single value. Refer to Convenience methods below for important details.

Note that a decoder created during fit may need to be bundled with fitresult to make it available to predict during re-encoding. So, for example, if the core algorithm being wrapped by fit expects a nominal target yint of type Vector{<:Integer} then a fit method may look something like this:

### Sparse data

Let us load a dataset with text data and let's build a classifier

In [46]:
push!(LOAD_PATH, "./")

4-element Array{String,1}:
 "@"      
 "@v#.#"  
 "@stdlib"
 "./"     

In [47]:
using AmazonBookReviews

┌ Info: Precompiling AmazonBookReviews [top-level]
└ @ Base loading.jl:1186


In [48]:
#;open AmazonBookReviews.jl

In [49]:
word_to_pos, pos_to_word, supported_word_counts, (X,y) =   AmazonBookReviews.load_data();

min support:5

In [50]:
n_features, n_samples = size(X)

(13195, 2000)

In [51]:
function get_sparsity(X)
    n_features, n_samples = size(X)
    m = 0
    for i in 1:n_samples
        m += sum(view(X,:,i))
    end
    mean_words_found = m/n_samples
    return mean_words_found, 100 * mean_words_found/n_features
end

get_sparsity (generic function with 1 method)

In [52]:
get_sparsity(X)

(156.7f0, 1.187571f0)

In [56]:
using MLDataUtils

┌ Info: Precompiling MLDataUtils [cc2ba9b6-d476-5e6d-8eaf-a92d5412d41d]
└ @ Base loading.jl:1186


In [57]:
(X_tr, y_tr), (X_te, y_te) = stratifiedobs((X, y), p = 0.7);

X_tr = copy(X_tr)
y_tr = copy(y_tr)
X_te = copy(X_te)
y_te = copy(y_te);


In [58]:
size(X_tr), size(X_te)

((13195, 1400), (13195, 600))

Notice that the matrices are not sparse.

We can use `SparseArrays` to encode them as sparse matrices

In [59]:
X_tr_sp = sparse(X_tr)
X_te_sp = sparse(X_te);

In [60]:
@time X_tr_sp * X_tr_sp';

  0.967056 seconds (290.88 k allocations: 287.597 MiB, 11.40% gc time)


In [61]:
@time X_tr * X_tr';

  2.953984 seconds (925.79 k allocations: 708.424 MiB, 6.57% gc time)


#### Training the MulticlassPerceptron with sparse data

In [62]:
y_tr_cat = CategoricalArray(y_tr);

In [63]:
size(X_tr_sp), size(y_tr_cat)

((13195, 1400), (1400,))

In [64]:
model = MulticlassPerceptronClassifier(n_epochs=50, element_type=Float32)

MulticlassPerceptronClassifier(n_epochs = 50,
                               epoch_patience = 5,
                               pocket = true,
                               average_weights = true,
                               element_type = Float32,)[34m @ 1…34[39m

In [65]:
@time fitresult, _ , _  = MLJBase.fit(model, 1, X_tr, y_tr_cat)

[KEpoch: 50 	 Accuracy: 1.0  1.115667 seconds (1.03 M allocations: 140.465 MiB, 2.70% gc time)


((MulticlassPerceptronClassifierParameters{Float32}(Float32[0.0259023 0.581819; 0.646623 0.49239; … ; 2.35888 -1.26332; 2.13371 -1.57738], Float32[-3.0, 3.0], 2, 13195, false), MLJBase.CategoricalDecoder{Float64,UInt32}(CategoricalPool{Float64,UInt32}([-1.0,1.0]), [1, 2])), nothing, NamedTuple())

### sparse data

In [70]:
function affine_dense_input_sparse(W_dense, b_dense, x_sp)
    n_rows_W, n_cols_W = size(W_dense)
    result = zeros(eltype(W_dense), n_rows_W)
    
    
    @inbounds for j in 1:n_rows_W
        for i in x_sp.nzind
            result[j] += W_dense[j,i] * x_sp[i] 
        end
        result[j] +=  b_dense[j]
    end
    return result
end

affine_dense_input_sparse (generic function with 1 method)

In [71]:

function fit!(h::MulticlassPerceptronClassifierParameters, X::AbstractArray, y::AbstractVector;
              n_epochs=50, 
              learning_rate=1., 
              print_flag=0,
              compute_accuracy=false, 
              seed=MersenneTwister(1234),
              pocket=false,
              shuffle_data=false)
    
    n_features, n_samples = size(X)
    @assert length(y) == n_samples
    scores = []
    
    T = eltype(X)
    learning_rate     = T(learning_rate)
    class_placeholder = zeros(T, h.n_classes)
    y_preds           = zeros(Int64, n_samples)
    data_indices      = Array(1:n_samples)
    max_acc           = zero(T)

    if pocket
        W_hat = zeros(T, h.n_features, h.n_classes)
        b_hat = zeros(T, h.n_classes)
    end

    for epoch in 1:n_epochs

        n_mistakes = 0
        if shuffle_data
            shuffle!(seed, data_indices)
        end
        #println("\nepoch ",epoch,"\n")
        @inbounds for m in data_indices
            #println("sample seen ", m ,"\n")
            #@btime x = view($X, :, $m);
            #x = view(X, :, m);
            x = X[:, m]
            #x = view(X_tr_sp.rowval, X_tr_sp.colptr[2]:(X_tr_sp.colptr[3]-1 ))
            #x = view(X_tr_sp.rowval, X_tr_sp.colptr[m]:(X_tr_sp.colptr[m]-1 ))


            #y_hat = predict_with_placeholder(h, x, class_placeholder)
            #@btime y_hat = argmax($h.W'* $x .+ $h.b)W
            #y_hat = argmax(h.W'* x + h.b)
            y_hat = argmax(affine_dense_input_sparse(h.W', h.b, x))
            y_m = y[m]
            #break ############# REMOVE
            if y_m != y_hat
                n_mistakes += 1
                ####  wij ← wij − η (yj −tj) · xi
                h.W[:, y_m]  .= h.W[:, y_m]  + learning_rate .* x
                h.b[y_m]      = h.b[y_m]     + learning_rate
                h.W[:, y_hat] .= h.W[:, y_hat] - learning_rate .* x
                h.b[y_hat]     = h.b[y_hat]    - learning_rate
            end
        end
        #break ############# REMOVE
        #println("FINISHED")

        if compute_accuracy
             @inbounds for m in  data_indices
                 #y_preds[m] = predict_with_placeholder(h, view(X, :, m), class_placeholder)
                y_preds[m] = predict_with_placeholder(h, X[:,m], class_placeholder)
            end
            acc = accuracy(y, y_preds)
            push!(scores, acc)
        else
            acc = (n_samples - n_mistakes)/n_samples
            push!(scores, acc)
        end

        if pocket
            if acc > max_acc
                max_acc = acc
                copy!(W_hat, h.W)
                copy!(b_hat, h.b)
            end
        end

        if print_flag ==1
            print("\r\u1b[K")
            print("Epoch: $(epoch) \t Accuracy: $(round(acc; digits=3))")
        end
        
        if print_flag ==2
            println("Epoch: $(epoch) \t Accuracy: $(round(acc; digits=3))")
        end
    end
end

fit! (generic function with 1 method)

In [72]:
model = MulticlassPerceptronClassifier(n_epochs=50, element_type=Float32)

MulticlassPerceptronClassifier(n_epochs = 50,
                               epoch_patience = 5,
                               pocket = true,
                               average_weights = true,
                               element_type = Float32,)[34m @ 1…41[39m

In [73]:
@time fitresult, _ , _  = MLJBase.fit(model, 1, X_tr_sp, y_tr_cat);

[KEpoch: 50 	 Accuracy: 1.0  4.902879 seconds (1.34 M allocations: 785.255 MiB, 2.26% gc time)


# Try to avoid creating the col vector

In [75]:
using BenchmarkTools

In [76]:
@btime view(X_tr_sp.rowval, X_tr_sp.colptr[2]:(X_tr_sp.colptr[3]-1 ));            

  230.146 ns (2 allocations: 80 bytes)


In [77]:
@btime  view(X_tr_sp, :, 2);   

  482.732 ns (2 allocations: 64 bytes)


In [78]:
sum(view(X_tr_sp, :, 2))

107.0f0

In [79]:
x_m_beg = X_tr_sp.colptr[m]
x_m_end = X_tr_sp.colptr[m+1] -1

UndefVarError: UndefVarError: m not defined

In [80]:
X_tr_sp.rowval[x_m_beg:x_m_end];

UndefVarError: UndefVarError: x_m_beg not defined

In [81]:
function affine_dense_input_sparse(W_dense, b_dense, X, x_m_beg, x_m_end)
    n_rows_W, n_cols_W = size(W_dense)
    result = zeros(eltype(W_dense), n_rows_W)
    
    @inbounds for j in 1:n_rows_W
        for i in X.rowval[x_m_beg:x_m_end]
            result[j] += W_dense[j,i] * X.nzval[i] 
        end
        result[j] +=  b_dense[j]
    end
    return result
end

affine_dense_input_sparse (generic function with 2 methods)

In [82]:

function fit!(h::MulticlassPerceptronClassifierParameters, X::AbstractArray, y::AbstractVector;
              n_epochs=50, 
              learning_rate=1., 
              print_flag=0,
              compute_accuracy=false, 
              seed=MersenneTwister(1234),
              pocket=false,
              shuffle_data=false)
    
    n_features, n_samples = size(X)
    @assert length(y) == n_samples
    scores = []
    
    T = eltype(X)
    learning_rate     = T(learning_rate)
    class_placeholder = zeros(T, h.n_classes)
    y_preds           = zeros(Int64, n_samples)
    data_indices      = Array(1:n_samples)
    max_acc           = zero(T)

    if pocket
        W_hat = zeros(T, h.n_features, h.n_classes)
        b_hat = zeros(T, h.n_classes)
    end

    for epoch in 1:n_epochs

        n_mistakes = 0
        if shuffle_data
            shuffle!(seed, data_indices)
        end
        #println("\nepoch ",epoch,"\n")
        @inbounds for m in data_indices
            #x = X[:, m]
        
            x_m_beg = X.colptr[m]
            x_m_end = X.colptr[m+1] -1
            y_hat = argmax(affine_dense_input_sparse(h.W', h.b, X, x_m_beg, x_m_end))
            y_m = y[m]

            #break ############# REMOVE
            if y_m != y_hat
                n_mistakes += 1
                ####  wij ← wij − η (yj −tj) · xi
                for i in X.rowval[x_m_beg:x_m_end]
                    
                    #h.W[:, y_m]  .= h.W[:, y_m]  + learning_rate .* x
                    h.W[i, y_m]   +=  learning_rate * X.nzval[i]
                    
                    #h.W[:, y_hat] .= h.W[:, y_hat] - learning_rate .* x
                    h.W[i, y_hat] -=  learning_rate * X.nzval[i]
                end
                h.b[y_m]      = h.b[y_m]     + learning_rate
                h.b[y_hat]    = h.b[y_hat]    - learning_rate
            end
        end
        #break ############# REMOVE
        #println("FINISHED")

        if compute_accuracy
             @inbounds for m in  data_indices
                 #y_preds[m] = predict_with_placeholder(h, view(X, :, m), class_placeholder)
                y_preds[m] = predict_with_placeholder(h, X[:,m], class_placeholder)
            end
            acc = accuracy(y, y_preds)
            push!(scores, acc)
        else
            acc = (n_samples - n_mistakes)/n_samples
            push!(scores, acc)
        end

        if pocket
            if acc > max_acc
                max_acc = acc
                copy!(W_hat, h.W)
                copy!(b_hat, h.b)
            end
        end

        if print_flag ==1
            print("\r\u1b[K")
            print("Epoch: $(epoch) \t Accuracy: $(round(acc; digits=3))")
        end
        
        if print_flag ==2
            println("Epoch: $(epoch) \t Accuracy: $(round(acc; digits=3))")
        end
    end
end

fit! (generic function with 1 method)

In [83]:
model = MulticlassPerceptronClassifier(n_epochs=50, element_type=Float32)

MulticlassPerceptronClassifier(n_epochs = 50,
                               epoch_patience = 5,
                               pocket = true,
                               average_weights = true,
                               element_type = Float32,)[34m @ 1…02[39m

In [84]:
@btime fitresult, _ , _  = MLJBase.fit(model, 1, X_tr_sp, y_tr_cat);

[KEpoch: 50 	 Accuracy: 1.0  1.345 s (1415513 allocations: 179.10 MiB)


In [None]:
@btime fitresult, _ , _  = MLJBase.fit(model, 1, X_tr_sp, y_tr_cat);

In [None]:
#@time fitresult[1].W' * X_tr_sp[:,1]

### Making matvecprod

In [None]:

function fit!(h::MulticlassPerceptronClassifierParameters, X::AbstractArray, y::AbstractVector;
              n_epochs=50, 
              learning_rate=1., 
              print_flag=0,
              compute_accuracy=true, 
              seed=MersenneTwister(1234),
              pocket=false,
              shuffle_data=false)
    
    n_features, n_samples = size(X)
    @assert length(y) == n_samples
    scores = []
    
    T = eltype(X)
    learning_rate     = T(learning_rate)
    class_placeholder = zeros(T, h.n_classes)
    y_preds           = zeros(Int64, n_samples)
    data_indices      = Array(1:n_samples)
    max_acc           = zero(T)

    if pocket
        W_hat = zeros(T, h.n_features, h.n_classes)
        b_hat = zeros(T, h.n_classes)
    end

    for epoch in 1:n_epochs

        n_mistakes = 0
        if shuffle_data
            shuffle!(seed, data_indices)
        end
        #println("\nepoch ",epoch,"\n")
        @inbounds for m in data_indices
            #println("sample seen ", m ,"\n")
            #@btime x = view($X, :, $m);
            #x = view(X, :, m);
            x = X[:, m]
            #y_hat = predict_with_placeholder(h, x, class_placeholder)
            #@btime y_hat = argmax($h.W'* $x .+ $h.b)
            y_hat = argmax(h.W'* x + h.b)
            y_m = y[m]
            #break ############# REMOVE
            if y_m != y_hat
                n_mistakes += 1
                ####  wij ← wij − η (yj −tj) · xi
                h.W[:, y_m]  .= h.W[:, y_m]  + learning_rate .* x
                h.b[y_m]      = h.b[y_m]     + learning_rate
                h.W[:, y_hat] .= h.W[:, y_hat] - learning_rate .* x
                h.b[y_hat]     = h.b[y_hat]    - learning_rate
            end
        end
        #break ############# REMOVE
        #println("FINISHED")

        if compute_accuracy
             @inbounds for m in  data_indices
                 #y_preds[m] = predict_with_placeholder(h, view(X, :, m), class_placeholder)
                y_preds[m] = predict_with_placeholder(h, X[:,m], class_placeholder)
            end
            acc = accuracy(y, y_preds)
            push!(scores, acc)
        else
            acc = (n_samples - n_mistakes)/n_samples
            push!(scores, acc)
        end

        if pocket
            if acc > max_acc
                max_acc = acc
                copy!(W_hat, h.W)
                copy!(b_hat, h.b)
            end
        end

        if print_flag ==1
            print("\r\u1b[K")
            print("Epoch: $(epoch) \t Accuracy: $(round(acc; digits=3))")
        end
        
        if print_flag ==2
            println("Epoch: $(epoch) \t Accuracy: $(round(acc; digits=3))")
        end
    end
end

### Profiling fit

In [None]:
model = MulticlassPerceptronClassifier(n_epochs=2, element_type=Float32)

In [None]:
@profile fitresult, _ , _  = MLJBase.fit(model, 2, X_tr_sp, y_tr_cat);

In [None]:
using TimerOutputs

### Inspecting the speed of fitting: Why is it so slow with sparse data?

In [None]:

Wsp = sparse(fitresult[1].W);
b = fitresult[1].b
x_sp = X_tr_sp[:,1];

In [None]:
using BenchmarkTools

In [None]:
@btime $Wsp'* $x_sp + $b

In [None]:
@btime dense_weights_vector_sparse($Wsp',$b, $x_sp)

In [None]:
@btime fitresult[1].W' * X_tr_sp[:,1] + b

In [None]:
@btime fitresult[1].W' * X_tr_sp[:,1] + b

### Making our own matrix vector product

In [None]:
size(fitresult[1].W)

In [None]:
using BenchmarkTools

In [None]:
W = fitresult[1].W';
b = fitresult[1].b
x_sp = X_tr_sp[:,1];

In [None]:
function dense_weights_vector_sparse(W_dense,b_dense, x_sp)
    n_rows_W, n_cols_W = size(W_dense)
    result = zeros(eltype(W_dense), n_rows_W)
    
    @inbounds for j in 1:n_rows_W
        for i in x_sp.nzind
            result[j] += W_dense[j,i] * x_sp[i] 
        end
        result[j] +=  b[j]
    end
    return result
end

In [None]:
dense_weights_vector_sparse(W,b, x_sp)

In [None]:
W * x_sp + b

In [None]:
@btime W*x_sp + b

In [None]:
@btime dense_weights_vector_sparse(W,b, x_sp)

In [None]:
x = Array(x)
W = Array(W)
W_sp = sparse(W);
b_sp = fitresult[1].b;
b    = Array(fitresult[1].b)
aux  = zeros(2)


In [None]:
@btime W_sp * x_sp + b_sp;

In [None]:
@btime mul!(aux, W_sp, x_sp) + b_sp

In [None]:
@btime mul!(aux, W_sp, x_sp) + b

In [None]:
@btime W_sp*x_sp + b_sp

In [None]:
@btime W *x + b

In [None]:
@btime W *x_sp + b

In [None]:
@btime mul!(aux, W_sp, x_sp)

In [None]:
@btime mul!(aux, W_sp, x_sp) + b

### Matrix vector mutliply faster

In [None]:
matrix_dense_vector_sparse(fitresult[1].W',aux)

In [None]:
@btime fitresult[1].W' * aux

In [None]:
@btime matrix_dense_vector_sparse(fitresult[1].W',aux)

In [None]:
issparse(X_tr_sp)

In [None]:
type_X = typeof(X_tr_sp)