# Utils namespace methods

In [7]:
# Assert nan passes
import numpy as np
import sklearn.utils as util
X_test = np.array([[0], [3.4], [1]])
util.assert_all_finite(X_test)

In [8]:
# Assert nan fail
import numpy as np
import sklearn.utils as util
X_test = np.array([[0], [np.nan], [1]])
util.assert_all_finite(X_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [13]:
# convert to float with copy
import numpy as np
import sklearn.utils as util
X_test = np.array([[0], [1], [2], [3]])
X_test_out = util.as_float_array(X_test, copy=True)
print (X_test)
print (X_test_out)

[[0]
 [1]
 [2]
 [3]]
[[ 0.]
 [ 1.]
 [ 2.]
 [ 3.]]


In [16]:
# check consistent length with features 3x4 and labels 1x4
import numpy as np
import sklearn.utils as util
X_test = np.array([[0,0,0], [1,1,1], [2,2,2], [3,3,3]])
Y_test = np.array([0,1,2,3])
util.check_X_y(X_test, Y_test)

(array([[0, 0, 0],
        [1, 1, 1],
        [2, 2, 2],
        [3, 3, 3]]), array([0, 1, 2, 3]))

In [26]:
# Assert fail check consistent length with features 3x4 and labels 1x4
import numpy as np
import sklearn.utils as util
X_test = np.array([[0,0,0], [1,1,1], [2,2,np.nan], [3,3,3]])
Y_test = np.array([0,1,2,3])
util.check_X_y(X_test, Y_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [44]:
# Assert fail check consistent length with features 3x4 and labels 1x4
# DOES NOT FAIL! 
import numpy as np
import sklearn.utils as util
X_test = np.array([[0,0,0], [1,1,1], [2,'hello',2], [3,3,3]])
Y_test = np.array([0,1,2,3])
util.check_X_y(X_test, Y_test, dtype="numeric", warn_on_dtype=True)

(array([['0', '0', '0'],
        ['1', '1', '1'],
        ['2', 'hello', '2'],
        ['3', '3', '3']],
       dtype='<U11'), array([0, 1, 2, 3]))

In [59]:
# consistent first dimension
import numpy as np
import sklearn.utils as util
X_test = np.array([[0,0,0], [1,1,1], [2,2,2], [3,3,3]])
Y_test = np.array([0,1,2,3])
util.check_consistent_length(X_test, Y_test)

In [61]:
# Assert fail consistent first dimension check
import numpy as np
import sklearn.utils as util
X_test = np.array([[0,0,0], [1,1,1], [2,2,2], [3,3,3]])
Y_test = np.array([0,1,2])
util.check_consistent_length(X_test, Y_test)

ValueError: Found input variables with inconsistent numbers of samples: [4, 3]

In [69]:
# check random state
# This ALWAYS return a random state... can't get it to fail.
import numpy as np
import sklearn.utils as util
util.check_random_state(None)
util.check_random_state(17)

<mtrand.RandomState at 0x111023f0>

In [78]:
# NOT CORRECT
import numpy as np
import sklearn.utils as util
X_test = np.array([[1, True, True, True, True, False],[2, True,True,True,True,False]])
Y_test = np.array([1,2])
util.compute_class_weight(class_weight='balanced', classes=X_test, y=Y_test)
#compute_sample_weight

TypeError: unhashable type: 'numpy.ndarray'

In [93]:
# check estimator gives warnings for dummy classifier
import numpy as np
import sklearn.utils as util
import sklearn.utils.estimator_checks as chk
import sklearn.dummy as sk
#chk.check_estimator('None')
#sk.DummyClassifier(strategy='stratified', random_state=None, constant=None)
chk.check_estimator(sk.DummyClassifier)

AssertionError: ("Estimator doesn't check for NaN and inf in fit.", DummyClassifier(constant=None, random_state=1, strategy='stratified'))

In [105]:
import numpy as np
import sklearn.utils as util
import sklearn.utils.extmath as ext
A_test = np.array([3.0,4.0])
B_test = np.array([1,2])
ext.safe_sparse_dot(a=A_test,b=B_test)

11.0

In [115]:
# create a sparse matrix and make it indexable
import scipy.sparse as sparse
import sklearn.utils as util
from numpy import array
I = array([0,3,1,0])
J = array([0,3,1,2])
V = array([4,5,7,9])
A = sparse.coo_matrix((V,(I,J)),shape=(4,4))
util.indexable(A)

[<4x4 sparse matrix of type '<class 'numpy.int32'>'
 	with 4 stored elements in Compressed Sparse Row format>]

In [121]:
import tensorflow as tf
x = tf.Variable(3, name="x")
f = x*x
print(f)

Tensor("mul_2:0", shape=(), dtype=int32)


In [149]:
# resample - see row 3 column 2 change
from scipy.sparse import coo_matrix
from sklearn.utils import resample
X = np.array([[1., 0.], [2., 1.], [0., 0.]])
y = np.array([0, 1, 2])
X_sparse = coo_matrix(X)
X, X_sparse, y = resample(X, X_sparse, y, random_state=0)
X

array([[ 1.,  0.],
       [ 2.,  1.],
       [ 1.,  0.]])

In [140]:
y

array([0, 1, 0])

In [147]:
resample(y, n_samples=20, random_state=0)

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1])

In [150]:
# python can return multiple values from a function/method
def myfn(in_a, in_b):
    return in_a+1, in_b+1
myfn(2,3)

(3, 4)

In [164]:
# safe indexing
import sklearn.utils as util
from numpy import array
X = array([[3,3,1,0],[3,6,4,2]])
util.safe_indexing(X,[0,1])

array([[3, 3, 1, 0],
       [3, 6, 4, 2]])

In [170]:
#shuffle - show multiple input and output
from scipy.sparse import coo_matrix
from sklearn.utils import shuffle
X = np.array([[1., 0.], [2., 1.], [0., 0.]])
y = np.array([0, 1, 2])
X_sparse = coo_matrix(X)
X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0, n_samples=3)
X

array([[ 0.,  0.],
       [ 2.,  1.],
       [ 1.,  0.]])

In [171]:
X_sparse.toarray()

array([[ 0.,  0.],
       [ 2.,  1.],
       [ 1.,  0.]])

In [172]:
y

array([2, 1, 0])

In [238]:
#compute incremental mean along SPECIFIED axis
import sklearn.utils.sparsefuncs as util
from scipy.sparse import csc_matrix
X = csc_matrix(np.array([[ 1, 2], 
                         [10,20]], dtype=int))
lst_mean = csc_matrix(np.array([[0.,0.],[0.,0.]], dtype=float))
lst_var = csc_matrix(np.array([[0.,0.],[0.,0.]], dtype = float))
n = 0
util.incr_mean_variance_axis(X, axis=1, last_mean=lst_mean, last_var=lst_var, last_n=n)


(array([  1.5,  15. ]), array([  0.25,  25.  ]), 2)

In [239]:
#inplace column scaling
import sklearn.utils.sparsefuncs as util
from scipy.sparse import csc_matrix
X = csc_matrix(np.array([ [1 , 2], 
                          [10,20]], dtype=float))
util.inplace_column_scale(X, np.array([1.0, 2.0]))
X.toarray()

array([[  1.,   4.],
       [ 10.,  40.]])

In [240]:
#inplace row scaling
import sklearn.utils.sparsefuncs as util
from scipy.sparse import csc_matrix
X = csc_matrix(np.array([ [1 , 2], 
                          [10,20]], dtype=float))
util.inplace_row_scale(X, np.array([1.0, 2.0]))
X.toarray()

array([[  1.,   2.],
       [ 20.,  40.]])

In [244]:
#inplace row swap
import sklearn.utils.sparsefuncs as util
from scipy.sparse import csc_matrix
X = csc_matrix(np.array([ [1 , 2, 3], 
                          [10,20,30]], dtype=float))
util.inplace_swap_row(X, 0, 1)
X.toarray()

array([[ 10.,  20.,  30.],
       [  1.,   2.,   3.]])

In [246]:
#inplace column swap
import sklearn.utils.sparsefuncs as util
from scipy.sparse import csc_matrix
X = csc_matrix(np.array([ [1 , 2, 3], 
                          [10,20,30]], dtype=float))
util.inplace_swap_column(X, 0, 2)
X.toarray()

array([[  3.,   2.,   1.],
       [ 30.,  20.,  10.]])

In [252]:
#mean_variance on axis
import sklearn.utils.sparsefuncs as util
from scipy.sparse import csc_matrix
X = csc_matrix(np.array([ [1 , 2, 3], 
                          [10,20,30]], dtype=float))
out_mean = util.mean_variance_axis(X, 0)
out_mean

(array([  5.5,  11. ,  16.5]), array([  20.25,   81.  ,  182.25]))

In [254]:
# check_is_fitted - WTF?
import sklearn.utils.validation as util
import sklearn.dummy as sk
util.check_is_fitted(sk.DummyClassifier, ["coef_"])

NotFittedError: This type instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [255]:
# check memory... 
import sklearn.utils.validation as util
util.check_memory(None)

AttributeError: module 'sklearn.utils.validation' has no attribute 'check_memory'

In [259]:
# check symmetric
import sklearn.utils.validation as util
X = np.array([ [.1 , .2, .3, .4],
               [ 1,  2,  3, 4], 
               [10, 20, 30, 40],
               [40, 50, 60, 70]]);
util.check_symmetric(X)



array([[  0.1 ,   0.6 ,   5.15,  20.2 ],
       [  0.6 ,   2.  ,  11.5 ,  27.  ],
       [  5.15,  11.5 ,  30.  ,  50.  ],
       [ 20.2 ,  27.  ,  50.  ,  70.  ]])

In [263]:
# check symmetric
import sklearn.utils.validation as util
X = np.array([ [1,2],[3,4]]);
util.check_symmetric(X)



array([[ 1. ,  2.5],
       [ 2.5,  4. ]])

In [266]:
# only one column!
import sklearn.utils.validation as util
util.column_or_1d([1,2,3,4])

array([1, 2, 3, 4])

In [268]:
# Assert Fail - only one column!
import sklearn.utils.validation as util
util.column_or_1d([[1,2,3,4],[1,2,3,4]])

ValueError: bad input shape (2, 4)

In [273]:
# validate that estimator supports the specified parameter
import sklearn.utils.validation as util
from sklearn.svm import SVC
util.has_fit_parameter(SVC(), "sample_weight")

True