In [1]:
import os
import os.path

import cudf
import cupy as cp
import cuml
from cuml.linear_model import LogisticRegression, MBSGDClassifier, MBSGDRegressor
from cuml.multiclass import MulticlassClassifier
from cuml.naive_bayes import MultinomialNB
from cuml.ensemble import RandomForestClassifier
from cuml.svm import SVC
from cuml.metrics.regression import mean_squared_error as mnsq

import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mnsq_cpu

%run ../utils/f_northing.py
%run ../utils/f_northing_numpy.py
%run ../utils/f_price_range.py
%run ../utils/f_static_data.py
%run ../utils/f_utils.py

cuml.set_global_output_type('cudf')

Parece ser que la operación sort de cupy (basada en numpy.sort) tiene un límite estricto de elementos, ya sea por tamaño o por memoria disponible.

Creamos un par (X,y) con exactamente 5405 filas y ejecutamos un clasificador RandomForest (en este caso binario):

In [2]:
X = cp.random.normal(size=(5405,4)).astype(np.float32)
y = cp.random.randint(2,size=(5405,1)).astype(np.int32)

cuml_model = RandomForestClassifier(max_features=1.0,
                   n_bins=8,
                   n_estimators=40)

x_train, x_test, y_train, y_test  = cuml.train_test_split(X, y, train_size=0.9)

cuml_model.fit(x_train,y_train)
cuml_predict = cuml_model.predict(x_test)

print("Predicted labels : ", cuml_predict[0:10])
print("Real : ", y_test[0:10])

Predicted labels :  0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    1.0
6    1.0
7    0.0
8    0.0
9    1.0
dtype: float32
Real :  [[0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]]


Ejecutamos el mismo clasificador con una fila más:

In [3]:
X = cp.random.normal(size=(5406,4)).astype(np.float32)
y = cp.random.randint(2,size=(5406,1)).astype(np.int32)

cuml_model = RandomForestClassifier(max_features=1.0,
                   n_bins=8,
                   n_estimators=40)

x_train, x_test, y_train, y_test  = cuml.train_test_split(X, y, train_size=0.9)

cuml_model.fit(x_train,y_train)
cuml_predict = cuml_model.predict(x_test)

print("Predicted labels : ", cuml_predict[0:10])
print("Real : ", y_test[0:10])

RuntimeError: CUDA error encountered at: file=/opt/conda/envs/rapids/conda-bld/libcuml_1614210250760/work/cpp/src/decisiontree/quantile/quantile.cuh line=150: call='cub::DeviceRadixSort::SortKeys( (void *)d_temp_storage->data(), temp_storage_bytes, &d_keys_in[batch_offset], d_keys_out->data(), n_sampled_rows, 0, 8 * sizeof(T), tempmem->stream)', Reason=cudaErrorInvalidValue:invalid argument
Obtained 64 stack frames
#0 in /home/gondelcha/anaconda3/envs/rapids-0.18/lib/python3.8/site-packages/cuml/common/../../../../libcuml++.so(_ZN4raft9exception18collect_call_stackEv+0x46) [0x7f7e7d6fff36]
#1 in /home/gondelcha/anaconda3/envs/rapids-0.18/lib/python3.8/site-packages/cuml/common/../../../../libcuml++.so(_ZN4raft10cuda_errorC1ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE+0x69) [0x7f7e7d700699]
#2 in /home/gondelcha/anaconda3/envs/rapids-0.18/lib/python3.8/site-packages/cuml/common/../../../../libcuml++.so(_ZN2ML12DecisionTree19preprocess_quantileIfiEEvPKT_PKjiiiiSt10shared_ptrI15TemporaryMemoryIS2_T0_EE+0xaaf) [0x7f7e7d80fa7f]
#3 in /home/gondelcha/anaconda3/envs/rapids-0.18/lib/python3.8/site-packages/cuml/common/../../../../libcuml++.so(_ZN2ML12rfClassifierIfE3fitERKN4raft8handle_tEPKfiiPiiRPNS_20RandomForestMetaDataIfiEE+0xde3) [0x7f7e7da45b63]
#4 in /home/gondelcha/anaconda3/envs/rapids-0.18/lib/python3.8/site-packages/cuml/common/../../../../libcuml++.so(_ZN2ML3fitERKN4raft8handle_tERPNS_20RandomForestMetaDataIfiEEPfiiPiiNS_9RF_paramsEi+0x1fd) [0x7f7e7da4054d]
#5 in /home/gondelcha/anaconda3/envs/rapids-0.18/lib/python3.8/site-packages/cuml/ensemble/randomforestclassifier.cpython-38-x86_64-linux-gnu.so(+0x3c7e5) [0x7f7e548527e5]
#6 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(PyObject_Call+0x255) [0x557f50dca2b5]
#7 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0x21c1) [0x557f50e76de1]
#8 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalCodeWithName+0x2c3) [0x557f50e55503]
#9 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x1b2007) [0x557f50e57007]
#10 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0x4ca3) [0x557f50e798c3]
#11 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalCodeWithName+0x2c3) [0x557f50e55503]
#12 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(PyEval_EvalCodeEx+0x39) [0x557f50e56559]
#13 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(PyEval_EvalCode+0x1b) [0x557f50ef99ab]
#14 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x2731de) [0x557f50f181de]
#15 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x128d4b) [0x557f50dcdd4b]
#16 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0x92f) [0x557f50e7554f]
#17 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x182ea3) [0x557f50e27ea3]
#18 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0x1d37) [0x557f50e76957]
#19 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x182ea3) [0x557f50e27ea3]
#20 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0x1d37) [0x557f50e76957]
#21 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x182ea3) [0x557f50e27ea3]
#22 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x1958c9) [0x557f50e3a8c9]
#23 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0xa4b) [0x557f50e7566b]
#24 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyFunction_Vectorcall+0x1a6) [0x557f50e56706]
#25 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0x92f) [0x557f50e7554f]
#26 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyFunction_Vectorcall+0x1a6) [0x557f50e56706]
#27 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0xa4b) [0x557f50e7566b]
#28 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalCodeWithName+0x2c3) [0x557f50e55503]
#29 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyFunction_Vectorcall+0x378) [0x557f50e568d8]
#30 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x1b1f91) [0x557f50e56f91]
#31 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(PyObject_Call+0x5e) [0x557f50dca0be]
#32 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0x21c1) [0x557f50e76de1]
#33 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalCodeWithName+0x2c3) [0x557f50e55503]
#34 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x1b2007) [0x557f50e57007]
#35 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0x1782) [0x557f50e763a2]
#36 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x1925da) [0x557f50e375da]
#37 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x128d4b) [0x557f50dcdd4b]
#38 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x13b3ea) [0x557f50de03ea]
#39 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x21da4f) [0x557f50ec2a4f]
#40 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x128fc2) [0x557f50dcdfc2]
#41 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0x92f) [0x557f50e7554f]
#42 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalCodeWithName+0x2c3) [0x557f50e55503]
#43 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyFunction_Vectorcall+0x378) [0x557f50e568d8]
#44 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0xa4b) [0x557f50e7566b]
#45 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x1925da) [0x557f50e375da]
#46 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x128d4b) [0x557f50dcdd4b]
#47 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x13b3ea) [0x557f50de03ea]
#48 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x21da4f) [0x557f50ec2a4f]
#49 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x128fc2) [0x557f50dcdfc2]
#50 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0x92f) [0x557f50e7554f]
#51 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalCodeWithName+0x2c3) [0x557f50e55503]
#52 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x1b2007) [0x557f50e57007]
#53 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0x92f) [0x557f50e7554f]
#54 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x1925da) [0x557f50e375da]
#55 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x128d4b) [0x557f50dcdd4b]
#56 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x13b3ea) [0x557f50de03ea]
#57 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x21da4f) [0x557f50ec2a4f]
#58 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x128fc2) [0x557f50dcdfc2]
#59 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalFrameDefault+0x92f) [0x557f50e7554f]
#60 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyEval_EvalCodeWithName+0x2c3) [0x557f50e55503]
#61 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(_PyFunction_Vectorcall+0x378) [0x557f50e568d8]
#62 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(+0x1b1f91) [0x557f50e56f91]
#63 in /home/gondelcha/anaconda3/envs/rapids-0.18/bin/python(PyObject_Call+0x5e) [0x557f50dca0be]


# Otras técnicas soportadas por RAPIDS (en progreso)

Clasificación y regresión mediante SGD/MiniBatch SGD (Stochastic Gradient Descent)

- Entrenan aparentemente de manera correcta, pero las predicciones siempre llegan vacías.

Clasificador multiclase

- Devuelve errores 'L-BFGS line search failed' - posible error de memoria o de entrada de datos

Naive Bayes multinomial

Clasificación multiclase mediante conjunto Random Forest

In [None]:
model = RandomForestClassifier(max_features=1.0, n_bins=8, n_estimators=40) #categorías deben ser consecutivas

Regresión mediante algoritmos Quasi-Newton

In [None]:
model = cuml.QN(loss='softmax') #OK pero no ideal

Clasificación mediante soporte de vectores SVC (C-Support Vector Classification)

In [None]:
model = SVC(kernel='poly', degree=2, gamma='auto', C=1) #error de memoria