# cuML Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

## Imports

In [1]:
import cudf
import cuml
import numpy as np
import cupy as cp

## Create classification dataset

In [2]:
X, y = cuml.make_classification(
    n_samples=10000
    , n_classes=2
    , n_features=4
    , n_informative=2
    , flip_y=0.05
    , shift=cp.random.rand(4)
    , scale=cp.random.rand(4)
    , random_state=np.random.randint(1e9)
)

df_class = cudf.DataFrame(X, columns=[f'feat_{i}' for i in range(4)])
df_class['label'] = cudf.Series(y)
df_class.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,label
0,0.706547,1.327542,-0.01708,-0.060723,1
1,-0.329612,-0.15818,0.030866,0.069758,0
2,0.224212,-0.455626,0.353308,0.723769,0
3,1.010159,2.089042,0.099062,-0.3243,1
4,0.822657,1.395624,0.041967,-0.01138,1


In [3]:
X_train, X_test, y_train, y_test = cuml.preprocessing.train_test_split(df_class, 'label', train_size=.8)

---

# Classification models

---

#### LogisticRegression()

In [4]:
log_reg = cuml.linear_model.LogisticRegression()

In [7]:
log_reg = cuml.linear_model.LogisticRegression(
    fit_intercept=True
    , max_iter=1000
    , tol=1e-5
)

In [8]:
log_reg.fit(X_train, y_train)

LogisticRegression(penalty='l2', tol=1e-05, C=1.0, fit_intercept=True, max_iter=1000, linesearch_max_iter=50, verbose=4, l1_ratio=None, solver='qn', handle=<cuml.raft.common.handle.Handle object at 0x7f7db06aced0>, output_type='cudf')

In [9]:
print(f'Coefficients: \n{log_reg.coef_}')

Coefficients: 
0   -0.045268
1    2.865233
2   -0.164525
3   -1.272737
dtype: float32


In [10]:
print(f'Intercept: \n{log_reg.intercept_}')

Intercept: 
0   -1.44792
dtype: float32


In [11]:
log_reg.predict(X_test)

0     0
1     0
2     1
3     1
4     1
5     0
6     1
7     1
8     1
9     0
10    0
11    1
12    0
13    0
14    1
15    0
16    1
17    0
18    0
19    0
dtype: int64

In [12]:
log_reg.predict_proba(X_test)

Unnamed: 0,0,1
0,0.586687,0.413313
1,0.995453,0.004547
2,0.204978,0.795022
3,0.211756,0.788244
4,0.008126,0.991874
5,0.8305,0.1695
6,0.190383,0.809617
7,0.069539,0.930461
8,0.087073,0.912927
9,0.79787,0.20213


#### MBSGDClassifier()

In [13]:
mbsgd_c = cuml.MBSGDClassifier()

In [14]:
mbsgd_c = cuml.MBSGDClassifier(
    penalty='elasticnet'
    , alpha=0.001
    , batch_size=64
    , fit_intercept=True
    , eta0=0.002
    , learning_rate='adaptive'
)

In [15]:
mbsgd_c.fit(X_train, y_train)

MBSGDClassifier(loss='hinge', penalty='elasticnet', alpha=0.001, l1_ratio=0.15, fit_intercept=True, epochs=1000, tol=0.001, shuffle=True, learning_rate='adaptive', eta0=0.002, power_t=0.5, batch_size=64, n_iter_no_change=5, handle=<cuml.raft.common.handle.Handle object at 0x7f7dae95ad70>, verbose=4, output_type='input')

In [16]:
mbsgd_c.predict(X_test)

0     1
1     0
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     0
10    0
11    1
12    0
13    1
14    1
15    0
16    1
17    0
18    0
19    1
dtype: int64

#### MultinomialNB()

In [17]:
df_nb = cudf.DataFrame([
      ('a', 1)
    , ('b', 1)
    , ('a', 1)
    , ('b', 1)
    , ('c', 0)
    , ('b', 1)
    , ('b', 1)
    , ('c', 0)
    , ('a', 1)
    , ('b', 1)
], columns=['f_0', 'label'])

In [18]:
from cuml.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer()

X = hv.fit_transform(df_nb['f_0'])
y = df_nb['label']

nb = cuml.MultinomialNB()
nb.fit(X, y)

MultinomialNB()

In [19]:
nb.predict(X)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [20]:
nb.predict_proba(X)

array([[0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8],
       [0.2, 0.8]], dtype=float32)

In [21]:
nb.score(X, y)

0.800000011920929

#### ensemble.RandomForestClassifier()

In [22]:
rf = cuml.ensemble.RandomForestClassifier()

In [9]:
rf = cuml.ensemble.RandomForestClassifier(
    n_estimators=40
    , n_bins=8
    , max_depth=10
    , max_features=1.0
    , min_rows_per_node=10
    , split_criterion=1
)

In [10]:
rf.fit(X_train, y_train)

RandomForestClassifier(split_criterion=1)

In [11]:
rf.predict(X_test)

0       1.0
1       1.0
2       0.0
3       1.0
4       0.0
       ... 
1995    0.0
1996    1.0
1997    1.0
1998    1.0
1999    1.0
Length: 2000, dtype: float32

In [12]:
rf.predict_proba(X_test)

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,0.0,1.0
4,1.0,0.0
...,...,...
1995,1.0,0.0
1996,0.0,1.0
1997,0.0,1.0
1998,0.0,1.0


In [13]:
rf.score(X_test, y_test)

0.965499997138977

#### ForestInference()

In [None]:
from cuml import ForestInference

model_path = 'xgb.model'
fm = ForestInference.load(model_path, output_class=True)
fm.predict(X_test)

#### svm.SVC()

In [21]:
svc = cuml.svm.SVC(
    kernel='poly'
    , degree=2
    , gamma='scale'
    , probability=True
)

In [22]:
svc.fit(X_train, y_train)

SVC(handle=<cuml.raft.common.handle.Handle object at 0x7f9a35132c30>, C=1, kernel='poly', degree=2, gamma='scale', coef0=0.0, tol=0.001, cache_size=200.0, max_iter=-1, nochange_steps=1000, verbose=4, output_type='cudf', probability=True, random_state=None, class_weight=None)

In [23]:
svc.predict(X_test)

0       1
1       1
2       0
3       1
4       0
       ..
1995    0
1996    1
1997    1
1998    1
1999    1
Length: 2000, dtype: int64

In [24]:
svc.predict_proba(X_test)

Unnamed: 0,0,1
0,0.014909,0.985091
1,0.016166,0.983834
2,0.976508,0.023492
3,0.018885,0.981115
4,0.960479,0.039521
...,...,...
1995,0.964220,0.035780
1996,0.016556,0.983444
1997,0.011475,0.988525
1998,0.025032,0.974968


#### neighbors.KNeighborsClassifier()

In [25]:
knn_c = cuml.neighbors.KNeighborsClassifier()

In [26]:
knn_c = cuml.neighbors.KNeighborsClassifier(
    n_neighbors = 5
)

In [27]:
knn_c.fit(X_train, y_train)

KNeighborsClassifier(weights='uniform')

In [28]:
knn_c.predict(X_test)

0       1
1       1
2       0
3       1
4       0
       ..
1995    0
1996    1
1997    1
1998    1
1999    1
Length: 2000, dtype: int64

In [29]:
knn_c.predict_proba(X_test)

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,0.0,1.0
4,1.0,0.0
...,...,...
1995,0.6,0.4
1996,0.0,1.0
1997,0.0,1.0
1998,0.0,1.0


---

# Classification metrics

---

#### metrics.accuracy.accuracy_score()

In [32]:
cuml.metrics.accuracy.accuracy_score(y_test, knn_c.predict(X_test))

0.9710000157356262

#### metrics.confusion_matrix()

In [36]:
cuml.metrics.confusion_matrix(y_test, knn_c.predict(X_test))

array([[953.,  26.],
       [ 32., 989.]])

#### metrics.roc_auc_score()

In [37]:
cuml.metrics.roc_auc_score(y_test, knn_c.predict(X_test))

0.9710502624511719

#### metrics.precision_recall_curve()

In [38]:
cuml.metrics.precision_recall_curve(y_test, knn_c.predict(X_test))

(array([0.51050001, 0.97438425, 1.        ]),
 array([1.        , 0.96865815, 0.        ]),
 array([0, 1]))

#### metrics.pairwise_distances.pairwise_distances()

In [41]:
cuml.metrics.pairwise_distances(y_test.astype('float32'), knn_c.predict(X_test))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
3,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
1996,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1997,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1998,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
