# cuML Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

## Imports

In [2]:
import cudf
import cuml
import numpy as np
import cupy as cp

## Create regression dataset

In [8]:
X, y, c = cuml.make_regression(
    n_samples=10000
    , n_targets=1
    , n_features=4
    , n_informative=2
    , bias=-3.4
    , noise=0.2
    , coef=True
    , random_state=np.random.randint(1e9)
)

print(f'coefficients:\n {cp.array(c)}')

df_reg = cudf.DataFrame(X, columns=[f'feat_{i}' for i in range(4)])
df_reg['target'] = cudf.Series(y)
df_reg.head()

coefficients:
 [[ 9.516431]
 [45.778072]
 [ 0.      ]
 [ 0.      ]]


Unnamed: 0,feat_0,feat_1,feat_2,feat_3,target
0,1.811694,0.85595,-1.157179,1.175487,53.066772
1,-0.557946,1.769637,-0.482995,0.452414,72.292953
2,1.250792,-0.781601,-0.100964,-1.223461,-27.092638
3,-0.000758,-0.036418,-0.853497,0.504453,-5.076026
4,0.281274,-1.817569,-0.082699,0.898847,-84.085297


In [9]:
X_train, X_test, y_train, y_test = cuml.preprocessing.train_test_split(df_reg, 'target', train_size=.8)

---

# Regression models

---

#### LinearRegression()

In [6]:
lr = cuml.LinearRegression(
    algorithm='svd'
    , fit_intercept=True
    , normalize=True
)

In [10]:
lr.fit(X_train, y_train)

LinearRegression(algorithm='svd', fit_intercept=True, normalize=True, handle=<cuml.raft.common.handle.Handle object at 0x7f7110b137f0>, verbose=4, output_type='cudf')

In [11]:
lr.predict(X_test)

0       52.897781
1       22.874268
2      -35.518852
3       -3.778944
4        8.095516
          ...    
1995     8.623056
1996    38.878197
1997    21.281900
1998    57.972565
1999    -4.635669
Length: 2000, dtype: float32

#### Ridge()

In [51]:
ridge = cuml.Ridge(
    alpha=1.0
    , solver='svd'
    , fit_intercept=False
    , normalize=True
)

In [52]:
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, solver='svd', fit_intercept=False, normalize=True, handle=<cuml.raft.common.handle.Handle object at 0x7f711012c350>, output_type='cudf')

In [53]:
ridge.predict(X_test)

0       56.182236
1       26.206604
2      -32.065166
3       -0.421767
4       11.449984
          ...    
1995    12.046181
1996    42.205769
1997    24.712076
1998    61.343567
1999    -1.181158
Length: 2000, dtype: float32

#### Lasso()

In [48]:
lasso = cuml.Lasso(
    alpha=1.0
    , fit_intercept=False
    , normalize=True
)

In [49]:
lasso.fit(X_train, y_train)

Lasso(alpha=1.0, fit_intercept=False, normalize=True, max_iter=1000, tol=0.001, selection='cyclic', handle=<cuml.raft.common.handle.Handle object at 0x7f71101c6850>, output_type='cudf')

In [50]:
lasso.predict(X_test)

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
1995    0.0
1996    0.0
1997    0.0
1998    0.0
1999    0.0
Length: 2000, dtype: float32

#### ElasticNet()

In [28]:
elastic = cuml.ElasticNet()

In [42]:
elastic = cuml.ElasticNet(
    alpha=1.0
    , l1_ratio=0.05
    , fit_intercept=False
    , normalize=True
)

In [43]:
elastic.fit(X_train, y_train)

ElasticNet(alpha=1.0, l1_ratio=0.05, fit_intercept=False, normalize=True, max_iter=1000, tol=0.001, selection='cyclic', handle=<cuml.raft.common.handle.Handle object at 0x7f71101c6750>, output_type='cudf')

In [44]:
elastic.predict(X_test)

0       58.688072
1       27.395233
2      -33.479565
3       -0.413670
4       12.016131
          ...    
1995    12.477379
1996    44.053917
1997    25.695389
1998    64.006500
1999    -1.219085
Length: 2000, dtype: float32

#### MBSGDRegressor()

#### ensemble.RandomForestRegressor()

In [56]:
rf_reg = cuml.ensemble.RandomForestRegressor(
    n_estimators=40
    , n_bins=8
    , max_depth=10
    , max_features=1.0
    , min_rows_per_node=10
    , split_criterion=2
)

In [57]:
rf_reg.fit(X_train, y_train)

RandomForestRegressor(split_criterion=2, accuracy_metric='r2')

In [58]:
rf_reg.predict(X_test)

0       48.877590
1       23.289175
2      -29.943689
3       -8.524265
4       14.018460
          ...    
1995     4.716395
1996    50.056549
1997    19.273937
1998    71.629814
1999   -10.264793
Length: 2000, dtype: float32

#### svm.SVR()

In [63]:
svr = cuml.svm.SVR(
    kernel='linear'
)

In [64]:
svr.fit(X_train, y_train)

SVR(handle=<cuml.raft.common.handle.Handle object at 0x7f711012cc30>, C=1, kernel='linear', degree=3, gamma='scale', coef0=0.0, tol=0.001, epsilon=0.1, cache_size=200.0, max_iter=-1, nochange_steps=1000, verbose=4)

In [65]:
svr.predict(X_test)

0       52.895931
1       22.873196
2      -35.514950
3       -3.775647
4        8.096189
          ...    
1995     8.623354
1996    38.877102
1997    21.280092
1998    57.966404
1999    -4.637165
Length: 2000, dtype: float32

#### neighbors.KNeighborsRegressor()

In [66]:
knn_r = cuml.neighbors.KNeighborsRegressor(
    n_neighbors = 5
)

In [67]:
knn_r.fit(X_train, y_train)

KNeighborsRegressor(weights='uniform')

In [68]:
knn_r.predict(X_test)

0       51.069206
1       22.347643
2      -34.483269
3        1.861916
4        8.214670
          ...    
1995    14.336409
1996    38.786972
1997    14.941347
1998    55.171669
1999    -3.787619
Length: 2000, dtype: float32

---

# Regression metrics

---

#### metrics.regression.mean_absolute_error()

In [72]:
cuml.metrics.regression.mean_absolute_error(y_test, lr.predict(X_test))

array(0.16493095, dtype=float32)

#### metrics.regression.mean_squared_error()

In [73]:
cuml.metrics.regression.mean_squared_error(y_test, lr.predict(X_test), squared=False)

array(0.20599504, dtype=float32)

#### metrics.regression.mean_squared_log_error()

In [74]:
cuml.metrics.regression.mean_squared_log_error(y_test, lr.predict(X_test), squared=False)

ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

#### metrics.regression.r2_score()

In [77]:
cuml.metrics.regression.r2_score(y_test, rf_reg.predict(X_test))

0.9373433589935303