# cuML Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

## Imports

In [1]:
import cudf
import cuml
import numpy as np
import cupy as cp

## Sample dataset

In [59]:
df = cudf.DataFrame([
      (1.44, -0.17, -0.05,  1, 'a', 1,  0.02)
    , (1.93,  0.93,  0.59,  7, 'b', 1, -2.07)
    , (5.80, -1.02, -0.65, -4, 'a', 1,  2.08)
    , (1.94, -0.55, -0.25,  0, 'b', 1,  0.37)
    , (3.93,  2.59,  0.07,  2, 'c', 0, -0.01)
    , (1.63, -0.35, -0.13,  1, 'b', 1,  0.34)
    , (4.26, -0.09, -0.11,  1, 'b', 1, -0.16)
    , (2.29,  2.25, -0.15,  0, 'c', 0,  1.64)
    , (5.85, -1.64, -0.82, -5, 'a', 1,  2.11)
    , (1.49, -0.55, -0.16,  0, 'b', 1,  0.25)
], columns=['f_0', 'f_1', 'f_2', 'f_3', 'f_4', 'label', 'target'])
df

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,label,target
0,1.44,-0.17,-0.05,1,a,1,0.02
1,1.93,0.93,0.59,7,b,1,-2.07
2,5.8,-1.02,-0.65,-4,a,1,2.08
3,1.94,-0.55,-0.25,0,b,1,0.37
4,3.93,2.59,0.07,2,c,0,-0.01
5,1.63,-0.35,-0.13,1,b,1,0.34
6,4.26,-0.09,-0.11,1,b,1,-0.16
7,2.29,2.25,-0.15,0,c,0,1.64
8,5.85,-1.64,-0.82,-5,a,1,2.11
9,1.49,-0.55,-0.16,0,b,1,0.25


---

# Data preprocessing

---

#### model_selection.train_test_split()

In [31]:
X_train, X_test, y_train, y_test = cuml.preprocessing.model_selection.train_test_split(
    df
    , 'label'
    , train_size=0.75
)

In [30]:
X_train, X_test, y_train, y_test = cuml.preprocessing.model_selection.train_test_split(
    df[['f_1', 'f_2']]
    , df['label']
    , train_size=0.75
    , stratify=True
)

#### preprocessing.LabelEncoder.LabelEncoder()

In [36]:
le = cuml.preprocessing.LabelEncoder()
le.fit_transform(df['f_4'])

0    0
1    1
2    0
3    1
4    2
5    1
6    1
7    2
8    0
9    1
dtype: uint8

In [38]:
encoder = le.fit(df['f_4'])
encoder.transform(df['f_4'])

0    0
1    1
2    0
3    1
4    2
5    1
6    1
7    2
8    0
9    1
dtype: uint8

In [39]:
le.inverse_transform(encoder.transform(df['f_4']))

0    a
1    b
2    a
3    b
4    c
5    b
6    b
7    c
8    a
9    b
dtype: object

#### preprocessing.LabelBinarizer()

In [49]:
lb = cuml.preprocessing.LabelBinarizer()
lb.fit_transform(df['label'])

array([[0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1]])

In [43]:
binarizer = lb.fit(df['label'])
binarizer.transform(df['label'])

array([[0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1]])

In [44]:
binarizer.inverse_transform(binarizer.transform(df['label']))

array([1, 1, 1, 1, 0, 1, 1, 0, 1, 1])

#### preprocessing.OneHotEncoder()

In [63]:
ohe = cuml.preprocessing.OneHotEncoder(sparse=False)

In [76]:
ohe = cuml.preprocessing.OneHotEncoder(sparse=False, drop='first')

In [77]:
ohe.fit_transform(df[['f_4']])

array([[0., 0.],
       [1., 0.],
       [0., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [1., 0.]])

In [79]:
encoder = ohe.fit(df[['f_4']])
encoder.transform(df[['f_4']])

array([[0., 0.],
       [1., 0.],
       [0., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [1., 0.]])

In [80]:
encoder.inverse_transform(encoder.transform(df[['f_4']]))

Unnamed: 0,f_4
0,a
1,b
2,a
3,b
4,c
5,b
6,b
7,c
8,a
9,b


#### preprocessing.TargetEncoder()

In [84]:
te = cuml.preprocessing.TargetEncoder()

In [82]:
te = cuml.preprocessing.TargetEncoder(
    n_folds=5
    , smooth=0.1
    , split_method='interleaved'
    , output_type='cupy'
)

In [88]:
te.fit_transform(df['f_4'], df['label'])

array([1., 1., 1., 1., 0., 1., 1., 0., 1., 1.])

In [91]:
encoder = te.fit(df['f_4'], df['label'])
te.transform(df['f_4'])

array([1., 1., 1., 1., 0., 1., 1., 0., 1., 1.])

#### feature_extraction.text.CountVectorizer()

In [113]:
string_ser = cudf.Series([
              'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.'''
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF'''
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python'
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)'
            , 'Dask is a flexible library for parallel computing in Python'
])

In [103]:
cv = cuml.feature_extraction.text.CountVectorizer()

In [106]:
cv = cuml.feature_extraction.text.CountVectorizer(
    stop_words='english'
    , ngram_range=(1,2)
)

In [118]:
cv.fit_transform(string_ser).toarray()

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [120]:
vectorizer = cv.fit(string_ser)
vectorizer.transform(string_ser).toarray()

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [122]:
vectorizer.vocabulary_

0                aggregating
1      aggregating filtering
2                         ai
3                   ai suite
4                      allow
               ...          
137          want distribute
138                 want use
139                 workflow
140            workflow fast
141        workflow multiple
Name: token, Length: 142, dtype: object

#### feature_extraction.text.HashingVectorizer()

In [124]:
hv = cuml.feature_extraction.text.HashingVectorizer()

In [129]:
hv = cuml.feature_extraction.text.HashingVectorizer(
    stop_words='english'
    , ngram_range=(1,2)
)

In [130]:
hv.fit_transform(string_ser).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [131]:
vectorizer = hv.fit(string_ser)
vectorizer.transform(string_ser).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

#### feature_extraction.text.TfidfVectorizer()

In [132]:
tv = cuml.feature_extraction.text.TfidfVectorizer()

In [133]:
tv = cuml.feature_extraction.text.TfidfVectorizer(
    stop_words='english'
    , ngram_range=(1,2)
)

In [134]:
tv.fit_transform(string_ser).toarray()

array([[0.        , 0.        , 0.18530017, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.20747681, 0.20747681, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [135]:
vectorizer = tv.fit(string_ser)
vectorizer.transform(string_ser).toarray()

array([[0.        , 0.        , 0.18530017, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.20747681, 0.20747681, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)