<a href="https://colab.research.google.com/github/dcolinmorgan/grph/blob/main/colab_cucat_bench_run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# cu_cat CPU, GPU benchmark

This notebook examines `cu-cat` automatic feature engineering performance on variously populated datasets, ranging in size and complexity of data. The `dirty_cat` engine is highly optimized and parallelized for CPUs, and the `cu-cat` engine further adds single-GPU acceleration.
* Advantage can be seen at small scales, but GPU extraction really shines with scale, as GPU memory is loaded close to but not exceeding vram.
*The benchmark does not examine bigger-than-memory and distributed scenarios.
*There is a tradeoff between compute speedup vs gpu mem swapping time, and some cases are not to the advantage of the GPU.
Both the `GapEncoder()` and `TableVectorizer()` methods are employed independently here, as well as more extensively within the framework of `graphistry`'s `.featurize()` and `.umap()` functions.

The provided results here are from running on a free Google Colab T4 runtime, with a 2.2GHz Intel CPU (12 GB CPU RAM) and T4 Nvidia GPU (16 GB GPU RAM).

In [1]:
!pip install --extra-index-url=https://pypi.nvidia.com cuml-cu12 cudf-cu12 #==23.12.00 #cugraph-cu11 pylibraft_cu11 raft_dask_cu11 dask_cudf_cu11 pylibcugraph_cu11 pylibraft_cu11
!pip install git+https://github.com/graphistry/pygraphistry.git@dev/depman_gpufeat
!pip3 install --upgrade cu_cat

In [2]:
import cuml
cuml.__version__

'24.02.00'

In [4]:
import os
from collections import Counter
import cProfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pstats import Stats
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 200)
import cudf,cuml,cupy

In [5]:
import cu_cat
print(cu_cat.__version__)

v0.9.11


In [6]:
import graphistry
graphistry.register(api=3,protocol="https", server="hub.graphistry.com", username='dcolinmorgan', password='fXjJnkE3Gik6BWy') ## key id, secret key
graphistry.__version__

'0.33.0+367.g03f0fc3'

In [7]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [8]:
import logging
logger = logging.getLogger()
logging.basicConfig(level=logging.ERROR, force=True, format='%(asctime)s %(levelname)s %(message)s', datefmt='%H:%M:%S')

# logging.basicConfig(filename='app.log',
                    # level=logging.DEBUG,
                    # force=True, # Resets any previous configuration
                    # )

In [9]:
import cupy as cp
cp._default_memory_pool.free_all_blocks()
import gc
gc.collect()

0

In [10]:
# !git clone https://github.com/graphistry/pygraphistry.git
# !pip install sentence-transformers umap-learn parameterized dirty_cat
# !cd pygraphistry
# !git checkout dev/depman_gpufeat

!echo PYTHONPATH="${PYTHONPATH}:/path/to/directory" >> .env
!echo PYTHONPATH="${PYTHONPATH}:graphistry/tests" >> .env
!echo PYTHONPATH="${PYTHONPATH}:graphistry" >> .env
# export PYTHONPATH="${PYTHONPATH}:graphistry"
# from (del graphistry.)tests.test_feature_utils import
# is_test_cudf = cudf # and os.environ["TEST_CUDF"] != "0"
# !git clone https://github.com/graphistry/cu-cat.git

In [11]:
# !pytest graphistry/tests/test_feature_utils.py
# !pytest graphistry/tests/test_umap_utils.py
# !pytest graphistry/tests/test_umap_utils.py::TestUMAPFitTransform::test_transform_umap

## Introducing `cu-cat`
### with GPU-accelerated `GapEncoder` and `TableVectorizer` (prev. `SuperVectorizer`)

In [None]:
from dirty_cat.datasets._fetching import fetch_midwest_survey
from sklearn.model_selection import train_test_split
from cu_cat import GapEncoder as cuGapEncoder, TableVectorizer as cuTable
from dirty_cat import GapEncoder as dGapEncoder, TableVectorizer as dTable
from numpy.testing import assert_array_equal

In [None]:
dataset = fetch_midwest_survey()
X_train, X_test = train_test_split(
    dataset.X[["What_would_you_call_the_part_of_the_country_you_live_in_now"]],
    random_state=0,
)
encA = cuGapEncoder(n_components=2, random_state=2)
encA.fit_transform(X_train)
topics1 = encA.get_feature_names_out()

encB = dGapEncoder(n_components=2, random_state=2)
encB.fit_transform(X_train)
topics2 = encB.get_feature_names_out()

assert len(topics1) == len(topics2)


In [None]:
cc_table_vec = cuTable()
aa = cc_table_vec.fit_transform((dataset.X))
C_out = cc_table_vec.transformers_
print(C_out[2]) # print high_card_cat
d_table_vec = dTable()
aa = d_table_vec.fit_transform((dataset.X))
D_out = d_table_vec.transformers_
print(D_out[1]) # print high_card_cat

## Datasets Explored in the notebook:

\begin{array}{ccc}
data&rows&columns&data\ description\\
REDDIT&100&6&text-rich\\
CTU-13&10k&16&IP-address, datetime, numeric, short\ text\ labels\\
redteam&20k&14&messy\ string, text, other\\
ask HN&3000&14&title, text, datetime,numerics\\
20newsgroups&11k&1&paragraphs\\
winlogs&5M&21&windows\ log\ data: sparse,incomplete, hectic\\
\end{array}

##ndf_reddit

100 rows of `title` and `document` text-rich columns

In [None]:
import pandas as pd
import graphistry
from graphistry.features import topic_model

ndf_reddit = pd.read_csv('https://raw.githubusercontent.com/graphistry/pygraphistry/master/graphistry/tests/data/reddit.csv')#'pygraphistry/graphistry/tests/data/reddit.csv')
print(ndf_reddit.shape)
ndf_reddit.head(5)

In [None]:
g = graphistry.nodes(ndf_reddit)
print(g._nodes.shape, g._edges.shape)
g._nodes.head(5)

In [None]:
## run featurize via CPU and GPU and compare speeds, with both results being UMAP-ed on GPU
pr = cProfile.Profile()
pr.enable()
g = graphistry.nodes(ndf_reddit)
start0 = time.time()
g0a = g.umap(**topic_model,engine='cuml',memoize=False,feature_engine='dirty_cat',cardinality_threshold=10,cardinality_threshold_target=10)
end0 = time.time()
pr.disable()
with open('reddit_dirty_',n_samples,'.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)

g = graphistry.nodes(cudf.from_pandas(ndf_reddit))
pr = cProfile.Profile()
pr.enable()
start1 = time.time()
g0b = g.umap(**topic_model,engine='cuml',memoize=False,feature_engine='cu_cat',cardinality_threshold=10,cardinality_threshold_target=10)
end1 = time.time()
pr.disable()
with open('reddit_cucat_',n_samples,'.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)

del g, ndf_reddit
T0 = end0-start0
T1 = end1-start1
print('\nn_samples of reddit data:',n_samples,'\nCPU dirty_cat runtime:',np.round(T0,4),'\nGPU cu_cat runtime:',np.round(T1,4),'\nspeedup:', np.round(T0/T1,4),


In [None]:
g0a.plot()

In [None]:
g0b.plot()

## CTU-13 malware dataset
roughly 10k rows of IP-address, date-time, numeric and short text labels

In [None]:
edf = pd.read_csv('https://gist.githubusercontent.com/silkspace/33bde3e69ae24fee1298a66d1e00b467/raw/dc66bd6f1687270be7098f94b3929d6a055b4438/malware_bots.csv', index_col=0)
T = edf.Label.apply(lambda x: True if 'Botnet' in x else False)
bot = edf[T]
nbot = edf[~T]
print(f'Botnet abundance: {100*len(bot)/len(edf):0.2f}%')# so botnet traffic makes up a tiny fraction of total

# let's balance the dataset in a 10-1 ratio, for speed and demonstrative purposes
negs = nbot.sample(10*len(bot))
edf = pd.concat([bot, negs])  # top part of arrays are bot traffic, then all non-bot traffic
edf = edf.drop_duplicates()

# some useful indicators for later that predict Botnet as Bool and Int
Y = edf.Label.apply(lambda x: 1 if 'Botnet' in x else 0)  # np.array(T)

# Later we will use and exploit any meaning shared between the labels in a latent distribution

# add it to the dataframe
edf['bot'] = Y

# name some columns for edges and features
src = 'SrcAddr'
dst = 'DstAddr'
good_cols_with_edges = ['Dur', 'Proto', 'Sport',
       'Dport', 'State', 'TotPkts', 'TotBytes', 'SrcBytes', src, dst]

good_cols_without_edges = ['Dur', 'Proto', 'Sport',
       'Dport', 'State', 'TotPkts', 'TotBytes', 'SrcBytes']

## some encoding parameters
n_topics = 20
n_topics_target = 7

In [None]:
print(edf.shape())
edf.head(5)

In [None]:
g = graphistry.edges(edf, src, dst).materialize_nodes()
print(g._nodes.shape, g._edges.shape)
g._nodes.head(5)

In [None]:
## run featurize via CPU and GPU and compare speeds, with both results being UMAP-ed on GPU
pr = cProfile.Profile()
pr.enable()
g = graphistry.edges(edf, src, dst)
# g = graphistry.nodes(edf[['SrcAddr','DstAddr']])
start0 = time.time()
g1a = g.umap(kind='edges',
            X=good_cols_with_edges,
            y = ['bot'],
            use_scaler='quantile',
            use_scaler_target=None,
            cardinality_threshold=20,
            cardinality_threshold_target=2,
            n_topics=n_topics,
            feature_engine='dirty_cat',
            engine='cuml',
            memoize=False,
            n_topics_target=n_topics_target,
            n_bins=n_topics_target,
            metric='euclidean',
            n_neighbors=12)
end0 = time.time()
pr.disable()
with open('ctu13_dirty_',n_samples,'.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)

g = graphistry.edges(cudf.from_pandas(edf), src, dst)
# g = graphistry.nodes(edf[['SrcAddr','DstAddr']])
pr = cProfile.Profile()
pr.enable()
start1 = time.time()
g1a = g.umap(kind='edges',
            X=good_cols_with_edges,
            y = ['bot'],
            use_scaler='quantile',
            use_scaler_target=None,
            cardinality_threshold=20,
            cardinality_threshold_target=2,
            n_topics=n_topics,
            feature_engine='cu_cat',
            engine='cuml',
            memoize=False,
            n_topics_target=n_topics_target,
            n_bins=n_topics_target,
            metric='euclidean',
            n_neighbors=12)
end1 = time.time()
pr.disable()
with open('ctu13_cucat_',n_samples,'.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)

del g, edf
T0 = end0-start0
T1 = end1-start1
print('\nn_samples of CTU data:',n_samples,'\nCPU dirty_cat runtime:',np.round(T0,4),'\nGPU cu_cat runtime:',np.round(T1,4),'\nspeedup:', np.round(T0/T1,4),


In [None]:
g1a.plot()

In [None]:
g1b.plot()

## redteam

around 20k rows of messy string, text and other columns needing parsing

In [None]:
df = pd.read_csv('https://gist.githubusercontent.com/silkspace/c7b50d0c03dc59f63c48d68d696958ff/raw/31d918267f86f8252d42d2e9597ba6fc03fcdac2/redteam_50k.csv', index_col=0)
red_team = pd.read_csv('https://gist.githubusercontent.com/silkspace/5cf5a94b9ac4b4ffe38904f20d93edb1/raw/888dabd86f88ea747cf9ff5f6c44725e21536465/redteam_labels.csv', index_col=0)
df['feats'] = df.src_computer + ' ' + df.dst_computer + ' ' + df.auth_type + ' ' + df.logontype
df['feats2'] = df.src_computer + ' ' + df.dst_computer
ndf = df.drop_duplicates(subset=['feats'])
tdf = pd.concat([red_team.reset_index(), ndf.reset_index()])
tdf['node'] = range(len(tdf))

In [None]:
tdf.head(5)

In [None]:
g = graphistry.nodes((tdf), 'node')
print(g._nodes.shape, g._edges.shape)
g._nodes.head(5)

In [None]:
## run featurize via CPU and GPU and compare speeds, with both results being UMAP-ed on GPU
for n_samples in [5000,10000,15000]:
    tdfA=tdf.sample(n_samples,replace=False)
    pr = cProfile.Profile()
    pr.enable()
    g = graphistry.nodes((tdfA), 'node')
    start0 = time.time()
    g2a = g.umap(X=['feats'],
                min_words=1000000, # force high so that we don't use Sentence Transformers
                cardinality_threshold=4, # set low so we force topic model
                n_topics=32, # number of topics
                use_scaler=None,
                feature_engine='dirty_cat',
                memoize=False,
                engine='cuml',
                use_scaler_target=None
            )
    end0 = time.time()
    pr.disable()
    with open('ctu13_dirty_',n_samples,'.txt', 'w') as stream:
        stats = Stats(pr,stream=stream)
        stats.sort_stats('tottime').print_stats(20)

    g = graphistry.nodes(cudf.from_pandas(tdfA), 'node')
    pr = cProfile.Profile()
    pr.enable()
    start1 = time.time()
    g2a = g.umap(X=['feats'],
                min_words=1000000, # force high so that we don't use Sentence Transformers
                cardinality_threshold=4, # set low so we force topic model
                n_topics=32, # number of topics
                use_scaler=None,
                feature_engine='cu_cat',
                memoize=False,
                engine='cuml',
                use_scaler_target=None
            )
    end0 = time.time()
    pr.disable()
    with open('ctu13_cucat_',n_samples,'.txt', 'w') as stream:
        stats = Stats(pr,stream=stream)
        stats.sort_stats('tottime').print_stats(20)

    del g, tdfA
    T0 = end0-start0
    T1 = end1-start1
    print('\nn_samples of redteam data:',n_samples,'\nCPU dirty_cat runtime:',np.round(T0,4),'\nGPU cu_cat runtime:',np.round(T1,4),'\nspeedup:', np.round(T0/T1,4),


In [None]:
g2a.plot()

In [None]:
g2b.plot()

## ask HN
3000 rows of `title` and `text` columns similar to ndf_reddit dataframe, plus several disparate `time-stamp` columns and various numerics

In [None]:
# # get the data top 3000 posts on Hacker News
askHNA = pd.read_csv('https://storage.googleapis.com/cohere-assets/blog/text-clustering/data/askhn3k_df.csv', index_col=0)
print(askHNA.shape)
askHNA.head(5)

In [None]:
g = graphistry.nodes(askHNA)
print(g._nodes.shape, g._edges.shape)
g._nodes.head(5)

In [None]:
## run featurize via CPU and GPU and compare speeds, with both results being UMAP-ed on GPU
for n_samples in [500,1000,2000]:
    askHN = askHNA.sample(n_samples,replace=False)
    pr = cProfile.Profile()
    pr.enable()
    g = graphistry.nodes(askHN)
    start0 = time.time()
    g5a = g.umap(engine='cuml',memoize=False,feature_engine='dirty_cat',cardinality_threshold=10,cardinality_threshold_target=10)
    end0 = time.time()
    pr.disable()
    with open('hn_dirty_',n_samples,'.txt', 'w') as stream:
        stats = Stats(pr,stream=stream)
        stats.sort_stats('tottime').print_stats(20)

    g = graphistry.nodes(cudf.from_pandas(askHN))
    pr = cProfile.Profile()
    pr.enable()
    start1 = time.time()
    g5b = g.umap(engine='cuml',memoize=False,feature_engine='cu_cat',cardinality_threshold=10,cardinality_threshold_target=10)
    end0 = time.time()
    pr.disable()
    with open('hn_cucat_',n_samples,'.txt', 'w') as stream:
        stats = Stats(pr,stream=stream)
        stats.sort_stats('tottime').print_stats(20)

    del g, askHN
    T0 = end0-start0
    T1 = end1-start0
    print('\nn_samples of HN data:',n_samples,'\nCPU dirty_cat runtime:',np.round(T0,4),'\nGPU cu_cat runtime:',np.round(T1,4),'\nspeedup:', np.round(T0/T1,4),


In [None]:
g3a.plot()

In [None]:
g3b.plot()

## 20newsgroups
11k massive, single column containing multi-sentences to multi-paragraphs

In [None]:
from sklearn.datasets import fetch_20newsgroups
news, _ = fetch_20newsgroups(
    shuffle=True,
    random_state=1,
    remove=("headers", "footers", "quotes"),
    return_X_y=True,
)
print(news.shape)
news.head(5)

In [None]:
g = graphistry.nodes(news)
print(g._nodes.shape, g._edges.shape)
g._nodes.head(5)

In [None]:
## run featurize via CPU and GPU and compare speeds, with both results being UMAP-ed on GPU
for n_samples in [500,1000,2000]:
    newsA = news[:n_samples]
    newsA=pd.DataFrame(newsA)
    pr = cProfile.Profile()
    pr.enable()
    g = graphistry.nodes(newsA)
    start0 = time.time()
    g5a = g.umap(engine='cuml',memoize=False,feature_engine='dirty_cat',cardinality_threshold=10,cardinality_threshold_target=10)
    end0 = time.time()
    pr.disable()
    with open('news_dirty_',n_samples,'.txt', 'w') as stream:
        stats = Stats(pr,stream=stream)
        stats.sort_stats('tottime').print_stats(20)

    g = graphistry.nodes(cudf.from_pandas(newsA))
    pr = cProfile.Profile()
    pr.enable()
    start1 = time.time()
    g5b = g.umap(engine='cuml',memoize=False,feature_engine='cu_cat',cardinality_threshold=10,cardinality_threshold_target=10)
    end0 = time.time()
    pr.disable()
    with open('news_cucat_',n_samples,'.txt', 'w') as stream:
        stats = Stats(pr,stream=stream)
        stats.sort_stats('tottime').print_stats(20)

    del g, newsA
    T0 = end0-start0
    T1 = end1-start0
    print('\nn_samples of news data:',n_samples,'\nCPU dirty_cat runtime:',np.round(T0,4),'\nGPU cu_cat runtime:',np.round(T1,4),'\nspeedup:', np.round(T0/T1,4),


In [None]:
g4a.plot()

In [None]:
g4b.plot()

## winlogs
5M rows of windows log data, including 21 columns of sparse, incomplete and generally hecticly "structured" data. `cu-cat` shines as you scale rows up!

In [None]:
!wget -nc https://www.dropbox.com/s/31dx1g6g59exoc3/part.88.parquet
winlogs=pd.read_parquet('part.88.parquet')
print(winlogs.shape)
winlogs.head(5)

In [None]:
g = graphistry.nodes(winlogsA)
print(g._nodes.shape, g._edges.shape)
g._nodes.head(5)

In [None]:

for n_samples in [5000,10000,20000]:
    winlogsA=winlogs.sample(n_samples,replace=False)
    pr = cProfile.Profile()
    pr.enable()
    g = graphistry.nodes(winlogsA)
    start0 = time.time()
    g5a = g.umap(engine='cuml',memoize=False,feature_engine='dirty_cat',cardinality_threshold=10,cardinality_threshold_target=10)
    end0 = time.time()
    pr.disable()
    with open('winlogs_dirty_',n_samples,'.txt', 'w') as stream:
        stats = Stats(pr,stream=stream)
        stats.sort_stats('tottime').print_stats(20)

    g = graphistry.nodes(cudf.from_pandas(winlogs))
    pr = cProfile.Profile()
    pr.enable()
    start1 = time.time()
    g5b = g.umap(engine='cuml',memoize=False,feature_engine='cu_cat',cardinality_threshold=10,cardinality_threshold_target=10)
    end0 = time.time()
    pr.disable()
    with open('winlogs_cucat_',n_samples,'.txt', 'w') as stream:
        stats = Stats(pr,stream=stream)
        stats.sort_stats('tottime').print_stats(20)

    del g, winlogsA
    T0 = end0-start0
    T1 = end1-start0
    print('\nn_samples of winlogs data:',n_samples,'\nCPU dirty_cat runtime:',np.round(T0,4),'\nGPU cu_cat runtime:',np.round(T1,4),'\nspeedup:', np.round(T0/T1,4),


In [None]:
g5a.plot()

In [None]:
g5b.plot()