<a href="https://colab.research.google.com/github/dcolinmorgan/grph/blob/main/cu_cat_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analyzing Network Identity Data and Red Team Response with Graphistry AutoML + UMAP
We find a simple model that when clustered in a 2d plane via UMAP allows fast identification of anomalous computer to computer connections

In [None]:
!pip install --extra-index-url=https://pypi.nvidia.com cuml-cu11 cudf-cu11 cugraph-cu11 pylibraft_cu11 raft_dask_cu11 dask_cudf_cu11 pylibcugraph_cu11 pylibraft_cu11
import cuml,cudf
print(cuml.__version__)

!pip install -U --force git+https://github.com/graphistry/pygraphistry.git@feat/gpu-featurization
!pip install -U git+https://github.com/graphistry/cu-cat.git@DT4
!pip install dirty_cat
!pip install umap_learn==0.5.2
# !pip install Biopython

!nvidia-smi

In [None]:
import os
from collections import Counter
import cProfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pstats import Stats
import cudf
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_colwidth', 200)

In [None]:
import dirty_cat

# !pip install -q -U --user git+https://github.com/graphistry/pygraphistry.git@feat/gpu-featurization

os.chdir('pygraphistry')
import graphistry
os.chdir('..')

graphistry.register(api=3,protocol="https", server="hub.graphistry.com", username='dcolinmorgan', password='***')
graphistry.__version__

'0.29.1+37.g5677bea1'

In [None]:
# import logging
# logging.basicConfig(level=logging.DEBUG)

In [None]:
# !pip install -U git+https://github.com/graphistry/cu-cat.git@DT4
os.chdir('cu-cat')
import cu_cat
os.chdir('..')

Alert on & visualize anomalous identity events

Demo dataset: 1.6B windows events over 58 days => logins by 12K user over 14K systems adapt to any identity system with logins. Here we subsample down to a small set of 50k events to prove out the pipeline.

=> Can we identify accounts & computers acting anomalously? Resources being oddly accessed?
=> Can we spot the red team?
=> Operations: Identity incident alerting + identity data investigations
Community/contact for help handling bigger-than-memory & additional features

Runs on both CPU + multi-GPU Tools: PyGraphistry[AI], DGL + PyTorch, and NVIDIA RAPIDS / umap-learn

In [None]:
# data source citation
# """A. D. Kent, "Cybersecurity Data Sources for Dynamic Network Research,"
# in Dynamic Networks in Cybersecurity, 2015.

# @InProceedings{akent-2015-enterprise-data,
#    author = {Alexander D. Kent},
#    title = {{Cybersecurity Data Sources for Dynamic Network Research}},
#    year = 2015,
#    booktitle = {Dynamic Networks in Cybersecurity},
#    month =        jun,
#    publisher = {Imperial College Press}
# }"""

### redteam

In [None]:
df = pd.read_csv('https://gist.githubusercontent.com/silkspace/c7b50d0c03dc59f63c48d68d696958ff/raw/31d918267f86f8252d42d2e9597ba6fc03fcdac2/redteam_50k.csv', index_col=0)
red_team = pd.read_csv('https://gist.githubusercontent.com/silkspace/5cf5a94b9ac4b4ffe38904f20d93edb1/raw/888dabd86f88ea747cf9ff5f6c44725e21536465/redteam_labels.csv', index_col=0)
df['feats'] = df.src_computer + ' ' + df.dst_computer + ' ' + df.auth_type + ' ' + df.logontype
df['feats2'] = df.src_computer + ' ' + df.dst_computer
ndf = df.drop_duplicates(subset=['feats'])
tdf = pd.concat([red_team.reset_index(), ndf.reset_index()])
tdf['node'] = range(len(tdf))
tdf.RED.sum()

749.0

In [None]:
tdf

Unnamed: 0,index,time,src_domain,src_computer,dst_computer,feats,RED,dst_domain,auth_type,logontype,authentication_orientation,success_or_failure,feats2,node
0,0,150885,U620@DOM1,C17693,C1003,C17693 C1003,1.0,,,,,,,0
1,1,151036,U748@DOM1,C17693,C305,C17693 C305,1.0,,,,,,,1
2,2,151648,U748@DOM1,C17693,C728,C17693 C728,1.0,,,,,,,2
3,3,151993,U6115@DOM1,C17693,C1173,C17693 C1173,1.0,,,,,,,3
4,4,153792,U636@DOM1,C17693,C294,C17693 C294,1.0,,,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19008,8463107,48263,C11843$@DOM1,C11843,C528,C11843 C528 Kerberos Network,0.0,C11843$@DOM1,Kerberos,Network,LogOn,Success,C11843 C528,19757
19009,14394630,77937,C8470$@DOM1,C8470,C528,C8470 C528 NTLM Network,0.0,C8470$@DOM1,NTLM,Network,LogOn,Success,C8470 C528,19758
19010,33398153,173300,C716$@DOM1,C716,C716,C716 C716 ? ?,0.0,C716$@DOM1,?,?,AuthMap,Success,C716 C716,19759
19011,18353851,102472,U7365@DOM1,C16126,C586,C16126 C586 ? ?,0.0,U7365@DOM1,?,?,TGS,Success,C16126 C586,19760


## A model of Computer-Computer features only
We run 3 versions using different engine combinations --

1. first full gpu pipeline via 'engine=cuda' employing 'cu_cat' for freaturization of the raw dataframe + 'cuml' for umap
2. then 'dirty_cat' on cpu encoding and standard umap_learn umapping
3. then cu_cat gpu encoding back to cpu umapping (for the masochists of the world)

You will note full gpu (1) is understandably the fastest, followed by partial gpu (3) and finally full cup (2)

In [None]:
%%time
# process = True  # set to false after it's run for ease of speed
pr = cProfile.Profile()
pr.enable()

g = graphistry.nodes(tdf, 'node')

g5a = g.umap(X=['feats'],
            min_words=1000000, # force high so that we don't use Sentence Transformers
            cardinality_threshold=4, # set low so we force topic model
            n_topics=32, # number of topics
            use_scaler=None,
            # feature_engine='cu_cat',
            memoize=False,
             # remove_node_column=False,
            engine='cuda',
            use_scaler_target=None
           )

pr.disable()
stats = Stats(pr)
stats.sort_stats('tottime').print_stats(20)

with open('redteam_cuda.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)

* Ignoring target column of shape (19762, 0) in UMAP fit, as it is not one dimensional

         1671337 function calls (1647190 primitive calls) in 47.672 seconds

   Ordered by: internal time
   List reduced from 5214 to 20 due to restriction <20>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        8   20.192    2.524   25.020    3.127 /home/graphistry/notebooks/daniel/cu-cat/cu_cat/_gap_encoder.py:1286(_multiplicative_update_h_smallfast)
        3   12.285    4.095   12.286    4.095 /opt/conda/envs/rapids/lib/python3.8/site-packages/numpy/lib/arraysetops.py:523(in1d)
       20    4.296    0.215    4.298    0.215 /opt/conda/envs/rapids/lib/python3.8/site-packages/cudf/core/column/column.py:301(from_arrow)
     4463    3.621    0.001    3.655    0.001 /opt/conda/envs/rapids/lib/python3.8/site-packages/rmm/rmm.py:216(rmm_cupy_allocator)
        7    1.286    0.184    1.529    0.218 /home/graphistry/notebooks/daniel/cu-cat/cu_cat/_gap_encoder.py:275(_get_H)
      216    0.625    0.003    0.626    0.003 {built-in method cupy._core._routines_lina

In [None]:
%%time
# process = True  # set to false after it's run for ease of speed
pr = cProfile.Profile()
pr.enable()

g = graphistry.nodes((tdf), 'node')
# t=time()
g5b = g.featurize(X=['feats'],
           #  min_words=1000000, # force high so that we don't use Sentence Transformers
           #  cardinality_threshold=4, # set low so we force topic model
           #  n_topics=32, # number of topics
           #  use_scaler=None,
            feature_engine='dirty_cat')#,
           #  memoize=False,
           #   # remove_node_column=False,
            # engine='umap_learn',
           #  use_scaler_target=None
           # )

# pr.disable()
# stats = Stats(pr)
# stats.sort_stats('tottime').print_stats(20)

# j=time()-t
# print("umap: \n"+str(j))

with open('redteam_cpu.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)

         24274251 function calls (24273175 primitive calls) in 71.590 seconds

   Ordered by: internal time
   List reduced from 1518 to 20 due to restriction <20>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     1054   15.874    0.015   43.214    0.041 /opt/conda/envs/rapids/lib/python3.8/site-packages/dirty_cat/gap_encoder.py:815(_multiplicative_update_h)
        3   13.987    4.662   13.987    4.662 /opt/conda/envs/rapids/lib/python3.8/site-packages/numpy/lib/arraysetops.py:523(in1d)
3280085/3279849   10.899    0.000   25.132    0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
      775    4.272    0.006   10.009    0.013 /opt/conda/envs/rapids/lib/python3.8/site-packages/dirty_cat/gap_encoder.py:791(_multiplicative_update_w)
   273080    3.324    0.000    6.809    0.000 /opt/conda/envs/rapids/lib/python3.8/site-packages/scipy/sparse/_sputils.py:147(get_index_dtype)
   134949    2.750    0.000    8.322    0.000 /opt/conda/env

In [None]:
%%time
# process = True  # set to false after it's run for ease of speed
pr = cProfile.Profile()
pr.enable()

g = graphistry.nodes((tdf), 'node')

g5c = g.umap(X=['feats'],
            min_words=1000000, # force high so that we don't use Sentence Transformers
            cardinality_threshold=4, # set low so we force topic model
            n_topics=32, # number of topics
            use_scaler=None,
            feature_engine='cu_cat',
            memoize=False,
             # remove_node_column=False,
            engine='umap_learn',
            use_scaler_target=None
           )

pr.disable()
stats = Stats(pr)
stats.sort_stats('tottime').print_stats(20)

with open('redteam_gpu_cpu.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)

* Ignoring target column of shape (19762, 0) in UMAP fit, as it is not one dimensional

         5788799 function calls (5451682 primitive calls) in 52.187 seconds

   Ordered by: internal time
   List reduced from 5229 to 20 due to restriction <20>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        8   19.784    2.473   23.914    2.989 /home/graphistry/notebooks/daniel/cu-cat/cu_cat/_gap_encoder.py:1286(_multiplicative_update_h_smallfast)
        3   12.768    4.256   12.768    4.256 /opt/conda/envs/rapids/lib/python3.8/site-packages/numpy/lib/arraysetops.py:523(in1d)
      200    5.370    0.027    5.370    0.027 /opt/conda/envs/rapids/lib/python3.8/site-packages/umap/layouts.py:61(_optimize_layout_euclidean_single_epoch)
     4391    3.597    0.001    3.630    0.001 /opt/conda/envs/rapids/lib/python3.8/site-packages/rmm/rmm.py:216(rmm_cupy_allocator)
        7    1.299    0.186    1.544    0.221 /home/graphistry/notebooks/daniel/cu-cat/cu_cat/_gap_encoder.py:275(_get_H)
      728    0.775    0.001    1.336    0.002 /opt/conda/envs/rapids/li

In [None]:
# # get the data top 3000 posts on Hacker News
askHN = pd.read_csv('https://storage.googleapis.com/cohere-assets/blog/text-clustering/data/askhn3k_df.csv', index_col=0)

# df = askHN.sample(1000,replace=True) # set smaller if you want to test a minibatch
# df = df[['text','title']]
# df['title'] = df['title'].sample(frac=1).values

In [None]:
g = graphistry.nodes(cudf.from_pandas(askHN))
pr = cProfile.Profile()
pr.enable()

g1 = g.featurize(feature_engine='cu_cat',memoize=False,remove_node_column=False)
# g1a = g1.umap(engine='cuml')

pr.disable()
stats = Stats(pr)
stats.sort_stats('tottime').print_stats(20)

with open('aHN_umap_cudf-cu_cat_3k.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)

In [None]:
g = graphistry.nodes(askHN)
pr = cProfile.Profile()
pr.enable()

g2 = g.featurize(feature_engine='cu_cat',memoize=False,remove_node_column=False)
# g2a = g2.umap(engine='cuml')#,feature_engine='cu_cat',memoize=False)

pr.disable()
stats = Stats(pr)
stats.sort_stats('tottime').print_stats(20)

with open('aHN_umap_cu_cat_3k.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)

In [None]:
g = graphistry.nodes(askHN)
pr = cProfile.Profile()
pr.enable()


g3 = g.featurize(feature_engine='dirty_cat',memoize=False,remove_node_column=False)
# g3a = g3.umap(engine='cuml')#,feature_engine='cu_cat',memoize=False)

pr.disable()
stats = Stats(pr)
stats.sort_stats('tottime').print_stats(20)

with open('aHN_umap_dirtycat_3k.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)

In [None]:
# g1a.plot()

In [None]:
# g1b.plot()

In [None]:
winlogs=pd.read_parquet('~/notebooks/daniel/part.88 (1).parquet')
n=100000
winlogs=winlogs.sample(n,replace=False).iloc[:,:10]

In [None]:
g = graphistry.nodes(cudf.from_pandas(winlogs))
pr = cProfile.Profile()
pr.enable()

g4a = g.featurize(feature_engine='cu_cat',memoize=False,remove_node_column=False)
# g4b = g.umap(umap_engine='cuml',feature_engine='cu_cat',memoize=False,remove_node_column=False)

pr.disable()
stats = Stats(pr)
stats.sort_stats('tottime').print_stats(20)

with open('winlogs_umap_cudf_cuda_cat_'+str(n)+'.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)

MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /opt/conda/envs/rapids/include/rmm/mr/device/cuda_memory_resource.hpp

In [None]:
stopppp

In [None]:
winlogs=pd.read_parquet('~/notebooks/daniel/part.88 (1).parquet')
winlogs=winlogs.sample(500000,replace=False).iloc[:,:10]

# g = graphistry.nodes(winlogs)
# pr = cProfile.Profile()
# pr.enable()

# g10 = g.featurize(feature_engine='cu_cat',memoize=False,remove_node_column=False)
# g10a = g10.umap(engine='cuml')#,feature_engine='cu_cat',memoize=False)

# pr.disable()
# stats = Stats(pr)
# stats.sort_stats('tottime').print_stats(20)

# with open('winlogs_featurize_cu_cat_500k.txt', 'w') as stream:
#     stats = Stats(pr,stream=stream)
#     stats.sort_stats('tottime').print_stats(20)

g = graphistry.nodes(cudf.from_pandas(winlogs))
pr = cProfile.Profile()
pr.enable()

g11 = g.featurize(feature_engine='cu_cat',memoize=False,remove_node_column=False)
g11b = g11.umap(engine='cuml')#,feature_engine='cu_cat',memoize=False)

pr.disable()
stats = Stats(pr)
stats.sort_stats('tottime').print_stats(20)

with open('winlogs_featurize_cudf-cu_cat_500k.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)


# g = graphistry.nodes(winlogs)
# pr = cProfile.Profile()
# pr.enable()


# g12 = g.featurize(feature_engine='cu_cat',memoize=False,remove_node_column=False)
# g12c = g12.umap(engine='cuml')#,feature_engine='cu_cat',memoize=False)

# pr.disable()
# stats = Stats(pr)
# stats.sort_stats('tottime').print_stats(20)

# with open('winlogs_featurize_dirtycat_500k.txt', 'w') as stream:
#     stats = Stats(pr,stream=stream)
#     stats.sort_stats('tottime').print_stats(20)

In [None]:
winlogs=pd.read_parquet('~/notebooks/daniel/part.88 (1).parquet')
winlogs=winlogs.sample(1000000,replace=False).iloc[:,:10]

# g = graphistry.nodes(winlogs)
# pr = cProfile.Profile()
# pr.enable()

# g13 = g.featurize(feature_engine='cu_cat',memoize=False,remove_node_column=False)
# g13a = g13.umap(engine='cuml')#,feature_engine='cu_cat',memoize=False)

# pr.disable()
# stats = Stats(pr)
# stats.sort_stats('tottime').print_stats(20)

# with open('winlogs_featurize_cu_cat_1m.txt', 'w') as stream:
#     stats = Stats(pr,stream=stream)
#     stats.sort_stats('tottime').print_stats(20)

g = graphistry.nodes(cudf.from_pandas(winlogs))
pr = cProfile.Profile()
pr.enable()

g14 = g.featurize(feature_engine='cu_cat',memoize=False,remove_node_column=False)
g14b = g14.umap(engine='cuml')#,feature_engine='cu_cat',memoize=False)

pr.disable()
stats = Stats(pr)
stats.sort_stats('tottime').print_stats(20)

with open('winlogs_featurize_cudf-cu_cat_1m.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)


# g = graphistry.nodes(winlogs)
# pr = cProfile.Profile()
# pr.enable()

# g15 = g.featurize(feature_engine='cu_cat',memoize=False,remove_node_column=False)
# g15b = g15.umap(engine='cuml')#,feature_engine='cu_cat',memoize=False)


# pr.disable()
# stats = Stats(pr)
# stats.sort_stats('tottime').print_stats(20)

# with open('winlogs_featurize_dirtycat_1m.txt', 'w') as stream:
#     stats = Stats(pr,stream=stream)
#     stats.sort_stats('tottime').print_stats(20)

In [None]:
stopppp

In [None]:
from sklearn.datasets import fetch_20newsgroups
# newsgroups = fetch_20newsgroups()#categories=categories)
n_samples = 1000
# n_features = 1000
# n_components = 10
# n_top_words = 20
# batch_size = 128
# init = "nndsvda"

news, _ = fetch_20newsgroups(
    shuffle=True,
    random_state=1,
    remove=("headers", "footers", "quotes"),
    return_X_y=True,
)

news = news[:n_samples]
news=pd.DataFrame(news)

In [None]:
g = graphistry.nodes(news)
pr = cProfile.Profile()
pr.enable()


g7 = g.featurize(feature_engine='cu_cat',memoize=False,remove_node_column=False)
g7a = g7.umap(engine='cuml')#,feature_engine='cu_cat',memoize=False)

pr.disable()
stats = Stats(pr)
stats.sort_stats('tottime').print_stats(20)

with open('20news_umap_cu_cat.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)

In [None]:
g = graphistry.nodes(cudf.from_pandas(news))
pr = cProfile.Profile()
pr.enable()


g8 = g.featurize(feature_engine='cu_cat',memoize=False,remove_node_column=False)
g8b = g8.umap(engine='cuml')#,feature_engine='cu_cat',memoize=False)

pr.disable()
stats = Stats(pr)
stats.sort_stats('tottime').print_stats(20)

with open('20news_umap_cudf-cu_cat.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)

In [None]:
g = graphistry.nodes(news)
pr = cProfile.Profile()
pr.enable()

g9 = g.featurize(feature_engine='dirty_cat',memoize=False,remove_node_column=False)
g9c = g9.umap(engine='cuml')#,feature_engine='cu_cat',memoize=False)

pr.disable()
stats = Stats(pr)
stats.sort_stats('tottime').print_stats(20)

with open('20news_umap_dirtycat.txt', 'w') as stream:
    stats = Stats(pr,stream=stream)
    stats.sort_stats('tottime').print_stats(20)