# Analyzing Network Identity Data and Red Team Response with Graphistry AutoML + UMAP

We find a simple model that when clustered in a 2d plane via UMAP allows fast identification of anomalous 
computer to computer connections

In [None]:
# ! pip install graphistry[ai] 

In [None]:
import pandas as pd
import graphistry

import os
from joblib import load, dump
from collections import Counter

import numpy as np
import matplotlib.pylab as plt

from sklearn.cluster import DBSCAN
from sknetwork.ranking import PageRank


In [None]:
graphistry.register(api=3, protocol="https", server="hub.graphistry.com", username = os.environ['USERNAME'], password=os.environ['GRAPHISTRY_PASSWORD'])

Alert on & visualize anomalous identity events

Demo dataset: 1.6B windows events over 58 days => logins by 12K user over 14K systems
adapt to any identity system with logins. Here we subsample down to a small set of 50k events to prove out the pipeline. 

* => Can we identify accounts & computers acting anomalously? Resources being oddly accessed?
* => Can we spot the red team?
* => Operations: Identity incident alerting + identity data investigations

Community/contact for help handling bigger-than-memory & additional features

Runs on both CPU + multi-GPU
Tools: PyGraphistry[AI], DGL + PyTorch, and NVIDIA RAPIDS / umap-learn

In [None]:
# cite data source
# """A. D. Kent, "Cybersecurity Data Sources for Dynamic Network Research,"
# in Dynamic Networks in Cybersecurity, 2015.

# @InProceedings{akent-2015-enterprise-data,
#    author = {Alexander D. Kent},
#    title = {{Cybersecurity Data Sources for Dynamic Network Research}},
#    year = 2015,
#    booktitle = {Dynamic Networks in Cybersecurity},
#    month =        jun,
#    publisher = {Imperial College Press}
# }"""

# Data Loading and Munging


In [None]:
# small sample (get almost equivalent results without overheating computer over the 1.6B events in the full dataset)
df = pd.read_csv('https://gist.githubusercontent.com/silkspace/c7b50d0c03dc59f63c48d68d696958ff/raw/31d918267f86f8252d42d2e9597ba6fc03fcdac2/redteam_50k.csv', index_col=0)
df.head(5)

In [None]:
print(df.shape) # -> 50k

In [None]:
# here are the post-facto red team events
red_team = pd.read_csv('https://gist.githubusercontent.com/silkspace/5cf5a94b9ac4b4ffe38904f20d93edb1/raw/888dabd86f88ea747cf9ff5f6c44725e21536465/redteam_labels.csv', index_col=0)

# Modeling

Make sure you `mkdir(data)` or change path below


In [None]:
process = True  
# makes a combined feature we can use for topic modeling!
if process:
    # we create two types of models
    df['feats'] = df.src_computer + ' ' + df.dst_computer + ' ' + df.auth_type + ' ' + df.logontype
    # and one of just computer to computer 
    df['feats2'] = df.src_computer + ' ' + df.dst_computer
    ndf = df.drop_duplicates(subset=['feats'])
    ndf.to_parquet('../data/auth-50k-feats-one-column.parquet')
else:
    ndf = pd.read_parquet('../data/auth-50k-feats-one-column.parquet')
    
print(ndf.shape)

## Red Team Data 
Add it to the front of the DataFrame so we can keep track of it

In [None]:
# make a subsampled dataframe with the anom red-team data at top...so we can keep track.
# we don't need the full `df`, only the unique entries of 'feats' in `ndf` for 
# fitting a model (in a few cells below)

tdf = pd.concat([red_team.reset_index(), ndf.reset_index()])
tdf

In [None]:
# add a fidicial index used later
tdf['node'] = range(len(tdf))

In [None]:
# total number of red team events
tdf.RED.sum()

## Enrichment

In [None]:
# some enrichments
def pagerank(g):
    from sknetwork.ranking import PageRank
    adj = g._weighted_adjacency
    pagerank = PageRank()
    ranks = pagerank.fit_transform(adj)
    g._nodes['pagerank'] = ranks
    return g

def cluster(g):
    """
        Fits clustering on UMAP embeddings
    """
    dbscan = DBSCAN()
    labels = dbscan.fit_predict(g._node_embedding)
    g._nodes['cluster'] = labels
    cnt = Counter(labels)
    return g, dbscan, cnt

def get_confidences_per_cluster(g, cnt):
    """
        From DBSCAN clusters, will assess how many Red Team events exist,
        assessing confidence.
    """
    resses = []
    df = g._nodes
    for clust, count in cnt.most_common():
        res = df[df.cluster==clust]
        n = res.shape[0]
        n_reds = res.RED.sum()
        resses.append([clust, n_reds/n, n_reds, n])
        if n_reds>0:
            print('-'*20)
            print(f'cluster: {clust}\n   red {100*n_reds/n:.2f}% or {n_reds} out of {count}')
    conf_dict = {k[0]:k[1] for k in resses}
    confidence = [conf_dict[k] for k in df.cluster.values]
    g._nodes['confidence'] = confidence
    return g, pd.DataFrame(resses, columns=['cluster', 'confidence', 'n_red', 'total_in_cluster'])


def enrich(g):
    """
        Full Pipeline 
    """
    g = pagerank(g)
    g, dbscan, cnt = cluster(g)
    g, cluster_confidences = get_confidences_per_cluster(g, cnt)
    return g, dbscan, cluster_confidences
    

# The Full UMAP Pipelines
Fit a model on 'feats' column

In [None]:
%%time
process = True  # set to false after it's run for ease of speed
if process:
    g = graphistry.nodes(tdf, 'node')
    g5 = g.umap(X=['feats'], 
                min_words=1000000, # force high so that we don't use Sentence Transformers
                cardinality_threshold=4, # set low so we force topic model
                n_topics=32, # number of topics
                use_scaler=None,
                use_scaler_target=None
               )
    
    g5, dbscan, cluster_confidences = enrich(g5)

    g5.build_index()
    g5.save_search_instance('../data/auth-feat-topic.search')
else:
    g = graphistry.bind()
    g5 = g.load_search_instance('../data/auth-feat-topic.search')
    g5, dbscan, cluster_confidences = enrich(g5)


## Plot it
Color by `confidence` and hover over `red` team histogram to see where events occur

In [None]:
g5.name('auth 50k topic feats no target').plot(render=False)

In [None]:
# see how the model has organized features
X = g5._node_features
X

## Put model into Predict Mode

Once a model is fit, can predict on new batches as we demonstrate here

There are two main methods

`g.transform` and `g.transform_umap` 

see help(*) on each to learn more

One may save the model as above, load it, and wrap in a FastAPI endpoint, etc, to serve in production pipelines.

In [None]:
# first sample a batch from the normal data (auth=df)
emb_normal, xp_normal, _ = g5.transform_umap(df.sample(200), None, kind='nodes')
# then transform all the red team data
emb_red, xp_red, _ = g5.transform_umap(red_team, None, kind='nodes')

In [None]:
# all emb's have this form
g5._node_embedding

In [None]:
# scatter to see how well it does.
plt.figure(figsize=(10,7))
plt.scatter(g5._node_embedding.x, g5._node_embedding.y , c='b')  # the totality of the fit data
plt.scatter(emb_normal.x, emb_normal.y, c='g') # batch of new data
plt.scatter(emb_red.x, emb_red.y, c='r') # red labels to show good cluster seperation

## 96% Reduction in Alerts

This indicates a huge reduction in the search space needed 

Since we have clear cluster assignments along with (post facto) confidences of known anomalous activity, we can reduce the search space on new events (via Kafka, Splunk, etc)

In [None]:
# percent of RED team labels we get with 10% confidence or above
p = cluster_confidences[cluster_confidences.confidence>0.1].n_red.sum()/cluster_confidences[cluster_confidences.confidence>0.1].total_in_cluster.sum()
print(f'{100*p:.2f}%')

In [None]:
# number of data points not to consider (and it's more if we look at df proper!)
cluster_confidences[cluster_confidences.confidence<0.1].total_in_cluster.sum()

In [None]:
p = cluster_confidences[cluster_confidences.confidence<0.1].total_in_cluster.sum()/cluster_confidences.total_in_cluster.sum()
print(f'Alert Reduction {100*p:.2f}%')

In [None]:
plt.figure(figsize=(10,7))
plt.plot(np.cumsum([k[2] for k in cluster_confidences.values]))
plt.xlabel('Anomolous Cluster Number')  # shows that we can ignore first clusters (containing most of the alerts)
plt.ylabel('Number of Identified Red Team Events')
print()

## Supervised UMAP
Here we use the RED team label to help supervise the UMAP fit. 
This might be useful once teams have actually identified RED team events 
and want to help separate clusters. 
While separation is better, the unsupervised version does well without.

In [None]:
%%time
process = True
if process:
    g = graphistry.nodes(tdf, 'node')
    g6 = g.umap(X=['feats'], y =['RED'], 
                min_words=100000, 
                cardinality_threshold=2, 
                n_topics=32,
                use_scaler_target=None)
    g6, dbscan6, cluster_confidences6  = enrich(g6)
    g6.build_index()
    g6.save_search_instance('../data/auth-feat-supervised-topic.search')
else:
    g = graphistry.bind()
    g6 = g.load_search_instance('../data/auth-feat-supervised-topic.search')
    g6, dbscan6, cluster_confidences6  = enrich(g6)


### Plot
Color by `confidence` and hover over `red` team histogram to see where events occur

In [None]:
g6.name('auth 50k topic with supervised umap').plot(render=False)

## A model of Computer-Computer features only
Here we ignore `auth_type` and `logontype` and just fit on computer to computer, unsupervised.

In [None]:
%%time
process = True
if process:
    g = graphistry.nodes(tdf, 'node')
    g7 = g.umap(X=['feats2'], #y =['RED'], 
                min_words=100000, 
                cardinality_threshold=2, 
                n_topics=32,
                use_scaler_target=None)
    g7, dbscan7, cluster_confidences7  = enrich(g7)
    g7.build_index()
    g7.save_search_instance('../data/auth-feat-just-ip-topic.search')
else:
    g7 = graphistry.bind().load_search_instance('../data/auth-feat-just-ip-topic.search')
    g7, dbscan7, cluster_confidences7  = enrich(g7)


### Plot
Color by `confidence` and hover over `red` team histogram to see where events occur

In [None]:
g7.name('auth 50k topic only ips no supervision').plot(render=False)

In [None]:
X = g7._get_feature('nodes')
X

# Conditional Probability
Let's see if  can give us good histograms to tease out red team nodes? This is to baseline the above UMAP models, and we find in retrospect, UMAP wins.

In [None]:
g = graphistry.edges(tdf, "src_computer", "dst_computer")

In [None]:
def conditional_probability(x, given, df):
    """conditional probability function over categorical variables
       p(x|given) = p(x,given)/p(given)
        
    Args:
        x: the column variable of interest given the column 'given'
        given: the variabe to fix constant
        df: dataframe with columns [given, x]
    Returns:
        pd.DataFrame: the conditional probability of x given the column 'given'
    """
    return df.groupby([given])[x].apply(lambda g: g.value_counts()/len(g))


In [None]:
x='dst_computer'
given='src_computer'
condprobs = conditional_probability(x, given, df=tdf)

cprob = pd.DataFrame(list(condprobs.index), columns=[given, x])
cprob['_probs'] = condprobs.values

In [None]:
# now enrich the edges dataframe with the redteam data
# since cprobs lost those labels during the function cal
indx = cprob.src_computer.isin(red_team.src_computer) & cprob.dst_computer.isin(red_team.dst_computer)
cprob.loc[indx, 'red'] = 1
cprob.loc[~indx, 'red'] = 0

In [None]:
# full condprob graph 
cg = graphistry.edges(cprob, x, given).bind(edge_weight='_probs')
cg.plot(render=False)

## Learning
The conditional graph shows that most of the edge probabilities are between 4e-7 and 0.03, whose bucket contains most events. Thus the chances of finding the red team edges are ~ 1e-4 -- slim indeed. UMAP wins.

Likewise the transpose conditional is even worse 
with prob_detection ~ 6e-5

In [None]:
# let's repeat but with reverse conditional
x='src_computer'
given='dst_computer'
condprobs2 = conditional_probability(x, given, df=tdf)

cprob2 = pd.DataFrame(list(condprobs2.index), columns=[given, x])
cprob2['_probs'] = condprobs2.values

In [None]:
# now enrich the edges dataframe with the redteam data
indx = cprob2.src_computer.isin(red_team.src_computer) & cprob2.dst_computer.isin(red_team.dst_computer)
cprob2.loc[indx, 'red'] = 1
cprob2.loc[~indx, 'red'] = 0

In [None]:
cg2 = graphistry.edges(cprob2, x, given).bind(edge_weight='_probs')
cg2.plot(render=False)
# same conclusion as above...

In [None]:
# # let's see the probs better:
# for src in red_team.src_computer.unique():
#     for dst in red_team.dst_computer.unique():
#         if dst in condprobs[src]:
#             print('-'*30)
#             print(f'given src {src} -> dst {dst}')
#             print('-'*10)
#             print(f'   {condprobs[src][dst]*100:.2f}%')
#             print()

In [None]:
# for dst in red_team.dst_computer.unique():
#     for src in red_team.src_computer.unique():
#         if src in condprobs2[dst]:
#             print('-'*20)
#             print(f'given dst {dst} -> src {src}')
#             print('-'*10)
#             print(f'  {condprobs2[dst][src]*100:.2f}%')
#             print()