# Threat Hunting Masterclass: Three data science notebooks for finding bad actors in your network logs

More info: https://www.graphistry.com/blog/zeek-masterclass

To get start,load logs.tar into your Splunk and replace the index used below, or call it index `corelight_tutorial`: https://data.world/graphistry/networkforensics . From there, follow the cells below.

## Configure

* If you are using Graphistry Marketplace, leave `GRAPHISTRY` unedited, else, uncomment and fill it in
* Fill in `SPLUNK`. Make sure the user has capabilities for REST API access and reading the index in which you put `logs.tar`

In [0]:
#graphistry
GRAPHISTRY = {
    #'key': 'MY_API_KEY',
    #'protocol': 'https',
    #'server': 'labs.graphistry.com',
    #'api': 2
}    

#splunk
SPLUNK = {
    #'host': 'my.splunk.com',
    'scheme': 'https',
    'port': 8089,
    'username': 'my_user',
    'password': 'my_pwd'   
}

## Imports

In [0]:
!pip install graphistry -q
!pip install splunk-sdk -q

In [0]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

import re

import graphistry
graphistry.register(**GRAPHISTRY)

In [0]:
import splunklib
import splunklib.client as client
import splunklib.results as results

service = client.connect(**SPLUNK)

## Helpers

### Splunk
* Query splunk, with optional args like sampleRate
* Automatically paginate when result split over multiple responses
* Return as a Pandas dataframe (Note: treats all cols as strings)

In [0]:
STEP = 50000;                       
def splunkToPandas(qry, overrides={}):
    kwargs_blockingsearch = {
        "count": 0,
        "earliest_time": "2010-01-24T07:20:38.000-05:00",
        "latest_time": "now",
        "search_mode": "normal",
        "exec_mode": "blocking",
        **overrides}
    job = service.jobs.create(qry, **kwargs_blockingsearch)

    print("Search results:\n")
    resultCount = job["resultCount"]
    offset = 0;                         

    print('results', resultCount)
    out = None
    while (offset < int(resultCount)):
        print("fetching:", offset, '-', offset + STEP)
        kwargs_paginate = {**kwargs_blockingsearch,
                           "count": STEP,
                           "offset": offset}

        # Get the search results and display them
        blocksearch_results = job.results(**kwargs_paginate)
        reader = results.ResultsReader(blocksearch_results)
        lst = [x for x in reader]
        df2 = pd.DataFrame(lst)    
        out = df2 if type(out) == type(None) else pd.concat([out, df2], ignore_index=True)
        offset += STEP
    for c in out.columns:
        out[c] = out[c].astype(str)
    return out

### Bro/Zeek

Useful bindings for hypergraphs

In [0]:
categories = {
    'ip': ['id.orig_h', 'id.resp_h']
}

opts={
    'CATEGORIES': categories 
}

### Graphistry

In [0]:
##Extend graphistry.plotter.Plotter to add chainable method "my+graph.color_points_by('some_column_name')..." (and "color_edges_by")

import graphistry.plotter

def color_col_by_categorical(df, type_col):
  types = list(df[type_col].unique())
  type_to_color = {t: i for (i, t) in enumerate(types)}
  return df[type_col].apply(lambda t: type_to_color[t])

def color_col_by_continuous(df, type_col):
  mn = df[type_col].astype(float).min()
  mx = df[type_col].astype(float).max()
  if mx - mn < 0.000001:
    print('warning: too small values for color_col_by_continuous')
    return color_col_by_categorical(df, type_col)
  return df[type_col].apply(lambda v: 228010 - round(10 * (float(v) - mn)/(mx - mn) ))
  

## g * str * 'categorical' | 'continuous' -> g
def color_points_by(g, type_col, kind='categorical'):
  fn = color_col_by_categorical if kind == 'categorical' else color_col_by_continuous
  colors = fn(g._nodes, type_col)
  return g.nodes( g._nodes.assign(point_color=colors) ).bind(point_color='point_color')

## g * str * 'categorical' | 'continuous' -> g
def color_edges_by(g, type_col, kind='categorical'):
  fn = color_col_by_categorical if kind == 'categorical' else color_col_by_continuous
  colors = fn(g._edges, type_col)
  return g.edges( g._edges.assign(edge_color=colors) ).bind(edge_color='edge_color')

graphistry.plotter.Plotter.color_points_by = color_points_by
graphistry.plotter.Plotter.color_edges_by = color_edges_by

In [0]:
## remove node/edges pointing to "*::nan" values
def safe_not_nan(prog, v):
  try: 
    return not prog.match(v)
  except:
    return True
  
def drop_nan_col(df, col, prog):
  not_nans = df[col].apply(lambda v: safe_not_nan(prog, v))
  return df[ not_nans == True ]
  
def drop_nan(g, edges = ['src', 'dst'], nodes = ['nodeID']):
  prog = re.compile(".*::nan$")
  edges2 = g._edges
  for col_name in g._edges.columns:
    edges2 = drop_nan_col(edges2, col_name, prog)
  nodes2 = g._nodes
  for col_name in g._nodes.columns:
    nodes2 = drop_nan_col(nodes2, col_name, prog)
  return g.nodes(nodes2).edges(edges2)
  
graphistry.plotter.Plotter.drop_hyper_nans = drop_nan  

## Notebook intro:

### Jupyter
* Edit and run a code cell and see it's output: **shift-enter** or via the UI
* You can always edit it and rerun
* Best practice: Write in order as if a full program, so you can always restart and run from th top

### Google Colab
* Hit **Connect** on the top-right to start a running personal session  for this -- it is ready when it says *Connected*. 
* Run each *cell* of the notebook in sequence: either press the **play** button to the left of the cell, or select the cell and hit **shift-enter**.  Feel free to edit the cell, and rerun it (+ the likely . impacted cells below it.)
* Best practice: Write in order as if a full program, so you can always restart and run from the top


### Pandas
Most of the preprocessing code is `pandas`, the most popular Python data science tool (https://pandas.pydata.org ). Graphistry enterprise enables you to replace this kind of manual data wrangling code with shareable point-and-click solutions.

## Graphistry intro:

* Graphistry load below a cell whenever its code says  `...plot()`

* If you see a giant Graphistry logo over a gray background and nothing else, click the logo to start the Graphistry session

* UI Guide: https://labs.graphistry.com/graphistry/ui.html 

* Graphistry notebook examples: https://github.com/graphistry/pygraphistry

* Palettes: https://labs.graphistry.com/graphistry/docs/palette.html

In [0]:
df = splunkToPandas(
    """
    search index=corelight_tutorial 
    | dedup id.orig_h, id.resp_h, name 
    | fields - _* 
    | head 100
    """,
    {'sample_ratio': 10}) # Optional, means "sample 1 in 10"

print('# rows', len(df))
df.sample(3)

Search results:

results 71
fetching: 0 - 10000
# rows 71


Unnamed: 0,host,id.orig_h,id.resp_h,index,linecount,name,source,sourcetype,splunk_server,uid,size
45,splunk.graphistry.com,192.168.0.54,74.125.71.103,corelight_tutorial,1,possible_split_routing,logs.tar:./weird_20180803_16:37:08-16:40:00-07...,weird,splunk.graphistry.com,CnmCKnefJGrBoSkXb,
4,splunk.graphistry.com,192.168.0.53,192.168.0.1,corelight_tutorial,1,dns_unmatched_msg,logs.tar:./weird_20180803_16:37:08-16:40:00-07...,weird,splunk.graphistry.com,CTtsFjRZfz8mEKZOh,
36,splunk.graphistry.com,192.168.0.51,212.227.17.187,corelight_tutorial,1,data_before_established,logs.tar:./weird_20180803_16:37:08-16:40:00-07...,weird,splunk.graphistry.com,Cd8SbX5dSN6tvVLtb,


In [0]:
hg = graphistry.hypergraph(
    df, 
    ["id.orig_h", "id.resp_h", "name", "uid"], 
    direct=True,
    opts={
        'CATEGORIES': {
            'ip': ['id.orig_h', 'id.resp_h'] # combine repeats across columns into the same nodes
        }
    })
hg['graph'].plot()

# links 426
# events 71
# attrib entities 137


## 1. DNS Tunneling

### 1.A. DNS Map for in general

General query for looking at DNS connections with Bro/Zeek, and reducing it a bit ahead of time:
* Nodes are IPs
* Edges summarize all activity per IP<>IP

Demo: Summarize all activity across 10,000 IP<>IP pairs: max bytes, ...

In [0]:
dns_a_df = splunkToPandas("""

    search index="corelight_tutorial" sourcetype="conn" 

    | stats
    count(_time) as count,
    earliest(_time), latest(_time),
    values(answers{}) as answers,
    values(conn_state),
    values(history)
    values(issuer),
    values(ja3),
    values(last_alert),
    values(qtype_name),
    values(subject),
    max(*bytes), avg(*bytes),

    by id.orig_h, id.resp_h

    | eval duration_ms = last_time_ms - first_time_ms

    | head 10000

    """,
    {'sample_ratio': 1})

print('# rows', len(dns_a_df))
dns_a_df.sample(3)

Search results:

results 10000
fetching: 0 - 10000
# rows 10000


Unnamed: 0,id.orig_h,id.resp_h,count,earliest(_time),latest(_time),values(conn_state),max(missed_bytes),max(orig_ip_bytes),max(resp_ip_bytes),avg(missed_bytes),avg(orig_ip_bytes),avg(resp_ip_bytes),values(history),max(orig_bytes),max(resp_bytes),avg(orig_bytes),avg(resp_bytes),answers,values(qtype_name),values(issuer),values(ja3),values(subject),values(last_alert)
2782,192.168.0.51,54.204.24.165,1,1533339487.644064,1533339487.644064,SF,0,1991,718,0,1991,718.0,ShADadfF,1463,242,1463,242.0,,,,,,
2818,192.168.0.51,54.230.97.87,1,1533339485.88874,1533339485.88874,SF,0,2390,1307,0,2390,1307.0,ShADadFf,1446,467,1446,467.0,,,,,,
8654,192.168.0.54,80.239.217.162,4,1533339487.423292,1533339487.427818,SF,0,3596,4185,0,1486,1692.5,"['ShADadFf', 'ShAFf']",2938,3593,1101,1370.5,,,,,,


In [0]:
hg = graphistry.hypergraph(
    dns_a_df, 
    ["id.orig_h", "id.resp_h"], ### "uid", "protocol", ....
    direct=True,
    opts=opts)

hg['graph'].plot()

# links 10000
# events 10000
# attrib entities 8771


### 1.B. DNS Tunnel:

Search for the top 10,000  ip->(unique dns query)->ip summaries matching tunneling heuristics:
1. length(query) > 25: exfil / command request
2. length(answer) > 45: received command 

Visualize:
* Nodes: IPs and queries
* Edges: Summaries along each  orig_h->query->resp_h->answer->orig_h

Results:
* uid: C3ApkJ3TwWW64DtnWb , CaAbvy2ureWe5sifRf
* ip: 10.0.2.30 10.0.2.20  34.215.241.13 192.168.1.128


In [0]:
dns_b_df = splunkToPandas("""

    search index="corelight_tutorial" sourcetype="conn"
    
    | eval query_length = length(query)
    | eval long_answers=mvfilter(length('answers{}') > 45)
    | eval long_answers_length = max(length(long_answers))
    | where query_length > 25 OR long_answers_length > 45


    | stats
    count(_time) as count,
    earliest(_time), latest(_time),
    values(answers{}) as answers,
    max(long_answers_length) as max_long_answers_length,
    values(conn_state),
    values(history)
    values(issuer),
    values(ja3),
    values(last_alert),
    values(subject),
    values(qtype_name),
    first(uid),

    max(*bytes), avg(*bytes),
    
    by id.orig_h, id.resp_h, query, query_length                                

    | eval duration_ms = last_time_ms - first_time_ms
    
    | eval query=substr(query,1,100)
    | eval max_query_or_answer_length = max(query_length, max_long_answers_length)
    | sort max_query_or_answer_length desc                                           

    | head 50000

    """,
    {'sample_ratio': 1})

print('# rows', len(dns_b_df))
dns_b_df.sample(3)

Search results:

results 10000
fetching: 0 - 50000
# rows 10000


Unnamed: 0,id.orig_h,id.resp_h,query,query_length,count,earliest(_time),latest(_time),answers,values(qtype_name),first(uid),max_query_or_answer_length,max_long_answers_length
6616,192.168.1.128,34.215.241.13,913b01a21f5c127a1a7b540cd24150bcdef5a4683b327e...,228,1,1533339541.68141,1533339541.68141,e9df01a21f4b40012781d8ffff18fe8a5f.sweetcoldwa...,,CaAbvy2ureWe5sifRf,228,53.0
972,192.168.1.128,34.215.241.13,14c401a21f67d55332d3772907b1fbe1881949605c29a7...,228,1,1533339541.834035,1533339541.834035,TXT 34 950301a21f7c67fa89e1d2ffff18fefdc1,TXT,CaAbvy2ureWe5sifRf,228,
1377,192.168.1.128,34.215.241.13,1e1d01a21f39a5d6a43154028f6be081a67e9ec01fa825...,228,1,1533339541.637895,1533339541.637895,0c2401a21f151e3bc094c7ffff18fe1a8b.sweetcoldwa...,MX,CaAbvy2ureWe5sifRf,228,53.0


In [0]:
hg = graphistry.hypergraph(
    dns_b_df, 
    ["id.orig_h", "id.resp_h", "query", "answers"], ### "uid", "protocol", ....
    direct=True,
    opts={
        **opts,
        'EDGES': {
            'id.orig_h': ['query'],
            'query': ['id.resp_h'],
            'id.resp_h': ['answers'],
            'answers': ['id.orig_h']
        }})

hg['graph'].bind(edge_title='query').drop_hyper_nans().color_points_by('category').color_edges_by('max_query_or_answer_length', 'continuous').plot()

# links 40000
# events 10000
# attrib entities 19444


In [0]:
dns_b2_df = splunkToPandas("""

    search index="corelight_tutorial" 
    C3ApkJ3TwWW64DtnWb OR CaAbvy2ureWe5sifRf OR 10.0.2.30 OR 10.0.2.20  OR 34.215.241.13 OR 192.168.1.128
    | eval time=ts
    | rename answers{} as answers
    | fields *
    | fields - _*
                                   

    | head 50000

    """,
    {'sample_ratio': 1})

print('# rows', len(dns_b2_df))
dns_b2_df.sample(10)

Search results:

results 36352
fetching: 0 - 50000
# rows 36352


Unnamed: 0,date_hour,date_mday,date_minute,date_month,date_second,date_wday,date_year,date_zone,eventtype,host,id.orig_h,id.orig_p,id.resp_h,id.resp_p,index,linecount,name,notice,punct,source,sourcetype,splunk_server,splunk_server_group,time,timeendpos,timestartpos,ts,uid,unix_category,unix_group,conn_state,duration,history,local_orig,local_resp,missed_bytes,orig_bytes,orig_ip_bytes,orig_pkts,proto,resp_bytes,resp_ip_bytes,resp_pkts,AA,RA,RD,TC,Z,qclass,qclass_name,qtype,qtype_name,query,rejected,trans_id,TTLs{},answers,rcode,rcode_name,rtt,resp_cc,service,addl
966,23,3,39,august,2,friday,2018,0,nix-all-logs,splunk.graphistry.com,192.168.1.128,56309,192.168.1.107,1124,corelight_tutorial,1,,,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",logs.tar:./conn_20180803_16:37:13-16:40:00-070...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",2018-08-03T23:39:02.363126Z,34,7,2018-08-03T23:39:02.363126Z,CtkNIN26MXgsH8O8Q9,all_hosts,default,S0,,S,True,True,0.0,,44.0,1.0,tcp,,0.0,0.0,,,,,,,,,,,,,,,,,,,,
16620,23,3,39,august,2,friday,2018,0,nix-all-logs,splunk.graphistry.com,192.168.1.128,56308,192.168.1.180,31337,corelight_tutorial,1,,,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",logs.tar:./conn_20180803_16:37:13-16:40:00-070...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",2018-08-03T23:39:02.196686Z,34,7,2018-08-03T23:39:02.196686Z,C9XzcSPo0JRG14DDl,all_hosts,default,REJ,1.5e-05,Sr,True,True,0.0,0.0,44.0,1.0,tcp,0.0,40.0,1.0,,,,,,,,,,,,,,,,,,,,
16927,23,3,39,august,2,friday,2018,0,nix-all-logs,splunk.graphistry.com,192.168.1.128,56308,192.168.1.139,6788,corelight_tutorial,1,,,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",logs.tar:./conn_20180803_16:37:13-16:40:00-070...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",2018-08-03T23:39:02.193316Z,34,7,2018-08-03T23:39:02.193316Z,CPE0ro1CvEoTAe49g3,all_hosts,default,S0,,S,True,True,0.0,,44.0,1.0,tcp,,0.0,0.0,,,,,,,,,,,,,,,,,,,,
31592,23,3,39,august,1,friday,2018,0,nix-all-logs,splunk.graphistry.com,192.168.1.128,62035,34.215.241.13,53,corelight_tutorial,1,,,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",logs.tar:./dns_20180803_16:36:44-16:40:00-0700...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",2018-08-03T23:39:01.695281Z,34,7,2018-08-03T23:39:01.695281Z,CaAbvy2ureWe5sifRf,all_hosts,default,,,,,,,,,,udp,,,,False,True,True,False,0.0,1.0,C_INTERNET,16.0,TXT,1dcc01fae690833eb48f0a041d8391c49e.sweetcoldwa...,False,46798.0,60.0,TXT 34 09af01fae6fe017aff5df5ffff028daf63,0.0,NOERROR,9e-06,,,
25947,23,3,39,august,1,friday,2018,0,nix-all-logs,splunk.graphistry.com,192.168.1.128,62035,34.215.241.13,53,corelight_tutorial,1,,,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",logs.tar:./dns_20180803_16:36:44-16:40:00-0700...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",2018-08-03T23:39:01.832631Z,34,7,2018-08-03T23:39:01.832631Z,CaAbvy2ureWe5sifRf,all_hosts,default,,,,,,,,,,udp,,,,False,True,True,False,0.0,1.0,C_INTERNET,16.0,TXT,d6b901fae619bcf7cbfb4b05da0f542bda.sweetcoldwa...,False,55266.0,60.0,TXT 34 7c3801fae6c3d754b132daffff028daf63,0.0,NOERROR,2e-06,,,
12235,23,3,39,august,2,friday,2018,0,nix-all-logs,splunk.graphistry.com,192.168.1.128,56308,192.168.1.148,6580,corelight_tutorial,1,,,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",logs.tar:./conn_20180803_16:37:13-16:40:00-070...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",2018-08-03T23:39:02.241926Z,34,7,2018-08-03T23:39:02.241926Z,C5KAt622d177IwkIhc,all_hosts,default,REJ,4.7e-05,Sr,True,True,0.0,0.0,44.0,1.0,tcp,0.0,40.0,1.0,,,,,,,,,,,,,,,,,,,,
8856,23,3,39,august,2,friday,2018,0,nix-all-logs,splunk.graphistry.com,192.168.1.128,56309,192.168.1.191,7800,corelight_tutorial,1,,,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",logs.tar:./conn_20180803_16:37:13-16:40:00-070...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",2018-08-03T23:39:02.283136Z,34,7,2018-08-03T23:39:02.283136Z,CVE1qu27KX6e9e20N4,all_hosts,default,S0,,S,True,True,0.0,,44.0,1.0,tcp,,0.0,0.0,,,,,,,,,,,,,,,,,,,,
24146,23,3,39,august,1,friday,2018,0,nix-all-logs,splunk.graphistry.com,192.168.1.128,62035,34.215.241.13,53,corelight_tutorial,1,,,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",logs.tar:./dns_20180803_16:36:44-16:40:00-0700...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",2018-08-03T23:39:01.856055Z,34,7,2018-08-03T23:39:01.856055Z,CaAbvy2ureWe5sifRf,all_hosts,default,,,,,,,,,,udp,,,,False,True,True,False,0.0,1.0,C_INTERNET,5.0,CNAME,91b501a21f5d165a30929a31a8981c9b99f763a89a93db...,False,25392.0,60.0,6ded01a21fc9181b3deec7ffff18fed7d7.sweetcoldwa...,0.0,NOERROR,3e-06,,,
19800,23,3,39,august,2,friday,2018,0,nix-all-logs,splunk.graphistry.com,192.168.1.128,56309,192.168.1.101,1032,corelight_tutorial,1,,,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",logs.tar:./conn_20180803_16:37:13-16:40:00-070...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",2018-08-03T23:39:02.157896Z,34,7,2018-08-03T23:39:02.157896Z,CCSten4hlE8xhaaID2,all_hosts,default,S0,,S,True,True,0.0,,44.0,1.0,tcp,,0.0,0.0,,,,,,,,,,,,,,,,,,,,
28928,23,3,39,august,1,friday,2018,0,nix-all-logs,splunk.graphistry.com,192.168.1.128,62035,34.215.241.13,53,corelight_tutorial,1,,,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",logs.tar:./dns_20180803_16:36:44-16:40:00-0700...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",2018-08-03T23:39:01.740285Z,34,7,2018-08-03T23:39:01.740285Z,CaAbvy2ureWe5sifRf,all_hosts,default,,,,,,,,,,udp,,,,False,True,True,False,0.0,1.0,C_INTERNET,15.0,MX,5b0d01a21f45a14fe65c211c7a34df336b47fbe51d162f...,False,7632.0,60.0,83f801a21f8549af09fb40ffff18feca1f.sweetcoldwa...,0.0,NOERROR,4e-06,,,


In [0]:
hg = graphistry.hypergraph(
    dns_b2_df, 
    ["id.orig_h", "id.resp_h", "query", "answers", "uid", "history", "conn_state"], ### "uid", "protocol", ....
    direct=True,
    opts={
        **opts,
        'EDGES': {
            'id.orig_h': ['query', "history"],
            'query': ['id.resp_h'],
            'id.resp_h': ['answers', "history"],
            'answers': ['id.orig_h'],
            #'uid': ['id.orig_h', 'query', 'id.resp_h', 'answers', 'history']
        }})

hg['graph'].bind(edge_title='conn_state').drop_hyper_nans().color_points_by('category').color_edges_by('category').plot()

# links 218112
# events 36352
# attrib entities 48381
Uploading 12914 kB. This may take a while...


In [0]:
hg = graphistry.hypergraph(
    dns_b2_df, 
    ["id.orig_h", "id.resp_h"], ### "uid", "protocol", ....
    direct=True,
    opts=opts)

hg['graph'].bind(edge_title='conn_state').drop_hyper_nans().color_points_by('category').color_edges_by('category').plot()

# links 36352
# events 36352
# attrib entities 32


# B. NTLM/SMB

In [0]:
ntlm_a_df = splunkToPandas("""

    search index="corelight_tutorial" 
        [ search index="corelight_tutorial" ntlm | dedup uid | fields + uid ]
    | fields *
                                   

    | head 1000

    """,
    {'sample_ratio': 1})

print('# rows', len(ntlm_a_df))
ntlm_a_df.sample(3)

NameError: ignored

In [0]:
hg = graphistry.hypergraph(
    ntlm_a_df, 
    ["id.orig_h", "name", "id.resp_h", "share_type", "path", "hostname", "domainname"], ### "uid", "protocol", ....
    direct=True,
    opts={
        **opts,
        'EDGES': {
            "id.orig_h": ['name', 'id.resp_h', 'share_type', "hostname", "domainname"],
            "share_type": ['id.resp_h'],
            'path': ['name'],
            'hostname': ['id.resp_h'],
            'domainname': ['id.resp_h'],
            "name": ['id.resp_h',],
        }})
        

hg['graph'].bind(edge_title='name').drop_hyper_nans().color_points_by('category').color_edges_by('action').plot()

# links 460
# events 46
# attrib entities 32


## Mimetype Mismatch

* Entity of interest: index=main sourcetype=corelight* filename!=*.exe mime_type=application/x-dosexec

* Ahah: Files that aren't named with the proper extension. Can pivot off md5/Sha1/Sha256. Can track tx_host and rx_host.


# C. Certs

In [0]:
certs_a_df = splunkToPandas("""

    search index="corelight_tutorial" cert_chain_fuids{}=* 
    validation_status="certificate has expired" OR validation_status="self signed certificate" 
    OR validation_status ="self signed certificate in certificate chain"
    
    | fields *
    | fields - _*
                                   

    | head 50000

    """,
    {'sample_ratio': 1})

print('# rows', len(certs_a_df))
certs_a_df.sample(10)

Search results:

results 5429
fetching: 0 - 50000
# rows 5429


Unnamed: 0,cert_chain_fuids{},cipher,curve,date_hour,date_mday,date_minute,date_month,date_second,date_wday,date_year,date_zone,established,eventtype,host,id.orig_h,id.orig_p,id.resp_h,id.resp_p,index,issuer,ja3,linecount,punct,resumed,server_name,source,sourcetype,splunk_server,splunk_server_group,subject,timeendpos,timestartpos,ts,uid,unix_category,unix_group,validation_status,version,last_alert,next_protocol
655,"['FAStTQ3W098SRj0bQg', 'FHzczp32yHsbZi0Wii', '...",TLS_RSA_WITH_RC4_128_SHA,,23,3,38,august,51,friday,2018,0,True,nix-all-logs,splunk.graphistry.com,192.168.0.53,2105,98.137.201.232,443,corelight_tutorial,"CN=VeriSign Class 3 Secure Server CA - G3,OU=T...",de350869b8c85de67a350c8d186f11e6,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,,logs.tar:./ssl_20180803_16:37:08-16:40:00-0700...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=yql.yahooapis.com,OU=Information Technology...",34,7,2018-08-03T23:38:51.569733Z,CRtpV31mjHAxO1XAR8,all_hosts,default,certificate has expired,TLSv10,,
1537,"['FitRNj3jUEv9tpsvDj', 'FnLwch2NN7H0REDfR7', '...",TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256,secp256r1,23,3,38,august,46,friday,2018,0,True,nix-all-logs,splunk.graphistry.com,192.168.0.54,56910,94.245.107.146,443,corelight_tutorial,"CN=MSIT Machine Auth CA 2,DC=redmond,DC=corp,D...",2a458dd9c65afbcf591cd8c2a194b804,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,pipe.skype.com,logs.tar:./ssl_20180803_16:37:08-16:40:00-0700...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",CN=*.pipe.skype.com,34,7,2018-08-03T23:38:46.031009Z,CDCbba6875SH5Ohvd,all_hosts,default,certificate has expired,TLSv12,,
2458,"['FifDlaJKlvGZrlXLg', 'FpGKZfTB1YnNO5i0d']",TLS_RSA_WITH_RC4_128_SHA,,23,3,38,august,12,friday,2018,0,True,nix-all-logs,splunk.graphistry.com,192.168.0.54,49582,2.23.148.90,443,corelight_tutorial,"CN=GeoTrust SSL CA - G4,O=GeoTrust Inc.,C=US",2a458dd9c65afbcf591cd8c2a194b804,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,static.skypeassets.com,logs.tar:./ssl_20180803_16:37:08-16:40:00-0700...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=www.skypeassets.com,O=Skype\, Inc.,L=Redmon...",34,7,2018-08-03T23:38:12.455644Z,Chr4c04UYoOImnTLCk,all_hosts,default,certificate has expired,TLSv12,,
3614,"['FPKTXw3a5MPfyb4Ei1', 'FumQSQ2WWmBIf08JW9', '...",TLS_RSA_WITH_3DES_EDE_CBC_SHA,,23,3,37,august,56,friday,2018,0,True,nix-all-logs,splunk.graphistry.com,192.168.0.53,3466,212.227.111.53,443,corelight_tutorial,"CN=thawte SSL CA - G2,O=thawte\, Inc.,C=US",de350869b8c85de67a350c8d186f11e6,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,,logs.tar:./ssl_20180803_16:37:08-16:40:00-0700...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=navigator-bs.gmx.com,O=1&1 Mail & Media Inc...",34,7,2018-08-03T23:37:56.725666Z,Cf8bOA2H0cWEzViqt3,all_hosts,default,certificate has expired,TLSv10,,
1242,"['FfJ3hC4GrOBHpxqDYh', 'FHyTAH19LVSflISlkk']",TLS_RSA_WITH_RC4_128_SHA,,23,3,38,august,49,friday,2018,0,True,nix-all-logs,splunk.graphistry.com,192.168.0.53,3135,134.170.99.245,443,corelight_tutorial,"CN=Microsoft IT SSL SHA2,OU=Microsoft IT,O=Mic...",de350869b8c85de67a350c8d186f11e6,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,,logs.tar:./ssl_20180803_16:37:08-16:40:00-0700...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",CN=urs.microsoft.com,34,7,2018-08-03T23:38:49.235066Z,CNioN93B28VSnUN9m5,all_hosts,default,certificate has expired,TLSv10,,
1673,"['FJfbkk42ziGRG2PBBj', 'Fi3ypFGaeaKepoH1', 'Fd...",TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,secp256r1,23,3,38,august,42,friday,2018,0,True,nix-all-logs,splunk.graphistry.com,192.168.0.54,62978,64.233.163.108,993,corelight_tutorial,"CN=Google Internet Authority G2,O=Google Inc,C=US",1d095e68489d3c535297cd8dffb06cb9,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,imap.gmail.com,logs.tar:./ssl_20180803_16:37:08-16:40:00-0700...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=imap.gmail.com,O=Google Inc,L=Mountain View...",34,7,2018-08-03T23:38:42.772173Z,Cvetvv4tB1N8kRjpHk,all_hosts,default,certificate has expired,TLSv10,,
5301,"['FFfi7R3fdTlVrA8Lbg', 'FFxPZv2OnIyn79AsMf']",TLS_RSA_WITH_RC4_128_SHA,,23,3,37,august,18,friday,2018,0,True,nix-all-logs,splunk.graphistry.com,192.168.0.51,36679,63.140.58.182,443,corelight_tutorial,"CN=COMODO SSL CA,O=COMODO CA Limited,L=Salford...",aa7f5e2ada5d7bb8a7dceed01f5ffd7c,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,www90.intel.com,logs.tar:./ssl_20180803_16:37:08-16:40:00-0700...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=www90.intel.com,OU=COMODO SSL,OU=Issued thr...",34,7,2018-08-03T23:37:18.338638Z,CZdEsx34YWVF0DMh75,all_hosts,default,certificate has expired,TLSv12,,
3093,"['FltAnz2rRn88mpdbre', 'FAuACb3xlzVwXAa8gh', '...",TLS_RSA_WITH_RC4_128_MD5,,23,3,38,august,5,friday,2018,0,True,nix-all-logs,splunk.graphistry.com,192.168.0.51,59757,198.72.114.197,443,corelight_tutorial,"CN=COMODO SSL CA,O=COMODO CA Limited,L=Salford...",aa7f5e2ada5d7bb8a7dceed01f5ffd7c,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,ips-invite.iperceptions.com,logs.tar:./ssl_20180803_16:37:08-16:40:00-0700...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=*.iperceptions.com,OU=COMODO SSL Wildcard,O...",34,7,2018-08-03T23:38:05.667542Z,CkcOAW2utnbI7gpmz4,all_hosts,default,certificate has expired,TLSv12,,
2559,"['FYZVaC3QRz25TT7EDa', 'FpbCsX1IESZinqwxua']",TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,secp256r1,23,3,38,august,12,friday,2018,0,True,nix-all-logs,splunk.graphistry.com,192.168.0.54,49182,173.252.106.17,5222,corelight_tutorial,"CN=DigiCert High Assurance CA-3,OU=www.digicer...",06207a1730b5deeb207b0556e102ded2,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,,logs.tar:./ssl_20180803_16:37:08-16:40:00-0700...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=chat.facebook.com,O=Facebook\, Inc.,L=Menlo...",34,7,2018-08-03T23:38:12.133663Z,CpRYj54TuE0Rfd0QAc,all_hosts,default,certificate has expired,TLSv10,,
375,"['FAmNXC4NA4z9sJkBIh', 'FinYXt6KYY8811E9e', 'F...",TLS_RSA_WITH_RC4_128_MD5,,23,3,38,august,52,friday,2018,0,True,nix-all-logs,splunk.graphistry.com,192.168.0.53,3443,23.78.121.92,443,corelight_tutorial,"CN=Verizon Akamai SureServer CA G14-SHA1,OU=Cy...",de350869b8c85de67a350c8d186f11e6,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,,logs.tar:./ssl_20180803_16:37:08-16:40:00-0700...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=*.adobe.com,OU=IS,O=Adobe Systems Incorpora...",34,7,2018-08-03T23:38:52.329258Z,C6QYbZQ6DtvrwKYM3,all_hosts,default,certificate has expired,TLSv10,,


In [0]:
hg = graphistry.hypergraph(
    certs_a_df, 
    ["id.orig_h", "id.resp_h", "uid", "ja3", "issuer", "subject"], ### "uid", "protocol", ....
    direct=True,
    opts={
        **opts,
        'EDGES': {
            'id.orig_h': ["id.resp_h", "ja3", "subject"],
            'ja3': ['id.resp_h'],
            "subject": ['id.resp_h'],
            'issuer': ['id.resp_h']
        }})

hg['graph'].bind(edge_title='conn_state').drop_hyper_nans().color_points_by('category').color_edges_by('validation_status').plot()

# links 32574
# events 5429
# attrib entities 6647


