# Threat Hunting Masterclass: Three data science notebooks for finding bad actors in your network logs

More info: https://www.graphistry.com/blog/zeek-masterclass

To get start,load logs.tar into your Splunk and replace the index used below, or call it index `corelight_tutorial`: https://data.world/graphistry/networkforensics . From there, follow the cells below.

## Purpose

These tutorials cover multiple useful topic areas:

* **Hunts**: Sample queries & visualizations for looking at encrypted traffic, DNS tunneling, network shares & logins, & file obfuscation. 
* **Data types**: Network logs around TLS, DNS, NTLM, SMB, and more
* **Methodologies**: Data science notebooks, SIEM queries, visual graph analytics
* **Tools**: Jupyter, Splunk, Bro/Zeek/Corelight, and Graphistry

## Configure

* If you are using Graphistry Marketplace, leave `GRAPHISTRY` unedited, else, uncomment and fill it in
* Fill in `SPLUNK`. Make sure the user has capabilities for REST API access and reading the index in which you put `logs.tar`

In [None]:
!pip install python-dotenv graphistry pandas -q

import os
from dotenv import load_dotenv

# Load environment variables from .env file (if it exists)
load_dotenv()

# Configuration options (in order of precedence):
# 1. Manual configuration (highest priority) - uncomment to override
# 2. Environment variables  
# 3. .env file (lowest priority)

GRAPHISTRY_CONFIG = {
    # Uncomment and modify any values below to override environment variables:
    # 'api': 3,
    # 'username': 'your_username',
    # 'password': 'your_password', 
    # 'protocol': 'https',
    # 'server': 'hub.graphistry.com'
}

# Load configuration from environment with fallbacks
GRAPHISTRY = {
    'api': GRAPHISTRY_CONFIG.get('api', int(os.getenv('GRAPHISTRY_API', '3'))),
    'username': GRAPHISTRY_CONFIG.get('username', os.getenv('GRAPHISTRY_USERNAME')),
    'password': GRAPHISTRY_CONFIG.get('password', os.getenv('GRAPHISTRY_PASSWORD')),
    'protocol': GRAPHISTRY_CONFIG.get('protocol', os.getenv('GRAPHISTRY_PROTOCOL', 'https')),
    'server': GRAPHISTRY_CONFIG.get('server', os.getenv('GRAPHISTRY_SERVER', 'hub.graphistry.com'))
}

# Remove None values
GRAPHISTRY = {k: v for k, v in GRAPHISTRY.items() if v is not None}

# Splunk configuration - fill in your details
SPLUNK = {
    'host': 'SPLUNK.MYSITE.COM',
    'scheme': 'https',
    'port': 8089,
    'username': 'corelight_tutorial',
    'password': 'MY_SPLUNK_PWD'   
}

In [None]:
GRAPHISTRY = {
    "api":3,
    "personal_key_id": "YOUR_KEY_ID",
    "personal_key_secret": "YOUR_SECRET",
    "server": "hub.graphistry.com"
}

## Imports

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import sys
import numpy as np
import math
np.set_printoptions(threshold=sys.maxsize)

import re

import graphistry

if GRAPHISTRY.get('username') and GRAPHISTRY.get('password'):
    graphistry.register(**GRAPHISTRY)
    print("✅ Registered with Graphistry successfully")
    print(f"   Server: {GRAPHISTRY.get('server', 'hub.graphistry.com')}")
    print(f"   Username: {GRAPHISTRY.get('username', 'N/A')}")
else:
    print("⚠️  Graphistry credentials not found.")
    print("   Please configure using one of the methods described above.")
    print("   The notebook will continue but visualizations may not work.")

In [3]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import sys
import numpy as np
import math
np.set_printoptions(threshold=sys.maxsize)

import re

import graphistry
graphistry.register(**GRAPHISTRY)

<graphistry.pygraphistry.GraphistryClient at 0x7801fde3bec0>

In [4]:
import splunklib
import splunklib.client as client
import splunklib.results as results

service = client.connect(**SPLUNK)

## Helpers

def safe_log(v):
  try:
     v2 = float(v)
     return math.log(round(v2) + 1) if not np.isnan(v2) else 0
  except:
    return 0
  
  
# Convert bytes to log of numbers
# Running this twice is safe (idempotent)
# Returns a copy (no mutation of the original)
def log_of_bytes(df):
  df2 = df.copy()
  for c in [c for c in df.columns if re.match('.*bytes.*', c) and not re.match(r'log\(.*', c)]:
    df2['log(' + c + ')'] = df[c].apply(safe_log)      
  return df2

In [5]:
def safe_log(v):
  try:
     v2 = float(v)
     return math.log(round(v2) + 1) if not np.isnan(v2) else 0
  except:
    return 0
  
  
# Convert bytes to log of numbers
# Running this twice is safe (idempotent)
# Returns a copy (no mutation of the original)
def log_of_bytes(df):
  df2 = df.copy()
  for c in [c for c in df.columns if re.match('.*bytes.*', c) and not re.match('log\(.*', c)]:
    df2['log(' + c + ')'] = df[c].apply(safe_log)      
  return df2

  for c in [c for c in df.columns if re.match('.*bytes.*', c) and not re.match('log\(.*', c)]:


### Splunk
* Query splunk, with optional args like sampleRate
* Automatically paginate when result split over multiple responses
* Return as a Pandas dataframe (Note: treats all cols as strings)

In [11]:
STEP=5000
def splunkToPandas(qry, overrides={}):
    kwargs_blockingsearch = {
        "count": 0,
        "earliest_time": "2010-01-24T07:20:38.000-05:00",
        "latest_time": "now",
        "search_mode": "normal",
        "exec_mode": "blocking",
        **overrides
    }

    # Key fix: ensure output_mode=json here
    job = service.jobs.create(qry, output_mode="json", **kwargs_blockingsearch)
    resultCount = int(job["resultCount"])
    offset = 0
    all_data = []

    print(f"Search results: {resultCount}")

    while offset < resultCount:
        print(f"Fetching: {offset} - {offset + STEP}")
        kwargs_paginate = {
            "count": STEP,
            "offset": offset,
            "output_mode": "json"
        }

        blocksearch_results = job.results(**kwargs_paginate)
        reader = results.JSONResultsReader(blocksearch_results)

        batch = [event for event in reader if isinstance(event, dict)]
        all_data.extend(batch)
        offset += STEP

    df = pd.DataFrame(all_data)

    for c in df.columns:
        df[c] = df[c].astype(str)

    return df


### Bro/Zeek

Useful bindings for hypergraphs

In [7]:
categories = {
    'ip': ['id.orig_h', 'id.resp_h']
}

opts={
    'CATEGORIES': categories 
}

### Graphistry

In [8]:
##Extend graphistry.plotter.Plotter to add chainable method "my+graph.color_points_by('some_column_name')..." (and "color_edges_by")

import graphistry.plotter

def color_col_by_categorical(df, type_col):
  types = list(df[type_col].unique())
  type_to_color = {t: i for (i, t) in enumerate(types)}
  return df[type_col].apply(lambda t: type_to_color[t])

def color_col_by_continuous(df, type_col):
  mn = df[type_col].astype(float).min()
  mx = df[type_col].astype(float).max()
  if mx - mn < 0.000001:
    print('warning: too small values for color_col_by_continuous')
    return color_col_by_categorical(df, type_col)
  else:
    print('coloring for range', mn, mx)
  return df[type_col].apply(lambda v: 228010 - round(10 * (float(v) - mn)/(mx - mn) ))
  

## g * str * 'categorical' | 'continuous' -> g
def color_points_by(g, type_col, kind='categorical'):
  fn = color_col_by_categorical if kind == 'categorical' else color_col_by_continuous
  colors = fn(g._nodes, type_col)
  return g.nodes( g._nodes.assign(point_color=colors) ).bind(point_color='point_color')

## g * str * 'categorical' | 'continuous' -> g
def color_edges_by(g, type_col, kind='categorical'):
  fn = color_col_by_categorical if kind == 'categorical' else color_col_by_continuous
  colors = fn(g._edges, type_col)
  return g.edges( g._edges.assign(edge_color=colors) ).bind(edge_color='edge_color')

graphistry.plotter.Plotter.color_points_by = color_points_by
graphistry.plotter.Plotter.color_edges_by = color_edges_by

In [9]:
## remove node/edges pointing to "*::nan" values
def safe_not_nan(prog, v):
  try: 
    return not prog.match(v)
  except:
    return True
  
def drop_nan_col(df, col, prog):
  not_nans = df[col].apply(lambda v: safe_not_nan(prog, v))
  return df[ not_nans == True ]
  
def drop_nan(g, edges = ['src', 'dst'], nodes = ['nodeID']):
  prog = re.compile(".*::nan$")
  edges2 = g._edges
  for col_name in g._edges.columns:
    edges2 = drop_nan_col(edges2, col_name, prog)
  nodes2 = g._nodes
  for col_name in g._nodes.columns:
    nodes2 = drop_nan_col(nodes2, col_name, prog)
  return g.nodes(nodes2).edges(edges2)
  
graphistry.plotter.Plotter.drop_hyper_nans = drop_nan  

## Notebook intro:

## What are notebooks & why

Notebooks and their code ecosystem does a few things at the technical level:
* Web-based UI that exposes a paired Python shell session: a super terminal
* Write code, run it, see results, try again, and save your session
* Quickly connect to databases and wrangle data using the `pydata` Python ecosystem

Top and big teams are adopting notebook environments like Jupyter to solve some key problems:
* Individual advanced individuals use them for accessing the increasingly dominant Python ecosystem
  * Fast: Use at the beginning of a project for rapid analysis & rapid prototyping
  * Smart: Easiest way to use most machine learning tools
* Teams use them as a way to collaborate: 
  * Share executable investigations for one-offs
  * Lightweight automation:  investigation plays & rule/model analyses
  * Training



### Jupyter
* Edit and run a code cell and see it's output: **shift-enter** or via the UI
* You can always edit it and rerun
* Best practice: Write in order as if a full program, so you can always restart and run from th top

### Google Colab
* Hit **Connect** on the top-right to start a running personal session  for this -- it is ready when it says *Connected*. 
* Run each *cell* of the notebook in sequence: either press the **play** button to the left of the cell, or select the cell and hit **shift-enter**.  Feel free to edit the cell, and rerun it (+ the likely . impacted cells below it.)
* Best practice: Write in order as if a full program, so you can always restart and run from the top


### Pandas
Most of the preprocessing code is `pandas`, the most popular Python data science tool (https://pandas.pydata.org ). Graphistry enterprise enables you to replace this kind of manual data wrangling code with shareable point-and-click solutions.

## Graphistry intro:

* Graphistry loads below in every cell that says  `...plot()`

* If you see a giant Graphistry logo over a gray background and nothing else, click the logo to start the Graphistry session

* UI Guide: https://hub.graphistry.com/docs/ui/index/

* Graphistry notebook examples: https://github.com/graphistry/pygraphistry

* [Colors](https://github.com/graphistry/pygraphistry/blob/master/demos/more_examples/graphistry_features/encodings-colors.ipynb)


Try changing "`... | head 100`"  to  "`... | head 10000`"!

In [12]:
df = splunkToPandas(
    """
search index=corelight_tutorial 
| dedup id.orig_h, id.resp_h, name 
| table uid id.orig_h id.resp_h name 
| head 100
""",
    {'sample_ratio': 10}) # Optional, means "sample 1 in 10"

print('# rows', len(df))
df.sample(3)

Search results: 73
Fetching: 0 - 5000
# rows 73


Unnamed: 0,uid,id.orig_h,id.resp_h,name
59,Cg9lHg3DsPYSEp87i6,192.168.0.53,192.168.0.1,dns_unmatched_reply
20,CY24OW2LAYMT2QjBEf,192.168.0.54,108.160.167.35,data_before_established
11,CbSLLe2PMAkU8BUBpi,192.168.0.51,213.155.151.150,data_before_established


In [13]:
# For demo, making all plots public.

graphistry.privacy(mode="public")

In [14]:
hg = graphistry.hypergraph(
    df, 
    ["id.orig_h", "id.resp_h", "name", "uid"], 
    direct=True,
    opts={
        'CATEGORIES': {
            'ip': ['id.orig_h', 'id.resp_h'] # combine repeats across columns into the same nodes
        }
    })
hg['graph'].plot()

# links 438
# events 73
# attrib entities 138


# 1. Hunting through encrypted traffic
* **Motivation**: Internal and perimeter traffic is increasingly encrypted, but we still need to look at it for reasons including auditing encryption hygiene and understanding disguised malicious traffic.
* **Input:** SSL logs
* **Methodology**
  * Search for expired, self-signed, internal CAs, old TLS versions, ...
  * Map out & investigate
    * Work through combos of `version` TLS 1.2 (old) and `validation` search
    * Look for funny issuers, subjects
    * Use JA3 to fingerprint &  whitelist good TLS; then just focus on non-JA3
* **Insights**
  * 1: Clear clusters of TLS version hygiene issues across the various users & applications
  * 2: One cluster is signed... Obama?!?! 
  
* **Generalize**
   * Build whitelist of JA3 and look for violators
   * For unknown certs, characterize nature of activity based on behavior like periodic beaconing, heavy back-and-forth (tunnel), heavy data movement (exfil), ...
   * Map structure of certs in general: services -> certs -> authorities

In [15]:

#optional - add:     OR (version=* AND version != TLSv12)   

certs_a_df = splunkToPandas("""

    search index="corelight_tutorial" cert_chain_fuids{}=* 
    validation_status="certificate has expired" 
    OR validation_status="self signed certificate" 
    OR validation_status ="self signed certificate in certificate chain"
    
    
    | fields *
    | fields - _*
                                   

    | head 50000

    """,
    {'sample_ratio': 1})

print('# rows', len(certs_a_df))
certs_a_df.sample(10)

Search results: 5429
Fetching: 0 - 5000
Fetching: 5000 - 10000
# rows 5429


Unnamed: 0,cert_chain_fuids{},cipher,curve,date_hour,date_mday,date_minute,date_month,date_second,date_wday,date_year,date_zone,established,host,id.orig_h,id.orig_p,id.resp_h,id.resp_p,index,issuer,ja3,linecount,punct,resumed,source,sourcetype,splunk_server,splunk_server_group,subject,timeendpos,timestartpos,ts,uid,validation_status,version,server_name,next_protocol,last_alert
4161,"['Fqx5Sk1AYbuGv1toF3', 'FabRJ61BxafTjg1SHa', '...",TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,secp256r1,23,14,38,february,18,friday,2025,0,True,splunk.graphistry.com,192.168.0.54,61529,108.160.167.175,443,corelight_tutorial,"CN=Go Daddy Secure Certificate Authority - G2,...",8d0230b6ce881f161d1875364f4a156b,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,/datadrive/splunk/var/log/corelight-tutorial/s...,dns-2,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=*.dropbox.com,OU=Domain Control Validated",34,18,2018-08-03T23:38:18.796898Z,CG1lF65E0RwG3bv75,certificate has expired,TLSv10,,,
683,"['FwjILy3taZBAPh6sVi', 'FcKu4D1pfpBvNRCbF4', '...",TLS_RSA_WITH_3DES_EDE_CBC_SHA,,23,9,37,july,49,wednesday,2025,0,True,splunk.graphistry.com,192.168.0.53,1678,217.72.201.130,443,corelight_tutorial,"CN=thawte SSL CA - G2,O=thawte\, Inc.,C=US",de350869b8c85de67a350c8d186f11e6,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,/datadrive/splunk/var/log/corelight-tutorial/s...,dns-2,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=3c-bs.gmx.com,O=1&1 Mail & Media Inc.,L=Che...",34,18,2018-08-03T23:37:49.938525Z,CqILfRyd7VPTXQeb9,certificate has expired,TLSv10,,,
3583,"['FqiaqT2iBRe9fDhgD1', 'FrtBBp16lvpDY6aPNh']",TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,secp384r1,23,14,38,february,46,friday,2025,0,True,splunk.graphistry.com,192.168.0.54,57476,64.4.61.94,443,corelight_tutorial,"CN=Microsoft IT SSL SHA2,OU=Microsoft IT,O=Mic...",06207a1730b5deeb207b0556e102ded2,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,/datadrive/splunk/var/log/corelight-tutorial/s...,dns-2,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",CN=*.gateway.messenger.live.com,34,18,2018-08-03T23:38:46.396124Z,CxI5Ni25P8fKKIyDXa,certificate has expired,TLSv10,,,
1963,"['Fmoam8lgG3fJH9hzh', 'FJUhV1aeezOahOpii']",TLS_DHE_RSA_WITH_AES_128_CBC_SHA,,23,9,37,july,18,wednesday,2025,0,True,splunk.graphistry.com,192.168.0.51,34390,63.245.217.20,443,corelight_tutorial,"CN=DigiCert SHA2 Secure Server CA,O=DigiCert I...",ce694315cbb81ce95e6ae4ae8cbafde6,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,/datadrive/splunk/var/log/corelight-tutorial/s...,dns-2,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=static-san.mozilla.org,O=Mozilla Foundation...",34,18,2018-08-03T23:37:18.250476Z,C5WDaE4EbcIYQfkyLb,certificate has expired,TLSv12,live.mozillamessaging.com,,
4490,"['FR35Dk4yZpN9bgx3Pc', 'FfBVQ52tMzrdpurg0i']",TLS_RSA_WITH_RC4_128_SHA,,23,14,38,february,12,friday,2025,0,True,splunk.graphistry.com,192.168.0.54,50186,2.23.132.158,443,corelight_tutorial,"CN=GeoTrust SSL CA - G4,O=GeoTrust Inc.,C=US",2a458dd9c65afbcf591cd8c2a194b804,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,/datadrive/splunk/var/log/corelight-tutorial/s...,dns-2,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=www.skypeassets.com,O=Skype\, Inc.,L=Redmon...",34,18,2018-08-03T23:38:12.609952Z,CcQizhZVrSVEsJMz,certificate has expired,TLSv12,static.skypeassets.com,,
5394,"['F7AKbYgGDYjqMeIIc', 'FFVEPe1eLPbzc57U6', 'Fh...",TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,secp256r1,23,14,38,february,3,friday,2025,0,True,splunk.graphistry.com,192.168.0.54,59940,216.58.209.129,443,corelight_tutorial,"CN=Google Internet Authority G2,O=Google Inc,C=US",e03fdb6b99211ce6d1ed8a21abf4b25b,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,/datadrive/splunk/var/log/corelight-tutorial/s...,dns-2,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=*.googleusercontent.com,O=Google Inc,L=Moun...",34,18,2018-08-03T23:38:03.766636Z,CR0jwp4BfmSkd9jg7l,certificate has expired,TLSv12,s2.googleusercontent.com,,
3031,"['FkURzA3lnvLD9uLIP4', 'FSgiu444Nr8mgrRGI3']",TLS_RSA_WITH_RC4_128_SHA,,23,14,38,february,50,friday,2025,0,True,splunk.graphistry.com,192.168.0.53,4217,157.55.239.247,443,corelight_tutorial,"CN=Microsoft IT SSL SHA2,OU=Microsoft IT,O=Mic...",de350869b8c85de67a350c8d186f11e6,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,/datadrive/splunk/var/log/corelight-tutorial/s...,dns-2,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",CN=urs.microsoft.com,34,18,2018-08-03T23:38:50.098917Z,CrZZXoNGKXlS9OBf1,certificate has expired,TLSv10,,,
902,"['Frc21R1ZGhtC80B7g', 'FABXetnjdX7rnBodk']",TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,secp256r1,23,9,37,july,47,wednesday,2025,0,True,splunk.graphistry.com,192.168.0.54,49790,54.72.42.191,443,corelight_tutorial,"CN=RapidSSL CA,O=GeoTrust\, Inc.,C=US",0a7d2a1f4e376ba050fdcc5fd6b59021,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,/datadrive/splunk/var/log/corelight-tutorial/s...,dns-2,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=*.wtp101.com,OU=Domain Control Validated - ...",34,18,2018-08-03T23:37:47.526753Z,CiWfUq3dko8hEkhCC7,certificate has expired,TLSv12,www.wtp101.com,,
1698,"['FjsOQZ2nyUckgewCtg', 'F5lywt3SUFBifAXhid', '...",TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,secp256r1,23,9,37,july,33,wednesday,2025,0,True,splunk.graphistry.com,192.168.0.54,52290,216.58.209.129,443,corelight_tutorial,"CN=Google Internet Authority G2,O=Google Inc,C=US",0a7d2a1f4e376ba050fdcc5fd6b59021,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,/datadrive/splunk/var/log/corelight-tutorial/s...,dns-2,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=tpc.googlesyndication.com,O=Google Inc,L=Mo...",34,18,2018-08-03T23:37:33.528232Z,CJG5F3emXLRu6uqvi,certificate has expired,TLSv12,tpc.googlesyndication.com,h2-14,
1286,"['FCuIV61dVOGul24zB9', 'FF9oXa4bIWM5aGWKSe', '...",TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,secp256r1,23,9,37,july,44,wednesday,2025,0,True,splunk.graphistry.com,192.168.0.54,49167,216.58.209.138,443,corelight_tutorial,"CN=Google Internet Authority G2,O=Google Inc,C=US",6062f6c7c72e5cf557cc9698f4f31fce,1,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",False,/datadrive/splunk/var/log/corelight-tutorial/s...,dns-2,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...","CN=*.googleapis.com,O=Google Inc,L=Mountain Vi...",34,18,2018-08-03T23:37:44.636431Z,CI825GxULSxHD4OX4,certificate has expired,TLSv12,www.googleapis.com,h2-14,


### The graph:
* **Nodes**: IPs, ja3 (TLS metadata hashes), cert subject/issuers, colored by category
* **Edges**: Color by TLS version, title by issuer

In [16]:
hg = graphistry.hypergraph(
    certs_a_df, 
    ["id.orig_h", "id.resp_h", "uid", "ja3", "issuer", "subject"], ### "uid", "protocol", ....
    direct=True,
    opts={
        **opts,
        'EDGES': {
            'id.orig_h': ["id.resp_h", "ja3", "subject"],
            'ja3': ['id.resp_h'],
            "subject": ['id.resp_h'],
            'issuer': ['id.resp_h']
        }})

hg['graph'].bind(edge_title='category').drop_hyper_nans().color_points_by('category').color_edges_by('version').plot()

# links 32574
# events 5429
# attrib entities 6648


# 2. Hunting Insider Threats with NTLM+SMB

* **Motivation**:  NTLM (NT Lan Manager) logins are suspicious, especially on senstive data shares, worth auditing.
* **Input**: NTLM + SMB + other network logs (for any other IPs/activities)
* **Methodology**:
  * Seed search by NTLM activity
  * Get all other logs involving those UIDs
  * Map & audit

* **Insights**:
  * Cluster 1: Sonos smart speakers seem to be opening network-shared data that have nothing to do with listening to music
  * Cluster 2: Second cluster -- same `Workgroup` domain name, yet on a rogue host
  
* **Generalize**:
  * Map & audit NTLM and other remote logins to beginwith
  * From those hits, look at other file shares beyond SMB - dropbox, wiki's, ...
  * Map & audit file shares in general


In [17]:
ntlm_a_df = splunkToPandas("""

    search index="corelight_tutorial" 
        [ search index="corelight_tutorial" ntlm | dedup uid | fields + uid  ]
    | fields * 
                                   

    | head 1000

    """,
    {'sample_ratio': 1})

print('# rows', len(ntlm_a_df))
ntlm_a_df.sample(3)

Search results: 46
Fetching: 0 - 5000
# rows 46


Unnamed: 0,action,date_hour,date_mday,date_minute,date_month,date_second,date_wday,date_year,date_zone,host,id.orig_h,id.orig_p,id.resp_h,id.resp_p,index,linecount,name,punct,size,source,sourcetype,splunk_server,splunk_server_group,timeendpos,times.accessed,times.changed,times.created,times.modified,timestartpos,ts,uid,_bkt,_cd,_indextime,_raw,_serial,_si,_sourcetype,_subsecond,_time,native_file_system,path,service,share_type,domainname,hostname,status,success,username,eventtype,tag,tag::eventtype,_eventtype_color,actions{},dropped,dst,msg,note,p,peer_descr,proto,src,suppress_for,conn_state,duration,history,local_orig,local_resp,missed_bytes,orig_bytes,orig_ip_bytes,orig_pkts,resp_bytes,resp_ip_bytes,resp_pkts,orig_cc
19,SMB::FILE_OPEN,23,9,39,july,2,wednesday,2025,0,splunk.graphistry.com,172.16.1.8,38896,172.16.1.7,445,corelight_tutorial,1,\hack\jpg.string,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",12801.0,/datadrive/splunk/var/log/corelight-tutorial/s...,smb_files-too_small,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",34,2018-07-24T17:56:05.524403Z,2018-07-24T17:56:05.524403Z,2018-07-24T17:56:05.524403Z,2018-07-24T17:56:05.524403Z,18,2018-08-03T23:39:02.812722Z,COGaRD3cM7jP2XFdy8,corelight_tutorial~0~67A851F4-1BFE-4874-B653-8...,0:1460368,1753827598,"{""ts"":""2018-08-03T23:39:02.812722Z"",""uid"":""COG...",19,"['splunk.graphistry.com', 'corelight_tutorial']",smb_files-too_small,0.812722,2025-07-10T01:39:02.812+02:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,SMB::FILE_OPEN,23,9,39,july,2,wednesday,2025,0,splunk.graphistry.com,172.16.1.8,38896,172.16.1.7,445,corelight_tutorial,1,\hack\jpg.jpg,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",61292.0,/datadrive/splunk/var/log/corelight-tutorial/s...,smb_files-too_small,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",34,2018-07-24T17:56:04.824403Z,2018-07-24T17:56:04.832403Z,2018-07-24T17:56:04.824403Z,2018-07-24T17:56:04.832403Z,18,2018-08-03T23:39:02.908827Z,COGaRD3cM7jP2XFdy8,corelight_tutorial~0~67A851F4-1BFE-4874-B653-8...,0:1460514,1753827598,"{""ts"":""2018-08-03T23:39:02.908827Z"",""uid"":""COG...",7,"['splunk.graphistry.com', 'corelight_tutorial']",smb_files-too_small,0.908827,2025-07-10T01:39:02.908+02:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
41,,23,8,39,december,2,wednesday,2021,0,splunk.graphistry.com,172.16.1.8,38891,172.16.1.7,445,corelight_tutorial,1,,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",,/datadrive/splunk/var/log/corelight-tutorial/c...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",34,,,,,18,2018-08-03T23:39:02.792963Z,Co7dkb3VZW4JUWlYV5,corelight_tutorial~2~67A851F4-1BFE-4874-B653-8...,2:1187676,1753827610,"{""ts"":""2018-08-03T23:39:02.792963Z"",""uid"":""Co7...",41,"['splunk.graphistry.com', 'corelight_tutorial']",conn,0.792963,2021-12-09T00:39:02.792+01:00,,,"smb,gssapi,ntlm",,,,,,,,,,,,,,,,,,tcp,,,SF,8.5e-05,ShADadFf,False,False,0.0,886.0,1310.0,8.0,506.0,826.0,6.0,


### The graph:

Focus on representing NTLM, SMB, and generic Bro/Zeek logs.

* **Nodes**: IPs, domains/hosts/usernames, files/paths, colored by category
* **Edges**: Zeek events connecting them, colored by username

In [18]:
hg = graphistry.hypergraph(
    ntlm_a_df, 
    ["id.orig_h", "name", "id.resp_h", "path", "hostname", "domainname", "username"], ### "uid", "protocol", ....
    direct=True,
    opts={
        **opts,
        'EDGES': {
            "username": ['id.orig_h'],
            "id.orig_h": ['name', 'id.resp_h',  "hostname", "domainname"],       
            'path': ['name'],
            'hostname': ['id.resp_h'],
            'domainname': ['id.resp_h'],
            "name": ['id.resp_h'],
            "id.resp_h": ['username']
        }})
        

hg['graph'].bind(edge_title='name').drop_hyper_nans().color_points_by('category').color_edges_by('username').plot()

# links 460
# events 46
# attrib entities 36


# 3. DNS Tunneling

### 3.A. Setup -- General DNS map 

General query for looking at DNS connections with Bro/Zeek. 

Instead of showing each connection, summarize all activities across each unique 10,000 IP<>IP pairs: max bytes, first/last communication, ...

For UI work, compute the `log(..)` of bytes

In [20]:
dns_a_df = splunkToPandas("""

    search index="corelight_tutorial" sourcetype="conn"
    
    | eval total_bytes = orig_ip_bytes + resp_ip_bytes
    | eval log_total_bytes = log(orig_ip_bytes + resp_ip_bytes)

    | stats
    count(_time) as count,
    earliest(_time), latest(_time),
    values(answers{}) as answers,
    values(conn_state),
    values(history)
    values(issuer),
    values(ja3),
    values(last_alert),
    values(qtype_name),
    values(subject),
    max(*bytes), avg(*bytes), sum(*bytes),

    by id.orig_h, id.resp_h

    | eval duration_ms = last_time_ms - first_time_ms

    | head 50000

    """,
    {'sample_ratio': 1})

print('# rows', len(dns_a_df))
dns_a_df.sample(3)

Search results: 13399
Fetching: 0 - 5000
Fetching: 5000 - 10000
Fetching: 10000 - 15000
# rows 13399


Unnamed: 0,id.orig_h,id.resp_h,count,earliest(_time),latest(_time),values(conn_state),max(log_total_bytes),max(missed_bytes),max(orig_ip_bytes),max(resp_ip_bytes),max(total_bytes),avg(log_total_bytes),avg(missed_bytes),avg(orig_ip_bytes),avg(resp_ip_bytes),avg(total_bytes),sum(log_total_bytes),sum(missed_bytes),sum(orig_ip_bytes),sum(resp_ip_bytes),sum(total_bytes),values(history),max(orig_bytes),max(resp_bytes),avg(orig_bytes),avg(resp_bytes),sum(orig_bytes),sum(resp_bytes)
8223,192.168.0.54,73.170.185.232,2,1639006703.513396,1752104303.51339,SF,2.838849090737255,0,370,320,690,2.555345348636877,0,236,202.5,438.5,5.110690697273754,0,472,405,877,"['Dd', 'ShADadFf']",78,29,62,28.5,124,57
8849,192.168.0.54,83.149.41.40,1,1752104294.487483,1752104294.487483,S0,2.0,0,100,0,100,2.0,0,100,0.0,100.0,2.0,0,100,0,100,S,0,0,0,0.0,0,0
9697,192.168.0.54,95.42.110.200,2,1639006700.978599,1752104300.978589,S0,2.1818435879447726,0,152,0,152,1.93154241266018,0,100,0.0,100.0,3.86308482532036,0,200,0,200,"['D', 'S']",0,0,0,0.0,0,0


### Graph demo

* Nodes are IPs
* Edges summarize all activity per IP<>IP: first time, ... 
  * Color by total bytes in/out

In [21]:
hg = graphistry.hypergraph(
    dns_a_df, 
    ["id.orig_h", "id.resp_h"], ### "uid", "protocol", ....
    direct=True,
    opts=opts)

hg['graph'].color_points_by('category').color_edges_by('max(log_total_bytes)', 'continuous').bind(edge_title='max(total_bytes)').plot()

# links 13399
# events 13399
# attrib entities 11801
coloring for range 1.591064607026499 8.528258188610675


### 3.B. DNS Tunnel:

* **Motivation**: DNS is a sneaky channel for hiding activity. Need to detect & unravel, whether proactive or post-breach.

* **Input**: DNS connections

* **Methodology**: 

 * Search for the top 10,000  ip->(unique dns query)->ip summaries matching tunneling heuristics:
   *  length(query) > 25: exfil / command request
   *  length(answer) > 45: received command 
 * Inspect & explain all flagged behavior
   * Pay attention to long and artificial looking queries & answers
   * Exfil: big or many queries
   * Command: strange responses
   * Tunneling: heavy back-and-forth

* **Insights**
  * Two clusters of activity
  * One seems to be tunneling: back-and-forth
  * The other seems to not have answers
  
 
 * **Generalize**
 
 The hunt continues on the identified UIDs and IPs for demo purposes, but does not reveal much. What can you find?
 
 On a full SIEM:
  * Check periodicity (timebar) for bot vs human
  * Combine with endpoint logs to correlate proceses, file accesses, and users
  * Combine with alert logs to trace back to initial breach and subsequent behavior

In [22]:
dns_b_df = splunkToPandas("""

    search index="corelight_tutorial" sourcetype="dns-2"
    
    | eval total_bytes = orig_ip_bytes + resp_ip_bytes
    | eval log_total_bytes = log(orig_ip_bytes + resp_ip_bytes)

    | eval query_length = length(query)
    | eval long_answers=mvfilter(length('answers{}') > 45)
    | eval long_answers_length = max(length(long_answers))
    | where query_length > 25 OR long_answers_length > 45


    | stats
    count(_time) as count,
    earliest(_time), latest(_time),
    values(answers{}) as answers,
    max(long_answers_length) as max_long_answers_length,
    values(conn_state),
    values(history)
    values(issuer),
    values(ja3),
    values(last_alert),
    values(subject),
    max(*bytes), avg(*bytes), sum(*bytes),
    values(qtype_name),
    first(uid),

    max(*bytes), avg(*bytes), sum(*bytes),
    
    by id.orig_h, id.resp_h, query, query_length                                

    | eval duration_ms = last_time_ms - first_time_ms
    
    | eval query=substr(query,1,100)
    | eval max_query_or_answer_length = max(query_length, max_long_answers_length)
    | sort max_query_or_answer_length desc                                           

    | head 50000

    """,
    {'sample_ratio': 1})

print('# rows', len(dns_b_df))
dns_b_df.sample(3)

Search results: 10000
Fetching: 0 - 5000
Fetching: 5000 - 10000
# rows 10000


Unnamed: 0,id.orig_h,id.resp_h,query,query_length,count,earliest(_time),latest(_time),answers,values(qtype_name),first(uid),max_query_or_answer_length,max_long_answers_length
2695,192.168.1.128,34.215.241.13,3ba601a21f9d75cae406311fef410ee3862363c26f435f...,228,1,1639006741.79844,1639006741.79844,68f501a21f8dc4b0e7423dffff18fee307.sweetcoldwa...,MX,CaAbvy2ureWe5sifRf,228,53
7625,192.168.1.128,34.215.241.13,a79301a21fd2003f4201660c9f407d0ba92aebcc4eeb83...,228,1,1639006741.680896,1639006741.680896,2a8901a21f20aa11843cfaffff18fe653d.sweetcoldwa...,,CaAbvy2ureWe5sifRf,228,53
7493,192.168.1.128,34.215.241.13,a4ca01a21fb84122d311fe19da1f5616ce725bf3935bf9...,228,1,1639006741.727527,1639006741.727527,4ca301a21fa137a2275d72ffff18fed5a5.sweetcoldwa...,CNAME,CaAbvy2ureWe5sifRf,228,53


### The graph:

* **Nodes**: IPs, queries, answers
* **Edges**: Summaries along each  orig_h->query->resp_h->answer->orig_h

**Results**:
* **UIDs**: C3ApkJ3TwWW64DtnWb , CaAbvy2ureWe5sifRf
* **IPs**: 10.0.2.30 10.0.2.20  34.215.241.13 192.168.1.128


In [23]:
hg = graphistry.hypergraph(
    dns_b_df, 
    ["id.orig_h", "id.resp_h", "query", "answers"], ### "uid", "protocol", ....
    direct=True,
    opts={
        **opts,
        'EDGES': {
            'id.orig_h': ['query'],
            'query': ['id.resp_h'],
            'id.resp_h': ['answers'],
            'answers': ['id.orig_h']
        }})

g = hg['graph'].bind(edge_title='query').drop_hyper_nans().color_points_by('category').color_edges_by('max_query_or_answer_length', 'continuous')

g.plot()

# links 40000
# events 10000
# attrib entities 19445
coloring for range 228.0 252.0


### Dig into interesting UIDs and IPs 1: IP map
* Surface IPs interacted with
* What log types are available 
  * `sourcetype`s: `conn` and `weird`

In [24]:
dns_b2_df = splunkToPandas("""

    search index="corelight_tutorial" 
    C3ApkJ3TwWW64DtnWb OR CaAbvy2ureWe5sifRf OR 10.0.2.30 OR 10.0.2.20  OR 34.215.241.13 OR 192.168.1.128
    | eval time=ts
    | rename answers{} as answers
    | fields *
    | fields - _*
                                   

    | head 50000

    """,
    {'sample_ratio': 1})

print('# rows', len(dns_b2_df))
dns_b2_df.sample(3)

Search results: 36352
Fetching: 0 - 5000
Fetching: 5000 - 10000
Fetching: 10000 - 15000
Fetching: 15000 - 20000
Fetching: 20000 - 25000
Fetching: 25000 - 30000
Fetching: 30000 - 35000
Fetching: 35000 - 40000
# rows 36352


Unnamed: 0,date_hour,date_mday,date_minute,date_month,date_second,date_wday,date_year,date_zone,host,id.orig_h,id.orig_p,id.resp_h,id.resp_p,index,linecount,name,notice,punct,source,sourcetype,splunk_server,splunk_server_group,time,timeendpos,timestartpos,ts,uid,addl,conn_state,duration,history,local_orig,local_resp,missed_bytes,orig_bytes,orig_ip_bytes,orig_pkts,proto,resp_bytes,resp_ip_bytes,resp_pkts,AA,RA,RD,TC,Z,qclass,qclass_name,qtype,qtype_name,query,rejected,trans_id,TTLs{},answers,rcode,rcode_name,rtt,resp_cc,service
3439,23,8,39,december,2,wednesday,2021,0,splunk.graphistry.com,192.168.1.128,56308,192.168.1.139,4443,corelight_tutorial,1,,,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",/datadrive/splunk/var/log/corelight-tutorial/c...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",2018-08-03T23:39:02.339922Z,34,18,2018-08-03T23:39:02.339922Z,C6jDxd1F9k7DRiuRA1,,REJ,5.1e-05,Sr,True,True,0.0,0.0,44.0,1.0,tcp,0.0,40.0,1.0,,,,,,,,,,,,,,,,,,,
7300,23,8,39,december,2,wednesday,2021,0,splunk.graphistry.com,192.168.1.105,1039,192.168.1.128,56308,corelight_tutorial,1,,,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",/datadrive/splunk/var/log/corelight-tutorial/c...,conn,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",2018-08-03T23:39:02.300674Z,34,18,2018-08-03T23:39:02.300674Z,CFQz0L1Y7GPJkRZtBd,,RSTOS0,,R,True,True,0.0,,40.0,1.0,tcp,,0.0,0.0,,,,,,,,,,,,,,,,,,,
33466,23,8,39,december,1,wednesday,2021,0,splunk.graphistry.com,192.168.1.128,62035,34.215.241.13,53,corelight_tutorial,1,,,"{"""":""--::."","""":"""",""."":""..."",""."":,""."":""..."","".""...",/datadrive/splunk/var/log/corelight-tutorial/d...,dns-2,splunk.graphistry.com,"['dmc_group_cluster_master', 'dmc_group_deploy...",2018-08-03T23:39:01.667083Z,34,18,2018-08-03T23:39:01.667083Z,CaAbvy2ureWe5sifRf,,,,,,,,,,,udp,,,,False,True,True,False,0.0,1.0,C_INTERNET,15.0,MX,ba2001a21f51a0795f0fb709a842c0c7423c26071707fc...,False,145.0,60.0,7feb01a21f805a26080da9ffff18fe7a33.sweetcoldwa...,0.0,NOERROR,7e-06,,


In [25]:
hg = graphistry.hypergraph(
    dns_b2_df, 
    ["id.orig_h", "id.resp_h"], ### "uid", "protocol", ....
    direct=True,
    opts=opts)

hg['graph'].bind(edge_title='sourcetype').drop_hyper_nans().color_points_by('category').color_edges_by('sourcetype').plot()

# links 36352
# events 36352
# attrib entities 32


### Dig into interesting UIDs and IPs 2: Mostly just connections, so inspect from that perspective
* Reuse DNS query from before

In [26]:
dns_b3_df = splunkToPandas("""

    search index="corelight_tutorial" sourcetype="dns-2"
    C3ApkJ3TwWW64DtnWb OR CaAbvy2ureWe5sifRf OR 10.0.2.30 OR 10.0.2.20  OR 34.215.241.13 OR 192.168.1.128
    
    | eval total_bytes = orig_ip_bytes + resp_ip_bytes
    | eval log_total_bytes = log(orig_ip_bytes + resp_ip_bytes)

    | eval query_length = length(query)
    | eval long_answers=mvfilter(length('answers{}') > 45)
    | eval long_answers_length = max(length(long_answers))
    | where query_length > 25 OR long_answers_length > 45


    | stats
    count(_time) as count,
    earliest(_time), latest(_time),
    values(answers{}) as answers,
    max(long_answers_length) as max_long_answers_length,
    values(conn_state),
    values(history)
    values(issuer),
    values(ja3),
    values(last_alert),
    values(subject),
    max(*bytes), avg(*bytes), sum(*bytes),
    values(qtype_name),
    first(uid),

    max(*bytes), avg(*bytes), sum(*bytes),
    
    by id.orig_h, id.resp_h, query, query_length                               

    | eval duration_ms = last_time_ms - first_time_ms
    
    | eval query=substr(query,1,100)
    | eval max_query_or_answer_length = max(query_length, max_long_answers_length)
    | sort max_query_or_answer_length desc                                           

    | head 50000

    """,
    {'sample_ratio': 1})

print('# rows', len(dns_b3_df))
dns_b3_df.sample(3)

Search results: 10000
Fetching: 0 - 5000
Fetching: 5000 - 10000
# rows 10000


Unnamed: 0,id.orig_h,id.resp_h,query,query_length,count,earliest(_time),latest(_time),answers,values(qtype_name),first(uid),max_query_or_answer_length,max_long_answers_length
4836,192.168.1.128,34.215.241.13,6ad301a21fdf70af44d345284fe366d976ef399381e80e...,228,1,1639006741.832416,1639006741.832416,4ed101a21f69f323c2a42bffff18feb3f1.sweetcoldwa...,,CaAbvy2ureWe5sifRf,228,53
5211,192.168.1.128,34.215.241.13,731c01a21fab55b70f6ccd0a0ecc50946bd6f605799380...,228,1,1639006741.669395,1639006741.669395,853801a21fe024bd1e9ad9ffff18fe9877.sweetcoldwa...,MX,CaAbvy2ureWe5sifRf,228,53
1841,192.168.1.128,34.215.241.13,28ab01a21f5dc1685be27f0babf011f8830ec58888da9d...,228,1,1639006741.677097,1639006741.677097,ca1301a21fafc6527dd87effff18fe1735.sweetcoldwa...,CNAME,CaAbvy2ureWe5sifRf,228,53


In [27]:
hg = graphistry.hypergraph(
    dns_b3_df, 
    ["id.orig_h", "id.resp_h", "query", "answers", "first(uid)"], ### "uid", "protocol", ....
    direct=True,
    opts={
        **opts,
        'EDGES': {
            'id.orig_h': ['query'],
            'query': ['id.resp_h'],
            'id.resp_h': ['answers'],
            'answers': ['id.orig_h']
        }})

hg['graph'].bind(edge_title='query').drop_hyper_nans().color_points_by('category').color_edges_by('max_query_or_answer_length', 'continuous').plot()

# links 40000
# events 10000
# attrib entities 19447
coloring for range 228.0 252.0


# 4. Mimetype Mismatch

* **Motivation**: When following an incident or doing a sweep, a common case is executable files hiding behind  extensions like ".jpeg", and brings into question the UIDs of all entities involved

* **Data**: Multiple. Ex:

* **Methodology**: 
  * Entity of interest: `index=main sourcetype=corelight* filename!=*.exe mime_type=application/x-dosexec`
  * Files that aren't named with the proper extension. Can pivot off md5/Sha1/Sha256. Can track tx_host and rx_host.

* **Insight**: 


In [28]:
mime_df = splunkToPandas("""

    search index=corelight_tutorial filename!=*.exe mime_type=application/x-dosexec                                         

    | head 200

    """,
    {'sample_ratio': 1})

print('# rows', len(dns_b3_df))
dns_b3_df.sample(3)

Search results: 5
Fetching: 0 - 5000
# rows 10000


Unnamed: 0,id.orig_h,id.resp_h,query,query_length,count,earliest(_time),latest(_time),answers,values(qtype_name),first(uid),max_query_or_answer_length,max_long_answers_length
880,192.168.1.128,34.215.241.13,12ca01a21fb477d69012f9126e3cdc6ffc24a3a83023ac...,228,1,1639006741.694151,1639006741.694151,TXT 34 b26501a21f3ed6a854a517ffff18fe6b6f,TXT,CaAbvy2ureWe5sifRf,228,
4401,192.168.1.128,34.215.241.13,60ce01a21fcc7f5332d67608cb12cbe64f3f521b28fb4d...,228,1,1639006741.66285,1639006741.66285,591801a21ff7605fae8fafffff18fe2cf5.sweetcoldwa...,CNAME,CaAbvy2ureWe5sifRf,228,53.0
8563,192.168.1.128,34.215.241.13,bc9101a21f27d3494a7ffc28e4398fdef38b5632fe0317...,228,1,1639006741.833688,1639006741.833688,aa0201a21fcb82345d685fffff18fee10f.sweetcoldwa...,CNAME,CaAbvy2ureWe5sifRf,228,53.0
