# Simple DNS Anomaly Detection Lab

Hands-on notebook demonstrating:
1. Load DNS dataset
2. Statistical Analysis (Z-score, IQR)
3. ML Anomaly Detection (Isolation Forest, One-Class SVM)
4. Detection Rules (Sigma, IDS/Firewall examples)


## Step 1: Load dataset

In [13]:
import pandas as pd
DATA_PATH = r"./data/zeek_dns_export.csv"
df = pd.read_csv(DATA_PATH)
print("Rows, Cols:", df.shape)
print("Columns:", df.columns.tolist()[:20], "...")
df.head()


Rows, Cols: (7075, 146)
Columns: ['@timestamp', '_id', '_ignored', '_index', '_score', 'agent.ephemeral_id', 'agent.hostname', 'agent.id', 'agent.name', 'agent.type', 'agent.version', 'client.address', 'destination.address', 'destination.bytes', 'destination.ip', 'destination.packets', 'destination.port', 'dns.answers.data', 'dns.answers.ttl', 'dns.header_flags'] ...


Unnamed: 0,@timestamp,_id,_ignored,_index,_score,agent.ephemeral_id,agent.hostname,agent.id,agent.name,agent.type,...,zeek.stats.packets.processed,zeek.stats.packets.received,zeek.stats.peer,zeek.stats.reassembly_size.file,zeek.stats.reassembly_size.frag,zeek.stats.reassembly_size.tcp,zeek.stats.reassembly_size.unknown,zeek.stats.timers.active,zeek.stats.timers.count,zeek.stats.timestamp_lag
0,"Sep 28, 2025 @ 23:10:32.037",pwEqkpkBvNUpndSauFsx,-,.ds-filebeat-8.19.4-2025.09.20-000001,-,45c2e6bb-35a3-4b53-8972-d70c2b483d2e,logmonitor,a7f056b0-7095-48c4-b0bf-641edc9206b7,logmonitor,filebeat,...,-,-,-,-,-,-,-,-,-,-
1,"Sep 28, 2025 @ 23:10:31.339",nwEpkpkBvNUpndSa5lte,-,.ds-filebeat-8.19.4-2025.09.20-000001,-,45c2e6bb-35a3-4b53-8972-d70c2b483d2e,logmonitor,a7f056b0-7095-48c4-b0bf-641edc9206b7,logmonitor,filebeat,...,-,-,-,-,-,-,-,-,-,-
2,"Sep 28, 2025 @ 23:10:31.338",ngEpkpkBvNUpndSa5lte,-,.ds-filebeat-8.19.4-2025.09.20-000001,-,45c2e6bb-35a3-4b53-8972-d70c2b483d2e,logmonitor,a7f056b0-7095-48c4-b0bf-641edc9206b7,logmonitor,filebeat,...,-,-,-,-,-,-,-,-,-,-
3,"Sep 28, 2025 @ 23:10:31.336",nQEpkpkBvNUpndSa5lte,-,.ds-filebeat-8.19.4-2025.09.20-000001,-,45c2e6bb-35a3-4b53-8972-d70c2b483d2e,logmonitor,a7f056b0-7095-48c4-b0bf-641edc9206b7,logmonitor,filebeat,...,-,-,-,-,-,-,-,-,-,-
4,"Sep 28, 2025 @ 23:10:31.335",nAEpkpkBvNUpndSa5lte,-,.ds-filebeat-8.19.4-2025.09.20-000001,-,45c2e6bb-35a3-4b53-8972-d70c2b483d2e,logmonitor,a7f056b0-7095-48c4-b0bf-641edc9206b7,logmonitor,filebeat,...,-,-,-,-,-,-,-,-,-,-


In [20]:
# Normalize Zeek DNS field names if present
rename_map = {
    'source.ip': 'src_ip',
    'destination.ip': 'dst_ip',
    'zeek.dns.query': 'query',
    'zeek.dns.qtype_name': 'qtype',
    'zeek.dns.rcode': 'rcode'
}
for k,v in rename_map.items():
    if k in df.columns and v not in df.columns:
        df = df.rename(columns={k:v})

df['query'] = df.get('query', pd.Series(index=df.index, dtype='object')).fillna('').astype(str)
df['query_len'] = df['query'].astype(str).str.len()

print(df[['src_ip','query','query_len']].head(10))

# Print top entries by query_len in descending order
print("\nTop 10 queries by query_len (descending):")
print(df.sort_values('query_len', ascending=False)[['src_ip','query','query_len']].head(10))


                    src_ip          query  query_len
0  fe80::b18:3d6b:309a:2ca              -          1
1  fe80::b18:3d6b:309a:2ca  ech-rog.local         13
2             192.168.56.1  ech-rog.local         13
3  fe80::b18:3d6b:309a:2ca  ech-rog.local         13
4             192.168.56.1  ech-rog.local         13
5  fe80::b18:3d6b:309a:2ca  ech-rog.local         13
6             192.168.56.1  ech-rog.local         13
7  fe80::b18:3d6b:309a:2ca  ech-rog.local         13
8  fe80::b18:3d6b:309a:2ca              -          1
9             192.168.56.1  ech-rog.local         13

Top 10 queries by query_len (descending):
            src_ip                                              query  \
1856  192.168.56.2  1vfwf3xhfo30xfy7b.fbubneo.wly6jmhkvrejvyl3.gh1...   
4549  192.168.56.2  hftq9wt9ra28lad0v.x5fg1orylpuquhg.8qipopbx2vzn...   
2281  192.168.56.2  ofmiwuv6xtqh3w18h0.6aqswr7ffvj.uor8dvwxejrk4.m...   
1985  192.168.56.2  fvyt1x1m1thw70.65spl8kn.0ppiwfq7oux6b8ez9.u7qb...   
3525  192

## Step 2: Statistical Analysis (Z-score, IQR)

In [15]:
from scipy import stats
import numpy as np

# group per src_ip
agg = df.groupby('src_ip').agg(
    queries_per_ip=('query','count'),
    avg_query_len=('query_len','mean')
).reset_index()

# Z-score on queries_per_ip
agg['zscore'] = stats.zscore(agg['queries_per_ip'])
agg['zscore_flag'] = agg['zscore'].abs() > 3

# IQR
Q1 = agg['queries_per_ip'].quantile(0.25)
Q3 = agg['queries_per_ip'].quantile(0.75)
IQR = Q3 - Q1
lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
agg['iqr_flag'] = (agg['queries_per_ip'] < lower) | (agg['queries_per_ip'] > upper)

agg.sort_values('queries_per_ip', ascending=False).head(10)


Unnamed: 0,src_ip,queries_per_ip,avg_query_len,zscore,zscore_flag,iqr_flag
2,192.168.56.2,6097,18.547482,2.996114,False,True
9,fe80::b18:3d6b:309a:2ca,306,8.254902,-0.223201,False,False
3,192.168.56.3,193,5.580311,-0.286019,False,False
1,192.168.56.1,182,13.197802,-0.292134,False,False
6,fe80::3aca:a7f1:9332:a384,130,7.8,-0.321042,False,False
5,192.168.56.5,69,1.0,-0.354953,False,False
4,192.168.56.4,54,1.0,-0.363292,False,False
0,-,38,1.0,-0.372186,False,False
8,fe80::a064:1e98:9a0a:734a,4,1.0,-0.391088,False,False
7,fe80::7c59:f6c4:36cf:4cd4,2,1.0,-0.392199,False,False


## Step 3: ML Anomaly Detection (Isolation Forest, One-Class SVM)

In [16]:
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

features = ['queries_per_ip','avg_query_len']
X = agg[features].fillna(0).values
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Isolation Forest
if_model = IsolationForest(contamination=0.05, random_state=42)
agg['if_flag'] = if_model.fit_predict(X_scaled) == -1
agg['if_score'] = -if_model.decision_function(X_scaled)

# One-Class SVM
ocsvm = OneClassSVM(kernel='rbf', gamma='scale', nu=0.05)
agg['ocsvm_flag'] = ocsvm.fit_predict(X_scaled) == -1
agg['ocsvm_score'] = -ocsvm.decision_function(X_scaled)

agg[['src_ip','queries_per_ip','if_score','if_flag','ocsvm_score','ocsvm_flag']].head(10)


Unnamed: 0,src_ip,queries_per_ip,if_score,if_flag,ocsvm_score,ocsvm_flag
0,-,38,-0.241127,False,-0.000796,False
1,192.168.56.1,182,-0.114946,False,0.000125,True
2,192.168.56.2,6097,0.094047,True,0.000125,True
3,192.168.56.3,193,-0.173074,False,-0.002061,False
4,192.168.56.4,54,-0.252563,False,-0.001044,False
5,192.168.56.5,69,-0.216388,False,-0.001244,False
6,fe80::3aca:a7f1:9332:a384,130,-0.19091,False,-0.001846,False
7,fe80::7c59:f6c4:36cf:4cd4,2,-0.222369,False,-0.000103,False
8,fe80::a064:1e98:9a0a:734a,4,-0.225714,False,-0.000146,False
9,fe80::b18:3d6b:309a:2ca,306,-0.141196,False,-0.000147,False


## Step 4: Detection Rules

Examples of exporting anomaly detection into rules.

In [17]:
import yaml, numpy as np

# Derive threshold (95th percentile of queries_per_ip)
thr = int(np.percentile(agg['queries_per_ip'], 95))

# Sigma rule
sigma = {
  'title': 'Excessive DNS Queries Per Host',
  'id': 'dns-excess-queries-baseline',
  'description': 'Detect hosts with unusually high DNS queries per IP',
  'logsource': {'product':'zeek','service':'dns'},
  'detection': {'selection': {'queries_per_ip': {'min': thr}}, 'condition': 'selection'},
  'level': 'high'
}
with open('./data/sigma_dns_example.yml','w') as f:
    yaml.safe_dump(sigma,f)
print("Sigma rule written to /mnt/data/sigma_dns_example.yml")

# IDS/firewall example (pseudo):
print("\nIDS/Firewall example:")
print(f"alert dns any any -> any any (msg:\"Excessive DNS Queries\"; threshold: type both, track by_src, count {thr}, seconds 3600; sid:1000001;)")


Sigma rule written to /mnt/data/sigma_dns_example.yml

IDS/Firewall example:
alert dns any any -> any any (msg:"Excessive DNS Queries"; threshold: type both, track by_src, count 3491, seconds 3600; sid:1000001;)
