In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from IPython.display import display, Markdown

In [27]:
display(Markdown("### Dictionary with column names for various Zeek log types"))

zeek_cols = {'capture_loss': ['ts', 'ts_delta', 'peer', 'gaps', 'acks', 'percent_lost'],
 'conn': ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h',
        'id.resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes',
        'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history',
        'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes',
        'tunnel_parents'],
 'dhcp': ['ts', 'uids', 'client_addr', 'server_addr', 'mac',
        'host_name', 'client_fqdn', 'domain', 'requested_addr', 'assigned_addr',
        'lease_time', 'client_message', 'server_message', 'msg_types',
        'duration'],
 'dns': ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h',
        'id.resp_p', 'proto', 'trans_id', 'rtt', 'query', 'qclass',
        'qclass_name', 'qtype', 'qtype_name', 'rcode', 'rcode_name', 'AA', 'TC',
        'RD', 'RA', 'Z', 'answers', 'TTLs', 'rejected'],
 'files': ['ts', 'fuid', 'tx_hosts', 'rx_hosts', 'conn_uids', 'source',
        'depth', 'analyzers', 'mime_type', 'filename', 'duration', 'local_orig',
        'is_orig', 'seen_bytes', 'total_bytes', 'missing_bytes',
        'overflow_bytes', 'timedout', 'parent_fuid', 'md5', 'sha1', 'sha256',
        'extracted', 'extracted_cutoff', 'extracted_size'],
 'http': ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h',
        'id.resp_p', 'trans_depth', 'method', 'host', 'uri', 'referrer',
        'version', 'user_agent', 'origin', 'request_body_len',
        'response_body_len', 'status_code', 'status_msg', 'info_code',
        'info_msg', 'tags', 'username', 'password', 'proxied', 'orig_fuids',
        'orig_filenames', 'orig_mime_types', 'resp_fuids', 'resp_filenames',
        'resp_mime_types'],
 'notice': ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h',
        'id.resp_p', 'fuid', 'file_mime_type', 'file_desc', 'proto', 'note',
        'msg', 'sub', 'src', 'dst', 'p', 'n', 'peer_descr', 'actions',
        'email_dest', 'suppress_for', 'remote_location.country_code',
        'remote_location.region', 'remote_location.city',
        'remote_location.latitude', 'remote_location.longitude'],
 'ntp': ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h',
        'id.resp_p', 'version', 'mode', 'stratum', 'poll', 'precision',
        'root_delay', 'root_disp', 'ref_id', 'ref_time', 'org_time', 'rec_time',
        'xmt_time', 'num_exts'],
 'ssl': ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h',
        'id.resp_p', 'version', 'cipher', 'curve', 'server_name', 'resumed',
        'last_alert', 'next_protocol', 'established', 'cert_chain_fuids',
        'client_cert_chain_fuids', 'subject', 'issuer', 'client_subject',
        'client_issuer', 'validation_status'],
 'stats': ['ts', 'peer', 'mem', 'pkts_proc', 'bytes_recv',
        'pkts_dropped', 'pkts_link', 'pkt_lag', 'events_proc', 'events_queued',
        'active_tcp_conns', 'active_udp_conns', 'active_icmp_conns',
        'tcp_conns', 'udp_conns', 'icmp_conns', 'timers', 'active_timers',
        'files', 'active_files', 'dns_requests', 'active_dns_requests',
        'reassem_tcp_size', 'reassem_file_size', 'reassem_frag_size',
        'reassem_unknown_size'],
 'weird': ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h',
        'id.resp_p', 'name', 'addl', 'notice', 'peer', 'source'],
 'x509': ['ts', 'id', 'certificate.version', 'certificate.serial',
        'certificate.subject', 'certificate.issuer',
        'certificate.not_valid_before', 'certificate.not_valid_after',
        'certificate.key_alg', 'certificate.sig_alg', 'certificate.key_type',
        'certificate.key_length', 'certificate.exponent', 'certificate.curve',
        'san.dns', 'san.uri', 'san.email', 'san.ip', 'basic_constraints.ca',
        'basic_constraints.path_len']
        }
display(Markdown(f"""**Zeek column types:**  
{list(zeek_cols.keys())}"""))

### Dictionary with column names for various Zeek log types

**Zeek column types:**  
['capture_loss', 'conn', 'dhcp', 'dns', 'files', 'http', 'notice', 'ntp', 'ssl', 'stats', 'weird', 'x509']

In [28]:
df = pd.read_csv("data/conn.log", sep="\t", skiprows=8, names=zeek_cols["conn"], low_memory=False)
display(df.sample(5))

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents
541997,1556823691.453735,CYFF0c12i3gwp4FXJ4,192.168.10.43,50025.0,192.168.49.50,7002.0,tcp,-,0.000501,0,...,REJ,-,-,0.0,Sr,1.0,44.0,1.0,40.0,-
596161,1556827216.418389,CYcC354DgEd9QoSVT7,192.168.10.43,8.0,192.168.61.254,0.0,icmp,-,1.302140,0,...,OTH,-,-,0.0,-,2.0,56.0,0.0,0.0,-
735215,1556838144.503411,CUYgcxMx1W6qHL6ek,192.168.10.50,64044.0,192.168.61.20,800.0,tcp,-,-,-,...,S0,-,-,0.0,S,1.0,44.0,0.0,0.0,-
1279769,1556888983.977802,ChAuu117DayxViufp,10.200.200.80,13.0,172.16.60.185,14.0,icmp,-,0.112221,24,...,OTH,-,-,0.0,-,2.0,80.0,0.0,0.0,-
88638,1556767727.970694,CNdvoh2xjeqGL4wWX7,177.41.86.159,50479.0,192.168.61.21,23.0,tcp,-,20.370510,121,...,RSTO,-,-,0.0,ShAdDafR,33.0,1453.0,27.0,1299.0,-


In [30]:
df = pd.concat([df, pd.get_dummies(df['proto'], prefix='proto')], axis=1)
display(df.sample(5))

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,proto_icmp,proto_tcp,proto_udp,proto_icmp.1,proto_tcp.1,proto_udp.1
181142,1556816837.82753,CgcL6w2CDegBLycmCi,192.168.10.43,20390.0,92.232.245.148,22.0,tcp,-,-,-,...,40.0,0.0,0.0,-,False,True,False,False,True,False
1078326,1556866901.62088,Ck5sYw1Etl4HBMMMt9,192.168.10.50,6099.0,192.168.61.50,31038.0,tcp,-,0.000189,0,...,44.0,1.0,40.0,-,False,True,False,False,True,False
101776,1556780279.354144,CT7OmaL6DzzNnrmqf,192.168.61.21,44781.0,8.8.8.8,53.0,udp,dns,0.011960,44,...,72.0,1.0,132.0,-,False,False,True,False,False,True
436380,1556817679.156135,CxFb9z1TrG2mkgCAk1,192.168.10.43,19622.0,165.13.109.162,23.0,tcp,-,-,-,...,40.0,0.0,0.0,-,False,True,False,False,True,False
1155993,1556874050.447611,CO6QIWSBHQjL1fJvb,192.168.10.43,40926.0,192.168.10.50,787.0,tcp,-,-,-,...,44.0,0.0,0.0,-,False,True,False,False,True,False
