In [1]:
import pandas as pd 
import netutils as net
import utils as ut

In [2]:
# Our first task is to identify all the IP addresses that are in the environment. 
# The logs produced include: conn.log, dhcp.log, dns.log, ssl.log, http.log 
conn_df = pd.read_json('unsw/day_logs/conn.log', lines=True)
print(f'The Columns are {conn_df.columns.values}')

# Let us also see what the entries look like. 
conn_df.head()

The Columns are ['ts' 'uid' 'id.orig_h' 'id.orig_p' 'id.resp_h' 'id.resp_p' 'proto'
 'service' 'duration' 'orig_bytes' 'resp_bytes' 'conn_state'
 'missed_bytes' 'history' 'orig_pkts' 'orig_ip_bytes' 'resp_pkts'
 'resp_ip_bytes' 'orig_l2_addr' 'resp_l2_addr']


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,orig_l2_addr,resp_l2_addr
0,1474726000.0,COK7PAJLfH51Z8sMj,192.168.1.166,53716,89.30.121.150,80,tcp,http,1.166575,1846.0,290.0,SF,0,ShADdaFf,8,2270,5,558,00:24:e4:11:18:a8,14:cc:20:51:33:ea
1,1474726000.0,Cmq5BD4abGJE9M6Dn6,192.168.1.166,45136,192.168.1.1,53,udp,dns,0.284601,37.0,242.0,SF,0,Dd,1,65,1,270,00:24:e4:11:18:a8,14:cc:20:51:33:ea
2,1474726000.0,CqS2E81hhnbbAo7xH9,192.168.1.120,37616,192.168.1.1,53,udp,dns,0.000366,33.0,49.0,SF,0,Dd,1,61,1,77,18:b7:9e:02:20:44,14:cc:20:51:33:ea
3,1474726000.0,C3mgb9JSnpUFR1cK4,192.168.1.193,3872,192.168.1.249,49152,tcp,http,0.043867,186.0,1170.0,SF,0,ShADadfF,5,454,5,1438,ec:1a:59:83:28:11,00:16:6c:ab:6b:88
4,1474726000.0,CqjW6D4N17JsNNyk4a,192.168.1.193,4024,192.168.1.223,49153,tcp,http,0.044333,183.0,3617.0,SF,0,ShADadfF,7,555,7,3989,ec:1a:59:83:28:11,ec:1a:59:79:f4:89


In [3]:
# Let us first explore what we can learn about the system by examining the DHCP Logs 
# DHCP is contacted by every client when they want to get a local address 
# There may be some machines that do not get addresses using DHCP 
# First let us read the DHCP log into memory 

dhcp_df = pd.read_json('unsw/day_logs/dhcp.log', lines=True)
print(f'The Columns are {dhcp_df.columns.values}')

# Let us also see what the entries look like. 
dhcp_df.head()

The Columns are ['ts' 'uids' 'client_addr' 'server_addr' 'mac' 'assigned_addr'
 'lease_time' 'msg_types' 'duration' 'subnet_mask' 'routers' 'dns_servers'
 'requested_addr' 'domain' 'host_name']


Unnamed: 0,ts,uids,client_addr,server_addr,mac,assigned_addr,lease_time,msg_types,duration,subnet_mask,routers,dns_servers,requested_addr,domain,host_name
0,1474726000.0,"[CIOVRi3LzvT6RirLt5, CkrHcZ1y43zvJOWL4i]",192.168.1.112,192.168.1.1,70:ee:50:03:b8:ac,192.168.1.112,43200.0,"[DISCOVER, DISCOVER, OFFER, OFFER, DISCOVER, O...",5.854226,255.255.255.0,[192.168.1.1],[192.168.1.1],,,
1,1474726000.0,"[CIOVRi3LzvT6RirLt5, CkrHcZ1y43zvJOWL4i]",192.168.1.112,192.168.1.1,70:ee:50:03:b8:ac,192.168.1.112,43200.0,"[DISCOVER, OFFER, DISCOVER, OFFER]",16.016445,255.255.255.0,[192.168.1.1],[192.168.1.1],,,
2,1474726000.0,"[CIOVRi3LzvT6RirLt5, CkrHcZ1y43zvJOWL4i]",192.168.1.112,192.168.1.1,70:ee:50:03:b8:ac,192.168.1.112,43200.0,"[DISCOVER, DISCOVER, OFFER, OFFER, DISCOVER, O...",13.661959,255.255.255.0,[192.168.1.1],[192.168.1.1],,,
3,1474726000.0,"[CIOVRi3LzvT6RirLt5, CkrHcZ1y43zvJOWL4i]",192.168.1.112,192.168.1.1,70:ee:50:03:b8:ac,192.168.1.112,43200.0,"[DISCOVER, OFFER]",0.000812,255.255.255.0,[192.168.1.1],[192.168.1.1],,,
4,1474726000.0,"[CIOVRi3LzvT6RirLt5, CkrHcZ1y43zvJOWL4i]",192.168.1.112,192.168.1.1,70:ee:50:03:b8:ac,192.168.1.112,43200.0,"[DISCOVER, DISCOVER, OFFER, OFFER, DISCOVER, O...",13.961972,255.255.255.0,[192.168.1.1],[192.168.1.1],,,


In [4]:
# Let us see how many routers, local DNS servers and subnet masks are provided by DHCP 
print(f'Unique DNS Servers: {net.get_unique_entries(dhcp_df, net.DHCP_DNS, True)}')
print(f'Unique Routers: {net.get_unique_entries(dhcp_df, net.DHCP_ROUTER, True)}')
print(f'Unique Subnet Masks: {net.get_unique_entries(dhcp_df, net.DHCP_SUBNET_MASK, False)}')
print(f'Unique DHCP Servers: {net.get_unique_entries(dhcp_df, net.DHCP_SERVER, False)}')

Unique DNS Servers: ['192.168.1.1']
Unique Routers: ['192.168.1.1']
Unique Subnet Masks: ['255.255.255.0']
Unique DHCP Servers: ['192.168.1.1']


In [5]:
# In this particular log, we only have one unique DNS and one unique router provided by DHCP Server. 
# This looks like a single subnet with an Access Point of 192.168.1.1. The machine IP address is last octet. 

In [6]:
#Let us see how many clients contact dhcp 
dhcp_clients = net.get_unique_subset(dhcp_df, [net.DHCP_CLIENT, net.DHCP_MAC])
print(f'Number of unique mappings are: {dhcp_clients.shape[0]}')
print(dhcp_clients)

Number of unique mappings are: 15
       client_addr                mac
0    192.168.1.112  70:ee:50:03:b8:ac
42   192.168.1.238  00:24:e4:1b:6f:96
62   192.168.1.106  30:8c:fb:2f:e4:b2
122  192.168.1.143  f4:f2:6d:93:51:f1
124  192.168.1.120  18:b7:9e:02:20:44
153  192.168.1.239  08:21:ef:3b:fc:e3
157  192.168.1.240  44:65:0d:56:cc:d3
165  192.168.1.241  70:ee:50:18:34:43
177  192.168.1.236  70:5a:0f:e4:9b:c0
180  192.168.1.227  50:c7:bf:00:56:39
181  192.168.1.166  00:24:e4:11:18:a8
201  192.168.1.177  e0:76:d0:33:bb:85
212  192.168.1.249  00:16:6c:ab:6b:88
253  192.168.1.196  d0:52:a8:00:67:5e
384  192.168.1.168  18:b4:30:25:be:e4


In [7]:
# Let us also see if there are any clients which are contacting DHCP Server without a MAC address. 
# This is going to list all members, and if there are more than 15 then those are DHCP whose MAC are not known. 
entries = list(set(list(dhcp_df[net.DHCP_CLIENT].unique())))
print(len(entries), entries)

15 ['192.168.1.227', '192.168.1.249', '192.168.1.106', '192.168.1.120', '192.168.1.239', '192.168.1.240', '192.168.1.143', '192.168.1.236', '192.168.1.166', '192.168.1.241', '192.168.1.112', '192.168.1.238', '192.168.1.196', '192.168.1.177', '192.168.1.168']


In [8]:
# Now, we know that there are exactly 15 addresses, so all clients have MAC addresses. 

In [9]:
# We are also going to read the groud truth and see whether we have missed any IP addresses. 
#ground_df = pd.read_csv('unsw/ground_truth.csv')
#ground_dict=ground_df[['SrcIPAddress','mac']].set_index('SrcIPAddress')['mac'].to_dict()

ground_dict = ut.read_as_dict('unsw/ground_truth.csv','SrcIPAddress','mac')
for key in ground_dict:
    print(f'{key} -> {ground_dict[key]}')

192.168.1.106 -> 30:8c:fb:2f:e4:b2
192.168.1.112 -> 70:ee:50:03:b8:ac
192.168.1.120 -> 18:b7:9e:02:20:44
192.168.1.131 -> 74:6a:89:00:2e:25
192.168.1.143 -> f4:f2:6d:93:51:f1
192.168.1.166 -> 00:24:e4:11:18:a8
192.168.1.168 -> 18:b4:30:25:be:e4
192.168.1.177 -> e0:76:d0:33:bb:85
192.168.1.196 -> d0:52:a8:00:67:5e
192.168.1.208 -> 74:2f:68:81:69:42
192.168.1.227 -> 50:c7:bf:00:56:39
192.168.1.228 -> d0:a6:37:df:a1:e1
192.168.1.236 -> 70:5a:0f:e4:9b:c0
192.168.1.238 -> 00:24:e4:1b:6f:96
192.168.1.239 -> 08:21:ef:3b:fc:e3
192.168.1.240 -> 44:65:0d:56:cc:d3
192.168.1.241 -> 70:ee:50:18:34:43
192.168.1.248 -> b4:ce:f6:a7:a3:c2
192.168.1.249 -> 00:16:6c:ab:6b:88


In [10]:
#Let us see if all the devices are in the ground truth and if the ground truth information matches the dhcp data 
#dhcp_dict=dhcp_clients.set_index(net.DHCP_CLIENT)[net.DHCP_MAC].to_dict()
dhcp_dict = ut.df_2_dict(dhcp_clients,net.DHCP_CLIENT,net.DHCP_MAC)

In [11]:
#Let us compare the two dictionaries 
print(f'Present in DHCP - Missing from Ground: {[x for x in dhcp_dict if x not in ground_dict]}')
print(f'Present in Ground - Missing from DHCP: {[x for x in ground_dict if x not in dhcp_dict]}')

Present in DHCP - Missing from Ground: []
Present in Ground - Missing from DHCP: ['192.168.1.131', '192.168.1.208', '192.168.1.228', '192.168.1.248']


In [12]:
#Check whether the DHCP Addresses and MAC addresses in the two dictionaries match 
mismatch = [x for x in dhcp_dict if dhcp_dict[x] != ground_dict[x]]
print(mismatch)

[]


In [13]:
# Let us find the set of local IP addresses that are client in connection log as well as servers in the connection logs 
client_ip = list(conn_df[net.SRC_IP].unique())
not_found = [x for x in client_ip if x not in dhcp_dict.keys() and x.startswith('192.168.1.')]
print(not_found)

['192.168.1.193', '192.168.1.223', '192.168.1.1']


In [14]:
# So we see that there are three addresses in the connection log that did not make a DHCP Request. 
# We know from the head of the dhcp_df that 192.168.1.1 is the router and DHCP and DNS Server 
# Let us see if the others are present in the ground truth 
for x in ['192.168.1.193', '192.168.1.223']:
    mac_ground = ground_dict.get(x,None)
    mac_dhcp = dhcp_dict.get(x,None)
    if mac_ground is None and mac_dhcp is None:
        print(f'Not found address: {x}')
    elif mac_ground is None:
        print(f'Found in DHCP: {x} -> {mac_dhcp}')
    elif mac_dhcp is None:
        print(f'Found in DHCP: {x} -> {mac_ground}')
    else:
        print(f'Found both {x} -> Ground: {mac_ground} , DHCP: {mac_dhcp}')    

# So we have two unknown addresses where we do not know the ground truth -- as to what these machines are 

Not found address: 192.168.1.193
Not found address: 192.168.1.223


In [15]:
# Can we find the MAC addresses for the mystery devices 
conn_subset = conn_df[conn_df[net.SRC_IP].isin(['192.168.1.193', '192.168.1.223'])]
unknown_df = conn_subset[[net.SRC_IP, net.SRC_MAC]].drop_duplicates().dropna()
print(unknown_df)

        id.orig_h       orig_l2_addr
3   192.168.1.193  ec:1a:59:83:28:11
17  192.168.1.223  ec:1a:59:79:f4:89


In [16]:
# From looking at the OUI from IEEE standards database, we can now determine that these 
# devices are from Belkin International Inc.

In [17]:
# We should also explore what clients and servers are present in the local network. 
# Let us find the servers which are in domain 192.168.1. and see which ports they are listening on
server_ip = list(conn_df[conn_df[net.CONN_STATE]=="SF"][net.DEST_IP].unique())
local_servers = [x for x in server_ip if x.startswith('192.168.1.')]
print('Local Servers:', len(local_servers), local_servers)
in_ground = [x for x in local_servers if x in ground_dict.keys()]
print('In Ground Truth:', len(in_ground), in_ground)
print('Not in Ground Truth:', [x for x in local_servers if x not in in_ground])


Local Servers: 5 ['192.168.1.1', '192.168.1.249', '192.168.1.223', '192.168.1.193', '192.168.1.241']
In Ground Truth: 2 ['192.168.1.249', '192.168.1.241']
Not in Ground Truth: ['192.168.1.1', '192.168.1.223', '192.168.1.193']


In [18]:
#Let us try to find out which ports people are trying to connect to the local_servers.
# We will only use connections in state SF which is established. 


for x in local_servers:
    ports = net.get_server_port(x, conn_df)
    print(f'Ports {x} is contacted on: {ports}')

Ports 192.168.1.1 is contacted on: [67, 53]
Ports 192.168.1.249 is contacted on: [49152]
Ports 192.168.1.223 is contacted on: [49153]
Ports 192.168.1.193 is contacted on: [49154]
Ports 192.168.1.241 is contacted on: [46194]


In [19]:
for x in local_servers:
    if x != '192.168.1.1':
        print(f'Checking Address:{x}')
        print(net.find_clients(x, conn_df))
        print('-------')

Checking Address:192.168.1.249
       id.orig_h  id.resp_p
3  192.168.1.193      49152
-------
Checking Address:192.168.1.223
       id.orig_h  id.resp_p
4  192.168.1.193      49153
-------
Checking Address:192.168.1.193
        id.orig_h  id.resp_p
61  192.168.1.223      49154
-------
Checking Address:192.168.1.241
          id.orig_h  id.resp_p
24974  52.8.186.218      46194
-------


In [20]:
# Now, we know that 49152 is the port used for dynamic port range by applications 
# So, the two mystery devices are the ones that are trying to connect to local servers. 
# The Mystery thickens. 

In [21]:
conn_clients = net.get_unique_entries(conn_df, net.SRC_IP, is_list=False)
print(len(conn_clients))

36


In [22]:
# There are a lot more clients in the connection log than we have in DHCP or Ground Truth. 
# Let us take a look at what they are 
print(conn_clients) 

['192.168.1.166', '192.168.1.120', '192.168.1.193', '62.210.178.168', '192.168.1.196', '192.168.1.240', '192.168.1.223', '192.168.1.249', 'fe80::16cc:20ff:fe51:33ea', 'fe80::725a:fff:fee4:9bc0', 'fe80::72ee:50ff:fe18:3443', 'fe80::ee1a:59ff:fe79:f489', 'fe80::e276:d0ff:fe33:bb85', 'fe80::ee1a:59ff:fe83:2811', 'fe80::216:6cff:feab:6b88', '192.168.1.241', 'fe80::a21:efff:fe3b:fce3', '192.168.1.177', '192.168.1.143', '149.171.37.162', '192.168.1.1', '0.0.0.0', '192.168.1.112', '192.168.1.227', '192.168.1.239', '74.125.203.188', '192.168.1.238', '192.168.1.106', '64.233.189.188', '192.168.1.236', '192.168.1.168', '::', 'fe80::1ab4:30ff:fe25:bee4', '52.8.186.218', '64.233.188.188', '89.30.121.13']


In [23]:
# We see that there are a lot of IPV6 addresses. We also see some addreses of 0.0.0. 
# Let us check on what is going on there by examining the connection log 


In [24]:
# On examination, we will find that the 0.0.0 are dhcp requests. 
# This is done by DHCP requests when the client IP address is not assigned. 
# So, we can ignore this address for the sake of counting clients. 

In [25]:
# Let us see what the IPV6 addresses are doing in this log 
# Checking on the system shows that they are sending ICMP messages to other IPV6 addresses 
# If we check the connection log again, we can see that they contain the MAC addresses as well. 
# Let us see which of our various devices have MAC addresses that we can match to the various devices. 
# Note that the log MAC address probably only contains the local ones. 
client_map = net.get_unique_subset(conn_df, [net.SRC_IP, net.SRC_MAC])
grouped = client_map.groupby(net.SRC_MAC)
for name, group in grouped:
    print(f"Group: {name} with {group.shape[0]} entries")
    print(group)

Group: 00:16:6c:ab:6b:88 with 2 entries
                   id.orig_h       orig_l2_addr
18             192.168.1.249  00:16:6c:ab:6b:88
44  fe80::216:6cff:feab:6b88  00:16:6c:ab:6b:88
Group: 00:24:e4:11:18:a8 with 1 entries
       id.orig_h       orig_l2_addr
0  192.168.1.166  00:24:e4:11:18:a8
Group: 00:24:e4:1b:6f:96 with 2 entries
          id.orig_h       orig_l2_addr
1340  192.168.1.238  00:24:e4:1b:6f:96
1362        0.0.0.0  00:24:e4:1b:6f:96
Group: 08:21:ef:3b:fc:e3 with 2 entries
                    id.orig_h       orig_l2_addr
107  fe80::a21:efff:fe3b:fce3  08:21:ef:3b:fc:e3
230             192.168.1.239  08:21:ef:3b:fc:e3
Group: 14:cc:20:51:33:ea with 9 entries
                       id.orig_h       orig_l2_addr
13                62.210.178.168  14:cc:20:51:33:ea
21     fe80::16cc:20ff:fe51:33ea  14:cc:20:51:33:ea
163               149.171.37.162  14:cc:20:51:33:ea
192                  192.168.1.1  14:cc:20:51:33:ea
285               74.125.203.188  14:cc:20:51:33:ea
3982    

In [26]:
# Looking at the addreses, we can see that in most cases, the MAC corresponds to one or 2 Client IP addresses
# In the case of 2 IP addresses, one of them is an IPV6 address and one of these is an IPV4 address. 
# This also allows us to determine the equivalence among clients -- which interface has the V4, V6 and MAC address. 
# One of the group has multiple IP addresses corresponding to the same MAC address. This is the router interface. 
# So, we know that the addresses -- 
#  62.210.178.168, 149.171.37.162,  74.125.203.188 , 64.233.189.188, 52.8.186.218, 64.233.188.188, 89.30.121.13 
#    are all on another subnet and are connecting via the router interface 192.168.1.1 with MAC 14:cc:20:51:33:ea


In [27]:
#Let us also look at the destination MAC and destination for the same. 
# We will exclude 14:cc:20:51:33:ea since it will have many external IP addresses. 

In [28]:
server_map = net.get_unique_subset(conn_df, [net.DEST_IP, net.DEST_MAC])
grouped = server_map.groupby(net.DEST_MAC)
for name, group in grouped:
    if name != '14:cc:20:51:33:ea':
        print(f"Group: {name} with {group.shape[0]} entries")
        #print(group)

Group: 00:16:6c:ab:6b:88 with 1 entries
Group: 00:24:e4:11:18:a8 with 1 entries
Group: 00:24:e4:1b:6f:96 with 1 entries
Group: 01:00:5e:00:00:fb with 1 entries
Group: 01:00:5e:7f:ff:fa with 1 entries
Group: 08:21:ef:3b:fc:e3 with 2 entries
Group: 18:b4:30:25:be:e4 with 2 entries
Group: 33:33:00:00:00:01 with 1 entries
Group: 33:33:00:00:00:02 with 1 entries
Group: 33:33:00:00:00:0c with 2 entries
Group: 33:33:00:00:00:16 with 1 entries
Group: 33:33:00:00:00:fb with 1 entries
Group: 33:33:00:01:00:02 with 1 entries
Group: 33:33:00:01:00:03 with 2 entries
Group: 33:33:ff:00:00:00 with 1 entries
Group: 33:33:ff:00:00:01 with 1 entries
Group: 33:33:ff:00:0e:e2 with 1 entries
Group: 33:33:ff:18:34:43 with 1 entries
Group: 33:33:ff:25:be:e4 with 1 entries
Group: 33:33:ff:33:bb:85 with 1 entries
Group: 33:33:ff:3b:fc:e3 with 1 entries
Group: 33:33:ff:51:33:ea with 1 entries
Group: 33:33:ff:6a:d5:08 with 1 entries
Group: 33:33:ff:79:f4:89 with 1 entries
Group: 33:33:ff:83:28:11 with 1 entries


In [29]:
# As we can see, there are usually 1 or 2 entries. 
# So, both SRC and DESTINATION MAC can be used to make a mapping to IP address to the V4 and V6 addresses

mapping_dict = net.find_client_mapping(conn_df, '14:cc:20:51:33:ea')
for x in mapping_dict.keys():
    print(x, '-->', mapping_dict[x])

fe80::216:6cff:feab:6b88 --> 192.168.1.249
fe80::a21:efff:fe3b:fce3 --> 192.168.1.239
fe80::1ab4:30ff:fe25:bee4 --> 192.168.1.168
fe80::725a:fff:fee4:9bc0 --> 192.168.1.236
fe80::72ee:50ff:fe18:3443 --> 192.168.1.241
fe80::ee1a:59ff:fe79:f489 --> 192.168.1.223
fe80::ee1a:59ff:fe83:2811 --> 192.168.1.193
192.168.1.255 --> 255.255.255.255
fe80::e276:d0ff:fe33:bb85 --> 192.168.1.177


In [30]:
#Let us save the answer dictionary into a mapping file 
map_df = pd.DataFrame({'Original':list(mapping_dict.keys()), 'Mapped':[mapping_dict[x] for x in mapping_dict.keys()]})
map_df.to_csv('unsw/mapping.csv', index=False)

In [31]:
# Let us now focus on IPV4 addresses only -- since they can identify most of the communication
# mapping_dict allows us to map SRC and CLIENT IP addresses to local IPV4 address 


conn_df = net.conn_map_addresses(conn_df, mapping_dict)

In [32]:
#Now, we can see the client and server addresses from conn_df

client_addresses = list(conn_df[conn_df[net.CONN_STATE]=="SF"][net.SRC_IP].unique())
print(len(client_addresses))
print(client_addresses)

18
['192.168.1.166', '192.168.1.120', '192.168.1.193', '192.168.1.196', '192.168.1.240', '192.168.1.249', '192.168.1.223', '192.168.1.239', '192.168.1.177', '192.168.1.143', '192.168.1.241', '192.168.1.227', '192.168.1.112', '192.168.1.238', '192.168.1.106', '192.168.1.236', '192.168.1.168', '52.8.186.218']


In [33]:
# We have 17 local addresses, and an external address which is making an inbound connection. 

In [34]:
# Let us find out the set of local servers. 
server_addresses = list(conn_df[conn_df[net.CONN_STATE]=="SF"][net.DEST_IP].unique())
local_servers = [x for x in server_addresses if x.startswith('192.168.1')]
print(len(local_servers))
print(local_servers)

5
['192.168.1.1', '192.168.1.249', '192.168.1.223', '192.168.1.193', '192.168.1.241']


In [35]:
#These are the same ones that we had before playing with V6 addresses. 