### CMB Assigment 2 Program ###

In [None]:
# Program, run imports
%matplotlib inline
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import time
import seaborn as sns
import requests
sns.set(style="darkgrid")

# used to skip merging and filtering
new_read = False

Create Pandas Dataframe

In [None]:
if new_read:
    df_fabian = pd.read_csv("dumps/allPorts_fabian.csv", encoding = "latin")
    df_sven = pd.read_csv("dumps/standard-dump-sven.csv", encoding = "latin")
    df = pd.concat([df_fabian, df_sven])
    print(df)

Filter Local Traffic

In [None]:
if new_read:
    # source and destination should not start with 192.168 to filter local network. DNS should not be filtered.
    filtered = df.loc[~df['Source'].str.startswith("192.168", na=False) & df['Destination'].str.startswith("192.168", na=False) |
                    df['Source'].str.startswith("192.168", na=False) & ~df['Destination'].str.startswith("192.168", na=False) |
                    df['Protocol'].isin(['DNS'])]

    filtered = filtered.loc[~filtered['Protocol'].isin(['DHCP', 'ARP', 'MDNS', 'LLDP', 'SSDP', 'IGMP', 'IGMPv2', 'IGMPv3', 'ICMP', 'ICMPv4', 'ICMPv6', 'ieee1905', 'LLMNR'])]
    filtered = filtered.loc[~filtered['Destination'].isin(['255.255.255.255'])]


    df = filtered.copy()
    df.to_csv('df.csv')
    # print (df.loc[df['Destination'].str.contains("255.255.255.255")])
else: 
    df = pd.read_csv('df.csv')
print(df)

In [None]:
# helper functions
def utcEntryToTimestamp(entry):
    if '.' in entry:
        row_entry = entry.split(".")[0]
    else:
        row_entry = entry.split(",")[0]
    TIME_FORMAT='%Y-%m-%d %H:%M:%S'
    ts = int(datetime.strptime(row_entry, TIME_FORMAT).timestamp())
    return ts

def utcRowToTimestamp(row):
    return utcEntryToTimestamp(row.at['Time'])
utcRowToTimestamp(df.iloc[0])
# print(df.loc[0].at['Time'])

### Protocol packet distribution ###



In [None]:
print(df.groupby('Protocol').Source.count())

count_protocol_packets = df.groupby('Protocol').Source.count()

fig, ax = plt.subplots()
plt.title('Protocol Distribution by Amount of Packets')
ax.pie(count_protocol_packets, labels=count_protocol_packets.keys(), autopct='%1.1f%%',)
fig.show()

### Amount of data traffic per protocol ###

In [None]:
count_protocol_traffic = df.groupby('Protocol').Length.sum()
print(count_protocol_traffic)
fig, ax = plt.subplots()
plt.title('Protocol Distribution by Data Traffic')
ax.pie(count_protocol_traffic, labels=count_protocol_traffic.keys(), autopct='%1.1f%%',)
fig.show()

### Average Data length per Protocol Type ###

In [None]:
average_protocol_packet_length = df.groupby('Protocol').Length.sum() / df.groupby('Protocol').Length.count()
print(average_protocol_packet_length)
fig, ax = plt.subplots()
ax.tick_params(axis='x', which='major', labelsize=12)
ax.tick_params(axis='x', which='minor', labelsize=12)
plt.xlabel('Protocol')
plt.ylabel('Data [Byte]')
plt.title('Average Data Length per Protocol Type')
ax.bar(average_protocol_packet_length.keys(), average_protocol_packet_length, align='center',)
plt.xticks(rotation=60, ha="right")
fig.show()

### Source addresses ###

In [None]:
print(df.groupby('Source').Time.count())
print(df.groupby('Destination').Time.count().sort_values())

unique_dests = df['Destination'].unique()

# write ip address destinations to file
file1 = open("Testfile.txt","w")
for row in unique_dests:
    file1.write(row + "\n")
file1.close()

In [None]:
print(df.groupby('Source').Time.count())
print(df.groupby('Destination').Time.count().sort_values())

## DNS 

We would like to investigate further our collected data. Now, we are particular interested in DNS requests. therefore, we filter the data by the protocol "DNS".

In [None]:
df_dns = df[df['Protocol'].isin(['DNS'])]
print(df_dns.head())

### Used DNS Server

Various DNS Resolvers exist on the Internet. We would like to find out which our devices are using during their operation. 
Some devices can be configured to use specif DNS Servers, some a DNS resolver hardcoded in their firmware. In man cases the resolve request is just forwarded to the router, who takes care of this. 
By grouping our DNS Destinations and counting the requests, we see which resolvers were primarily used.

In [None]:
df_dns_server = df_dns.loc[~df_dns['Info'].str.contains("response")]
print(df_dns_server.groupby(['Destination']).size())

We see that most of our DNS requests were send to our router or well known DNS resolvers, such as Google's 8.8.8.8. However, we also found two unknown DNS resolvers. A print out of these requests reveals that the Feinstaubsensor IoT device is talking to changing DNS Servers to resolve its destinations.

In [None]:
print(df_dns_server.loc[df_dns_server['Destination'].str.contains("195.234.128.139")].head(5))
print(df_dns_server.loc[df_dns_server['Destination'].str.contains("217.68.162.126")].head(5))

Another great insight is figuring out what URLs are actually being resolved. This can also be used to track a devices browsing behavior. We count the requests by URL to get a better overview. \
An interesting finding is that our top two resolved URLs are requested by static devices, such as the Feinstaubsensor and the Amazon Fire TV stick. We are not sure why thousands of requests are needed to resolve the same URL ever and ever again. 

In [None]:
df_dns_requests_1 = df_dns[df_dns['Info'].str.contains('Standard query')]
# does not contain response
df_dns_requests_2 = df_dns_requests_1[~df_dns_requests_1['Info'].str.contains('response')]
df_dns_resources = df_dns_requests_2['Info'].apply(lambda x: x.split(' ')[-1]).value_counts()
print(df_dns_resources.head(20))

### IP Wordlwide location

In [None]:
df_destinations =  df['Destination'].unique()

rows = []
for dest in df_destinations[:10]: # 10 entries limit for now
    if dest.startswith("192.168"):
            continue
    headers = { 'User-Agent': "keycdn-tools:https://www.example.com" }
    url = "https://tools.keycdn.com/geo.json?host={}".format(dest)
    json_response = requests.get(url, headers=headers).json()
    geo = json_response['data']['geo']
    #print(json_response)
    #print(geo)
    rows.append([geo['ip'], geo['longitude'], geo['latitude']])
    
# as dataframe
df_coord = pd.DataFrame(rows, columns=["ip", "lng", "lat"])
print(df_coord)

# plot on world
g_world = gpd.GeoDataFrame(df_coord, geometry=gpd.points_from_xy(df_coord.lng, df_coord.lat))

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
base = world.plot(color='white', edgecolor='black')
g_world.plot(ax=base, marker='o', color='red', markersize=5)
plt.show()

### Data traffic over time ###

In [162]:
def timeMapping(x):
    # include time difference UTC+1
    time = datetime.utcfromtimestamp(x*min_15_duration + 3600)
    if time.minute == 0 and time.hour % 3 == 0:
        return time.strftime("%H:%M")
    else:
        return ""

# find first and last timestamp, then create data structure
min_15_duration = 60 * 15
# df_time_mod = df.copy()
print('Compute intensive task 1/3...')
df['index-time'] = df['Time'] # .apply(lambda x: x)) # map to 15 min window
df['index-time'] = df['index-time'].apply(lambda x: utcEntryToTimestamp(x))

first_entry = df.iloc[0].at['index-time'] // min_15_duration
last_entry = df.iloc[-1].at['index-time'] // min_15_duration

print('from ', df.iloc[0].at['Time'], " to ", df.iloc[-1].at['Time'])

count_packets = np.zeros(last_entry - first_entry + 1)
length_packets = np.zeros(last_entry - first_entry + 1)

x_values_packets = list(range(first_entry, last_entry + 1))
print('Amount of Intervals',len(x_values_packets))

mapping_res = list(map(timeMapping, x_values_packets))

print('Compute intensive task 2/3...')
df['index-time'] = (df['index-time'] // min_15_duration) - first_entry


from  2022-01-16 22:04:04.055476  to  2022-01-15 10:51:12.351281


ValueError: negative dimensions are not allowed

In [None]:
count_packets = np.zeros(last_entry - first_entry + 1)
length_packets = np.zeros(last_entry - first_entry + 1)
print('Compute intensive task 3/3...')
time_start = time.time()
for row in df.to_dict('records'):
    count_packets[row['index-time']] += 1
    length_packets[row['index-time']] += row['Length']

# old solution:
#for index, row in df.iterrows():
#    count_packets[row.at['index-time']] += 1
#    length_packets[row.at['index-time']] += row.at['Length']

time_end = time.time()
print(time_end - time_start)

length_packets = length_packets // 1000
#print(count_packets)
#print(length_packets)

fig, ax = plt.subplots()
plt.xlabel('Time')
plt.ylabel('Packets')
plt.title('Packets per timeframe (15 min interval)')
plt.xticks(x_values_packets, mapping_res)
ax.bar(x_values_packets, count_packets, color='black')
fig.show()

fig, ax = plt.subplots()
plt.xlabel('Time')
plt.ylabel('Data [kB]')
plt.title('Data traffic per timeframe (15 min interval)')
plt.xticks(x_values_packets, mapping_res)
ax.bar(x_values_packets, length_packets, color='black')
fig.show()

End of the notebook