### CMB Assigment 2 Program ###

In [1]:
# Program, run imports
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import seaborn as sns
import requests
sns.set(style="darkgrid")

Create Pandas Dataframe

In [4]:
#df = pd.read_csv("data/21-12-23_complete.csv")
# data_folder = "/Users/Fabian/Desktop/"
data_folder = "/Users/Fabian/Desktop/"
df = pd.read_csv(data_folder + "1dayAll.csv", encoding = "latin")
print(df)
# ['No.', 'Time', 'Source', 'Destination', 'Protocol', 'Length', 'Info', 'dst port']


             No.                        Time           Source  \
0              1  2022-01-16 22:04:03.993585     192.168.0.88   
1              2  2022-01-16 22:04:03.997303      192.168.0.5   
2              3  2022-01-16 22:04:04.055476  199.232.138.248   
3              4  2022-01-16 22:04:04.057104      192.168.0.9   
4              5  2022-01-16 22:04:04.070783  199.232.138.248   
...          ...                         ...              ...   
1117455  1117456  2022-01-17 17:05:50.550259   188.164.238.26   
1117456  1117457  2022-01-17 17:05:50.550547   188.164.238.26   
1117457  1117458  2022-01-17 17:05:50.595386     192.168.0.22   
1117458  1117459  2022-01-17 17:05:50.605007     192.168.0.22   
1117459  1117460  2022-01-17 17:05:50.607387   188.164.238.26   

             Destination Protocol  Length  \
0            192.168.0.5      SSH     190   
1           192.168.0.88      TCP      66   
2            192.168.0.9      TCP     730   
3        199.232.138.248      TCP      

Filter local traffic

In [None]:
# source and destination should not start with 192.168 to filter local network
filtered = df.loc[~df['Source'].str.startswith("192.168") & df['Destination'].str.startswith("192.168") |
                 df['Source'].str.startswith("192.168") & ~df['Destination'].str.startswith("192.168")]
df = filtered.copy()
print(df)

In [None]:
# helper functions
def utcEntryToTimestamp(entry):
    if '.' in entry:
        row_entry = entry.split(".")[0]
    else:
        row_entry = entry.split(",")[0]
    TIME_FORMAT='%Y-%m-%d %H:%M:%S'
    ts = int(datetime.strptime(row_entry, TIME_FORMAT).timestamp())
    return ts

def utcRowToTimestamp(row):
    return utcEntryToTimestamp(row.at['Time'])
utcRowToTimestamp(df.iloc[0])
# print(df.loc[0].at['Time'])

### Protocol packet distribution ###

In [None]:
print(df.groupby('Protocol').Source.count())

count_protocol_packets = df.groupby('Protocol').Source.count()

fig, ax = plt.subplots()
plt.title('Protocol Distribution by Amount of Packets')
ax.pie(count_protocol_packets, labels=count_protocol_packets.keys(), autopct='%1.1f%%',)
fig.show()

### Amount of data traffic per protocol ###

In [None]:
count_protocol_traffic = df.groupby('Protocol').Length.sum()
print(count_protocol_traffic)
fig, ax = plt.subplots()
plt.title('Protocol Distribution by Data Traffic')
ax.pie(count_protocol_traffic, labels=count_protocol_traffic.keys(), autopct='%1.1f%%',)
fig.show()

### Average Data length per Protocol Type ###

In [None]:
average_protocol_packet_length = df.groupby('Protocol').Length.sum() / df.groupby('Protocol').Length.count()
print(average_protocol_packet_length)
fig, ax = plt.subplots()
ax.tick_params(axis='x', which='major', labelsize=12)
ax.tick_params(axis='x', which='minor', labelsize=12)
plt.xlabel('Protocol')
plt.ylabel('Data [Byte]')
plt.title('Average Data Length per Protocol Type')
ax.bar(average_protocol_packet_length.keys(), average_protocol_packet_length, align='center',)
plt.xticks(rotation=60, ha="right")
fig.show()

### Source addresses ###

In [None]:
print(df.groupby('Source').Time.count())
print(df.groupby('Destination').Time.count().sort_values())

unique_dests = df['Destination'].unique()

# write ip address destinations to file
file1 = open("Testfile.txt","w")
for row in unique_dests:
    file1.write(row + "\n")
file1.close()

In [None]:
print(df.groupby('Source').Time.count())
print(df.groupby('Destination').Time.count().sort_values())

### DNS requests ###

In [None]:
df_dns = df[df['Protocol'].isin(['DNS'])]
df_dns.head()
df_dns_requests_1 = df_dns[df_dns['Info'].str.contains('Standard query')]
# does not contain response
df_dns_requests_2 = df_dns_requests_1[~df_dns_requests_1['Info'].str.contains('response')]
df_dns_resources = df_dns_requests_2['Info'].apply(lambda x: x.split(' ')[-1]).value_counts()
print(df_dns_resources)
# print(df_dns_requests)
# print(df_dns['Info'])


### IP Wordlwide location

In [None]:
df_destinations =  df['Destination'].unique()

for dest in df_destinations[:10]: # 10 entries limit for now
    if dest.startswith("192.168"):
            continue
    headers = { 'User-Agent': "keycdn-tools:https://www.example.com" }
    url = "https://tools.keycdn.com/geo.json?host={}".format(dest)
    r = requests.get(url, headers=headers)
    print(r.json() )  


### Data traffic over time ###

In [None]:
def timeMapping(x):
    # include time difference UTC+1
    time = datetime.utcfromtimestamp(x*min_15_duration + 3600)
    if time.minute == 0 and time.hour % 3 == 0:
        return time.strftime("%H:%M")
    else:
        return ""

# find first and last timestamp, then create data structure
min_15_duration = 60 * 15
df_time_mod = df.copy()
df_time_mod['Time'] = df_time_mod['Time'].apply(lambda x: utcEntryToTimestamp(x)) # map to 15 min window
df_time_mod.head()
first_entry = df_time_mod.iloc[0].at['Time'] // min_15_duration
last_entry = df_time_mod.iloc[-1].at['Time'] // min_15_duration

print('from ', df_time_mod.iloc[0].at['Time'], " to ", df_time_mod.iloc[-1].at['Time'])

count_packets = np.zeros(last_entry - first_entry + 1)
length_packets = np.zeros(last_entry - first_entry + 1)

x_values_packets = list(range(first_entry, last_entry + 1))
print(len(x_values_packets))

mapping_res = list(map(timeMapping, x_values_packets))

# print(mapping_res)

for index, row in df_time_mod.iterrows():
    ts = row.at['Time'] // min_15_duration
    count_packets[ts - first_entry] += 1
    length_packets[ts - first_entry] += row.at['Length']

length_packets = length_packets // 1000
print(count_packets)
print(length_packets)

fig, ax = plt.subplots()
plt.xlabel('Time')
plt.ylabel('Packets')
plt.title('Packets per timeframe (15 min interval)')
plt.xticks(x_values_packets, mapping_res)
ax.bar(x_values_packets, count_packets, color='black')
fig.show()

fig, ax = plt.subplots()
plt.xlabel('Time')
plt.ylabel('Data [kB]')
plt.title('Data traffic per timeframe (15 min interval)')
plt.xticks(x_values_packets, mapping_res)
ax.bar(x_values_packets, length_packets, color='black')
fig.show()