In [1]:
import numpy as np
import pandas as pd
from pcap_analysis import *

crawl_data_dir1 = '/mnt/iot-house/crawl-data/amazon-data-20190401-121138'
crawl_data_dir2 = '/mnt/iot-house/crawl-data/roku-data-20190402-222352'



In [2]:
#Create Timestamp DF
crawl_data_dirs = [crawl_data_dir1, crawl_data_dir2]
df_ch_timestamps_list = []
for crawl_data_dir in crawl_data_dirs:
    channel_timestamps = load_timestamp_json(crawl_data_dir)
    df_ch_timestamps = pd.DataFrame(channel_timestamps).transpose().reset_index()
    df_ch_timestamps.rename(columns={'index': 'Channel Name'}, inplace=True)
    df_ch_timestamps_list.append(df_ch_timestamps)

Loading timestamp data from /mnt/iot-house/crawl-data/amazon-data-20190401-121138
Loading timestamp data from /mnt/iot-house/crawl-data/roku-data-20190402-222352


In [3]:
#Generate Global (All SSL/TCP Streams)
global_df_ssl_list = []
for crawl_data_dir in crawl_data_dirs:
    post_process_dir = join(crawl_data_dir, 'post-process')
    global_df = gen_global_df(post_process_dir)

    #Add SSL artifacts 
    global_df_ssl = add_ssl_features(global_df, post_process_dir)
    #Drop all duplicates
    global_df_ssl = global_df_ssl.drop_duplicates(subset=['Channel Name', 'ip.dst', 'MITM Attemp',  'SSL Failure'])
    #print(global_df_ssl.head())
    
    global_df_ssl_list.append(global_df_ssl)

Generating Global DF from /mnt/iot-house/crawl-data/amazon-data-20190401-121138/post-process 
Adding SSL features to DF from /mnt/iot-house/crawl-data/amazon-data-20190401-121138/post-process 
Generating Global DF from /mnt/iot-house/crawl-data/roku-data-20190402-222352/post-process 
Adding SSL features to DF from /mnt/iot-house/crawl-data/roku-data-20190402-222352/post-process 


In [4]:
######Timestamp Analysis######
##############################
global_df_merged_list = []
for idx, val in enumerate(crawl_data_dirs):
    #Merge global_df with timestamp_df
    global_df_merged = pd.merge(global_df_ssl_list[idx], df_ch_timestamps_list[idx], on=['Channel Name'])
    global_df_merged['epoch'] = np.nan

    epoch_list = list(df_ch_timestamps_list[idx])
    epoch_list.remove('Channel Name')


    #Populate epoch column for channel based on
    #the timestamp of SYN packet.
    for epoch in epoch_list:
        global_df_merged['epoch'] = np.where(global_df_merged['frame.time_epoch']>global_df_merged[epoch],
                                         epoch, global_df_merged['epoch'])

    #print(global_df_merged)
    global_df_merged_list.append(global_df_merged)


In [5]:
#Graph
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt

new_endpoints_count_list = []
for idx, val in enumerate(crawl_data_dirs):
    epoch_list = list(df_ch_timestamps_list[idx])
    epoch_list.remove('Channel Name')

    labels = epoch_list

    #Sort Lanuch tries
    min_iter = None
    max_iter = None
    for idx2, val in enumerate(epoch_list):
        if 'launch' in val:
            if min_iter == None:
                min_iter = idx2
            max_iter = idx2 + 1

    labels[min_iter:max_iter] = sorted(labels[min_iter:max_iter], key=str_to_int_key)
    launch_label = labels[min_iter:max_iter]
    #print(labels)

    global_df_merged = global_df_merged_list[idx]
    new_endpoints_df = global_df_merged.drop_duplicates(subset=['Channel Name', 'ip.dst'])

    new_endpoints_count = []
    for epoch in labels:
        df_epoch = new_endpoints_df.loc[global_df_merged['epoch'] == epoch]
        total_len = len(df_epoch)
        #print( "%s saw %s new end point" % (epoch, str(total_len)) )
        new_endpoints_count.append(total_len)
    
    new_endpoints_count_list.append(new_endpoints_count)

In [6]:
N = len(labels)
ind = np.arange(N)
width = 0.35

plt.figure(1)

p_list = []
for idx, val in enumerate(crawl_data_dirs):
    p = plt.bar(ind + width*idx, new_endpoints_count_list[idx], width)
    p_list.append(p)
    
plt.xlabel('Epoch')
plt.ylabel('Endpoints')
plt.title('New TLS Endpoints')
plt.subplots_adjust(bottom=0.2)
plt.legend(p_list, ('Amazon', 'Roku'));
labels_ref = [l.replace('install_channel', 'install') for l in labels]
plt.xticks(ind, labels_ref, rotation=45);



<IPython.core.display.Javascript object>

In [7]:
N = len(launch_label)
ind = np.arange(N)
width = 0.35

f2 = plt.figure(2)

p_list = []
for idx, val in enumerate(crawl_data_dirs):
    p = plt.bar(ind + width*idx, new_endpoints_count_list[idx][min_iter:max_iter], width)
    p_list.append(p)

plt.xlabel('Epoch')
plt.ylabel('Endpoints')
plt.title('New TLS Endpoints')
plt.subplots_adjust(bottom=0.2)
plt.legend(p_list, ('Amazon', 'Roku'));
plt.xticks(ind, launch_label, rotation=45);
f2.savefig("tls-learn.pdf", bbox_inches='tight')



<IPython.core.display.Javascript object>

In [8]:
plt2 = plt.figure(3)
labels = epoch_list
failure_percentage = []
success_percentage = []
for epoch in epoch_list:
    df_epoch = global_df_merged.loc[global_df_merged['epoch'] == epoch]
    total_len = len(df_epoch)
    failure_len = len(df_epoch.loc[df_epoch['SSL Failure'] == 1])
    if total_len == 0:
        failure_percent = 0
        success_percent = 100
    else:
        failure_percent = 100 * float(failure_len)/float(total_len)
        success_percent = 100 - failure_percent
    failure_percentage.append(failure_percent)
    success_percentage.append(success_percent)
    #print(failure_percentage)
    #print(success_percentage)
    #print('******')




N = len(epoch_list)
ind = np.arange(N)
width = 0.35


p1 = plt.bar(ind, failure_percentage, width)
p2 = plt.bar(ind, success_percentage, width, bottom=failure_percentage)
plt.xticks(ind, epoch_list)
plt.yticks(np.arange(0, 100, 10))
plt.ylabel('Percentage')
plt.title('TLS Failure Rates for each Stage')
plt.legend((p1[0], p2[0]), ('Failure', 'Success (Intercepted and Pass-Through'));


#plt2.show()


<IPython.core.display.Javascript object>