## Unencrypted connections
- Num of channels sending at least one unencrypted HTTP request
- Percentage of unencrypted HTTP requests

In [1]:
import numpy as np
import pandas as pd
from log_analysis import (print_crawl_summary, load_timestamps_from_crawl_data,
                          get_distinct_tcp_conns, get_crawl_data_path,
                          load_dns_data, get_crawl_status, get_http_df, get_n_successful_channels)

from os.path import isdir, join
from datetime import datetime
from glob import glob
from tabulate import tabulate
import seaborn as sns

from crawl_ids import CrawlRokuNoMITM, CrawlFireTVNoMITM
from nb_utils import get_popular_domains
#ROKU_NO_MITM_CRAWL = 'roku-data-20190501-031836'
#AMAZON_NO_MITM_CRAWL = 'amazon-data-20190508-202449'

crawl_data_dir_roku = get_crawl_data_path(CrawlRokuNoMITM)
crawl_data_dir_amazon = get_crawl_data_path(CrawlFireTVNoMITM)

  return f(*args, **kwds)
  return f(*args, **kwds)


### Total num. of channels

In [2]:
n_roku = get_n_successful_channels(crawl_data_dir_roku)
n_amazon = get_n_successful_channels(crawl_data_dir_amazon)
print(n_roku, n_amazon)

95 82


### Load TCP data

In [3]:
roku_tcp = get_distinct_tcp_conns(crawl_data_dir_roku)
amazon_tcp = get_distinct_tcp_conns(crawl_data_dir_amazon)

Loading distinct TCP connections from /media/gacar/Data/iot-house/crawl-data/roku-data-20190501-031836/post-process 
Loading distinct TCP connections from /media/gacar/Data/iot-house/crawl-data/amazon-data-20190508-202449/post-process 


- Only take port 80 and 443
- Very few connections to other ports

In [4]:
roku_tcp = roku_tcp[roku_tcp.tcp_dstport.isin([80, 443])]
amazon_tcp = amazon_tcp[amazon_tcp.tcp_dstport.isin([80, 443])]

roku_tcp['tls'] = roku_tcp.tcp_dstport.map(lambda x: x == 443)
amazon_tcp['tls'] = amazon_tcp.tcp_dstport.map(lambda x: x == 443)

In [5]:
NO_TLS = 0  # tls=False (0)
TLS = 1

def print_unencryted_stats(df):
    res = df.tls.value_counts()
    n_total = len(df)
    print ("NoTLS:", res[NO_TLS])
    print ("TLS:", res[TLS])
    print ("Total:", n_total)
    print("%% of unencrypted connections: %0.1f" % (100*res[NO_TLS] / (res[NO_TLS] + res[TLS])))

In [7]:
requests_roku, responses_roku, dns_df_roku = get_http_df(crawl_data_dir_roku)
requests_amazon, responses_amazon, dns_df_amazon = get_http_df(crawl_data_dir_amazon)

Multiple messages 196
Multiple messages 100


In [8]:
roku_no_tls = roku_tcp[~roku_tcp.tls]
amazon_no_tls = amazon_tcp[~amazon_tcp.tls]

n_channels_no_tls_roku = requests_roku.channel_id.nunique()
n_channels_no_tls_amazon = requests_amazon.channel_id.nunique()

"%s of the %s Roku channels (%0.1f%%) send at last one unencrypted request" % (n_channels_no_tls_roku, n_roku, (100*n_channels_no_tls_roku)/n_roku)

'73 of the 95 Roku channels (76.8%) send at last one unencrypted request'

In [9]:
"%s of the %s Fire TV channels (%0.1f%%) send at last one unencrypted request" % (n_channels_no_tls_amazon, n_amazon, (100*n_channels_no_tls_amazon)/n_amazon)

'78 of the 82 Fire TV channels (95.1%) send at last one unencrypted request'

In [10]:
amazon_no_tls.tcp_stream.nunique(), requests_amazon.tcp_stream.nunique()

(230, 230)

In [11]:
amazon_no_tls.domain_by_dns.nunique(), requests_amazon.domain_by_dns.nunique()

(83, 84)

In [12]:
requests_amazon[requests_amazon.domain_by_dns=="amazon.com"][['channel_id', 'time', 'url', 'ip_dst', 'tcp_dstport', 'tcp_stream', 'decoded_data']]

Unnamed: 0,channel_id,time,url,ip_dst,tcp_dstport,tcp_stream,decoded_data


In [13]:
requests_amazon[requests_amazon.tcp_dstport=="443"]

Unnamed: 0,channel_id,time,cookie,post_data,host,referer,url,method,user_agent,ip_dst,...,http2,http2_type,host_by_dns,domain_by_dns,channel_name,rank,category,decoded_data,req_domain,status


In [14]:
def diff(df1, df2, column):
    return set(df1[column].unique()) - set(df2[column].unique())

In [None]:
amazon_no_tls

In [51]:
diff(amazon_no_tls, requests_amazon, "domain_by_dns")

{None}

In [60]:
print(roku_no_tls.channel_id.nunique(), 'channels')
print(requests_amazon.channel_id.nunique(), 'channels')
diff(requests_amazon, amazon_no_tls,  "channel_id")
#diff(amazon_no_tls,  requests_amazon, "channel_id")

75 channels
78 channels


{'com.HBO',
 'com.amazon.rialto.cordova.webapp.webapp8dce82b0aefe46e3ac67dc00fce12092',
 'com.amazon.rialto.cordova.webapp.webappca79b1bc532647ccb0d32c16bf4d72d2',
 'com.amazon.rialto.cordova.webapp.webappddccc2245c524d95bdeb7088fa5d7a1f',
 'com.amazon.rialto.cordova.webapp.webappf2ef4524131e47619089c0caecf6f33c',
 'com.amazon.rialto.webapp.A6ed4efc199bebae4da0e5ec21e689439c4744ab8',
 'com.beachbody.bodfiretv',
 'com.cbs.ott',
 'com.dailyburn.challenge',
 'com.demicapps.quizzer8',
 'com.gaia.firetv',
 'com.hbo.hbonow',
 'com.magellan.tv',
 'com.maz.home449',
 'com.speareducation.lobby',
 'com.tinymission.dailyworkoutsfree'}

In [62]:
amazon_no_tls[amazon_no_tls.channel_id=="com.cbs.ott"]

Unnamed: 0,tcp_stream,frame_time_epoch,ip_src,tcp_srcport,ip_dst,tcp_dstport,channel_id,mitm_attempt,host_by_dns,domain_by_dns,disconnect_blocked,timestamp,channel_name,rank,category,playback,status,tls


In [64]:
requests_amazon[requests_amazon.channel_id=="com.cbs.ott"].tcp_stream.unique()

array(['37'], dtype=object)

In [None]:
requests_amazon

In [48]:
print(roku_no_tls.domain_by_dns.nunique(), 'distinct domains')
print(roku_no_tls.host_by_dns.nunique(), 'distinct hosts')
print(roku_no_tls.channel_id.nunique(), 'channels')

123 distinct domains
191 distinct hosts
75 channels


In [49]:
print(requests_roku.domain_by_dns.nunique())
print(requests_roku.host_by_dns.nunique())
print(requests_roku.channel_id.nunique())

121
185
73


In [18]:
print(amazon_no_tls.domain_by_dns.nunique(), 'distinct domains')
print(amazon_no_tls.host_by_dns.nunique(), 'distinct hosts')
print(amazon_no_tls.ip_dst.nunique(), 'distinct IP addresses')

83 distinct domains
129 distinct hosts
141 distinct IP addresses


In [19]:
requests_roku.channel_id.nunique()

73

In [20]:
len(requests_roku)

4138

In [21]:
len(requests_amazon)

3422

In [22]:
roku_distinct_domains = roku_tcp.drop_duplicates(['domain_by_dns', 'tls'], keep='last')
amazon_distinct_domains = amazon_tcp.drop_duplicates(['domain_by_dns', 'tls'], keep='last')

In [23]:
print_unencryted_stats(roku_tcp)

NoTLS: 1818
TLS: 6430
Total: 8248
% of unencrypted connections: 22.0


In [24]:
print_unencryted_stats(amazon_tcp)

NoTLS: 1070
TLS: 3108
Total: 4178
% of unencrypted connections: 25.6


In [25]:
print_unencryted_stats(roku_distinct_domains)

NoTLS: 124
TLS: 178
Total: 302
% of unencrypted connections: 41.1


In [26]:
print_unencryted_stats(amazon_distinct_domains)

NoTLS: 84
TLS: 262
Total: 346
% of unencrypted connections: 24.3


In [27]:
pd.crosstab(roku_distinct_domains.disconnect_blocked, roku_distinct_domains.tls)

tls,False,True
disconnect_blocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,86,131
True,38,47


In [28]:
pd.crosstab(amazon_distinct_domains.disconnect_blocked, amazon_distinct_domains.tls)

tls,False,True
disconnect_blocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,61,165
True,23,97


In [29]:
len(roku_tcp), len(amazon_tcp)

(8248, 4178)

In [30]:
len(roku_distinct_domains), len(amazon_distinct_domains)

(302, 346)

# TODO: remove redirected domains
- we should not count if http is upgraded to https

In [31]:
def print_unencryted_tracker_stats(df):
    df = df.groupby(["disconnect_blocked", "tls"]).size()
    not_blocked = df[0]
    blocked = df[1]
    print("Non-tracker - %% NoTLS: %0.1f" % (100*not_blocked[NO_TLS] / (not_blocked[NO_TLS] + not_blocked[TLS])))
    print("Tracker - %% NoTLS: %0.1f" % (100*blocked[NO_TLS] / (blocked[NO_TLS] + blocked[TLS])))

#$100*blocked[NO_TLS] / (blocked[NO_TLS] + blocked[TLS]

In [32]:
print_unencryted_tracker_stats(roku_tcp)

Non-tracker - % NoTLS: 20.0
Tracker - % NoTLS: 33.0


In [33]:
print_unencryted_tracker_stats(roku_distinct_domains)

Non-tracker - % NoTLS: 39.6
Tracker - % NoTLS: 44.7


In [34]:
print_unencryted_tracker_stats(amazon_tcp)

Non-tracker - % NoTLS: 33.8
Tracker - % NoTLS: 13.9


In [35]:
print_unencryted_tracker_stats(amazon_distinct_domains)

Non-tracker - % NoTLS: 27.0
Tracker - % NoTLS: 19.2


In [36]:
pre = r"""
\begin{table}[H]
%\centering
\resizebox{\columnwidth}{!}{%
"""

post = r"""
}
\caption{CAPTION}
\label{tab:LABEL}
\end{table}"""

In [37]:
def make_latex_table(df, label="LABEL", caption="caption",
    tablefmt="latex_booktabs", headers="keys", showindex=False):
    tabu = tabulate(df, tablefmt="latex_booktabs", headers="keys", showindex=False)
    return pre + tabu + post.replace("LABEL", label).replace("CAPTION", caption)
    

## Most common unencrypted endpoints

In [47]:
df = get_popular_domains(requests_roku)
print(make_latex_table(df, caption="Roku - Most prevalent domains contacted over unencrypted connections",
                       label="most_common_insecure_domains_roku"))



\begin{table}[H]
%\centering
\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 domain\_by\_dns         &   Num. of channels \\
\midrule
 roku.com              &                 26 \\
 scorecardresearch.com &                 13 \\
 ifood.tv              &                 12 \\
 demdex.net            &                 11 \\
 doubleclick.net       &                  9 \\
 google-analytics.com  &                  8 \\
 akamaihd.net          &                  7 \\
 tremorhub.com         &                  7 \\
 1rx.io                &                  7 \\
 irchan.com            &                  6 \\
\bottomrule
\end{tabular}
}
\caption{Roku - Most prevalent domains contacted over unencrypted connections}
\label{tab:most_common_insecure_domains_roku}
\end{table}


In [39]:
df = get_popular_domains(requests_amazon)
print(make_latex_table(df, caption="Amazon - Most prevalent domains contacted over unencrypted connections",
                       label="most_common_insecure_domains_amazon"))



\begin{table}[H]
%\centering
\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 domain\_by\_dns         &   Num. of channels \\
\midrule
 amazon-adsystem.com   &                 27 \\
 scorecardresearch.com &                 16 \\
 demdex.net            &                  7 \\
 ifood.tv              &                  7 \\
 spotxchange.com       &                  5 \\
 omtrdc.net            &                  5 \\
 images-amazon.com     &                  4 \\
 nbcuni.com            &                  4 \\
 yumenetworks.com      &                  3 \\
 adsrvr.org            &                  3 \\
\bottomrule
\end{tabular}
}
\caption{Amazon - Most prevalent domains contacted over unencrypted connections}
\label{tab:most_common_insecure_domains_amazon}
\end{table}


In [40]:
get_popular_domains(roku_no_tls)

Unnamed: 0,domain_by_dns,Num. of channels
93,roku.com,26
97,scorecardresearch.com,13
53,ifood.tv,12
33,demdex.net,11
35,doubleclick.net,10
48,google-analytics.com,8
9,akamaihd.net,7
108,tremorhub.com,7
0,1rx.io,7
57,irchan.com,6


In [41]:
get_popular_domains(amazon_tcp[~amazon_tcp.tls])

Unnamed: 0,domain_by_dns,Num. of channels
4,amazon-adsystem.com,27
65,scorecardresearch.com,16
19,demdex.net,7
34,ifood.tv,7
70,spotxchange.com,5
57,omtrdc.net,5
36,images-amazon.com,4
51,nbcuni.com,4
82,yumenetworks.com,3
2,adsrvr.org,3


## Most common unencrypted tracking endpoints

In [42]:
df = get_popular_domains(roku_tcp[~roku_tcp.tls & roku_tcp.disconnect_blocked])

In [43]:
print(make_latex_table(df, caption="Roku - Most prevalent trackers that use unencrypted connections",
                       label="most_common_insecure_trackers_roku"))


\begin{table}[H]
%\centering
\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 domain\_by\_dns         &   Num. of channels \\
\midrule
 scorecardresearch.com &                 13 \\
 demdex.net            &                 11 \\
 doubleclick.net       &                 10 \\
 google-analytics.com  &                  8 \\
 1rx.io                &                  7 \\
 tremorhub.com         &                  7 \\
 omtrdc.net            &                  6 \\
 w55c.net              &                  4 \\
 adsrvr.org            &                  4 \\
 spotxchange.com       &                  4 \\
\bottomrule
\end{tabular}
}
\caption{Roku - Most prevalent trackers that use unencrypted connections}
\label{tab:most_common_insecure_trackers_roku}
\end{table}


In [44]:
df = get_popular_domains(amazon_tcp[~amazon_tcp.tls & amazon_tcp.disconnect_blocked])
print(make_latex_table(df, caption="Amazon - Most prevalent trackers that use unencrypted connections",
                       label="most_common_insecure_trackers_amazon"))


\begin{table}[H]
%\centering
\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 domain\_by\_dns         &   Num. of channels \\
\midrule
 amazon-adsystem.com   &                 27 \\
 scorecardresearch.com &                 16 \\
 demdex.net            &                  7 \\
 spotxchange.com       &                  5 \\
 omtrdc.net            &                  5 \\
 yumenetworks.com      &                  3 \\
 fwmrm.net             &                  3 \\
 adsrvr.org            &                  3 \\
 rlcdn.com             &                  2 \\
 google-analytics.com  &                  2 \\
\bottomrule
\end{tabular}
}
\caption{Amazon - Most prevalent trackers that use unencrypted connections}
\label{tab:most_common_insecure_trackers_amazon}
\end{table}


## Unencrypted platform endpoints

In [45]:
amazon_tcp[~amazon_tcp.tls & (amazon_tcp.domain_by_dns == "amazon-adsystem.com")].host_by_dns.unique()

array(['aax-us-east.amazon-adsystem.com', 's.amazon-adsystem.com'],
      dtype=object)

In [46]:
roku_tcp[~roku_tcp.tls & (roku_tcp.domain_by_dns == "roku.com")].host_by_dns.unique()

array(['wwwimg.roku.com', 'api2.sr.roku.com', 'cigars.roku.com',
       'channels.roku.com'], dtype=object)

In [None]:
## HTTP