## Unencrypted connections
- Num of channels sending at least one unencrypted HTTP request
- Percentage of unencrypted HTTP requests

In [1]:
import pandas as pd
from df_utils import load_df, print_stats
from log_analysis import get_playback_detection_results, get_n_successful_channels
from nb_utils import get_popular_domains_from_reqs, get_popular_domains_from_tcp_conns
from crawl_ids import CrawlRokuTop1K, CrawlRokuSSLStrip
from crawl_ids import CrawlFireTVMITMWarmUp, CrawlFireTVMITM, CrawlFireTVNoMITM

  return f(*args, **kwds)
  return f(*args, **kwds)


### Total num. of channels

In [2]:
n_roku = get_n_successful_channels(CrawlRokuTop1K)
n_amazon = get_n_successful_channels(CrawlFireTVNoMITM)
print(n_roku, n_amazon)

981 82


### Load Requests and TCP connections

In [3]:
requests_roku = load_df(CrawlRokuTop1K, "http_req")
responses_roku = load_df(CrawlRokuTop1K, "http_resp")
dns_df_roku = load_df(CrawlRokuTop1K, "dns")

requests_amazon = load_df(CrawlFireTVNoMITM, "http_req")
responses_amazon = load_df(CrawlFireTVNoMITM, "http_resp")
dns_df_amazon = load_df(CrawlFireTVNoMITM, "dns")

In [4]:
len(requests_roku), len(requests_amazon)

(29664, 3422)

In [5]:
roku_tcp = load_df(CrawlRokuTop1K, "tcp_conn")
amazon_tcp = load_df(CrawlFireTVNoMITM, "tcp_conn")

- Only take port 80 and 443
- Very few connections to other ports

In [6]:
roku_tcp = roku_tcp[roku_tcp.tcp_dstport.isin([80, 443])]
amazon_tcp = amazon_tcp[amazon_tcp.tcp_dstport.isin([80, 443])]

roku_tcp['tls'] = roku_tcp.tcp_dstport.map(lambda x: x == 443)
amazon_tcp['tls'] = amazon_tcp.tcp_dstport.map(lambda x: x == 443)

amazon_no_tls = amazon_tcp[~amazon_tcp.tls]
roku_no_tls = roku_tcp[~roku_tcp.tls]

In [7]:
NO_TLS = 0  # tls=False (0)
TLS = 1

def print_unencryted_stats(df):
    res = df.tls.value_counts()
    n_total = len(df)
    print ("NoTLS:", res[NO_TLS])
    print ("TLS:", res[TLS])
    print ("Total:", n_total)
    print("%% of unencrypted connections: %0.1f" % (100*res[NO_TLS] / (res[NO_TLS] + res[TLS])))

In [8]:
n_channels_no_tls_roku = requests_roku.channel_id.nunique()
n_channels_no_tls_amazon = requests_amazon.channel_id.nunique()

"%s of the %s Roku channels (%0.1f%%) send at last one unencrypted request" % (n_channels_no_tls_roku, n_roku, (100*n_channels_no_tls_roku)/n_roku)

'794 of the 981 Roku channels (80.9%) send at last one unencrypted request'

In [9]:
"%s of the %s Fire TV channels (%0.1f%%) send at last one unencrypted request" % (n_channels_no_tls_amazon, n_amazon, (100*n_channels_no_tls_amazon)/n_amazon)

'78 of the 82 Fire TV channels (95.1%) send at last one unencrypted request'

In [10]:
print(len(amazon_no_tls), len(amazon_tcp))
print(len(roku_no_tls), len(roku_tcp))

1102 4535
9141 61639


In [11]:
print(len(requests_roku), 'distinct requests')
print(requests_roku.req_domain.nunique(), 'distinct domains')

29664 distinct requests
493 distinct domains


In [12]:
print(len(requests_amazon), 'distinct requests')
print(requests_amazon.req_domain.nunique(), 'distinct domains')

3422 distinct requests
87 distinct domains


In [13]:
roku_distinct_domains = roku_tcp.drop_duplicates(['domain', 'tls'], keep='last')
amazon_distinct_domains = amazon_tcp.drop_duplicates(['domain', 'tls'], keep='last')

In [14]:
print_unencryted_stats(roku_tcp)

NoTLS: 9141
TLS: 52498
Total: 61639
% of unencrypted connections: 14.8


In [15]:
print_unencryted_stats(amazon_tcp)

NoTLS: 1102
TLS: 3433
Total: 4535
% of unencrypted connections: 24.3


In [16]:
print_unencryted_stats(roku_distinct_domains)

NoTLS: 506
TLS: 773
Total: 1279
% of unencrypted connections: 39.6


In [17]:
print_unencryted_stats(amazon_distinct_domains)

NoTLS: 87
TLS: 264
Total: 351
% of unencrypted connections: 24.8


In [18]:
pd.crosstab(roku_distinct_domains.adblocked, roku_distinct_domains.tls)

tls,False,True
adblocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,392,593
True,114,180


In [19]:
pd.crosstab(amazon_distinct_domains.adblocked, amazon_distinct_domains.tls)

tls,False,True
adblocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,58,137
True,29,127


In [20]:
len(roku_tcp), len(amazon_tcp)

(61639, 4535)

In [21]:
len(roku_distinct_domains), len(amazon_distinct_domains)

(1279, 351)

In [22]:
def print_unencryted_tracker_stats(df):
    grouped = df.groupby(["adblocked", "tls"]).size()
    non_tracker = grouped[0]
    tracker = grouped[1]
    n_trackers = sum(tracker)
    n_non_trackers = sum(non_tracker)
    print("Tracker -     Unencrypted: %0.1f%% (%s of %s)" % ((100*tracker[NO_TLS] / n_trackers), tracker[NO_TLS], n_trackers))
    print("Non-tracker - Unencrypted: %0.1f%% (%s of %s)" % ((100*non_tracker[NO_TLS] / n_non_trackers), non_tracker[NO_TLS], n_non_trackers))
    return pd.crosstab(df.adblocked, df.tls)


In [23]:
print_unencryted_tracker_stats(roku_tcp)

Tracker -     Unencrypted: 19.0% (2447 of 12906)
Non-tracker - Unencrypted: 13.7% (6694 of 48733)


tls,False,True
adblocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,6694,42039
True,2447,10459


In [24]:
print_unencryted_tracker_stats(amazon_tcp)

Tracker -     Unencrypted: 15.5% (307 of 1976)
Non-tracker - Unencrypted: 31.1% (795 of 2559)


tls,False,True
adblocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,795,1764
True,307,1669


In [25]:
print_unencryted_tracker_stats(roku_distinct_domains)

Tracker -     Unencrypted: 38.8% (114 of 294)
Non-tracker - Unencrypted: 39.8% (392 of 985)


tls,False,True
adblocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,392,593
True,114,180


In [26]:
print_unencryted_tracker_stats(amazon_distinct_domains)

Tracker -     Unencrypted: 18.6% (29 of 156)
Non-tracker - Unencrypted: 29.7% (58 of 195)


tls,False,True
adblocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,58,137
True,29,127


In [27]:
tcp_top = get_popular_domains_from_tcp_conns(roku_tcp[roku_tcp.adblocked])
tcp_top

Unnamed: 0,domain,Num. of channels
108,doubleclick.net,996
132,google-analytics.com,362
194,scorecardresearch.com,178
204,spotxchange.com,169
134,googlesyndication.com,114
227,vimeo.com,82
215,tremorhub.com,78
206,stickyadstv.com,73
104,demdex.net,66
15,adrta.com,63


In [28]:
from nb_utils import make_latex_table

In [30]:
df = get_popular_domains_from_reqs(requests_amazon)
df

Unnamed: 0,req_domain,Num. of channels
5,amazon-adsystem.com,59
68,scorecardresearch.com,18
36,ifood.tv,7
21,demdex.net,7
60,omtrdc.net,5
73,spotxchange.com,5
53,nbcuni.com,4
38,images-amazon.com,4
34,google.com,3
45,lightcast.com,3


## Most common unencrypted endpoints

In [80]:
requests_roku.columns

Index(['channel_id', 'time', 'cookie', 'post_data', 'host', 'referer', 'url',
       'method', 'user_agent', 'ip_dst', 'tcp_dstport', 'tcp_stream', 'http2',
       'http2_type', 'host_by_dns', 'domain_by_dns', 'channel_name', 'rank',
       'category', 'decoded_data', 'req_domain', 'status'],
      dtype='object')

In [81]:
df = get_popular_domains_from_reqs(requests_roku)
df

Unnamed: 0,req_domain,Num. of channels
185,google-analytics.com,178
345,roku.com,150
133,doubleclick.net,147
357,scorecardresearch.com,113
208,ifood.tv,98
221,irchan.com,79
273,monarchads.com,68
381,stickyadstv.com,66
27,akamaihd.net,46
378,spotxchange.com,37


In [82]:
print(make_latex_table(df, caption="Roku - Most prevalent domains contacted over unencrypted connections",
                       label="most_common_insecure_domains_roku"))



\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 req\_domain            &   Num. of channels \\
\midrule
 google-analytics.com  &                178 \\
 roku.com              &                150 \\
 doubleclick.net       &                147 \\
 scorecardresearch.com &                113 \\
 ifood.tv              &                 98 \\
 irchan.com            &                 79 \\
 monarchads.com        &                 68 \\
 stickyadstv.com       &                 66 \\
 akamaihd.net          &                 46 \\
 spotxchange.com       &                 37 \\
\bottomrule
\end{tabular}
%}resizebox
\caption{Roku - Most prevalent domains contacted over unencrypted connections}
\label{tab:most_common_insecure_domains_roku}
\end{table}


In [83]:
df = get_popular_domains_from_reqs(requests_amazon)
df

Unnamed: 0,req_domain,Num. of channels
5,amazon-adsystem.com,59
68,scorecardresearch.com,18
36,ifood.tv,7
21,demdex.net,7
60,omtrdc.net,5
73,spotxchange.com,5
53,nbcuni.com,4
38,images-amazon.com,4
34,google.com,3
45,lightcast.com,3


In [84]:
print(make_latex_table(df, caption="Amazon - Most prevalent domains contacted over unencrypted connections",
                       label="most_common_insecure_domains_amazon"))



\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 req\_domain            &   Num. of channels \\
\midrule
 amazon-adsystem.com   &                 59 \\
 scorecardresearch.com &                 18 \\
 ifood.tv              &                  7 \\
 demdex.net            &                  7 \\
 omtrdc.net            &                  5 \\
 spotxchange.com       &                  5 \\
 nbcuni.com            &                  4 \\
 images-amazon.com     &                  4 \\
 google.com            &                  3 \\
 lightcast.com         &                  3 \\
\bottomrule
\end{tabular}
%}resizebox
\caption{Amazon - Most prevalent domains contacted over unencrypted connections}
\label{tab:most_common_insecure_domains_amazon}
\end{table}


## Most common unencrypted tracking endpoints

In [None]:
# TODO - Update using adblock

In [34]:
requests_amazon['adblocked'] = requests_amazon['host'].map(
            lambda x: disconnect.should_block("http://" + x) if len(x) else False)

In [39]:
requests_amazon.disconnect_blocked.value_counts()

False    2750
True      672
Name: disconnect_blocked, dtype: int64

In [40]:
requests_roku.disconnect_blocked.value_counts()

False    23357
True      6307
Name: disconnect_blocked, dtype: int64

In [42]:
from trackingprotection_tools import DisconnectParser
disconnect = DisconnectParser(blocklist="disconnect/services.json")


In [45]:
df = get_popular_domains_from_reqs(requests_roku)
df['disconnect_blocked'] = df['req_domain'].map(
            lambda x: disconnect.should_block("http://" + x) if len(x) else False)
df

Unnamed: 0,req_domain,Num. of channels,disconnect_blocked
185,google-analytics.com,178,True
345,roku.com,150,False
133,doubleclick.net,147,True
357,scorecardresearch.com,113,True
208,ifood.tv,98,False
221,irchan.com,79,False
273,monarchads.com,68,False
381,stickyadstv.com,66,True
27,akamaihd.net,46,False
378,spotxchange.com,37,True


In [131]:
print(make_latex_table(df, caption="Roku - Most prevalent trackers that use unencrypted connections",
                       label="most_common_insecure_trackers_roku"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 req\_domain            &   Num. of channels \\
\midrule
 scorecardresearch.com &                 14 \\
 demdex.net            &                 11 \\
 doubleclick.net       &                  9 \\
 google-analytics.com  &                  8 \\
 1rx.io                &                  7 \\
 tremorhub.com         &                  7 \\
 omtrdc.net            &                  6 \\
 w55c.net              &                  4 \\
 adsrvr.org            &                  4 \\
 spotxchange.com       &                  4 \\
\bottomrule
\end{tabular}
%}resizebox
\caption{Roku - Most prevalent trackers that use unencrypted connections}
\label{tab:most_common_insecure_trackers_roku}
\end{table}


In [50]:
df = get_popular_domains_from_reqs(requests_amazon[requests_amazon.disconnect_blocked])
#df['disconnect_blocked'] = df['req_domain'].map(
#            lambda x: disconnect.should_block("http://" + x) if len(x) else False)

print(make_latex_table(df, caption="Amazon - Most prevalent trackers that use unencrypted connections",
                       label="most_common_insecure_trackers_amazon"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 req\_domain            &   Num. of channels \\
\midrule
 amazon-adsystem.com   &                 59 \\
 scorecardresearch.com &                 18 \\
 demdex.net            &                  7 \\
 spotxchange.com       &                  5 \\
 omtrdc.net            &                  5 \\
 yumenetworks.com      &                  3 \\
 fwmrm.net             &                  3 \\
 adsrvr.org            &                  3 \\
 rlcdn.com             &                  2 \\
 google-analytics.com  &                  2 \\
\bottomrule
\end{tabular}
%}resizebox
\caption{Amazon - Most prevalent trackers that use unencrypted connections}
\label{tab:most_common_insecure_trackers_amazon}
\end{table}


## Unencrypted platform endpoints

In [135]:
requests_amazon[ (requests_amazon.req_domain == "amazon-adsystem.com")].host.unique()

array(['aax-us-east.amazon-adsystem.com', 's.amazon-adsystem.com'],
      dtype=object)

In [136]:
requests_roku[ (requests_roku.req_domain == "roku.com")].host.unique()

array(['channels.roku.com', 'cigars.roku.com'], dtype=object)

In [None]:
## HTTP