## Unencrypted connections
- Num of channels sending at least one unencrypted HTTP request
- Percentage of unencrypted HTTP requests

In [1]:
import pandas as pd
from df_utils import load_df, print_stats
from log_analysis import get_playback_detection_results, get_n_successful_channels
from nb_utils import get_popular_domains_from_reqs, get_popular_domains_from_tcp_conns, make_latex_table
from crawl_ids import CrawlRokuTop1KNoMITM, CrawlFireTVTop1KNoMITM

  return f(*args, **kwds)
  return f(*args, **kwds)


### Total num. of channels

In [2]:
n_roku = get_n_successful_channels(CrawlRokuTop1KNoMITM)
n_amazon = get_n_successful_channels(CrawlFireTVTop1KNoMITM)
print(n_roku, n_amazon)

981 956


### Load Requests and TCP connections

In [3]:
requests_roku = load_df(CrawlRokuTop1KNoMITM, "http_req")
responses_roku = load_df(CrawlRokuTop1KNoMITM, "http_resp")
dns_df_roku = load_df(CrawlRokuTop1KNoMITM, "dns")

requests_amazon = load_df(CrawlFireTVTop1KNoMITM, "http_req")
responses_amazon = load_df(CrawlFireTVTop1KNoMITM, "http_resp")
dns_df_amazon = load_df(CrawlFireTVTop1KNoMITM, "dns")

In [4]:
roku_tcp = load_df(CrawlRokuTop1KNoMITM, "tcp_conn")
amazon_tcp = load_df(CrawlFireTVTop1KNoMITM, "tcp_conn")

- Only take port 80 and 443
- Very few connections to other ports

In [6]:
n_channels_no_tls_roku = requests_roku.channel_id.nunique()
n_channels_no_tls_amazon = requests_amazon.channel_id.nunique()

"%s of the %s Roku channels (%0.1f%%) send at last one unencrypted request" % (n_channels_no_tls_roku, n_roku, (100*n_channels_no_tls_roku)/n_roku)

'794 of the 981 Roku channels (80.9%) send at last one unencrypted request'

In [7]:
"%s of the %s Fire TV channels (%0.1f%%) send at last one unencrypted request" % (n_channels_no_tls_amazon, n_amazon, (100*n_channels_no_tls_amazon)/n_amazon)

'762 of the 956 Fire TV channels (79.7%) send at last one unencrypted request'

### Amazon top unencrypted

In [22]:
from log_analysis import get_https_upgrade_redirectors
def get_most_common_unencrypted(crawl_name):
    http_req = load_df(crawl_name, "http_req")
    http_resp = load_df(crawl_name, "http_resp")

    redirects, https_upgrades, cross_origin_redirects = get_https_upgrade_redirectors(
        crawl_name, http_req, http_resp)
    https_upgrade_domains = set(https_upgrades.req_domain.unique())
    df = get_popular_domains_from_reqs(http_req, 10)
    df['https_upgrade'] = df.req_domain.map(lambda x: x in https_upgrade_domains)
    # Omit the https_upgrade column if none of the domains upgrade to HTTPS
    if list(df['https_upgrade'].unique()) == [False]:
        del df['https_upgrade']
    return df

In [32]:
df = get_most_common_unencrypted(CrawlFireTVTop1KNoMITM)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  http_resp['url'] = http_resp.apply(lambda x: get_resp_url(x, req_urls), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  lambda x: get_fld(x, fail_silently=True))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  and x.req_domain==x.loc_domain), axis=1)


In [33]:
print(make_latex_table(df, label="most_common_insecure_domains_amazon",
                       caption="Most prevalent domains contacted over unencrypted connections (Amazon)"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 Req domain            &   Num. of channels \\
\midrule
 amazon-adsystem.com   &                392 \\
 scorecardresearch.com &                122 \\
 images-amazon.com     &                 58 \\
 ifood.tv              &                 51 \\
 cloudinary.com        &                 31 \\
 titantv.com           &                 29 \\
 spotxchange.com       &                 28 \\
 wsi.com               &                 27 \\
 cdn01.net             &                 25 \\
 lightcast.com         &                 25 \\
\bottomrule
\end{tabular}
%}
\caption{Most prevalent domains contacted over unencrypted connections (Amazon)}
\label{tab:most_common_insecure_domains_amazon}
\end{table}


### Roku top unencrypted

In [34]:
df = get_most_common_unencrypted(CrawlRokuTop1KNoMITM)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  http_resp['url'] = http_resp.apply(lambda x: get_resp_url(x, req_urls), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  lambda x: get_fld(x, fail_silently=True))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  and x.req_domain==x.loc_domain), axis=1)


In [35]:
print(make_latex_table(df, label="most_common_insecure_domains_roku",
                       caption="Most prevalent domains contacted over unencrypted connections (Roku)"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 Req domain            &   Num. of channels \\
\midrule
 google-analytics.com  &                178 \\
 roku.com              &                150 \\
 doubleclick.net       &                147 \\
 scorecardresearch.com &                113 \\
 ifood.tv              &                 98 \\
 irchan.com            &                 79 \\
 monarchads.com        &                 68 \\
 stickyadstv.com       &                 66 \\
 akamaihd.net          &                 46 \\
 spotxchange.com       &                 37 \\
\bottomrule
\end{tabular}
%}
\caption{Most prevalent domains contacted over unencrypted connections (Roku)}
\label{tab:most_common_insecure_domains_roku}
\end{table}


In [5]:
roku_tcp = roku_tcp[roku_tcp.tcp_dstport.isin([80, 443])]
amazon_tcp = amazon_tcp[amazon_tcp.tcp_dstport.isin([80, 443])]

roku_tcp['tls'] = roku_tcp.tcp_dstport.map(lambda x: x == 443)
amazon_tcp['tls'] = amazon_tcp.tcp_dstport.map(lambda x: x == 443)

amazon_no_tls = amazon_tcp[~amazon_tcp.tls]
roku_no_tls = roku_tcp[~roku_tcp.tls]

In [13]:
roku_distinct_domains = roku_tcp.drop_duplicates(['domain', 'tls'], keep='last')
amazon_distinct_domains = amazon_tcp.drop_duplicates(['domain', 'tls'], keep='last')

In [7]:
NO_TLS = 0  # tls=False (0)
TLS = 1

def print_unencryted_stats(df):
    res = df.tls.value_counts()
    n_total = len(df)
    print ("NoTLS:", res[NO_TLS])
    print ("TLS:", res[TLS])
    print ("Total:", n_total)
    print("%% of unencrypted connections: %0.1f" % (100*res[NO_TLS] / (res[NO_TLS] + res[TLS])))

In [14]:
print_unencryted_stats(roku_tcp)

NoTLS: 9141
TLS: 52498
Total: 61639
% of unencrypted connections: 14.8


In [15]:
print_unencryted_stats(amazon_tcp)

NoTLS: 1102
TLS: 3433
Total: 4535
% of unencrypted connections: 24.3


In [16]:
print_unencryted_stats(roku_distinct_domains)

NoTLS: 506
TLS: 773
Total: 1279
% of unencrypted connections: 39.6


In [17]:
print_unencryted_stats(amazon_distinct_domains)

NoTLS: 87
TLS: 264
Total: 351
% of unencrypted connections: 24.8


In [18]:
pd.crosstab(roku_distinct_domains.adblocked, roku_distinct_domains.tls)

tls,False,True
adblocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,392,593
True,114,180


In [19]:
pd.crosstab(amazon_distinct_domains.adblocked, amazon_distinct_domains.tls)

tls,False,True
adblocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,58,137
True,29,127


## Most common unencrypted tracking endpoints

In [45]:
df = get_popular_domains_from_reqs(requests_roku)
df

Unnamed: 0,req_domain,Num. of channels,disconnect_blocked
185,google-analytics.com,178,True
345,roku.com,150,False
133,doubleclick.net,147,True
357,scorecardresearch.com,113,True
208,ifood.tv,98,False
221,irchan.com,79,False
273,monarchads.com,68,False
381,stickyadstv.com,66,True
27,akamaihd.net,46,False
378,spotxchange.com,37,True


In [131]:
print(make_latex_table(
    df,
    caption="Roku - Most prevalent trackers that use unencrypted connections",
    label="most_common_insecure_trackers_roku"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 req\_domain            &   Num. of channels \\
\midrule
 scorecardresearch.com &                 14 \\
 demdex.net            &                 11 \\
 doubleclick.net       &                  9 \\
 google-analytics.com  &                  8 \\
 1rx.io                &                  7 \\
 tremorhub.com         &                  7 \\
 omtrdc.net            &                  6 \\
 w55c.net              &                  4 \\
 adsrvr.org            &                  4 \\
 spotxchange.com       &                  4 \\
\bottomrule
\end{tabular}
%}resizebox
\caption{Roku - Most prevalent trackers that use unencrypted connections}
\label{tab:most_common_insecure_trackers_roku}
\end{table}


In [50]:
df = get_popular_domains_from_reqs(requests_amazon[requests_amazon.disconnect_blocked])
#df['disconnect_blocked'] = df['req_domain'].map(
#            lambda x: disconnect.should_block("http://" + x) if len(x) else False)

print(make_latex_table(df, caption="Amazon - Most prevalent trackers that use unencrypted connections",
                       label="most_common_insecure_trackers_amazon"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 req\_domain            &   Num. of channels \\
\midrule
 amazon-adsystem.com   &                 59 \\
 scorecardresearch.com &                 18 \\
 demdex.net            &                  7 \\
 spotxchange.com       &                  5 \\
 omtrdc.net            &                  5 \\
 yumenetworks.com      &                  3 \\
 fwmrm.net             &                  3 \\
 adsrvr.org            &                  3 \\
 rlcdn.com             &                  2 \\
 google-analytics.com  &                  2 \\
\bottomrule
\end{tabular}
%}resizebox
\caption{Amazon - Most prevalent trackers that use unencrypted connections}
\label{tab:most_common_insecure_trackers_amazon}
\end{table}


## Unencrypted platform endpoints

In [135]:
requests_amazon[ (requests_amazon.req_domain == "amazon-adsystem.com")].host.unique()

array(['aax-us-east.amazon-adsystem.com', 's.amazon-adsystem.com'],
      dtype=object)

In [136]:
requests_roku[ (requests_roku.req_domain == "roku.com")].host.unique()

array(['channels.roku.com', 'cigars.roku.com'], dtype=object)

In [None]:
## HTTP

In [22]:
def print_unencryted_tracker_stats(df):
    grouped = df.groupby(["adblocked", "tls"]).size()
    non_tracker = grouped[0]
    tracker = grouped[1]
    n_trackers = sum(tracker)
    n_non_trackers = sum(non_tracker)
    print("Tracker -     Unencrypted: %0.1f%% (%s of %s)" % ((100*tracker[NO_TLS] / n_trackers), tracker[NO_TLS], n_trackers))
    print("Non-tracker - Unencrypted: %0.1f%% (%s of %s)" % ((100*non_tracker[NO_TLS] / n_non_trackers), non_tracker[NO_TLS], n_non_trackers))
    return pd.crosstab(df.adblocked, df.tls)


In [23]:
print_unencryted_tracker_stats(roku_tcp)

Tracker -     Unencrypted: 19.0% (2447 of 12906)
Non-tracker - Unencrypted: 13.7% (6694 of 48733)


tls,False,True
adblocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,6694,42039
True,2447,10459


In [24]:
print_unencryted_tracker_stats(amazon_tcp)

Tracker -     Unencrypted: 15.5% (307 of 1976)
Non-tracker - Unencrypted: 31.1% (795 of 2559)


tls,False,True
adblocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,795,1764
True,307,1669


In [25]:
print_unencryted_tracker_stats(roku_distinct_domains)

Tracker -     Unencrypted: 38.8% (114 of 294)
Non-tracker - Unencrypted: 39.8% (392 of 985)


tls,False,True
adblocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,392,593
True,114,180


In [26]:
print_unencryted_tracker_stats(amazon_distinct_domains)

Tracker -     Unencrypted: 18.6% (29 of 156)
Non-tracker - Unencrypted: 29.7% (58 of 195)


tls,False,True
adblocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,58,137
True,29,127


## Most common unencrypted endpoints

In [81]:
df = get_popular_domains_from_reqs(requests_roku)
df

Unnamed: 0,req_domain,Num. of channels
185,google-analytics.com,178
345,roku.com,150
133,doubleclick.net,147
357,scorecardresearch.com,113
208,ifood.tv,98
221,irchan.com,79
273,monarchads.com,68
381,stickyadstv.com,66
27,akamaihd.net,46
378,spotxchange.com,37


In [82]:
print(make_latex_table(df, caption="Roku - Most prevalent domains contacted over unencrypted connections",
                       label="most_common_insecure_domains_roku"))



\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 req\_domain            &   Num. of channels \\
\midrule
 google-analytics.com  &                178 \\
 roku.com              &                150 \\
 doubleclick.net       &                147 \\
 scorecardresearch.com &                113 \\
 ifood.tv              &                 98 \\
 irchan.com            &                 79 \\
 monarchads.com        &                 68 \\
 stickyadstv.com       &                 66 \\
 akamaihd.net          &                 46 \\
 spotxchange.com       &                 37 \\
\bottomrule
\end{tabular}
%}resizebox
\caption{Roku - Most prevalent domains contacted over unencrypted connections}
\label{tab:most_common_insecure_domains_roku}
\end{table}


In [83]:
df = get_popular_domains_from_reqs(requests_amazon)
df

Unnamed: 0,req_domain,Num. of channels
5,amazon-adsystem.com,59
68,scorecardresearch.com,18
36,ifood.tv,7
21,demdex.net,7
60,omtrdc.net,5
73,spotxchange.com,5
53,nbcuni.com,4
38,images-amazon.com,4
34,google.com,3
45,lightcast.com,3


In [84]:
print(make_latex_table(df, caption="Amazon - Most prevalent domains contacted over unencrypted connections",
                       label="most_common_insecure_domains_amazon"))



\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 req\_domain            &   Num. of channels \\
\midrule
 amazon-adsystem.com   &                 59 \\
 scorecardresearch.com &                 18 \\
 ifood.tv              &                  7 \\
 demdex.net            &                  7 \\
 omtrdc.net            &                  5 \\
 spotxchange.com       &                  5 \\
 nbcuni.com            &                  4 \\
 images-amazon.com     &                  4 \\
 google.com            &                  3 \\
 lightcast.com         &                  3 \\
\bottomrule
\end{tabular}
%}resizebox
\caption{Amazon - Most prevalent domains contacted over unencrypted connections}
\label{tab:most_common_insecure_domains_amazon}
\end{table}
