## Unencrypted connections
- Num of channels sending at least one unencrypted HTTP request
- Percentage of unencrypted HTTP requests

In [1]:
import numpy as np
import pandas as pd
from log_analysis import (print_crawl_summary, load_timestamps_from_crawl_data,
                          get_distinct_tcp_conns, get_crawl_data_path,
                          load_dns_data, get_crawl_status, get_http_df, get_n_successful_channels)

from os.path import isdir, join
from datetime import datetime
from glob import glob
from tabulate import tabulate
import seaborn as sns

from crawl_ids import CrawlRokuNoMITM, CrawlFireTVNoMITM
from nb_utils import get_popular_domains
#ROKU_NO_MITM_CRAWL = 'roku-data-20190501-031836'
#AMAZON_NO_MITM_CRAWL = 'amazon-data-20190508-202449'

crawl_data_dir_roku = get_crawl_data_path(CrawlRokuNoMITM)
crawl_data_dir_amazon = get_crawl_data_path(CrawlFireTVNoMITM)

  return f(*args, **kwds)
  return f(*args, **kwds)


### Total num. of channels

In [2]:
n_roku = get_n_successful_channels(crawl_data_dir_roku)
n_amazon = get_n_successful_channels(crawl_data_dir_amazon)
print(n_roku, n_amazon)

95 82


### Load TCP data

In [5]:
roku_tcp = get_distinct_tcp_conns(crawl_data_dir_roku)
amazon_tcp = get_distinct_tcp_conns(crawl_data_dir_amazon)

Loading distinct TCP connections from /media/gacar/Data/iot-house/crawl-data/roku-data-20190501-031836/post-process 
Loading distinct TCP connections from /media/gacar/Data/iot-house/crawl-data/amazon-data-20190508-202449/post-process 


- Only take port 80 and 443
- Very few connections to other ports

In [9]:
roku_tcp = roku_tcp[roku_tcp.tcp_dstport.isin([80, 443])]
amazon_tcp = amazon_tcp[amazon_tcp.tcp_dstport.isin([80, 443])]

roku_tcp['tls'] = roku_tcp.tcp_dstport.map(lambda x: x == 443)
amazon_tcp['tls'] = amazon_tcp.tcp_dstport.map(lambda x: x == 443)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [46]:
NO_TLS = 0  # tls=False (0)
TLS = 1

def print_unencryted_stats(df):
    res = df.tls.value_counts()
    n_total = len(df)
    print ("NoTLS:", res[NO_TLS])
    print ("TLS:", res[TLS])
    print ("Total:", n_total)
    print("%% of unencrypted connections: %0.1f" % (100*res[NO_TLS] / (res[NO_TLS] + res[TLS])))

In [47]:
roku_distinct_domains = roku_tcp.drop_duplicates(['domain_by_dns', 'tls'], keep='last')
amazon_distinct_domains = amazon_tcp.drop_duplicates(['domain_by_dns', 'tls'], keep='last')

In [48]:
print_unencryted_stats(roku_tcp)

NoTLS: 1818
TLS: 6430
Total: 8248
% of unencrypted connections: 22.0


In [49]:
print_unencryted_stats(amazon_tcp)

NoTLS: 1070
TLS: 3108
Total: 4178
% of unencrypted connections: 25.6


In [50]:
print_unencryted_stats(roku_distinct_domains)

NoTLS: 124
TLS: 178
Total: 302
% of unencrypted connections: 41.1


In [51]:
print_unencryted_stats(amazon_distinct_domains)

NoTLS: 84
TLS: 262
Total: 346
% of unencrypted connections: 24.3


In [53]:
pd.crosstab(roku_distinct_domains.disconnect_blocked, roku_distinct_domains.tls)

tls,False,True
disconnect_blocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,86,131
True,38,47


In [54]:
pd.crosstab(amazon_distinct_domains.disconnect_blocked, amazon_distinct_domains.tls)

tls,False,True
disconnect_blocked,Unnamed: 1_level_1,Unnamed: 2_level_1
False,61,165
True,23,97


In [None]:
len(roku_tcp), len(amazon_tcp)

In [None]:
len(roku_distinct_domains), len(amazon_distinct_domains)

# TODO: remove redirected domains
- we should not count if http is upgraded to https

In [68]:
def print_unencryted_tracker_stats(df):
    df = df.groupby(["disconnect_blocked", "tls"]).size()
    not_blocked = df[0]
    blocked = df[1]
    print("Non-tracker - %% NoTLS: %0.1f" % (100*not_blocked[NO_TLS] / (not_blocked[NO_TLS] + not_blocked[TLS])))
    print("Tracker - %% NoTLS: %0.1f" % (100*blocked[NO_TLS] / (blocked[NO_TLS] + blocked[TLS])))

#$100*blocked[NO_TLS] / (blocked[NO_TLS] + blocked[TLS]

In [72]:
print_unencryted_tracker_stats(roku_tcp)

Non-tracker - % NoTLS: 20.0
Tracker - % NoTLS: 33.0


In [69]:
print_unencryted_tracker_stats(roku_distinct_domains)

Non-tracker - % NoTLS: 39.6
Tracker - % NoTLS: 44.7


In [71]:
print_unencryted_tracker_stats(amazon_tcp)

Non-tracker - % NoTLS: 33.8
Tracker - % NoTLS: 13.9


In [70]:
print_unencryted_tracker_stats(amazon_distinct_domains)

Non-tracker - % NoTLS: 27.0
Tracker - % NoTLS: 19.2


## Most common unencrypted endpoints

In [75]:
get_popular_domains(roku_tcp[~roku_tcp.tls])

Unnamed: 0,domain_by_dns,Num. of channels
93,roku.com,26
97,scorecardresearch.com,13
53,ifood.tv,12
33,demdex.net,11
35,doubleclick.net,10
48,google-analytics.com,8
9,akamaihd.net,7
108,tremorhub.com,7
0,1rx.io,7
57,irchan.com,6


In [76]:
get_popular_domains(amazon_tcp[~amazon_tcp.tls])

Unnamed: 0,domain_by_dns,Num. of channels
4,amazon-adsystem.com,27
65,scorecardresearch.com,16
19,demdex.net,7
34,ifood.tv,7
70,spotxchange.com,5
57,omtrdc.net,5
36,images-amazon.com,4
51,nbcuni.com,4
82,yumenetworks.com,3
2,adsrvr.org,3


In [101]:
pre = r"""
\begin{table}[H]
%\centering
\resizebox{\columnwidth}{!}{%
"""

post = r"""
}
\caption{CAPTION}
\label{tab:LABEL}
\end{table}"""

In [111]:
def make_latex_table(df, label="LABEL", caption="caption",
    tablefmt="latex_booktabs", headers="keys", showindex=False):
    tabu = tabulate(df, tablefmt="latex_booktabs", headers="keys", showindex=False)
    return pre + tabu + post.replace("LABEL", label).replace("CAPTION", caption)
    

## Most common unencrypted tracking endpoints

In [112]:
df = get_popular_domains(roku_tcp[~roku_tcp.tls & roku_tcp.disconnect_blocked])

In [113]:
print(make_latex_table(df, caption="Roku - Most prevalent trackers that use unencrypted connections",
                       label="most_common_insecure_trackers_roku"))


\begin{table}[H]
%\centering
\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 domain\_by\_dns         &   Num. of channels \\
\midrule
 scorecardresearch.com &                 13 \\
 demdex.net            &                 11 \\
 doubleclick.net       &                 10 \\
 google-analytics.com  &                  8 \\
 1rx.io                &                  7 \\
 tremorhub.com         &                  7 \\
 omtrdc.net            &                  6 \\
 w55c.net              &                  4 \\
 adsrvr.org            &                  4 \\
 spotxchange.com       &                  4 \\
\bottomrule
\end{tabular}
}
\caption{Roku - Most prevalent trackers that use unencrypted connections}
\label{tab:most_common_insecure_trackers_roku}
\end{table}


In [114]:
df = get_popular_domains(amazon_tcp[~amazon_tcp.tls & amazon_tcp.disconnect_blocked])
print(make_latex_table(df, caption="Amazon - Most prevalent trackers that use unencrypted connections",
                       label="most_common_insecure_trackers_amazon"))


\begin{table}[H]
%\centering
\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 domain\_by\_dns         &   Num. of channels \\
\midrule
 amazon-adsystem.com   &                 27 \\
 scorecardresearch.com &                 16 \\
 demdex.net            &                  7 \\
 spotxchange.com       &                  5 \\
 omtrdc.net            &                  5 \\
 yumenetworks.com      &                  3 \\
 fwmrm.net             &                  3 \\
 adsrvr.org            &                  3 \\
 rlcdn.com             &                  2 \\
 google-analytics.com  &                  2 \\
\bottomrule
\end{tabular}
}
\caption{Amazon - Most prevalent trackers that use unencrypted connections}
\label{tab:most_common_insecure_trackers_amazon}
\end{table}


## Unencrypted platform endpoints

In [79]:
amazon_tcp[~amazon_tcp.tls & (amazon_tcp.domain_by_dns == "amazon-adsystem.com")].host_by_dns.unique()

array(['aax-us-east.amazon-adsystem.com', 's.amazon-adsystem.com'],
      dtype=object)

In [80]:
roku_tcp[~roku_tcp.tls & (roku_tcp.domain_by_dns == "roku.com")].host_by_dns.unique()

array(['wwwimg.roku.com', 'api2.sr.roku.com', 'cigars.roku.com',
       'channels.roku.com'], dtype=object)

In [None]:
## HTTP

In [None]:
requests_roku, responses_roku, dns_df_roku = get_http_df(crawl_data_dir_roku)
requests_amazon, responses_amazon, dns_df_amazon = get_http_df(crawl_data_dir_amazon)

In [8]:
requests_roku.channel_id.nunique(), requests_amazon.channel_id.nunique()

NameError: name 'requests_roku' is not defined