## 1K  tables

In [14]:
from df_utils import load_df
from crawl_ids import CrawlRokuTop1KMITM
import matplotlib.pyplot as plt
import seaborn as sns
from log_analysis import get_https_upgrade_redirectors
from nb_utils import get_popular_domains_from_tcp_conns, make_latex_table, get_popular_domains_from_reqs
from nb_utils import get_channels_with_most_domains

## Load Roku TCP connection data from Roku 1K crawl

### Popular domains

In [15]:
roku_tcp = load_df(CrawlRokuTop1KMITM, "tcp_conn")
df = get_popular_domains_from_tcp_conns(roku_tcp)
df

Unnamed: 0,domain,Num. of channels
738,roku.com,1000
291,doubleclick.net,975
397,google-analytics.com,360
810,spotxchange.com,212
764,scorecardresearch.com,212
399,googlesyndication.com,178
447,imrworldwide.com,113
892,tremorhub.com,109
451,innovid.com,102
410,gvt1.com,101


### Popular trackers

In [16]:
roku_tcp = load_df(CrawlRokuTop1KMITM, "tcp_conn")
df = get_popular_domains_from_tcp_conns(roku_tcp[roku_tcp.adblocked])
df

Unnamed: 0,domain,Num. of channels
111,doubleclick.net,975
138,google-analytics.com,360
197,scorecardresearch.com,212
204,spotxchange.com,212
140,googlesyndication.com,178
146,imrworldwide.com,113
215,tremorhub.com,109
147,innovid.com,102
1,2mdn.net,88
225,vimeo.com,86


### Tables
-- note: GA is 271 of 677 channels

In [17]:
#TODO combine tables
print(make_latex_table(df))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 Domain                &   Num. of channels \\
\midrule
 doubleclick.net       &                975 \\
 google-analytics.com  &                360 \\
 scorecardresearch.com &                212 \\
 spotxchange.com       &                212 \\
 googlesyndication.com &                178 \\
 imrworldwide.com      &                113 \\
 tremorhub.com         &                109 \\
 innovid.com           &                102 \\
 2mdn.net              &                 88 \\
 vimeo.com             &                 86 \\
\bottomrule
\end{tabular}
%}
\caption{caption}
\label{tab:LABEL}
\end{table}


In [18]:
df = get_channels_with_most_domains(roku_tcp[roku_tcp.adblocked], 10)
df

Unnamed: 0,channel_name,rank,category,# tracking  domains
726,StarGazer,1012,Special Interest,50
668,Rock Paper Scissors Free,437,Games,42
263,Falling Down Free,421,Games,42
509,Marble Blast Free,738,Games,41
607,Ping Pong Free,447,Games,41
88,Basketball Shots Free,211,Games,41
738,Swing Hero Free Game,504,Games,40
619,Pop Lock Free,489,Games,40
629,Pulse Free,844,Games,37
715,Soccer Shots Free,509,Games,37


In [19]:
print(make_latex_table(df, caption="Channels that send data to most trackers", label="tab:ch_most_trackers_roku"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lrlr}
\toprule
 Channel name             &   Rank & Category         &    \# tracking
 domains \\
\midrule
 StarGazer                &   1012 & Special Interest & 50 \\
 Rock Paper Scissors Free &    437 & Games            & 42 \\
 Falling Down Free        &    421 & Games            & 42 \\
 Marble Blast Free        &    738 & Games            & 41 \\
 Ping Pong Free           &    447 & Games            & 41 \\
 Basketball Shots Free    &    211 & Games            & 41 \\
 Swing Hero Free Game     &    504 & Games            & 40 \\
 Pop Lock Free            &    489 & Games            & 40 \\
 Pulse Free               &    844 & Games            & 37 \\
 Soccer Shots Free        &    509 & Games            & 37 \\
\bottomrule
\end{tabular}
%}
\caption{Channels that send data to most trackers}
\label{tab:tab:ch_most_trackers_roku}
\end{table}


In [20]:
top_ten_categories = list(roku_tcp.drop_duplicates("channel_id").
                          category.value_counts().head(10).index)
roku_tcp_top_ten = roku_tcp[roku_tcp.category.isin(top_ten_categories)]


### Groupby - Number of trackers per category

In [21]:
#df = get_channels_with_most_domains(playback, head=1000)
from nb_utils import display_side_by_side
df = roku_tcp_top_ten[roku_tcp_top_ten.adblocked]

tmp_group = df.drop_duplicates(subset=["channel_name", "domain"]).\
    groupby(["channel_name", "rank", 'category']).size().groupby('category')

title = "Num. of trackers - Mean"
a= tmp_group.mean().reset_index(name=title).\
    sort_values(by=[title], ascending=False)

title = "Num. of trackers - Median"
b = tmp_group.median().reset_index(name=title).\
    sort_values(by=[title], ascending=False)
display_side_by_side(a, b)

#df.groupby('category').mean()
#get_category_avgs(playback)

category,Num. of trackers - Mean
Games,18.295082
Special Interest,9.222222
Movies & TV,5.122137
News & Weather,4.807107
Kids & Family,3.648649
Sports,3.333333
Lifestyle,3.12
International,2.714286
Music,2.361111
Religious,1.682927

category,Num. of trackers - Median
Games,18
Special Interest,5
News & Weather,4
Movies & TV,3
International,2
Kids & Family,2
Lifestyle,2
Music,2
Sports,2
Religious,1


In [22]:
title = "median"
tmp_group.median().reset_index(name=title).\
    sort_values(by=[title], ascending=False)


Unnamed: 0,category,median
0,Games,18
8,Special Interest,5
6,News & Weather,4
4,Movies & TV,3
1,International,2
2,Kids & Family,2
3,Lifestyle,2
5,Music,2
9,Sports,2
7,Religious,1


## Num of distinct domains channels talked to grouped by category

In [23]:
playback_grouped = playback.drop_duplicates(subset=["category", "channel_id", "domain"]).\
    groupby(["category"]).size().reset_index(name="# tracker domains").\
    sort_values(by=['# tracker domains'], ascending=False)

roku_grouped = roku_tcp.drop_duplicates(subset=["category", "domain"]).\
    groupby(["category"]).size().reset_index(name="# tracker domains").\
    sort_values(by=['# tracker domains'], ascending=False)


In [24]:
playback_grouped

Unnamed: 0,category,# tracker domains
11,News & Weather,2110
9,Movies & TV,1716
5,Games,1320
7,Kids & Family,510
17,Special Interest,471
13,Religious,342
18,Sports,203
1,Comedy,172
10,Music,162
6,International,155


In [25]:
# TODO ad block stats
roku_tcp.ghostery_blocked.value_counts()
roku_tcp.disconnect_blocked.value_counts()
roku_tcp.easylist_blocked.value_counts()
roku_tcp.easyprivacy_blocked.value_counts()

False    187654
True      12088
Name: easyprivacy_blocked, dtype: int64

### OTT trackers

In [26]:
roku_tcp[roku_tcp.domain=="kargo.com"].channel_id.nunique()

25

### Unencrypted traffic
- mark https upgrades

In [27]:
http_req = load_df(CrawlRokuTop1KMITM, "http_req")
http_resp = load_df(CrawlRokuTop1KMITM, "http_resp")

In [28]:
http_req.columns = http_req.columns.astype(str)

In [29]:
http_req.head()

Unnamed: 0,channel_id,time,cookie,post_data,host,referer,url,method,user_agent,ip_dst,...,easylist_blocked,easyprivacy_blocked,pihole_blocked,adblocked,disconnect_blocked_by_url,ghostery_blocked_by_url,easylist_blocked_by_url,easyprivacy_blocked_by_url,pihole_blocked_by_url,adblocked_by_url
0,25082,1559129340.411159,,,d2sy1af2shs9ve.cloudfront.net,,http://d2sy1af2shs9ve.cloudfront.net/portal/in...,GET,Roku/DVP-9.0 (519.00E04142A),99.84.112.165,...,False,False,False,True,True,False,False,False,False,True
1,25082,1559129351.959034,,,d2sy1af2shs9ve.cloudfront.net,,http://d2sy1af2shs9ve.cloudfront.net/portal/in...,GET,Roku/DVP-9.0 (519.00E04142A),99.84.112.39,...,False,False,False,True,True,False,False,False,False,True
2,25082,1559129363.458775,,,d2sy1af2shs9ve.cloudfront.net,,http://d2sy1af2shs9ve.cloudfront.net/portal/in...,GET,Roku/DVP-9.0 (519.00E04142A),99.84.112.165,...,False,False,False,True,True,False,False,False,False,True
3,25082,1559129379.930451,,,d2sy1af2shs9ve.cloudfront.net,,http://d2sy1af2shs9ve.cloudfront.net/portal/in...,GET,Roku/DVP-9.0 (519.00E04142A),99.84.112.50,...,False,False,False,True,True,False,False,False,False,True
4,25082,1559129433.775138,,,d2sy1af2shs9ve.cloudfront.net,,http://d2sy1af2shs9ve.cloudfront.net/portal/in...,GET,Roku/DVP-9.0 (519.00E04142A),52.85.89.18,...,False,False,False,True,True,False,False,False,False,True


In [31]:
redirects, https_upgrades, cross_origin_redirects = get_https_upgrade_redirectors(CrawlRokuTop1KMITM, http_req, http_resp)
https_upgrade_domains = set(https_upgrades.req_domain.unique())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  http_resp['url'] = http_resp.apply(lambda x: get_resp_url(x, req_urls), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  lambda x: get_fld(x, fail_silently=True))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  and x.req_domain==x.loc_domain), axis=1)


In [30]:
https_upgrade_domains

NameError: name 'https_upgrade_domains' is not defined

In [25]:
df = get_popular_domains_from_reqs(http_req, 10)
# df['https_upgrade'] = df.req_domain.map(lambda x: x in https_upgrade_domains)
df

Unnamed: 0,req_domain,Num. of channels
151,doubleclick.net,267
208,google-analytics.com,190
399,scorecardresearch.com,145
385,roku.com,145
234,ifood.tv,90
461,tremorhub.com,79
248,irchan.com,74
425,stickyadstv.com,74
307,monarchads.com,73
5,1rx.io,66


In [26]:
print(make_latex_table(df, label="most_common_insecure_domains_roku",
                       caption="Most prevalent domains contacted over unencrypted connections (Roku)"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 Req domain            &   Num. of channels \\
\midrule
 doubleclick.net       &                267 \\
 google-analytics.com  &                190 \\
 scorecardresearch.com &                145 \\
 roku.com              &                145 \\
 ifood.tv              &                 90 \\
 tremorhub.com         &                 79 \\
 irchan.com            &                 74 \\
 stickyadstv.com       &                 74 \\
 monarchads.com        &                 73 \\
 1rx.io                &                 66 \\
\bottomrule
\end{tabular}
%}
\caption{Most prevalent domains contacted over unencrypted connections (Roku)}
\label{tab:most_common_insecure_domains_roku}
\end{table}
