## 1K  tables

In [5]:
from df_utils import load_df
from crawl_ids import CrawlFireTVTop1KMITM
import matplotlib.pyplot as plt
import seaborn as sns
from nb_utils import get_popular_domains_from_tcp_conns, make_latex_table, get_popular_domains_from_reqs
from nb_utils import get_channels_with_most_domains

In [10]:
AMAZON_DOMAINS = ["amazon.com", "amazonvideo.com"]

### Load TCP data

In [16]:
tcp_df = load_df(CrawlFireTVTop1KMITM, "tcp_conn")

### Most prevalent trackers

In [17]:
df = get_popular_domains_from_tcp_conns(tcp_df[tcp_df.adblocked & ~tcp_df.domain.isin(AMAZON_DOMAINS)])
df

Unnamed: 0,domain,Num. of channels
27,amazon-adsystem.com,687
62,crashlytics.com,346
130,doubleclick.net,307
168,google-analytics.com,277
149,facebook.com,196
108,d3a510xmpll7o6.cloudfront.net,180
29,app-measurement.com,179
171,googlesyndication.com,145
177,imasdk.googleapis.com,129
174,gstatic.com,127


In [18]:
print(make_latex_table(df))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 Domain                        &   Num. of channels \\
\midrule
 amazon-adsystem.com           &                687 \\
 crashlytics.com               &                346 \\
 doubleclick.net               &                307 \\
 google-analytics.com          &                277 \\
 facebook.com                  &                196 \\
 d3a510xmpll7o6.cloudfront.net &                180 \\
 app-measurement.com           &                179 \\
 googlesyndication.com         &                145 \\
 imasdk.googleapis.com         &                129 \\
 gstatic.com                   &                127 \\
\bottomrule
\end{tabular}
%}
\caption{caption}
\label{tab:LABEL}
\end{table}


### Channels with most trackers

In [29]:
df = get_channels_with_most_domains(tcp_df[tcp_df.adblocked], 10)
df

Unnamed: 0,channel_name,rank,category,# tracking  domains
896,WNEP - Proud to Serve Scranton/Wilkes-Barre/Ha...,739,News,64
23,ABC7 News San Francisco - Local News &amp; Wea...,503,News,61
915,WTTV CBS4 Indy,1157,News,58
506,Midnight Pulp,933,Movies & TV,32
960,Xtreme Vegas - Classic Slot,341,Games,30
959,Xtreme Slots - FREE Vegas Casino Slot Machines,406,Games,28
918,WVTM 13 -Birmingham News and Weather,1022,News,25
399,IP Tools: Network Utilities,1029,Utilities,22
18,ABC11 Raleigh-Durham - Local News &amp; Weather,595,News,20
322,FreeCell Solitaire,894,Games,19


In [30]:
print(make_latex_table(df, caption="Channels that send data to most trackers", label="tab:ch_most_trackers_fire"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lrlr}
\toprule
 Channel name                                         &   Rank & Category    &    \# tracking
 domains \\
\midrule
 WNEP - Proud to Serve Scranton/Wilkes-Barre/Hazleton &    739 & News        & 64 \\
 ABC7 News San Francisco - Local News \&amp; Weather   &    503 & News        & 61 \\
 WTTV CBS4 Indy                                       &   1157 & News        & 58 \\
 Midnight Pulp                                        &    933 & Movies \& TV & 32 \\
 Xtreme Vegas - Classic Slot                          &    341 & Games       & 30 \\
 Xtreme Slots - FREE Vegas Casino Slot Machines       &    406 & Games       & 28 \\
 WVTM 13 -Birmingham News and Weather                 &   1022 & News        & 25 \\
 IP Tools: Network Utilities                          &   1029 & Utilities   & 22 \\
 ABC11 Raleigh-Durham - Local News \&amp; Weather      &    595 & News        & 20 \\
 FreeCell Solitaire      

In [9]:
top_ten_categories = list(tcp_df.drop_duplicates("channel_id").
                          category.value_counts().head(10).index)
tcp_df_top_ten = tcp_df[tcp_df.category.isin(top_ten_categories)]


### Groupby - Number of trackers per category

In [10]:
#df = get_channels_with_most_domains(playback, head=1000)
from nb_utils import display_side_by_side
df = tcp_df_top_ten[tcp_df_top_ten.adblocked]

tmp_group = df.drop_duplicates(subset=["channel_name", "domain"]).\
    groupby(["channel_name", "rank", 'category']).size().groupby('category')

title = "Num. of trackers - Mean"
a= tmp_group.mean().reset_index(name=title).\
    sort_values(by=[title], ascending=False)

title = "Num. of trackers - Median"
b = tmp_group.median().reset_index(name=title).\
    sort_values(by=[title], ascending=False)
display_side_by_side(a, b)

#df.groupby('category').mean()
#get_category_avgs(playback)

category,Num. of trackers - Mean
News,7.610738
Movies & TV,6.010949
Health & fitness,5.285714
Utilities,5.175439
Lifestyle,4.904762
Sports,4.507937
Novelty,4.347826
Kids,4.3
Games,4.048649
Music & audio,3.40625

category,Num. of trackers - Median
News,6
Movies & TV,5
Health & fitness,4
Kids,4
Lifestyle,4
Sports,4
Games,3
Music & audio,3
Novelty,3
Utilities,3


In [11]:
title = "median"
tmp_group.median().reset_index(name=title).\
    sort_values(by=[title], ascending=False)


Unnamed: 0,category,median
6,News,6
4,Movies & TV,5
1,Health & fitness,4
2,Kids,4
3,Lifestyle,4
8,Sports,4
0,Games,3
5,Music & audio,3
7,Novelty,3
9,Utilities,3


In [14]:
playback_grouped = playback.drop_duplicates(subset=["category", "channel_id", "domain"]).\
    groupby(["category"]).size().reset_index(name="# tracker domains").\
    sort_values(by=['# tracker domains'], ascending=False)

roku_grouped = tcp_df.drop_duplicates(subset=["category", "domain"]).\
    groupby(["category"]).size().reset_index(name="# tracker domains").\
    sort_values(by=['# tracker domains'], ascending=False)


In [16]:
playback_grouped

Unnamed: 0,category,# tracker domains
14,News,1824
12,Movies & TV,1469
5,Games,1041
8,Lifestyle,355
21,Sports,298
7,Kids,246
4,Food & drink,197
15,Novelty,139
13,Music & audio,130
20,Social,99


### Unencrypted traffic
- mark https upgrades

In [19]:
http_req = load_df(CrawlFireTVTop1KNoMITM, "http_req")
http_resp = load_df(CrawlFireTVTop1KNoMITM, "http_resp")

In [20]:
http_req.columns = http_req.columns.astype(str)

In [31]:
http_req.head(3)

NameError: name 'http_req' is not defined

In [23]:
from log_analysis import get_https_upgrade_redirectors
redirects, https_upgrades, cross_origin_redirects = get_https_upgrade_redirectors(CrawlFireTVTop1KNoMITM, http_req, http_resp)
https_upgrade_domains = set(https_upgrades.req_domain.unique())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  http_resp['url'] = http_resp.apply(lambda x: get_resp_url(x, req_urls), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  lambda x: get_fld(x, fail_silently=True))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  and x.req_domain==x.loc_domain), axis=1)


In [25]:
df = get_popular_domains_from_reqs(http_req, 10)
#df['https_upgrade'] = df.req_domain.map(lambda x: x in https_upgrade_domains)
df

Unnamed: 0,req_domain,Num. of channels
33,amazon-adsystem.com,392
282,scorecardresearch.com,122
175,images-amazon.com,58
173,ifood.tv,51
76,cloudinary.com,31
317,titantv.com,29
290,spotxchange.com,28
360,wsi.com,27
72,cdn01.net,25
197,lightcast.com,25


In [26]:
print(make_latex_table(df, label="most_common_insecure_domains_roku",
                       caption="Most prevalent domains contacted over unencrypted connections (Roku)"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 Req domain            &   Num. of channels \\
\midrule
 amazon-adsystem.com   &                392 \\
 scorecardresearch.com &                122 \\
 images-amazon.com     &                 58 \\
 ifood.tv              &                 51 \\
 cloudinary.com        &                 31 \\
 titantv.com           &                 29 \\
 spotxchange.com       &                 28 \\
 wsi.com               &                 27 \\
 cdn01.net             &                 25 \\
 lightcast.com         &                 25 \\
\bottomrule
\end{tabular}
%}
\caption{Most prevalent domains contacted over unencrypted connections (Roku)}
\label{tab:most_common_insecure_domains_roku}
\end{table}
