In [1]:
import LeakDetector
import numpy as np
import pandas as pd
from os.path import isdir, join, basename
from datetime import datetime
from glob import glob


In [2]:
from device_ids import TV_ID_MAP
from log_analysis import get_crawl_parameter
from crawl_ids import CrawlRokuManualV2
from df_utils import load_df
from nb_utils import make_latex_table

from ott_leaks import run_leak_detection, DEVICE_ID_NAMES, print_leak_stats, remove_ch_name_url_false_positives


pd.set_option("display.max_colwidth",500)
pd.set_option("display.max_rows",500)

## Load leaks
- Run the following to detect and pickle leaks
  - Detect on all crawls: `python2 detect_leaks.py`
  - Detect on a single crawl: `python2 detect_leaks.py roku-data-20190508-013650`

In [3]:
leaks_roku = load_df(CrawlRokuManualV2, "leak")
openwpm_leaks_roku = load_df(CrawlRokuManualV2, "openwpm_leak")
leaks_roku = leaks_roku[leaks_roku.id_type!="Build Number"]
len(leaks_roku), len(openwpm_leaks_roku)

(11923, 2074)

In [4]:
remove_ch_name_url_false_positives(leaks_roku)  # this is part of the leak detection flow now
leaks_roku = leaks_roku[leaks_roku.req_domain!="roku.com"]
id_leaks_roku = leaks_roku[leaks_roku.id_type.isin(DEVICE_ID_NAMES)
                           | leaks_roku.id_type.str.contains("password", case=False)
                           | leaks_roku.id_type.str.contains("email", case=False)
                           | leaks_roku.id_type.str.contains("cc", case=False)
                           | leaks_roku.id_type.str.contains("password", case=False)]
df = print_leak_stats(leaks_roku)
df

Unnamed: 0,ID,Num. of leaks,Num. of channels
0,AD ID,3478,12
1,Channel name,8044,14
2,directtv_email,11,2
3,Device Name,212,4
4,Profile Email,1,1
5,Serial No,1,1
6,Profile Zip,137,1
7,Zip,37,1
8,Profile Password,2,1


In [5]:
from log_analysis import add_domain_column
add_domain_column(openwpm_leaks_roku)

In [6]:
openwpm_leaks_roku[openwpm_leaks_roku.id_type !="Channel name"][['channel_name', 'id_type', 'encoding', 'search', 'leak_type', 'req_domain']].drop_duplicates()

Unnamed: 0,channel_name,id_type,encoding,search,leak_type,req_domain
0,ABC,City,unencoded,princeton,url_leaks,go.com
1,Hotstar,Profile Email,sha256,baaaaaab54@gmail.com,url_leaks,gravityrd-services.com
1,Netflix,Profile Email,sha256,baaaaaab54@gmail.com,url_leaks,googleadservices.com
0,Netflix,Profile Email,sha256,baaaaaab54@gmail.com,url_leaks,nflximg.net
0,Netflix,Profile Email,sha256,baaaaaab54@gmail.com,url_leaks,facebook.com
1,Netflix,Profile Email,sha256,baaaaaab54@gmail.com,url_leaks,google.com
1,Netflix,Profile Email,sha256,baaaaaab54@gmail.com,url_leaks,doubleclick.net
0,Netflix,Profile Zip,unencoded,40505,url_leaks,nflxvideo.net


In [7]:
id_leaks_roku[id_leaks_roku.id_type !="Channel name"][['channel_name', 'id_type', 'encoding', 'search', 'leak_type', 'req_domain']].drop_duplicates()

Unnamed: 0,channel_name,id_type,encoding,search,leak_type,req_domain
0,ABC,AD ID,sha1,4489bf66-6dc7-5173-a700-bdb751bc6cf2,url_leaks,scorecardresearch.com
2,ABC,AD ID,md5,4489bf66-6dc7-5173-a700-bdb751bc6cf2,url_leaks,scorecardresearch.com
0,ABC News,AD ID,unencoded,4489bf66-6dc7-5173-a700-bdb751bc6cf2,url_leaks,doubleclick.net
0,Bravo,AD ID,sha1,4489bf66-6dc7-5173-a700-bdb751bc6cf2,url_leaks,scorecardresearch.com
1,Bravo,AD ID,md5,4489bf66-6dc7-5173-a700-bdb751bc6cf2,url_leaks,scorecardresearch.com
0,Bravo,AD ID,unencoded,4489bf66-6dc7-5173-a700-bdb751bc6cf2,url_leaks,fwmrm.net
2,Bravo,AD ID,unencoded,4489bf66-6dc7-5173-a700-bdb751bc6cf2,post_leaks,omtrdc.net
0,Bravo,AD ID,unencoded,4489bf66-6dc7-5173-a700-bdb751bc6cf2,url_leaks,demdex.net
0,Bravo,AD ID,unencoded,4489bf66-6dc7-5173-a700-bdb751bc6cf2,url_leaks,omtrdc.net
0,Cartoon Network,AD ID,sha1,4489bf66-6dc7-5173-a700-bdb751bc6cf2,url_leaks,scorecardresearch.com


In [8]:
print(make_latex_table(
    df, caption="Overview of information leakage detected in Roku-Top1K-NoMITM crawl",
    label="leaks_roku"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lrr}
\toprule
 Id               &   Num. of leaks &   Num. of channels \\
\midrule
 AD ID            &            3478 &                 12 \\
 Channel name     &            8044 &                 14 \\
 directtv\_email   &              11 &                  2 \\
 Device Name      &             212 &                  4 \\
 Profile Email    &               1 &                  1 \\
 Serial No        &               1 &                  1 \\
 Profile Zip      &             137 &                  1 \\
 Zip              &              37 &                  1 \\
 Profile Password &               2 &                  1 \\
\bottomrule
\end{tabular}
%}
\caption{Overview of information leakage detected in Roku-Top1K-NoMITM crawl}
\label{tab:leaks_roku}
\end{table}


### Leak location

In [9]:
leaks_roku.leak_type.value_counts()

url_leaks         11191
post_leaks          574
referrer_leaks      114
cookie_leaks         44
Name: leak_type, dtype: int64

### Leak encodings

In [10]:
leaks_roku.encoding.value_counts()

unencoded    10227
sha1           796
md5            796
urlencode       75
base64          28
sha256           1
Name: encoding, dtype: int64

### Top trackers

In [11]:
from nb_utils import get_popular_domains_from_tcp_conns
tcp = load_df(CrawlRokuManualV2, "tcp_conn")
df = get_popular_domains_from_tcp_conns(tcp[tcp.adblocked])
df

Unnamed: 0,domain,Num. of channels
21,doubleclick.net,23
34,googlesyndication.com,18
51,scorecardresearch.com,12
20,demdex.net,10
37,innovid.com,9
36,imrworldwide.com,8
48,omtrdc.net,7
15,conviva.com,7
28,fwmrm.net,6
3,adobe.com,6


In [14]:
print make_latex_table(df, caption="Most prevalent trackers in the Roku manual crawl", label="tab:top_trackers_roku_manual_v2")


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 Domain                &   Num. of channels \\
\midrule
 doubleclick.net       &                 23 \\
 googlesyndication.com &                 18 \\
 scorecardresearch.com &                 12 \\
 demdex.net            &                 10 \\
 innovid.com           &                  9 \\
 imrworldwide.com      &                  8 \\
 omtrdc.net            &                  7 \\
 conviva.com           &                  7 \\
 fwmrm.net             &                  6 \\
 adobe.com             &                  6 \\
\bottomrule
\end{tabular}
%}
\caption{Most prevalent trackers in the Roku manual crawl}
\label{tab:tab:top_trackers_roku_manual_v2}
\end{table}
