In [1]:
import LeakDetector
import numpy as np
import pandas as pd
from os.path import isdir, join, basename
from datetime import datetime
from glob import glob

In [2]:
from device_ids import TV_ID_MAP
from log_analysis import get_crawl_parameter
from crawl_ids import CrawlRokuTop1KNoMITM
from df_utils import load_df
from nb_utils import make_latex_table

from ott_leaks import run_leak_detection, DEVICE_ID_NAMES, print_leak_stats, remove_ch_name_url_false_positives

## Load leaks
- Run the following to detect and pickle leaks
  - Detect on all crawls: `python2 detect_leaks.py`
  - Detect on a single crawl: `python2 detect_leaks.py roku-data-20190508-013650`

In [32]:
leaks_roku = load_df(CrawlRokuTop1KNoMITM, "leak")
leaks_roku[leaks_roku.req_domain.str.contains("roku.com")]

Unnamed: 0,adblocked,adblocked_by_url,category,channel_id,channel_name,cookie,decoded_data,disconnect_blocked,disconnect_blocked_by_url,domain_by_dns,...,rank,referer,req_domain,search,status,tcp_dstport,tcp_stream,time,url,user_agent


In [55]:
leaks_roku = load_df(CrawlRokuTop1KNoMITM, "leak")
leaks_roku = leaks_roku[leaks_roku.id_type!="Build Number"]
remove_ch_name_url_false_positives(leaks_roku)  # this is part of the leak detection flow now
leaks_roku = leaks_roku[leaks_roku.req_domain!="roku.com"]
id_leaks_roku = leaks_roku[leaks_roku.id_type.isin(DEVICE_ID_NAMES)]
df = print_leak_stats(leaks_roku)
df

Unnamed: 0,ID,Num. of leaks,Num. of channels
0,AD ID,4606,313
1,Channel name,7291,225
2,Serial No,1708,108
3,City,30,5
4,State,11,2
5,Zip,35,5
6,Email,2,1


In [56]:
print(make_latex_table(
    df, caption="Overview of information leakage detected in Roku-Top1K-NoMITM crawl",
    label="leaks_roku"))


\begin{table}[H]
%\centering
\resizebox{\columnwidth}{!}{%
\begin{tabular}{lrr}
\toprule
 Id           &   Num. of leaks &   Num. of channels \\
\midrule
 AD ID        &            4606 &                313 \\
 Channel name &            7291 &                225 \\
 Serial No    &            1708 &                108 \\
 City         &              30 &                  5 \\
 State        &              11 &                  2 \\
 Zip          &              35 &                  5 \\
 Email        &               2 &                  1 \\
\bottomrule
\end{tabular}
}
\caption{Overview of information leakage detected in Roku-Top1K-NoMITM crawl}
\label{tab:leaks_roku}
\end{table}


In [57]:
## Pi-Hole-Block

## ID Leaks
- Exclude non-Id search terms

In [58]:
len(leaks_roku)

13683

In [59]:
leaks_roku.adblocked.value_counts()

True     7685
False    5998
Name: adblocked, dtype: int64

### Email sent on channel (for registration)
- email address sant to  http://api.qello.com/users/register/ for registration purposes
- crawler actually clicked the dialog to allow email address to be accessed from Roku

In [60]:
leaks_roku[leaks_roku.id_type.isin(["Email", "Unknown"])]

Unnamed: 0,adblocked,adblocked_by_url,category,channel_id,channel_name,cookie,decoded_data,disconnect_blocked,disconnect_blocked_by_url,domain_by_dns,...,referer,req_domain,search,status,tcp_dstport,tcp_stream,time,url,user_agent,ch_name_url_false_pos
0,False,False,Music,40299,Stingray Qello,,,False,False,qello.com,...,,qello.com,macyli47@gmail.com,TERMINATED,80,84,1557386763.45743,http://api.qello.com/users/register/,Roku/DVP-9.0 (519.00E04142A),False
0,False,False,Music,40299,Stingray Qello,,,False,False,qello.com,...,,qello.com,macyli47@gmail.com,TERMINATED,80,59,1557386729.679612,http://api.qello.com/users/register/,Roku/DVP-9.0 (519.00E04142A),False


### Adblocked status
- 4853 of the 6314 (76%) ID leaks are to domains flagged by ad blocker lists
- 38 if the 76 domains IDs leaked to are flagged by ad blocker lists

In [61]:
id_leaks_roku.adblocked.value_counts()

True     4853
False    1461
Name: adblocked, dtype: int64

In [62]:
leaks_roku.adblocked.value_counts()

True     7685
False    5998
Name: adblocked, dtype: int64

In [48]:
id_leaks_roku.drop_duplicates("req_domain").adblocked.value_counts()

True     38
False    38
Name: adblocked, dtype: int64

In [49]:
leaks_roku.drop_duplicates("req_domain").adblocked.value_counts()

False    105
True      45
Name: adblocked, dtype: int64

### Leaked IDs

In [50]:
leaks_roku.id_type.value_counts()

Channel name    7291
AD ID           4606
Build Number    2910
Serial No       1708
Zip               35
City              30
State             11
Email              2
Name: id_type, dtype: int64

### Leak location

In [53]:
leaks_roku.leak_type.value_counts()

url_leaks         15134
post_leaks         1266
referrer_leaks      183
cookie_leaks         10
Name: leak_type, dtype: int64

### Leak encodings

In [54]:
df = leaks_roku.encoding.value_counts()

unencoded    13911
md5           1110
sha1          1004
urlencode      522
base64          42
sha256           4
Name: encoding, dtype: int64

In [None]:
cookie_leaks