In [76]:
import LeakDetector
import numpy as np
import pandas as pd
from os.path import isdir, join, basename
from datetime import datetime
from glob import glob

In [77]:
from device_ids import TV_ID_MAP_V1
from log_analysis import get_crawl_parameter, get_crawl_data_path, get_ott_device_mac
from crawl_ids import CrawlRokuTop1KMITM
from df_utils import load_df
from nb_utils import make_latex_table

from ott_leaks import run_leak_detection, DEVICE_ID_NAMES, print_leak_stats, remove_ch_name_url_false_positives

## Load leaks
- Run the following to detect and pickle leaks
  - Detect on all crawls: `python2 detect_leaks.py`
  - Detect on a single crawl: `python2 detect_leaks.py roku-data-20190508-013650`

In [79]:
leaks_roku = load_df(CrawlRokuTop1KMITM, "leak")

In [80]:
leaks_roku = load_df(CrawlRokuTop1KMITM, "leak")
leaks_roku = leaks_roku[leaks_roku.id_type!="Build Number"]
leaks_roku = leaks_roku[leaks_roku.req_domain!="roku.com"]
id_leaks_roku = leaks_roku[leaks_roku.id_type.isin(DEVICE_ID_NAMES)]
df = print_leak_stats(leaks_roku)
df

Unnamed: 0,ID,Num. of leaks,Num. of channels
0,AD ID,12739,359
1,Channel name,10041,211
2,City,273,35
3,State,164,26
4,Zip,298,43
5,Serial No,4216,114
6,Email,15,5


In [81]:
print(make_latex_table(
    df, caption="Overview of information leakage detected in Roku-Top1K-NoMITM crawl",
    label="leaks_roku"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lrr}
\toprule
 Id           &   Num. of leaks &   Num. of channels \\
\midrule
 AD ID        &           12739 &                359 \\
 Channel name &           10041 &                211 \\
 City         &             273 &                 35 \\
 State        &             164 &                 26 \\
 Zip          &             298 &                 43 \\
 Serial No    &            4216 &                114 \\
 Email        &              15 &                  5 \\
\bottomrule
\end{tabular}
%}
\caption{Overview of information leakage detected in Roku-Top1K-NoMITM crawl}
\label{tab:leaks_roku}
\end{table}


In [10]:
## Pi-Hole-Block

## ID Leaks
- Exclude non-Id search terms

In [83]:
leaks_roku.adblocked.value_counts()

True     21304
False     6442
Name: adblocked, dtype: int64

### Email sent on channel (for registration)
- email address sant to  http://api.qello.com/users/register/ for registration purposes
- crawler actually clicked the dialog to allow email address to be accessed from Roku

In [85]:
leaks_roku[leaks_roku.id_type.isin(["Email", "Unknown"])]

Unnamed: 0,adblocked,adblocked_by_url,category,channel_id,channel_name,cookie,decoded_data,disconnect_blocked,disconnect_blocked_by_url,domain_by_dns,...,rank,referer,req_domain,search,status,tcp_dstport,tcp_stream,time,url,user_agent
0,False,False,Sports,256015,AHLTV,,,False,False,watchtheahl.com,...,680,,watchtheahl.com,macyli47@gmail.com,TERMINATED,443,83,1558854805.366032,https://ott.watchtheahl.com/user_lookup?email=...,Roku/DVP-9.0 (519.00E04142A)
0,False,False,Sports,256015,AHLTV,,,False,False,watchtheahl.com,...,680,,watchtheahl.com,macyli47@gmail.com,TERMINATED,443,103,1558854839.136287,https://ott.watchtheahl.com/user_lookup?email=...,Roku/DVP-9.0 (519.00E04142A)
0,False,False,Sports,256015,AHLTV,,,False,False,watchtheahl.com,...,680,,watchtheahl.com,macyli47@gmail.com,TERMINATED,443,152,1558854936.908638,https://ott.watchtheahl.com/user_lookup?email=...,Roku/DVP-9.0 (519.00E04142A)
0,False,False,Sports,256015,AHLTV,,,False,False,watchtheahl.com,...,680,,watchtheahl.com,macyli47@gmail.com,TERMINATED,443,172,1558854970.701874,https://ott.watchtheahl.com/user_lookup?email=...,Roku/DVP-9.0 (519.00E04142A)
0,False,False,Sports,108645,HockeyTV,,,False,False,hockeytv.com,...,1230,,hockeytv.com,macyli47@gmail.com,TERMINATED,443,146,1558924721.145332,https://ott.hockeytv.com/user_lookup?email=mac...,Roku/DVP-9.0 (519.00E04142A)
0,False,False,Sports,108645,HockeyTV,,,False,False,hockeytv.com,...,1230,,hockeytv.com,macyli47@gmail.com,TERMINATED,443,78,1558924589.500667,https://ott.hockeytv.com/user_lookup?email=mac...,Roku/DVP-9.0 (519.00E04142A)
0,False,False,Sports,108645,HockeyTV,,,False,False,hockeytv.com,...,1230,,hockeytv.com,macyli47@gmail.com,TERMINATED,443,98,1558924623.258755,https://ott.hockeytv.com/user_lookup?email=mac...,Roku/DVP-9.0 (519.00E04142A)
0,False,False,Sports,108645,HockeyTV,,,False,False,hockeytv.com,...,1230,,hockeytv.com,macyli47@gmail.com,TERMINATED,443,166,1558924754.927272,https://ott.hockeytv.com/user_lookup?email=mac...,Roku/DVP-9.0 (519.00E04142A)
0,False,False,Sports,70391,MotorTrend,,,False,False,motortrendondemand.com,...,150,,motortrendondemand.com,macyli47@gmail.com,TERMINATED,80,102,1559164962.760976,http://api.motortrendondemand.com/api/v2/profi...,Roku/DVP-9.0 (519.00E04142A)
0,False,False,Comedy,244040,RiffTrax,,,False,False,oddconnect.com,...,1022,,oddconnect.com,macyli47@gmail.com,TERMINATED,443,118,1558923964.636551,https://oddconnect.com/api/device_users/macyli...,Roku/DVP-9.0 (519.00E04142A)


### Adblocked status
- 4853 of the 6314 (76%) ID leaks are to domains flagged by ad blocker lists
- 38 if the 76 domains IDs leaked to are flagged by ad blocker lists

In [86]:
id_leaks_roku.adblocked.value_counts()

True     14377
False     2578
Name: adblocked, dtype: int64

In [87]:
leaks_roku.adblocked.value_counts()

True     21304
False     6442
Name: adblocked, dtype: int64

In [88]:
id_leaks_roku.drop_duplicates("req_domain").adblocked.value_counts()

True     44
False    38
Name: adblocked, dtype: int64

In [89]:
leaks_roku.drop_duplicates("req_domain").adblocked.value_counts()

False    83
True     48
Name: adblocked, dtype: int64

### Leaked IDs

In [90]:
leaks_roku.id_type.value_counts()

AD ID           12739
Channel name    10041
Serial No        4216
Zip               298
City              273
State             164
Email              15
Name: id_type, dtype: int64

### Leak location

In [91]:
leaks_roku.leak_type.value_counts()

url_leaks         22254
post_leaks         4458
referrer_leaks      998
cookie_leaks         36
Name: leak_type, dtype: int64

### Leak encodings

In [92]:
leaks_roku.encoding.value_counts()

unencoded              20794
sha1                    2363
md5                     2358
urlencode               2148
base64                    71
sha256                    10
urlencode-urlencode        2
Name: encoding, dtype: int64