In [1]:
import LeakDetector
import numpy as np
import pandas as pd
from os.path import isdir, join, basename
from datetime import datetime
from glob import glob

In [2]:
from device_ids import TV_ID_MAP_V1
from log_analysis import get_crawl_parameter, get_crawl_data_path, get_ott_device_mac, get_last_smart_launch_times
from crawl_ids import CrawlRokuTop1KMITM
from df_utils import load_df
from nb_utils import make_latex_table

from ott_leaks import run_leak_detection, DEVICE_ID_NAMES, print_leak_stats, remove_ch_name_url_false_positives

## Load leaks
- Run the following to detect and pickle leaks
  - Detect on all crawls: `python2 detect_leaks.py`
  - Detect on a single crawl: `python2 detect_leaks.py roku-data-20190508-013650`

In [24]:
leaks_roku = load_df(CrawlRokuTop1KMITM, "leak")
last_smart_launch_times = get_last_smart_launch_times(CrawlRokuTop1KMITM)
leaks_roku["valid"] = leaks_roku.apply(lambda x: x["time"]>=last_smart_launch_times[x["channel_id"]], axis=1)
leaks_roku = leaks_roku[leaks_roku.valid]

# see below
email_leaks_roku = leaks_roku[leaks_roku.id_type=="Email"]
leaks_roku = leaks_roku[leaks_roku.id_type!="Email"]

leaks_roku = leaks_roku[leaks_roku.id_type!="Build Number"]
leaks_roku = leaks_roku[leaks_roku.req_domain!="roku.com"]
id_leaks_roku = leaks_roku[leaks_roku.id_type.isin(DEVICE_ID_NAMES)]
df = print_leak_stats(leaks_roku)
df

Unnamed: 0,ID,Num. of leaks,Num. of channels
0,AD ID,2650,320
1,Channel name,2331,197
2,Serial No,996,110
3,City,64,11
4,State,33,6
5,Zip,61,10


In [25]:
print(make_latex_table(
    df, caption="Overview of information leakage detected in Roku-Top1K-NoMITM crawl",
    label="leaks_roku"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lrr}
\toprule
 Id           &   Num. of leaks &   Num. of channels \\
\midrule
 AD ID        &            2650 &                320 \\
 Channel name &            2331 &                197 \\
 Serial No    &             996 &                110 \\
 City         &              64 &                 11 \\
 State        &              33 &                  6 \\
 Zip          &              61 &                 10 \\
\bottomrule
\end{tabular}
%}
\caption{Overview of information leakage detected in Roku-Top1K-NoMITM crawl}
\label{tab:leaks_roku}
\end{table}


In [12]:
## Pi-Hole-Block

## ID Leaks
- Exclude non-Id search terms

In [13]:
print_leak_stats(id_leaks_roku)

Unnamed: 0,ID,Num. of leaks,Num. of channels
0,AD ID,2650,320
1,Serial No,996,110


In [14]:
leaks_roku.adblocked.value_counts()

True     4452
False    1690
Name: adblocked, dtype: int64

### Email sent on channel (for registration)
- email address sant to  http://api.qello.com/users/register/ for registration purposes
- crawler actually clicked the dialog to allow email address to be accessed from Roku

In [15]:
leaks_roku[leaks_roku.id_type.isin(["Email", "Unknown"])]

Unnamed: 0,adblocked,adblocked_by_url,category,channel_id,channel_name,cookie,decoded_data,disconnect_blocked,disconnect_blocked_by_url,domain_by_dns,...,referer,req_domain,search,status,tcp_dstport,tcp_stream,time,url,user_agent,valid
0,False,False,Sports,256015,AHLTV,,,False,False,watchtheahl.com,...,,watchtheahl.com,macyli47@gmail.com,TERMINATED,443,152,1558854936.908638,https://ott.watchtheahl.com/user_lookup?email=...,Roku/DVP-9.0 (519.00E04142A),True
0,False,False,Sports,256015,AHLTV,,,False,False,watchtheahl.com,...,,watchtheahl.com,macyli47@gmail.com,TERMINATED,443,172,1558854970.701874,https://ott.watchtheahl.com/user_lookup?email=...,Roku/DVP-9.0 (519.00E04142A),True
0,False,False,Sports,108645,HockeyTV,,,False,False,hockeytv.com,...,,hockeytv.com,macyli47@gmail.com,TERMINATED,443,146,1558924721.145332,https://ott.hockeytv.com/user_lookup?email=mac...,Roku/DVP-9.0 (519.00E04142A),True
0,False,False,Sports,108645,HockeyTV,,,False,False,hockeytv.com,...,,hockeytv.com,macyli47@gmail.com,TERMINATED,443,166,1558924754.927272,https://ott.hockeytv.com/user_lookup?email=mac...,Roku/DVP-9.0 (519.00E04142A),True
0,False,False,Comedy,244040,RiffTrax,,,False,False,oddconnect.com,...,,oddconnect.com,macyli47@gmail.com,TERMINATED,443,186,1558924093.955345,https://oddconnect.com/api/device_users/macyli...,Roku/DVP-9.0 (519.00E04142A),True
0,False,False,Music,40299,Stingray Qello,,,False,False,qello.com,...,,qello.com,macyli47@gmail.com,TERMINATED,80,188,1559061955.701806,http://api.qello.com/users/register/,Roku/DVP-9.0 (519.00E04142A),True
0,False,False,Music,40299,Stingray Qello,,,False,False,qello.com,...,,qello.com,macyli47@gmail.com,TERMINATED,80,214,1559061989.498758,http://api.qello.com/users/register/,Roku/DVP-9.0 (519.00E04142A),True


### Adblocked status

In [16]:
id_leaks_roku.adblocked.value_counts()

True     2934
False     712
Name: adblocked, dtype: int64

In [17]:
leaks_roku.adblocked.value_counts()

True     4452
False    1690
Name: adblocked, dtype: int64

In [18]:
id_leaks_roku.drop_duplicates("req_domain").adblocked.value_counts()

True     41
False    36
Name: adblocked, dtype: int64

In [19]:
leaks_roku.drop_duplicates("req_domain").adblocked.value_counts()

False    73
True     44
Name: adblocked, dtype: int64

### Leaked IDs

In [20]:
leaks_roku.id_type.value_counts()

AD ID           2650
Channel name    2331
Serial No        996
City              64
Zip               61
State             33
Email              7
Name: id_type, dtype: int64

### Leak location

In [21]:
leaks_roku.leak_type.value_counts()

url_leaks         4904
post_leaks         992
referrer_leaks     235
cookie_leaks        11
Name: leak_type, dtype: int64

### Leak encodings

In [22]:
leaks_roku.encoding.value_counts()

unencoded    4569
md5           537
sha1          536
urlencode     486
base64         12
sha256          2
Name: encoding, dtype: int64

In [23]:
len(leaks_roku)

6142