In [107]:
import LeakDetector
import numpy as np
import pandas as pd
from os.path import isdir, join, basename
from datetime import datetime
from glob import glob

In [108]:
from device_ids import TV_ID_MAP
from log_analysis import get_crawl_parameter, get_crawl_data_path, get_ott_device_mac
from crawl_ids import CrawlRokuTop1KNoMITM, CrawlFireTVTop1KNoMITM, CrawlFireTVTop1KMITM
from df_utils import load_df
from nb_utils import make_latex_table, get_popular_domains_from_reqs
from ott_leaks import run_leak_detection, DEVICE_ID_NAMES, print_leak_stats, remove_ch_name_url_false_positives
pd.set_option('display.max_columns', 500)

## Load leaks
- Run the following to detect and pickle leaks
  - Detect on all crawls: `python2 detect_leaks.py`
  - Detect on a single crawl: `python2 detect_leaks.py roku-data-20190508-013650`

In [109]:
leaks_fire = load_df(CrawlFireTVTop1KMITM, "leak")
AMAZON_DOMAINS = ["amazon.com", "amazonvideo.com"]
leaks_fire = leaks_fire[~leaks_fire.req_domain.isin(AMAZON_DOMAINS)]

id_leaks_fire = leaks_fire[leaks_fire.id_type.isin(DEVICE_ID_NAMES)]
df = print_leak_stats(leaks_fire)
df

Unnamed: 0,ID,Num. of leaks,Num. of channels
0,Android ID,3856,394
1,MAC,138,52
2,Serial No,377,105
3,Device name,64,40
4,AD ID,953,221
5,Zip,190,28
6,City,285,26
7,Wifi SSID,204,21
8,Channel name,5248,223
9,State,67,12


In [114]:
print(make_latex_table(df))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lrr}
\toprule
 Id           &   Num. of leaks &   Num. of channels \\
\midrule
 Android ID   &            3856 &                394 \\
 MAC          &             138 &                 52 \\
 Serial No    &             377 &                105 \\
 Device name  &              64 &                 40 \\
 AD ID        &             953 &                221 \\
 Zip          &             190 &                 28 \\
 City         &             285 &                 26 \\
 Wifi SSID    &             204 &                 21 \\
 Channel name &            5248 &                223 \\
 State        &              67 &                 12 \\
\bottomrule
\end{tabular}
%}
\caption{caption}
\label{tab:LABEL}
\end{table}


In [110]:
# Sanity check 
# Make sure the number 
http_req = load_df(CrawlFireTVTop1KMITM, "http_req")
http_req = http_req[~http_req.req_domain.isin(AMAZON_DOMAINS)]
crawl_data_dir = get_crawl_data_path(CrawlFireTVTop1KMITM)
device_ids = TV_ID_MAP_V1[get_ott_device_mac(crawl_data_dir)]

In [113]:
from urlparse import urlparse
http_req['path'] = http_req.url.map(lambda x: urlparse(x).path + urlparse(x).params + urlparse(x).query + urlparse(x).fragment)

http_req['path_upper'] = http_req.path.map(lambda x: x.upper())
http_req['path_lower'] = http_req.path.map(lambda x: x.lower())

for id_type, id_value in device_ids.items():
    if id_type == 'Channel name':
        continue
    id_in_urls = http_req[http_req.path.str.contains(id_value) | http_req.path_lower.str.contains(id_value.lower()) | http_req.path_upper.str.contains(id_value.upper())]
    id_url_leaks = leaks_fire[(leaks_fire.id_type==id_type) & (leaks_fire.leak_type=="url_leaks") & (leaks_fire.encoding=="unencoded")]
    assert set(id_in_urls.url.unique()) == set(id_url_leaks.url.unique())


  # Remove the CWD from sys.path while we load stuff.


In [16]:
df = print_leak_stats(id_leaks_fire)
df

Unnamed: 0,ID,Num. of leaks,Num. of channels
0,Android ID,5837,394
1,MAC,190,52
2,Serial No,608,105
3,Device name,65,40
4,AD ID,1537,221
5,Wifi SSID,204,21


In [18]:
id_leaks_fire[id_leaks_fire.req_domain==""]

Unnamed: 0,adblocked,adblocked_by_url,category,channel_id,channel_name,cookie,decoded_data,disconnect_blocked,disconnect_blocked_by_url,domain_by_dns,...,referer,req_domain,search,status,tcp_dstport,tcp_stream,time,url,user_agent,ch_name_url_false_pos
0,False,False,Movies & TV,com.jtv.aftv.dovechannel,Dove Channel,,,False,False,,...,,,c38e2b8a-b9df-460c-851b-f418368ab5ab,TERMINATED,8085,15,1557530184.736239,http://54.160.202.71:8085/log/reg/dove/93be8fe...,,False
1,False,False,Movies & TV,com.jtv.aftv.dovechannel,Dove Channel,,,False,False,,...,,,93be8fef8277953f,TERMINATED,8085,15,1557530184.736239,http://54.160.202.71:8085/log/reg/dove/93be8fe...,,False
2,False,False,Movies & TV,com.jtv.aftv.dovechannel,Dove Channel,,,False,False,,...,,,93be8fef8277953f,TERMINATED,8085,15,1557530184.736239,http://54.160.202.71:8085/log/reg/dove/93be8fe...,,False
3,False,False,Movies & TV,com.jtv.aftv.dovechannel,Dove Channel,,,False,False,,...,,,c38e2b8a-b9df-460c-851b-f418368ab5ab,TERMINATED,8085,15,1557530184.736239,http://54.160.202.71:8085/log/reg/dove/93be8fe...,,False
0,False,False,Movies & TV,com.jtv.aftv.dovechannel,Dove Channel,,,False,False,,...,,,c38e2b8a-b9df-460c-851b-f418368ab5ab,TERMINATED,8085,13,1557530184.505009,http://54.160.202.71:8085/log/reg/dove/93be8fe...,,False
1,False,False,Movies & TV,com.jtv.aftv.dovechannel,Dove Channel,,,False,False,,...,,,93be8fef8277953f,TERMINATED,8085,13,1557530184.505009,http://54.160.202.71:8085/log/reg/dove/93be8fe...,,False
2,False,False,Movies & TV,com.jtv.aftv.dovechannel,Dove Channel,,,False,False,,...,,,93be8fef8277953f,TERMINATED,8085,13,1557530184.505009,http://54.160.202.71:8085/log/reg/dove/93be8fe...,,False
3,False,False,Movies & TV,com.jtv.aftv.dovechannel,Dove Channel,,,False,False,,...,,,c38e2b8a-b9df-460c-851b-f418368ab5ab,TERMINATED,8085,13,1557530184.505009,http://54.160.202.71:8085/log/reg/dove/93be8fe...,,False


In [10]:
get_popular_domains_from_reqs(id_leaks_fire[id_leaks_fire.req_domain!="amazon.com"])

Unnamed: 0,req_domain,Num. of channels
15,amazonvideo.com,431
45,doubleclick.net,77
55,flurry.com,66
142,unity3d.com,60
13,amazon-adsystem.com,58
66,ifood.tv,50
62,google-analytics.com,34
1,a2z.com,30
67,imrworldwide.com,26
52,execute-api.us-east-1.amazonaws.com,20


In [19]:
ssid_leaks = leaks_fire[leaks_fire.id_type == "Wifi SSID"]
ssid_leaks.host.unique()
get_popular_domains_from_reqs(ssid_leaks)
#leaks_fire[leaks_fire.id_type == "Wifi SSID"][['leak_type', 'host', 'post_data']]

Unnamed: 0,req_domain,Num. of channels
1,conviva.com,10
0,a2z.com,7
2,kochava.com,4
3,wiphybackend.appspot.com,1


In [20]:
mac_leaks = leaks_fire[leaks_fire.id_type == "MAC"]
get_popular_domains_from_reqs(mac_leaks)
#leaks_fire[leaks_fire.id_type == "Wifi SSID"][['leak_type', 'host', 'post_data']]

Unnamed: 0,req_domain,Num. of channels
1,amazon-adsystem.com,21
3,bigstar.tv,15
0,adjust.com,8
8,vungle.com,4
2,applifier.com,3
4,leanplum.com,1
5,muneris.io,1
6,omtrdc.net,1
7,singular.net,1


In [27]:
leaks_fire[leaks_fire.id_type == "MAC"].tcp_dstport.value_counts()

80     119
443     71
Name: tcp_dstport, dtype: int64

## Unencrypted leaks

In [21]:
id_leaks_fire.tcp_dstport.value_counts()

443     5006
80      3427
8085       8
Name: tcp_dstport, dtype: int64

In [23]:
leaks_fire.tcp_dstport.value_counts()

443     15783
80       7021
8085        8
Name: tcp_dstport, dtype: int64

In [13]:
print(make_latex_table(
    df, caption="Overview of information leakage detected in Roku-Top1K-NoMITM crawl",
    label="leaks_fire"))


\begin{table}[H]
%\centering
\resizebox{\columnwidth}{!}{%
\begin{tabular}{lrr}
\toprule
 Id        &   Num. of leaks &   Num. of channels \\
\midrule
 AD ID     &            1029 &                111 \\
 Serial No &             964 &                 75 \\
\bottomrule
\end{tabular}
}
\caption{Overview of information leakage detected in Roku-Top1K-NoMITM crawl}
\label{tab:leaks_fire}
\end{table}


In [14]:
## Pi-Hole-Block

## ID Leaks
- Exclude non-Id search terms

In [24]:
id_leaks_fire.adblocked.value_counts()

False    5417
True     3024
Name: adblocked, dtype: int64

### Email sent on channel (for registration)
- email address sant to  http://api.qello.com/users/register/ for registration purposes
- crawler actually clicked the dialog to allow email address to be accessed from Roku

In [12]:
leaks_fire[leaks_fire.id_type.isin(["Email", "Unknown"])]

Unnamed: 0,adblocked,adblocked_by_url,category,channel_id,channel_name,cookie,decoded_data,disconnect_blocked,disconnect_blocked_by_url,domain_by_dns,...,referer,req_domain,search,status,tcp_dstport,tcp_stream,time,url,user_agent,ch_name_url_false_pos
0,False,False,Music,40299,Stingray Qello,,,False,False,qello.com,...,,qello.com,macyli47@gmail.com,TERMINATED,80,84,1557386763.45743,http://api.qello.com/users/register/,Roku/DVP-9.0 (519.00E04142A),False
0,False,False,Music,40299,Stingray Qello,,,False,False,qello.com,...,,qello.com,macyli47@gmail.com,TERMINATED,80,59,1557386729.679612,http://api.qello.com/users/register/,Roku/DVP-9.0 (519.00E04142A),False


### Adblocked status
- 4853 of the 6314 (76%) ID leaks are to domains flagged by ad blocker lists
- 38 if the 76 domains IDs leaked to are flagged by ad blocker lists

In [None]:
id_leaks_fire.adblocked.value_counts()

In [None]:
leaks_fire.adblocked.value_counts()

In [None]:
id_leaks_fire.drop_duplicates("req_domain").adblocked.value_counts()

### Leaked IDs

In [None]:
leaks_fire.id_type.value_counts()

### Leak location

In [None]:
leaks_fire.leak_type.value_counts()

### Leak encodings

In [None]:
leaks_fire.encoding.value_counts()