In [1]:
import LeakDetector
import numpy as np
import pandas as pd
from os.path import isdir, join, basename
from datetime import datetime
from glob import glob

In [2]:
from log_analysis import get_crawl_parameter, add_adblocked_status
from urlparse import urlparse
from crawl_ids import CrawlFireTVManualV2
from df_utils import load_df
from nb_utils import make_latex_table

from ott_leaks import run_leak_detection, DEVICE_ID_NAMES, print_leak_stats, remove_ch_name_url_false_positives

## Load leaks
- Run the following to detect and pickle leaks
  - Detect on all crawls: `python2 detect_leaks.py`
  - Detect on a single crawl: `python2 detect_leaks.py roku-data-20190508-013650`

In [3]:
leaks = load_df(CrawlFireTVManualV2, "leak")
openwpm_leaks = load_df(CrawlFireTVManualV2, "openwpm_leak")
leaks = leaks[leaks.id_type!="Build Number"]
remove_ch_name_url_false_positives(leaks)  # this is part of the leak detection flow now
# remove leaks to roku.com

AMAZON_DOMAINS = ["amazon.com", "amazonvideo.com"]
leaks = leaks[~leaks.req_domain.isin(AMAZON_DOMAINS)]
id_leaks = leaks[leaks.id_type.isin(DEVICE_ID_NAMES)
                           | leaks.id_type.str.contains("password", case=False)
                           | leaks.id_type.str.contains("email", case=False)
                           | leaks.id_type.str.contains("cc", case=False)
                           | leaks.id_type.str.contains("password", case=False)]

df = print_leak_stats(leaks)
df

Unnamed: 0,ID,Num. of leaks,Num. of channels
0,Android ID,166,9
1,directtv_email,7,1
2,directtv_password,1,1
3,Serial No,7,6
4,Channel name,1004,7
5,AD ID,122,7
6,Wifi SSID,5,1
7,Device name,5,1
8,Profile Email,1,1
9,Profile Firstname,1,1


In [4]:
print(make_latex_table(
    df, caption="Overview of information leakage detected in Fire TV-Top30-Manual-MITM crawl",
    label="leaks_amazon_manual"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lrr}
\toprule
 Id                &   Num. of leaks &   Num. of channels \\
\midrule
 Android ID        &             166 &                  9 \\
 directtv\_email    &               7 &                  1 \\
 directtv\_password &               1 &                  1 \\
 Serial No         &               7 &                  6 \\
 Channel name      &            1004 &                  7 \\
 AD ID             &             122 &                  7 \\
 Wifi SSID         &               5 &                  1 \\
 Device name       &               5 &                  1 \\
 Profile Email     &               1 &                  1 \\
 Profile Firstname &               1 &                  1 \\
 Profile Lastname  &               1 &                  1 \\
 MAC               &               5 &                  2 \\
 Zip               &               7 &                  2 \\
\bottomrule
\end{tabular}
%}
\caption{Overvi

## OpenWPM Leaks

In [5]:
from log_analysis import add_domain_column
add_domain_column(openwpm_leaks)

openwpm_leaks["host"] = openwpm_leaks.url.map(lambda x: urlparse(x).hostname)
add_adblocked_status(openwpm_leaks)
openwpm_leaks = openwpm_leaks[openwpm_leaks.adblocked]

df = openwpm_leaks[openwpm_leaks.id_type !="Channel name"][['channel_id', 'channel_name', 'id_type', 'encoding', 'search', 'leak_type', 'referrer', 'req_domain']].drop_duplicates()
#.url.iloc[2]
df

Loaded 76247 from EasyList, 16516 rules from EasyPrivacy


Unnamed: 0,channel_id,channel_name,id_type,encoding,search,leak_type,referrer,req_domain
0,com.aetn.aetv.watch,A&amp;E,Profile Email,sha256,baaaaaaab54@gmail.com,url_leaks,https://www.aetv.com/profile/activate,krxd.net
0,com.onemainstream.nbcunivers.android,NBC,Profile Email,base64,baaaaaaab54@gmail.com,url_leaks,https://www.nbc.com/sign-in,mixpanel.com
0,com.onemainstream.nbcunivers.android,NBC,Profile Email,base64,baaaaaaab54@gmail.com,url_leaks,https://www.nbc.com/sign-up,mixpanel.com
1,com.onemainstream.nbcunivers.android,NBC,Profile Email,base64,baaaaaaab54@gmail.com,url_leaks,https://www.nbc.com/nbcuniversalfamily/activate,mixpanel.com
0,com.onemainstream.nbcunivers.android,NBC,Profile Email,urlencode,baaaaaaab54@gmail.com,url_leaks,https://www.nbc.com/sign-in,mixpanel.com
0,com.nbcuni.com.nbcsports.liveextra.firetv,NBC Sports,Zip,unencoded,08540,url_leaks,https://buy.tinypass.com/checkout/offer/show?d...,tinypass.com


In [6]:
print(make_latex_table(
    df, caption="",
    label="leaks_roku_manual_web"))


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{llllllll}
\toprule
 Channel id                                & Channel name   & Id type       & Encoding   & Search                & Leak type   & Referrer                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              & Req domain   \\
\midrule
 com.aetn.aetv.watch                       & A\&amp;E        & Profile Email & sha256     & baaaaaaab54@gmail.com & url\_leaks   & https://www.aetv.com/profile/activate                                              

### Email leaks

In [7]:
leaks[leaks.id_type.isin(['Profile Email', 'Email'])]

Unnamed: 0,adblocked,adblocked_by_url,category,channel_id,channel_name,cookie,decoded_data,disconnect_blocked,disconnect_blocked_by_url,domain_by_dns,...,referer,req_domain,search,status,tcp_dstport,tcp_stream,time,url,user_agent,ch_name_url_false_pos
0,False,False,Movies & TV,com.feeln.androidapp,Hallmark Movies Now,,,False,False,feeln.com,...,,feeln.com,baaaaaaab54@gmail.com,TERMINATED,443,40,1562278877.810364,https://apify.feeln.com/v3/registration.json,Dalvik/2.1.0 (Linux; U; Android 5.1.1; AFTT Bu...,False


### Leaked IDs

In [8]:
leaks.id_type.value_counts()

Channel name         1004
Android ID            166
AD ID                 122
Serial No               7
directtv_email          7
Zip                     7
Device name             5
Wifi SSID               5
MAC                     5
Profile Firstname       1
Profile Lastname        1
Profile Email           1
directtv_password       1
Name: id_type, dtype: int64

### Leak location

In [9]:
leaks.leak_type.value_counts()

url_leaks         799
post_leaks        395
referrer_leaks    129
cookie_leaks        9
Name: leak_type, dtype: int64

### Leak encodings

In [10]:
leaks.encoding.value_counts()

unencoded    1185
urlencode      73
sha1           41
base64         15
md5            14
base16          4
Name: encoding, dtype: int64

## Top trackers

In [11]:
from nb_utils import get_popular_domains_from_tcp_conns
tcp = load_df(CrawlFireTVManualV2, "tcp_conn")
df = get_popular_domains_from_tcp_conns(tcp[tcp.adblocked])
df

Unnamed: 0,domain,Num. of channels
12,amazon.com,21
53,mobileanalytics.us-east-1.amazonaws.com,16
28,doubleclick.net,8
11,amazon-adsystem.com,8
5,adobe.com,7
62,scorecardresearch.com,6
21,crashlytics.com,6
27,demdex.net,6
31,facebook.com,5
24,d3a510xmpll7o6.cloudfront.net,5


In [12]:
print make_latex_table(df, caption="Most prevalent trackers in the Amazon manual crawl", label="tab:top_trackers_amazon_manual_v2")


\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lr}
\toprule
 Domain                                  &   Num. of channels \\
\midrule
 amazon.com                              &                 21 \\
 mobileanalytics.us-east-1.amazonaws.com &                 16 \\
 doubleclick.net                         &                  8 \\
 amazon-adsystem.com                     &                  8 \\
 adobe.com                               &                  7 \\
 scorecardresearch.com                   &                  6 \\
 crashlytics.com                         &                  6 \\
 demdex.net                              &                  6 \\
 facebook.com                            &                  5 \\
 d3a510xmpll7o6.cloudfront.net           &                  5 \\
\bottomrule
\end{tabular}
%}
\caption{Most prevalent trackers in the Amazon manual crawl}
\label{tab:tab:top_trackers_amazon_manual_v2}
\end{table}


In [13]:
tcp = load_df(CrawlFireTVManualV2, "tcp_conn")
tcp.domain.nunique()

140

In [14]:
tcp.channel_id.nunique()


21