In [1]:
import LeakDetector
import numpy as np
import pandas as pd
from nb_utils import get_crawl_data_path
from device_ids import TV_ID_MAP
from log_analysis import (load_dns_data, load_timestamps_from_crawl_data,
                          get_distinct_tcp_conns, get_crawl_parameter,
                          get_crawl_status, get_http_df, get_http2_df)

from os.path import isdir, join, basename
from datetime import datetime
from glob import glob


In [17]:
from crawl_ids import CrawlRokuTop1K
from df_utils import load_df

In [56]:
reqs = load_df(CrawlRokuTop1K, "http_req")

In [47]:
CHECK_REFERRER_LEAKS = True

def check_row_for_leaks(detector, req):
    url, cookie_str, post_body, referrer_str = req['url'], req['cookie'], req['post_data'], req['referer']
    url_leaks = detector.check_url(url)
    url_leaks += detector.substring_search(url, max_layers=2)
    cookie_leaks = detector.substring_search(cookie_str, max_layers=2)
    post_leaks = detector.substring_search(post_body, max_layers=2)
    if CHECK_REFERRER_LEAKS:
        referrer_leaks = detector.substring_search(referrer_str, max_layers=2)
        return url_leaks, cookie_leaks, post_leaks, referrer_leaks
    else:
        return url_leaks, cookie_leaks, post_leaks

In [48]:
def reverse_dict(d):
    new_d = dict()
    for k, v in d.items():
        new_d[v.lower()] = k
    return new_d

In [49]:
def convert_leaks_to_df(device_ids, leaks_dict):
    r_ids = reverse_dict(device_ids)
    leaks_dicts = []
    for leak_type, leaks in leaks_dict.items():
        for leak in leaks:            
            # print(leak, len(leak), leak[0])
            assert len(leak) <= 2
            if len(leak) == 2:
                encoding, search = leak
            elif len(leak) == 1:
                search = leak[0]
                encoding = "unencoded"
            id_type = r_ids.get(search.lower(), "Unknown")
            leaks_dicts.append({'id_type': id_type, 'search': search,
                                "encoding": encoding, "leak_type": leak_type})
    return pd.DataFrame(leaks_dicts)


In [50]:
import unicodedata

def detect_leaks_in_requests(df, device_ids):
    df.sort_values("channel_name", inplace=True)
    last_channel = ""
    leak_df = pd.DataFrame({})
    #for idx, req in df[df.channel_name.str.contains("Z")].iterrows():
    for idx, req in df.iterrows():
        channel_name = unicodedata.normalize('NFKD', req['channel_name']).encode('ascii','ignore')
        channel_id = str(req['channel_id'])
        if channel_name !=last_channel:
            last_channel = channel_name
            device_ids["Channel name"] = channel_name
            r_device_ids = reverse_dict(device_ids)
            leak_detector = LeakDetector.LeakDetector(
                device_ids.values(), encoding_set=LeakDetector.ENCODINGS_NO_ROT,
                encoding_layers=2, hash_layers=2, debugging=False
            )
        url_leaks, cookie_leaks, post_leaks, referrer_leaks = check_row_for_leaks(leak_detector, req)
        tmp_df = convert_leaks_to_df(device_ids,
                                     {"url_leaks": url_leaks, "cookie_leaks": cookie_leaks,
                                      "post_leaks": post_leaks, "referrer_leaks": referrer_leaks})
        tmp_df['channel_name'] = channel_name
        for col in df.columns:  # copy data from requests to leaks df
            if col not in ['channel_name',]:  #  already added, skip
                tmp_df[col] = req[col]
        leak_df = leak_df.append(tmp_df, sort=True)

    device_ids["Channel name"] = ""  # reset back
    return leak_df


In [114]:
def print_leak_stats(leak_df):
    for id_type in list(leak_df.id_type.unique()):
        num_leaks = leak_df[leak_df.id_type==id_type].channel_id.nunique()
        if num_leaks:
            print ("%d channels leaked %s" % (num_leaks, id_type))

    
def analyze_leaks(crawl_name, req_df=None):
    craw_dir = get_crawl_data_path(crawl_name)
    if req_df is None:
        req_df, _, _ = get_http_df(crawl_name)
    print("\nCrawl name: %s" % crawl_name)
    print("%d reqs from %d channels" % (len(req_df), req_df.channel_id.nunique()))
    id_dict = TV_ID_MAP[get_crawl_parameter(craw_dir, "WLANIF")]
    print("Will search for the following IDs", id_dict)
    leak_df = detect_leaks_in_requests(req_df, id_dict)

    for id_type in id_dict.keys():
        num_leaks = leak_df[leak_df.id_type==id_type].channel_id.nunique()
        if num_leaks:
            print ("%d channels leaked %s" % (num_leaks, id_type))

    return leak_df, req_df, id_dict

## Run leak detection

In [60]:
# leaks_amazon, requests_amazon, id_dict = analyze_leaks(AMAZON_CRAWL)
r = reqs
leaks_roku, requests_roku, id_dict = analyze_leaks(CrawlRokuTop1K, r)


Crawl name: roku-data-20190508-013650
29664 reqs from 794 channels
('Will search for the following IDs', {'City': 'Princeton', 'AD ID': 'ded0f0e3-b3aa-59a2-a143-f6c1157a7ae8', 'Device Name': 'Office tv', 'Zip': '08540', 'State': 'New Jersey', 'Device ID': 'C33858901841', 'Email': 'macyli47@gmail.com', 'Wifi SSID': 'IoT-Pi-3-2', 'Channel name': '', 'Serial No': 'YG0080901841', 'MAC': 'd8:31:34:22:e6:ff', 'Build Number': '519.00E04142A', 'Password': 'RoheuskEdfekJa3'})
5 channels leaked City
313 channels leaked AD ID
5 channels leaked Zip
2 channels leaked State
1 channels leaked Email
225 channels leaked Channel name
108 channels leaked Serial No
317 channels leaked Build Number


## Roku leaks

In [106]:
DEVICE_ID_NAMES = ['AD ID', 'Serial No', 'MAC', 'Device name', 'Device name', 'Wifi SSID', 'Device ID']
id_leaks_roku = leaks_roku[leaks_roku.id_type.isin(DEVICE_ID_NAMES)]

6314

In [115]:
print_leak_stats(leaks_roku)

313 channels leaked AD ID
317 channels leaked Build Number
225 channels leaked Channel name
108 channels leaked Serial No
5 channels leaked City
2 channels leaked State
5 channels leaked Zip
1 channels leaked Email
1 channels leaked Unknown


## ID Leaks

In [116]:
print_leak_stats(id_leaks_roku)

313 channels leaked AD ID
108 channels leaked Serial No


In [123]:
len(leaks_roku), len(id_leaks_roku), leaks_roku.channel_id.nunique(), id_leaks_roku.channel_id.nunique()
print ("IDs leaked", len(id_leaks_roku), "times in", id_leaks_roku.channel_id.nunique(), "channels")
print ("Channel name leaked in ", leaks_roku[leaks_roku.id_type=="Channel name"].channel_id.nunique(), "channels")

('IDs leaked', 6314, 'times in', 373, 'channels')
('Channel name leaked in ', 225, 'channels')


In [124]:
leaks_roku.playback.value_counts()

AttributeError: 'DataFrame' object has no attribute 'playback'

## Leak types

In [109]:
leaks_roku.id_type.value_counts()

Channel name    7291
AD ID           4606
Build Number    2910
Serial No       1708
Zip               35
City              30
State             11
Email              2
Unknown            2
Name: id_type, dtype: int64

In [110]:
id_leaks_roku.id_type.value_counts()

AD ID        4606
Serial No    1708
Name: id_type, dtype: int64

In [111]:
id_leaks_roku.drop_duplicates(['channel_id', 'id_type']).id_type.value_counts()

AD ID        313
Serial No    108
Name: id_type, dtype: int64

In [62]:
leaks_roku.adblocked.value_counts()

True     10278
False     6317
Name: adblocked, dtype: int64

In [65]:
leaks_roku.id_type.unique()

array(['AD ID', 'Build Number', 'Channel name', 'Serial No', 'City',
       'State', 'Zip', 'Email', 'Unknown'], dtype=object)

In [None]:
- email address sent to  http://api.qello.com/users/register/ for registration purposes
- crawler actually clicked the dialog to allow email address to be accessed from Roku

In [66]:
leaks_roku[leaks_roku.id_type=="Unknown"]

Unnamed: 0,adblocked,category,channel_id,channel_name,cookie,decoded_data,disconnect_blocked,domain_by_dns,easylist_blocked,easypivacy_blocked,...,rank,referer,req_domain,search,status,tcp_dstport,tcp_stream,time,url,user_agent
1,False,Music,40299,Stingray Qello,,,False,qello.com,False,False,...,498,,qello.com,macyli47,TERMINATED,80,84,1557386763.45743,http://api.qello.com/users/register/,Roku/DVP-9.0 (519.00E04142A)
1,False,Music,40299,Stingray Qello,,,False,qello.com,False,False,...,498,,qello.com,macyli47,TERMINATED,80,59,1557386729.679612,http://api.qello.com/users/register/,Roku/DVP-9.0 (519.00E04142A)


In [67]:
leaks_roku[leaks_roku.id_type=="Email"]

Unnamed: 0,adblocked,category,channel_id,channel_name,cookie,decoded_data,disconnect_blocked,domain_by_dns,easylist_blocked,easypivacy_blocked,...,rank,referer,req_domain,search,status,tcp_dstport,tcp_stream,time,url,user_agent
0,False,Music,40299,Stingray Qello,,,False,qello.com,False,False,...,498,,qello.com,macyli47@gmail.com,TERMINATED,80,84,1557386763.45743,http://api.qello.com/users/register/,Roku/DVP-9.0 (519.00E04142A)
0,False,Music,40299,Stingray Qello,,,False,qello.com,False,False,...,498,,qello.com,macyli47@gmail.com,TERMINATED,80,59,1557386729.679612,http://api.qello.com/users/register/,Roku/DVP-9.0 (519.00E04142A)


In [71]:
id_leaks_roku.adblocked.value_counts()

True     4853
False    1461
Name: adblocked, dtype: int64

In [103]:
## AD ID leaks

In [98]:
not_blocked = id_leaks_roku[~id_leaks_roku.adblocked]
not_blocked[not_blocked.id_type=="AD ID"].drop_duplicates(["channel_id", "req_domain"]).req_domain.value_counts().head(10)

monarchads.com     33
adrise.tv          18
ewscloud.com        7
lightcast.com       4
myspotlight.tv      4
aragoncreek.com     4
kargo.com           2
brightline.tv       2
theplatform.com     1
nbcuni.com          1
Name: req_domain, dtype: int64

In [104]:
not_blocked[not_blocked.id_type=="Serial No"].drop_duplicates(["channel_id", "req_domain"]).req_domain.value_counts().head(10)

irchan.com       47
lightcast.com     5
bigstar.tv        3
adrise.tv         2
rfdcc.com         1
                  1
slacker.com       1
eulive.eu         1
ihopkc.org        1
nmax.tv           1
Name: req_domain, dtype: int64

In [97]:
not_blocked = id_leaks_roku[~id_leaks_roku.adblocked]
not_blocked.drop_duplicates(["channel_id", "req_domain"]).req_domain.value_counts().head(11)

irchan.com         47
monarchads.com     33
adrise.tv          19
ewscloud.com        7
lightcast.com       6
aragoncreek.com     4
myspotlight.tv      4
bigstar.tv          3
brightline.tv       2
kargo.com           2
turner.com          1
Name: req_domain, dtype: int64

In [11]:
# leaks_amazon.head(3)

### Leaked IDs

In [99]:
leaks_roku.id_type.value_counts()

Channel name    7291
AD ID           4606
Build Number    2910
Serial No       1708
Zip               35
City              30
State             11
Email              2
Unknown            2
Name: id_type, dtype: int64

### Leak location

In [101]:
leaks_roku.leak_type.value_counts()

url_leaks         15134
post_leaks         1268
referrer_leaks      183
cookie_leaks         10
Name: leak_type, dtype: int64

### Leak encodings

In [102]:
leaks_roku.encoding.value_counts()

unencoded    13913
md5           1110
sha1          1004
urlencode      522
base64          42
sha256           4
Name: encoding, dtype: int64

In [16]:
#leaks_amazon.encoding.value_counts()

### Manual investigation of domains

In [85]:
id_leaks_roku[id_leaks_roku.req_domain=="myspotlight.tv"].url.unique()

array([u'http://api.myspotlight.tv/vmap/5c83418097f815b71495ef07/1280/720?secure=true&app_idfa=ded0f0e3-b3aa-59a2-a143-f6c1157a7ae8&device_make=Roku&device_model=3900X&app_name=American%20Beauty%20Star&device_type=roku&app_id=200404&app_url=https://channelstore.roku.com/details/200404/american-beauty-starapp_version=28.6&app_bundle=com.roku.americanbeautystar',
       u'http://api.myspotlight.tv/vmap/5cccb8d699f8158e06c89a80/1280/720?secure=true&app_idfa=ded0f0e3-b3aa-59a2-a143-f6c1157a7ae8&device_make=Roku&device_model=3900X&app_name=vikings&device_type=roku&app_id=176822&app_url=https://channelstore.roku.com/details/176822/minnesota-vikingsapp_version=31.4&app_bundle=com.roku.vikings',
       u'http://api.myspotlight.tv/vmap/5cccbfdb99f815fc0dc89a82/1280/720?secure=true&app_idfa=ded0f0e3-b3aa-59a2-a143-f6c1157a7ae8&device_make=Roku&device_model=3900X&app_name=vikings&device_type=roku&app_id=176822&app_url=https://channelstore.roku.com/details/176822/minnesota-vikingsapp_version=31.4&

In [82]:
id_leaks_roku[id_leaks_roku.req_domain == "ewscloud.com"].url.unique()

array([u'http://ads.ewscloud.com/preplay/nb15_film?ad.preroll=1&ad=nb_futuretoday45fawesome&pp2ip=0&cdur=15&app_name=Fawesome.tv&ga_track=1&ad.cust_params=rdid%3Dded0f0e3-b3aa-59a2-a143-f6c1157a7ae8%26app_name%3DFawesome.tv%26bundle_id%3Droku.fawesome.tv%26is_lat%3D0',
       u'http://ads.ewscloud.com/preplay/nb15_film?ad.preroll=1&ad=nb_vitormediastandard&pp2ip=0&cdur=15&app_name=Comedy%20Classics&ga_track=1&ad.cust_params=rdid%3Dded0f0e3-b3aa-59a2-a143-f6c1157a7ae8%26app_name%3DComedy%20Classics%26bundle_id%3Dcom.roku.vm-comedy-classics%26is_lat%3D',
       u'http://ads.ewscloud.com/preplay/nb15_film?ad.preroll=1&ad=nb_vitormediapremium&pp2ip=0&cdur=15&app_name=freehorrorchannel&ga_track=1&ad.cust_params=rdid%3Dded0f0e3-b3aa-59a2-a143-f6c1157a7ae8%26app_name%3Dfreehorrorchannel%26bundle_id%3Droku.freehorrorchannel%26is_lat%3D',
       u'http://ads.ewscloud.com/preplay/nb15_film?ad.preroll=1&ad=nb_vitormediapremium&pp2ip=0&cdur=15&app_name=freemoviesnow&ga_track=1&ad.cust_params=rdid%

In [95]:
id_leaks_roku[id_leaks_roku.req_domain == "bigstar.tv"].url.unique()

array([u'http://artofseduction.bigstar.tv/mobile/verifyToken?os=ROKU&device=aa26569d5b22bece2040ee7d92b01a12-YG0080901841&lan=en',
       u'http://artofseduction.bigstar.tv/mobile/genres?os=ROKU&device=aa26569d5b22bece2040ee7d92b01a12-YG0080901841&lan=en&avod_min=1',
       u'http://artofseduction.bigstar.tv/mobile/movies?os=ROKU&device=aa26569d5b22bece2040ee7d92b01a12-YG0080901841&lan=en&genre=most-popular&limit=30&shortDescription&ph=-1&rp=-1&se=-1&fi=-1&st=-1&lc=-1&ge=-1&du=-1&di=-1&wr=-1&ca=-1&ma=-1&hd=-1&sl=-1&vi=-1&li=-1&rd=-1&fe=-1&rt=-1&ra=-1&xml&ao=true&po=false',
       u'http://artofseduction.bigstar.tv/mobile/userInfo?os=ROKU&device=aa26569d5b22bece2040ee7d92b01a12-YG0080901841&lan=en',
       u'http://artofseduction.bigstar.tv/mobile/userMovieDetails?os=ROKU&device=aa26569d5b22bece2040ee7d92b01a12-YG0080901841&lan=en&film=18204',
       u'http://artofseduction.bigstar.tv/mobile/movies?os=ROKU&device=aa26569d5b22bece2040ee7d92b01a12-YG0080901841&lan=en&noAdultFilter&shortDe