In [1]:
import LeakDetector
import numpy as np
import pandas as pd
from nb_utils import get_crawl_data_path
from device_ids import TV_ID_MAP
from log_analysis import (load_dns_data, load_timestamps_from_crawl_data,
                          get_distinct_tcp_conns, get_crawl_parameter,
                          get_crawl_status, get_http_df, get_http2_df)

from os.path import isdir, join, basename
from datetime import datetime
from glob import glob


In [2]:
AMAZON_CRAWL = 'amazon-data-20190501-142343'
ROKU_CRAWL = 'roku-data-20190412-122224'

In [3]:
CHECK_REFERRER_LEAKS = True

def check_row_for_leaks(detector, req):
    url, cookie_str, post_body, referrer_str = req['url'], req['cookie'], req['post_data'], req['referer']
    url_leaks = detector.check_url(url)
    url_leaks += detector.substring_search(url, max_layers=2)
    #cookie_leaks = detector.check_cookies(headers)
    #cookie_str = detector.get_cookie_str(headers, from_request=True)
    cookie_leaks = detector.substring_search(cookie_str, max_layers=2)
    post_leaks = detector.substring_search(post_body, max_layers=2)
    if CHECK_REFERRER_LEAKS:
        referrer_leaks = detector.substring_search(referrer_str, max_layers=2)
        return url_leaks, cookie_leaks, post_leaks, referrer_leaks
    else:
        return url_leaks, cookie_leaks, post_leaks

In [4]:
def reverse_dict(d):
    new_d = dict()
    for k, v in d.items():
        new_d[v.lower()] = k
    return new_d

In [5]:
def convert_leaks_to_df(device_ids, leaks_dict):
    r_ids = reverse_dict(device_ids)
    leaks_dicts = []
    for leak_type, leaks in leaks_dict.items():
        for leak in leaks:            
            # print(leak, len(leak), leak[0])
            assert len(leak) <= 2
            if len(leak) == 2:
                encoding, search = leak
            elif len(leak) == 1:
                search = leak[0]
                encoding = "unencoded"
            id_type = r_ids[search.lower()]
            leaks_dicts.append({'id_type': id_type, 'search': search,
                                "encoding": encoding, "leak_type": leak_type})
    return pd.DataFrame(leaks_dicts)



In [6]:
def detect_leaks_in_requests(df, device_ids):
    df.sort_values("channel_name", inplace=True)
    last_channel = ""
    leak_df = pd.DataFrame({})
    #for idx, req in df[df.channel_name.str.contains("Z")].iterrows():
    for idx, req in df.iterrows():
        #print(idx, req['request_full_uri'], req['cookie'], req['post_data'], req['referer'], req['data'])
        channel_name = str(req['channel_name'])
        channel_id = str(req['channel_id'])
        if channel_name !=last_channel:
            last_channel = channel_name
            # print("Channel name", channel_name)
            #device_ids = {}
            device_ids["Channel name"] = channel_name
            r_device_ids = reverse_dict(device_ids)
            leak_detector = LeakDetector.LeakDetector(
                device_ids.values(), encoding_set=LeakDetector.ENCODINGS_NO_ROT,
                encoding_layers=2, hash_layers=2, debugging=False
            )
        url_leaks, cookie_leaks, post_leaks, referrer_leaks = check_row_for_leaks(leak_detector, req)
        tmp_df = convert_leaks_to_df(device_ids,
                                     {"url_leaks": url_leaks, "cookie_leaks": cookie_leaks,
                                      "post_leaks": post_leaks, "referrer_leaks": referrer_leaks})
        tmp_df['channel_id'] = channel_id
        tmp_df['channel_name'] = channel_name
        tmp_df['url'] = req['url']
        tmp_df['cookie'] = req['cookie']
        tmp_df['post_data'] = req['post_data']
        tmp_df['referer'] = req['referer']
        leak_df = leak_df.append(tmp_df, sort=True)
        #if len(url_leaks) or len(cookie_leaks) or len(post_leaks) or len(referrer_leaks):
        #    if "scorecard" not in req['url']:
        #        print (req['url'], req['cookie'], req['post_data'], req['referer'], url_leaks, cookie_leaks, post_leaks, referrer_leaks)
    device_ids["Channel name"] = ""
    return leak_df


In [9]:
def analyze_leaks(crawl_name, req_df=None):
    craw_dir = get_crawl_data_path(crawl_name)
    if req_df is None:
        req_df, _, _ = get_http_df(craw_dir)
    print("\nCrawl name: %s" % crawl_name)
    print("%d reqs from %d channels" % (len(req_df), req_df.channel_id.nunique()))
    id_dict = TV_ID_MAP[get_crawl_parameter(craw_dir, "WLANIF")]
    print("Will search for the following IDs", id_dict)
    leak_df = detect_leaks_in_requests(req_df, id_dict)

    for id_type in id_dict.keys():
        num_leaks = leak_df[leak_df.id_type==id_type].channel_id.nunique()
        if num_leaks:
            print ("%d channels leaked %s" % (num_leaks, id_type))

    return leak_df, req_df, id_dict

In [10]:
leaks_amazon, requests_amazon, id_dict = analyze_leaks(AMAZON_CRAWL)
leaks_roku, requests_roku, id_dict = analyze_leaks(ROKU_CRAWL)

('Multiple messages', 96)

Crawl name: amazon-data-20190501-142343
3522 reqs from 78 channels
('Will search for the following IDs', {'City': 'Princeton', 'AD ID': '05baacb6-acdf-4e18-84cd-ae78eab7e081', 'Zip': '08540', 'State': 'New Jersey', 'Email': 'macyli47@gmail.com', 'Wifi SSID': 'IoT-Pi-1', 'Device name': "Macy's 4th Fire TV Stick", 'Software version': 'Fire OS 5.2.6.9 (6325552020)', 'Channel name': '', 'Serial No': 'G070L82185152J8P', 'MAC': '68:9a:87:20:3d:1d', 'Amazon account': 'Macy Li', 'Password': 'M888777'})
9 channels leaked AD ID
13 channels leaked Channel name
4 channels leaked Serial No
('Multiple messages', 402)

Crawl name: roku-data-20190412-122224
34345 reqs from 130 channels
('Will search for the following IDs', {'City': 'Princeton', 'AD ID': 'ded0f0e3-b3aa-59a2-a143-f6c1157a7ae8', 'Device Name': 'Office tv', 'Zip': '08540', 'State': 'New Jersey', 'Device ID': 'C33858901841', 'Email': 'macyli47@gmail.com', 'Wifi SSID': 'IoT-Pi-3-2', 'Serial No': 'YG0080901841', 'M

## Roku leaks

In [12]:
leaks_roku.head(3)

Unnamed: 0,channel_id,channel_name,cookie,encoding,id_type,leak_type,post_data,referer,search,url
0,86186,24 Hour Movie Channel,,unencoded,AD ID,referrer_leaks,,http://us-east-sync.bidswitch.net/ul_cb/sync?d...,ded0f0e3-b3aa-59a2-a143-f6c1157a7ae8,http://match.sharethrough.com/sync/v1?source_i...
1,86186,24 Hour Movie Channel,,unencoded,AD ID,url_leaks,,http://us-east-sync.bidswitch.net/ul_cb/sync?d...,ded0f0e3-b3aa-59a2-a143-f6c1157a7ae8,http://match.sharethrough.com/sync/v1?source_i...
2,86186,24 Hour Movie Channel,,unencoded,AD ID,url_leaks,,http://us-east-sync.bidswitch.net/ul_cb/sync?d...,ded0f0e3-b3aa-59a2-a143-f6c1157a7ae8,http://match.sharethrough.com/sync/v1?source_i...


In [22]:
leaks_amazon.head(3)

Unnamed: 0,channel_id,channel_name,cookie,encoding,id_type,leak_type,post_data,referer,search,url
0,com.amctve.amcfiretv,AMC,,unencoded,Channel name,url_leaks,,,amc,http://amc-api-br.svc.ds.amcn.com/v2/public/mv...
1,com.amctve.amcfiretv,AMC,,unencoded,Channel name,url_leaks,,,amc,http://amc-api-br.svc.ds.amcn.com/v2/public/mv...
0,com.amctve.amcfiretv,AMC,,unencoded,Channel name,url_leaks,,,amc,http://amc-api-br.svc.ds.amcn.com/v2/public/fe...


### Leaked IDs

In [14]:
leaks_roku.id_type.value_counts()

Channel name    11272
AD ID            4784
Build Number     3974
Serial No        2435
City               86
Zip                16
Device Name         9
Name: id_type, dtype: int64

In [19]:
leaks_amazon.id_type.value_counts()

Channel name    488
AD ID            82
Serial No        28
Name: id_type, dtype: int64

### Leak location

In [15]:
leaks_roku.leak_type.value_counts()

url_leaks         19293
post_leaks         2847
referrer_leaks      364
cookie_leaks         72
Name: leak_type, dtype: int64

In [20]:
leaks_amazon.leak_type.value_counts()

url_leaks     517
post_leaks     81
Name: leak_type, dtype: int64

### Leak encodings

In [17]:
leaks_roku.encoding.value_counts()

unencoded    19209
sha1          1466
md5           1466
base64         218
urlencode      217
Name: encoding, dtype: int64

In [21]:
leaks_amazon.encoding.value_counts()

unencoded    575
urlencode     19
md5            4
Name: encoding, dtype: int64