## Detect identifiers in HTTP traffic from Roku

In [35]:
import pandas as pd
from os.path import dirname, realpath, join, isfile
from utils import (get_ps1_or_ipaddress, read_pcap_fields_from_txts,
                   download_roku_channel_details, ROKU_MACS)

In [2]:
import seaborn as sns
%matplotlib inline
sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(16,9)})

In [3]:
ROOT_PROJ_DIR = dirname(dirname(dirname(dirname(realpath('__file__')))))
DATA_DIR = join(ROOT_PROJ_DIR, "data/pcap_analysis/roku-channel-surfer/2018-09-27")
HTTP_REQS_DIR = join(DATA_DIR, "http")
DNS_QUERIES_DIR = join(DATA_DIR, "dns")
HTTP_CSV = join(DATA_DIR, "roku-2018-09-27-requests.csv")

### Load HTTP request details
- Load the request details from the CSV or build from scratch

In [4]:
if isfile(HTTP_CSV):
    # load from csv if we already built the dataframe
    df = pd.read_csv(HTTP_CSV, sep='\t', encoding='utf-8')
else:
    print "Can't find the CSV file", HTTP_CSV

In [6]:
print "%d requests to %d distinct domains from %s distinct channels" %\
    (len(df), df.domain.nunique(), df.channel_id.nunique())

30029 requests to 506 distinct domains from 993 distinct channels


### Fixed Roku Identifiers
-  look for device/user specific identifiers in the traffic

In [28]:
ROKU_IDS = {
    "Serial No": "YG0080901841",
    "AD ID": "ded0f0e3-b3aa-59a2-a143-f6c1157a7ae8",
    "Device ID": "C33858901841",
    "MAC": "d8:31:34:22:e6:ff",
    "City": "Princeton",
    "State": "New Jersey",
    "Zip": "08540"
}

from base64 import b64encode
from urllib import quote
from hashlib import md5, sha1, sha256

def get_encodings(_id):
    encodings = [
        ("Plain", _id),
        ("Uppercase", _id.upper()),
        ("Lowercase", _id.lower()),
        ("MD5", md5(_id).hexdigest()),
        ("SHA1", sha1(_id).hexdigest()),
        ("SHA256", sha256(_id).hexdigest()),
        ("Base64", b64encode(_id)),
        ("Urlencode", quote(_id)),
    ]
    for encoding_name, encoding in encodings:
        yield encoding_name, encoding

def search_roku_ids_in_reqs(req_df):
    """Search for IDs and their various encodings in the HTTP requests."""
    transmitted_encodings = []
    for id_name, _id in ROKU_IDS.iteritems():
        # print "** %s**" % id_name
        seen_id_encodings = set()
        for encoding_name, encoded_id in get_encodings(_id):
            if encoded_id in seen_id_encodings:
                continue
            seen_id_encodings.add(encoded_id)
            n_hits = req_df[req_df.url.str.contains(encoded_id)].channel_id.nunique()
            if n_hits:
                transmitted_encodings.append((id_name, encoding_name, n_hits, encoded_id))
                # print "%d channels: %s %s: (%s)" % (n_hits, id_name, encoding_name, encoded_id)
    return transmitted_encodings

In [29]:
hits = search_roku_ids_in_reqs(df)
pd.DataFrame(hits, columns=["ID type", "Encoding", "# channels", "ID Value"]).sort_values(by=["# channels", "ID type"], ascending=False)

Unnamed: 0,ID type,Encoding,# channels,ID Value
8,Serial No,Plain,515,YG0080901841
1,AD ID,Plain,177,ded0f0e3-b3aa-59a2-a143-f6c1157a7ae8
6,Zip,Plain,46,08540
0,City,Plain,35,Princeton
7,State,Urlencode,27,New%20Jersey
4,AD ID,SHA1,22,9e3dc8bc68c0c74ca772b982e0596d5d17db25f9
3,AD ID,MD5,21,0af0ecdbf35907d450cdb7a7eebd7874
2,AD ID,Uppercase,17,DED0F0E3-B3AA-59A2-A143-F6C1157A7AE8
10,Serial No,SHA1,16,3ba5afee40ed31762a86657dfc901cb5e4009dd1
9,Serial No,MD5,15,9e13f59dab6a01f233ba5952ced6dfe0


### Top endpoints that receive the device serial number

In [19]:
df[df.url.str.contains(ROKU_IDS["Serial No"])].\
    drop_duplicates(subset=["channel_name", "domain"]).\
    groupby("domain").size().reset_index(name="# channels").\
    sort_values(by=['# channels'], ascending=False).head(10)

Unnamed: 0,domain,# channels
13,ifood.tv,282
14,irchan.com,183
12,google-analytics.com,47
23,yumenetworks.com,20
4,bigstar.tv,12
16,lightcast.com,5
0,adrise.com,5
18,spotxchange.com,4
22,viewlift.com,3
1,adrise.tv,2


### MD5 of the device serial number

In [20]:
serial_md5 = md5(ROKU_IDS["Serial No"]).hexdigest()
df[df.url.str.contains(serial_md5)].\
    drop_duplicates(subset=["channel_name", "domain"]).\
    groupby("domain").size().reset_index(name="# channels").\
    sort_values(by=['# channels'], ascending=False)

Unnamed: 0,domain,# channels
2,scorecardresearch.com,12
0,2o7.net,1
1,cbsi.com,1
3,telebreeze.com,1
4,unicornmedia.com,1


In [21]:
serial_sha1 = sha1(ROKU_IDS["Serial No"]).hexdigest()
df[df.url.str.contains(serial_sha1)].\
    drop_duplicates(subset=["channel_name", "domain"]).\
    groupby("domain").size().reset_index(name="# channels").\
    sort_values(by=['# channels'], ascending=False)

Unnamed: 0,domain,# channels
2,scorecardresearch.com,12
1,google-analytics.com,3
0,2o7.net,1


### Cross-channel ID detection
- quick and dirty search for IDs that are used by more than one channels
- we will replace this with ID detection method from WWW'15 paper of Steve and Arvind

In [24]:
from urlparse import urlparse
def get_potential_ids(url):
    potential_ids = []
    query_part = urlparse(url).query
    if not query_part:
        return []

    for query_pair in query_part.split("&"):
        if not query_pair:
            return []
        # print query_pair
        if query_pair.count("=") == 1:
            potential_id = query_pair.split("=")[-1]
            if len(potential_id) > 5:
                potential_ids.append(potential_id)
    return potential_ids


In [25]:
from collections import defaultdict

id_locations = defaultdict(set)
for a, row in df.iterrows():
    for _id in get_potential_ids(row["url"]):
        id_locations[_id].add(row["channel_id"])


In [26]:
for _id, channel_ids in id_locations.iteritems():
    if len(channel_ids) > 25:
        print _id, len(channel_ids)

696570 29
hidden 72
Digital%20Video%20player 27
midpoint 40
thirdQuartile 39
ConnectedTV 27
complete 77
HDTV.518.10E04155A 28
eNozZPAL9fFBI2ois8Iy%2FN2DcvyqPCsjq7KN%2FKqSjaNy3TKjXMIy%2FLKyTfyyXMsjszyNfLN8bQGSHxTj 34
24-bit 47
happykids 38
197054 26
recurring 40
Roku%2FDVP-8.10%20%28518.10E04155A%29 79
138466 46
new-qid 110
nodeid 26
9212268898 38
6272977 73
1280x720 53
new-channel 44
eNozZvAL9fFBI2r8srJN%2FENcjf1cnHIjQ6Jyo3J9q6LC%2FbJ9XVwrfKu8cn1dQg0ic%2F2AMNQWAHwnFGs%3D 34
eNozYPAL9fFBI2r8qiINfbNCDSOr%2FHJ8s7IrI41CTXyNIk0jQ5LLI7MCjf3cA6v8w8OyfXMDbQF9gxRO 39
ROKU_ADS_CONTENT_LENGTH 57
http://roku.com 56
1920x1080 63
POD_POSITION 61
United%20States 27
tracking.instanttvchannel.com 33
%2Anull 29
147492 29
128.112.225.64 107
roku.happykids 38
eNozYvAL9fFBI2p8qxwNIrOyK6JCUnL8XUIr%2FEIyMiOz0iv83X0rfLOyTX2rko38XVwrIqsibQGT7hUz 40
linear 84
ded0f0e3-b3aa-59a2-a143-f6c1157a7ae8 175
720p%20HDTV 46
YG0080901841 497
UA-44634522-1 33
v1-b76 64
Princeton 27
2.1511.10 66
New%20Jersey 27
https%3A%2F%

### One Google Analytics ID is shared across 33 channels
- It turns out this ID belongs to http://www.instanttvchannel.com/ "Cloud-Based Roku Channel Production System"

In [33]:
df[df.url.str.contains("UA-44634522-1")].iloc[0]["url"]

u'http://www.google-analytics.com/__utm.gif?utmwv=1&utmn=1814227968&utmsr=720p%20HDTV&utmsc=24-bit&utmul=en-us&utmje=0&utmfl=-&utmdt=3900X&utmhn=tracking.instanttvchannel.com&utmr=-&utmp=HDTV.518.10E04155A&utmac=UA-44634522-1&utmcc=__utma%3D369465963.1311272320.1538381177.1538381177.1538381177.1%3B%2B__utmz%3D369465963.1538381177.1.2.utmccn%3D(direct)%7Cutmcsr%3D(direct)%7Cutmcmd%3D(none)%3B%2B__utmv%3D369465963.YG0080901841%3B&utme=5(20a38354-b9e1-430c-956f-4805c1065c32*Chinese%2520Movie*1.0.14)&utmt=event'

In [34]:
df[df.url.str.contains("UA-44634522-1")].head()

Unnamed: 0,channel_id,start_ts,command,select_idx,eth_src,ip_dst,req_method,url,channel_name,domain,rank,category
1001,89807,1538381174,launch,0,d8:31:34:22:e6:ff,172.217.10.78,GET,http://www.google-analytics.com/__utm.gif?utmw...,Chinese Movie,google-analytics.com,6504,movies-tvs
1008,52296,1538345065,launch,0,d8:31:34:22:e6:ff,172.217.10.78,GET,http://www.google-analytics.com/__utm.gif?utmw...,Robin Hood Heaven,google-analytics.com,4197,movies-tvs
2572,75222,1538258287,launch,0,d8:31:34:22:e6:ff,172.217.10.78,GET,http://www.google-analytics.com/__utm.gif?utmw...,HorrorHabit,google-analytics.com,1180,movies-tvs
3840,43484,1538361008,launch,0,d8:31:34:22:e6:ff,172.217.10.46,GET,http://www.google-analytics.com/__utm.gif?utmw...,MTD Studio Classics,google-analytics.com,4987,movies-tvs
5301,79558,1538330911,launch,0,d8:31:34:22:e6:ff,172.217.10.78,GET,http://www.google-analytics.com/__utm.gif?utmw...,The Millionaire Movie Channel,google-analytics.com,3286,movies-tvs
