## All TCP connections made during the crawls
- 

In [1]:
import numpy as np
import pandas as pd
from log_analysis import (get_domain_by_ip, load_timestamps_from_crawl_data,
                          get_distinct_tcp_conns,
                          load_dns_data, get_crawl_status, get_epoch)

from os.path import isdir, join
from datetime import datetime
from glob import glob

AMAZON_CRAWL = 'amazon-data-20190415-124534'
ROKU_CRAWL = 'roku-data-20190412-122224'

ROOT_CRAWL_DIR = '/mnt/iot-house/crawl-data/'
if not isdir(ROOT_CRAWL_DIR):
    # ROOT_CRAWL_DIR = '/media/gacar/Data/iot-house/crawl-data/'
    ROOT_CRAWL_DIR = '/home/gacar/dev/smart-tv/data'

crawl_data_dir_amazon = join(ROOT_CRAWL_DIR, AMAZON_CRAWL)
crawl_data_dir_roku = join(ROOT_CRAWL_DIR, ROKU_CRAWL)

crawl_data_dirs = {
    "Amazon": crawl_data_dir_amazon,
    "Roku": crawl_data_dir_roku,
}

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
roku_crawl_results = get_crawl_status(crawl_data_dir_roku)
amazon_crawl_results = get_crawl_status(crawl_data_dir_amazon)
crawl_results = roku_crawl_results.copy()
crawl_results.update(amazon_crawl_results)

## Load timestamps

In [3]:
channel_timestamps = load_timestamps_from_crawl_data(crawl_data_dir_amazon)
channel_timestamps.update(load_timestamps_from_crawl_data(crawl_data_dir_roku))

longest_label_set = max(list(channel_timestamps.values()), key=len)
label_set = [i[0] for i in longest_label_set]
smart_crawl_labels = list(filter(lambda x: 'key-seq' in x, label_set))
#print(channel_timestamps)

Loading timestamp data from /home/gacar/dev/smart-tv/data/amazon-data-20190415-124534
Loading timestamp data from /home/gacar/dev/smart-tv/data/roku-data-20190412-122224


In [4]:
roku_tcp = get_distinct_tcp_conns(crawl_data_dir_roku)
amazon_tcp = get_distinct_tcp_conns(crawl_data_dir_amazon)


Loading distinct TCP connections from /home/gacar/dev/smart-tv/data/roku-data-20190412-122224/post-process 
Loading distinct TCP connections from /home/gacar/dev/smart-tv/data/amazon-data-20190415-124534/post-process 


In [5]:
len(roku_tcp)

71660

In [6]:
len(amazon_tcp)

16506

In [7]:
roku_tcp.head(3)

Unnamed: 0,tcp_stream,frame_time_epoch,ip_src,ip_dst,tcp_dstport,channel_name,mitm_attempt,hostname,timestamp
0,9,1555178000.0,10.42.0.119,23.22.241.63,443,56317,0,api.roku.com,2019-04-13 13:52:15
1,10,1555178000.0,10.42.0.119,52.84.27.165,443,56317,0,image.roku.com,2019-04-13 13:52:16
2,13,1555178000.0,10.42.0.119,54.174.219.117,443,56317,0,api.roku.com,2019-04-13 13:52:18


In [10]:
amazon_tcp.head(3)

Unnamed: 0,tcp_stream,frame_time_epoch,ip_src,ip_dst,tcp_dstport,channel_name,mitm_attempt,hostname,timestamp
0,3,1555398000.0,10.42.0.198,52.94.233.94,443,com.hallmarkchannel.tv,0,msh.amazon.com,2019-04-16 03:05:30
1,4,1555398000.0,10.42.0.198,52.94.233.94,443,com.hallmarkchannel.tv,0,msh.amazon.com,2019-04-16 03:05:36
2,5,1555398000.0,10.42.0.198,52.46.132.145,443,com.hallmarkchannel.tv,0,appstore-tv-prod-na.amazon.com,2019-04-16 03:06:00


## Unmatched IP addresses
- the IP addresses we cannot find the domain for

In [11]:
len(roku_tcp[roku_tcp.hostname=="unknown"])

1231

In [13]:
roku_tcp[roku_tcp.hostname=="unknown"]["ip_dst"].value_counts()

8.8.8.8           1037
45.55.53.218        75
34.210.236.142      30
208.79.221.176      29
119.81.201.168      28
52.32.237.35        10
100.24.67.155        9
3.83.40.168          5
34.222.220.139       5
8.8.4.4              2
100.26.161.83        1
Name: ip_dst, dtype: int64

In [19]:
roku_tcp[(roku_tcp.hostname=="unknown") & (roku_tcp.ip_dst == "8.8.8.8") ]["channel_name"].nunique()

116

In [15]:
roku_tcp[roku_tcp.hostname=="unknown"]["channel_name"].nunique()

120

In [16]:
len(amazon_tcp[amazon_tcp.hostname=="unknown"])

532

In [18]:
amazon_tcp[amazon_tcp.hostname=="unknown"].ip_dst.value_counts()

10.0.0.165        261
10.0.0.144        261
78.47.94.237        9
172.217.12.200      1
Name: ip_dst, dtype: int64

In [20]:
amazon_tcp[amazon_tcp.hostname=="unknown"].ip_dst.value_counts()

10.0.0.165        261
10.0.0.144        261
78.47.94.237        9
172.217.12.200      1
Name: ip_dst, dtype: int64

In [21]:
amazon_tcp[amazon_tcp.hostname=="unknown"]["channel_name"].unique()

array(['com.hallmarkchannel.tv', 'tv.vhx.noizetv',
       'com.amazon.rialto.webapp.A97a39f857075ac09468b9222ff79d920f80cf054',
       'tv.expatprime.player',
       'com.amazon.rialto.webapp.A0166410f4578b4c09471c8a72d90142158edd4f0',
       'com.amazon.rialto.cordova.webapp.webapp16803e6c9e294e42883e237f1f387716',
       'com.amazon.rialto.cordova.webapp.webappa54eae912bb344479bb421756a21d8f6',
       'com.hiveview.damaitv', 'com.globalchurchtv',
       'com.amazon.rialto.cordova.webapp.webappf6ba1e5bf16744aca1f5f9f2e270bf3e',
       'com.antena3.atresplayer.tv', 'com.discovery.firetv.idsgo',
       'com.amazon.rialto.cordova.webapp.webappe7a61635ffe64b97a38f4073b8e0f872',
       'blackcinema.bigstar.tv', 'com.tube8',
       'com.amazon.rialto.cordova.webapp.webappda85a5000c9f4824b165ba500a2731a7',
       'com.amazon.rialto.webapp.Ae1e1dcd16922eb9449e8066f1059bfa8ff836a76',
       'firetv.bigstartv.equality', 'com.ktv.app',
       'com.dotstudioz.dotstudioPRO.abstv',
       'com.amaz

In [22]:
amazon_tcp[amazon_tcp.hostname=="unknown"].sample(3)

Unnamed: 0,tcp_stream,frame_time_epoch,ip_src,ip_dst,tcp_dstport,channel_name,mitm_attempt,hostname,timestamp
53,60,1555362000.0,10.42.0.198,10.0.0.144,40526,com.gtv.safenetwork,0,unknown,2019-04-15 17:01:05
139,151,1555437000.0,10.42.0.198,10.0.0.165,48224,com.amazon.rialto.webapp.A7aff98a5c98bdf2b8f45...,0,unknown,2019-04-16 13:57:49
113,120,1555359000.0,10.42.0.198,10.0.0.144,40526,com.antena3.atresplayer.tv,0,unknown,2019-04-15 16:09:55


In [23]:
roku_tcp[roku_tcp.hostname=="unknown"].head()

Unnamed: 0,tcp_stream,frame_time_epoch,ip_src,ip_dst,tcp_dstport,channel_name,mitm_attempt,hostname,timestamp
255,412,1555178000.0,10.42.0.119,8.8.8.8,53,56317,0,unknown,2019-04-13 13:56:10
309,494,1555178000.0,10.42.0.119,8.8.8.8,53,56317,0,unknown,2019-04-13 13:57:22
315,511,1555178000.0,10.42.0.119,8.8.8.8,53,56317,0,unknown,2019-04-13 13:57:44
352,566,1555178000.0,10.42.0.119,8.8.8.8,53,56317,0,unknown,2019-04-13 13:58:44
403,641,1555178000.0,10.42.0.119,8.8.8.8,53,56317,0,unknown,2019-04-13 14:00:01


In [24]:
roku_tcp.hostname.nunique()

559

In [25]:
amazon_tcp.hostname.nunique()

364

In [26]:
df_n_domains_by_channel = roku_tcp.drop_duplicates(subset=["channel_name", "hostname"]).\
    groupby(["channel_name"]).size().reset_index(name="# hostname").\
    sort_values(by=['# hostname'], ascending=False)
df_n_domains_by_channel.head(10)

Unnamed: 0,channel_name,# hostname
86,38896,54
124,79647,53
120,74519,52
133,86186,51
143,90131,49
3,111255,46
144,90440,45
103,6119,44
102,56317,44
110,68669,44


In [27]:
roku_tcp[~roku_tcp.hostname.str.contains("roku.com") & (roku_tcp.hostname != 'unknown')].drop_duplicates(subset=["channel_name", "hostname"]).\
    groupby(["hostname"]).size().reset_index(name="# channels").\
    sort_values(by=['# channels'], ascending=False).head(10)

Unnamed: 0,hostname,# channels
353,pubads.g.doubleclick.net,119
80,b.scorecardresearch.com,41
465,tpc.googlesyndication.com,39
426,securepubads.g.doubleclick.net,33
419,search.spotxchange.com,30
512,www.google-analytics.com,24
165,dpm.demdex.net,24
22,ad.doubleclick.net,23
443,static.ifood.tv,21
336,player.vimeo.com,21


In [28]:
roku_tcp[~roku_tcp.hostname.str.contains("roku.com") &
         (roku_tcp["tcp_dstport"] == 80) &
         (roku_tcp.hostname != 'unknown')].drop_duplicates(subset=["channel_name", "hostname"]).\
    groupby(["hostname"]).size().reset_index(name="# channels").\
    sort_values(by=['# channels'], ascending=False).head(10)

Unnamed: 0,hostname,# channels
32,b.scorecardresearch.com,35
168,pubads.g.doubleclick.net,24
202,static.ifood.tv,20
193,search.spotxchange.com,18
36,boot.irchan.com,15
102,hallmarkchannel.sc.omtrdc.net,13
78,dpm.demdex.net,11
4,PD13201-bnblt.ads.tremorhub.com,10
84,event.spotxchange.com,10
228,www.google-analytics.com,9


In [29]:
amazon_tcp[
    ~amazon_tcp.hostname.str.contains("amazon.com") &
           (amazon_tcp.hostname != 'unknown')].drop_duplicates(subset=["channel_name", "hostname"]).\
    groupby(["hostname"]).size().reset_index(name="# channels").\
    sort_values(by=['# channels'], ascending=False).head(20)

Unnamed: 0,hostname,# channels
9,aax-us-east.amazon-adsystem.com,99
122,graph.facebook.com,95
328,z.moatads.com,93
247,sb.scorecardresearch.com,93
150,mobile-collector.newrelic.com,93
64,can.cbs.com,92
106,dpm.demdex.net,92
242,s.amazon-adsystem.com,86
292,udm.scorecardresearch.com,75
26,andr-785f3ec7eb-cbc62794911ff31b-c55635c491a5a...,72


In [32]:
amazon_tcp[amazon_tcp.hostname.str.contains("moatads")].\
    drop_duplicates(['channel_name', 'hostname']).sort_values("frame_time_epoch")

Unnamed: 0,tcp_stream,frame_time_epoch,ip_src,ip_dst,tcp_dstport,channel_name,mitm_attempt,hostname,timestamp
30,44,1.555349e+09,10.42.0.198,23.194.110.93,443,com.gtv.goodparent,0,z.moatads.com,2019-04-15 13:17:21
55,66,1.555350e+09,10.42.0.198,23.194.110.93,443,com.amazon.rialto.cordova.webapp.webappc26b1d7...,0,z.moatads.com,2019-04-15 13:44:14
67,79,1.555352e+09,10.42.0.198,23.194.110.93,443,com.ventuno.tv.firetv.frames21,0,z.moatads.com,2019-04-15 14:18:01
91,112,1.555354e+09,10.42.0.198,23.194.110.93,443,com.netflix.ninja,0,z.moatads.com,2019-04-15 14:44:15
80,100,1.555354e+09,10.42.0.198,23.194.110.93,443,com.viewlift.peopleawesome,0,z.moatads.com,2019-04-15 14:53:56
26,44,1.555356e+09,10.42.0.198,23.194.110.93,443,com.elevatetv.tv.platform,0,z.moatads.com,2019-04-15 15:17:27
37,50,1.555357e+09,10.42.0.198,23.194.110.93,443,com.tulix.ginikousafiretv,0,z.moatads.com,2019-04-15 15:44:17
141,146,1.555359e+09,10.42.0.198,23.194.110.93,443,com.antena3.atresplayer.tv,0,z.moatads.com,2019-04-15 16:17:26
63,72,1.555361e+09,10.42.0.198,23.194.110.93,443,com.nortextv.tv.platform,0,z.moatads.com,2019-04-15 16:44:20
17,25,1.555363e+09,10.42.0.198,23.194.110.93,443,com.wb.amzn.dcuniverse,0,z.moatads.com,2019-04-15 17:17:59


In [34]:
amazon_tcp[~amazon_tcp.hostname.str.contains("amazon.com") &
         (amazon_tcp["tcp_dstport"] == 80) &
         (amazon_tcp.hostname != 'unknown')].drop_duplicates(subset=["channel_name", "hostname"]).\
    groupby(["hostname"]).size().reset_index(name="# channels").\
    sort_values(by=['# channels'], ascending=False).head(10)

Unnamed: 0,hostname,# channels
2,aax-us-east.amazon-adsystem.com,99
54,spectrum.s3.amazonaws.com,68
27,globalstreamingserver.com,14
5,amazon-web-app-whitelist.s3.amazonaws.com,6
59,static1.dmcdn.net,5
6,amazon.streamingtelevisioninc.com,5
49,res.cloudinary.com,4
26,equality.bigstar.tv,3
0,8s3.lvlt.dash.us.aiv-cdn.net,3
50,s.amazon-adsystem.com,3


In [36]:
amazon_tcp[~amazon_tcp.hostname.str.contains("amazon.com") &
           (amazon_tcp["tcp_dstport"] == 80) &
           (amazon_tcp.hostname != 'unknown')].drop_duplicates(subset=["channel_name", "hostname"]).\
    groupby(["hostname"]).size().reset_index(name="# channels").\
    sort_values(by=['# channels'], ascending=False).head(10)

Unnamed: 0,hostname,# channels
2,aax-us-east.amazon-adsystem.com,99
54,spectrum.s3.amazonaws.com,68
27,globalstreamingserver.com,14
5,amazon-web-app-whitelist.s3.amazonaws.com,6
59,static1.dmcdn.net,5
6,amazon.streamingtelevisioninc.com,5
49,res.cloudinary.com,4
26,equality.bigstar.tv,3
0,8s3.lvlt.dash.us.aiv-cdn.net,3
50,s.amazon-adsystem.com,3


In [37]:
roku_tcp[roku_tcp.hostname == 'pubads.g.doubleclick.net']

Unnamed: 0,tcp_stream,frame_time_epoch,ip_src,ip_dst,tcp_dstport,channel_name,mitm_attempt,hostname,timestamp
13,30,1.555178e+09,10.42.0.119,172.217.12.194,443,56317,0,pubads.g.doubleclick.net,2019-04-13 13:52:25
14,33,1.555178e+09,10.42.0.119,172.217.12.194,443,56317,0,pubads.g.doubleclick.net,2019-04-13 13:52:25
238,395,1.555178e+09,10.42.0.119,172.217.12.194,443,56317,0,pubads.g.doubleclick.net,2019-04-13 13:56:04
244,401,1.555178e+09,10.42.0.119,172.217.12.194,443,56317,0,pubads.g.doubleclick.net,2019-04-13 13:56:06
258,415,1.555178e+09,10.42.0.119,172.217.12.194,443,56317,0,pubads.g.doubleclick.net,2019-04-13 13:56:10
293,477,1.555178e+09,10.42.0.119,172.217.12.194,443,56317,0,pubads.g.doubleclick.net,2019-04-13 13:57:20
388,626,1.555178e+09,10.42.0.119,172.217.12.194,443,56317,0,pubads.g.doubleclick.net,2019-04-13 13:59:57
446,707,1.555178e+09,10.42.0.119,172.217.12.194,443,56317,0,pubads.g.doubleclick.net,2019-04-13 14:01:11
460,721,1.555178e+09,10.42.0.119,172.217.12.194,443,56317,0,pubads.g.doubleclick.net,2019-04-13 14:01:17
463,724,1.555178e+09,10.42.0.119,172.217.12.194,443,56317,0,pubads.g.doubleclick.net,2019-04-13 14:01:17
