## All TCP connections made during the crawls
- 

In [1]:
import numpy as np
import pandas as pd
from log_analysis import (get_tcp_conns, load_timestamps_from_crawl_data,
                          gen_network_df,
                          load_dns_data, get_crawl_status, get_epoch)

from os.path import isdir, join
from datetime import datetime
from glob import glob

AMAZON_CRAWL = 'amazon-data-20190415-124534'
ROKU_CRAWL = 'roku-data-20190412-122224'

ROOT_CRAWL_DIR = '/mnt/iot-house/crawl-data/'
if not isdir(ROOT_CRAWL_DIR):
    # ROOT_CRAWL_DIR = '/media/gacar/Data/iot-house/crawl-data/'
    ROOT_CRAWL_DIR = '/home/gacar/dev/smart-tv/data'

crawl_data_dir_amazon = join(ROOT_CRAWL_DIR, AMAZON_CRAWL)
crawl_data_dir_roku = join(ROOT_CRAWL_DIR, ROKU_CRAWL)

crawl_data_dirs = {
    "Amazon": crawl_data_dir_amazon,
    "Roku": crawl_data_dir_roku,
}

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
roku_crawl_results = get_crawl_status(crawl_data_dir_roku)
amazon_crawl_results = get_crawl_status(crawl_data_dir_amazon)
crawl_results = roku_crawl_results.copy()
crawl_results.update(amazon_crawl_results)

## Load timestamps

In [3]:
channel_timestamps = load_timestamps_from_crawl_data(crawl_data_dir_amazon)
channel_timestamps.update(load_timestamps_from_crawl_data(crawl_data_dir_roku))

longest_label_set = max(list(channel_timestamps.values()), key=len)
label_set = [i[0] for i in longest_label_set]
smart_crawl_labels = list(filter(lambda x: 'key-seq' in x, label_set))
#print(channel_timestamps)

Loading timestamp data from /home/gacar/dev/smart-tv/data/amazon-data-20190415-124534
Loading timestamp data from /home/gacar/dev/smart-tv/data/roku-data-20190412-122224


In [4]:
def get_domain_by_ip(ip_address, ip2name_db):
    if ip_address in ip2name_db:
        return ip2name_db[ip_address][0].rstrip('.')
    else:
        return "unknown"

In [11]:
from os.path import basename
def get_tcp_conns(crawl_data_dir, rIP2NameDB):
    df = pd.DataFrame([])
    # Find all tls failure due to invalid cert:
    post_process_dir = join(crawl_data_dir, 'post-process')
    for csv_path in glob(join(post_process_dir, "*all_tcp_streams")):
        filename = basename(csv_path)
        channel_name = filename.split("-")[0]
        tmp_df = pd.read_csv(csv_path, sep=',', encoding='utf-8', index_col=None)
        tmp_df['channel_name'] = channel_name
        df = df.append(tmp_df)
    df["hostname"] = df["ip.dst"].map(lambda x: get_domain_by_ip(x, rIP2NameDB))
    df['timestamp'] = df['frame.time_epoch'].map(lambda x: datetime.fromtimestamp(
            int(x)).strftime('%Y-%m-%d %H:%M:%S'))
    return df



In [12]:
#roku_tcp = get_tcp_conns(crawl_data_dir_roku)
#amazon_tcp = get_tcp_conns(crawl_data_dir_amazon)

rIP2NameDB, _ = load_dns_data(crawl_data_dir_roku)
roku_tcp = get_tcp_conns(crawl_data_dir_roku, rIP2NameDB)

rIP2NameDB, _ = load_dns_data(crawl_data_dir_amazon)
amazon_tcp = get_tcp_conns(crawl_data_dir_amazon, rIP2NameDB)
# amazon_tcp["hostname"] = amazon_tcp["ip.dst"].map(lambda x: get_domain_by_ip(x, rIP2NameDB))

In [13]:
len(roku_tcp)

71811

In [14]:
len(amazon_tcp)

17087

In [15]:
roku_tcp.sample(3)

Unnamed: 0,tcp.stream,frame.time_epoch,ip.src,ip.dst,tcp.dstport,channel_name,hostname,timestamp
136,226,1555119000.0,10.42.0.119,52.7.100.112,443,89531,scribe.logs.roku.com,2019-04-12 21:24:32
8,20,1555133000.0,10.42.0.119,52.45.134.131,443,194836,api.roku.com,2019-04-13 01:18:44
40,86,1555211000.0,10.42.0.119,34.199.173.179,443,74416,scribe.logs.roku.com,2019-04-13 22:56:33


In [16]:
amazon_tcp.sample(3)

Unnamed: 0,tcp.stream,frame.time_epoch,ip.src,ip.dst,tcp.dstport,channel_name,hostname,timestamp
159,151,1555466000.0,10.42.0.198,62.113.210.9,443,de.edv_medien.muxxtv,5856e1a25f71a.streamlock.net,2019-04-16 21:47:02
54,69,1555354000.0,10.42.0.198,54.239.26.255,443,com.viewlift.peopleawesome,unagi-na.amazon.com,2019-04-15 14:53:41
164,169,1555366000.0,10.42.0.198,74.125.174.139,443,com.amazon.rialto.webapp.A0166410f4578b4c09471...,r5---sn-ab5sznl7.googlevideo.com,2019-04-15 18:00:57


## Unmatched IP addresses
- the IP addresses we cannot find the domain for

In [17]:
len(roku_tcp[roku_tcp.hostname=="unknown"])

1233

In [18]:
roku_tcp[roku_tcp.hostname=="unknown"]["ip.dst"].value_counts()

8.8.8.8           1039
45.55.53.218        75
34.210.236.142      30
208.79.221.176      29
119.81.201.168      28
52.32.237.35        10
100.24.67.155        9
3.83.40.168          5
34.222.220.139       5
8.8.4.4              2
100.26.161.83        1
Name: ip.dst, dtype: int64

In [35]:
roku_tcp[(roku_tcp.hostname=="unknown") & (roku_tcp["ip.dst"] == "8.8.8.8") ]["channel_name"].nunique()

116

In [19]:
roku_tcp[roku_tcp.hostname=="unknown"]["channel_name"].nunique()

120

In [20]:
len(amazon_tcp[amazon_tcp.hostname=="unknown"])

1113

In [21]:
amazon_tcp[amazon_tcp.hostname=="unknown"]["ip.dst"].value_counts()

10.0.0.165        522
10.0.0.144        522
78.47.94.237       68
172.217.12.200      1
Name: ip.dst, dtype: int64

In [None]:
amazon_tcp[amazon_tcp.hostname=="unknown"]["ip.dst"].value_counts()

In [38]:
amazon_tcp[amazon_tcp.hostname=="unknown"]["channel_name"].unique()

array(['com.amazon.rialto.cordova.webapp.webapp61704e97000940daa1592b7194a9f7f5',
       'com.amazon.rialto.cordova.webapp.webappda85a5000c9f4824b165ba500a2731a7',
       'tv.vhx.noizetv', 'com.ktv.app', 'com.nortextv.tv.platform',
       'vlaf.android.player.nettv',
       'com.amazon.rialto.webapp.A7aff98a5c98bdf2b8f45a12e4e7ef3a963750381',
       'com.discovery.firetv.idsgo', 'com.dotstudioz.dotstudioPRO.abstv',
       'com.antena3.atresplayer.tv',
       'com.amazon.rialto.cordova.webapp.webapp0c6894c85898448da2be4bcdc6ed41ef',
       'com.amazon.rialto.webapp.A8302c274ec0e63feedfcf7722c23357c612356bb',
       'com.tube8',
       'com.amazon.rialto.webapp.A1ad7745609925c0784704d332c4db0e015576c71',
       'com.amazon.rialto.cordova.webapp.webappf2cfaca3cc134c849adb5cecf42f39e5',
       'com.amazon.rialto.cordova.webapp.webapp16803e6c9e294e42883e237f1f387716',
       'com.amazon.rialto.cordova.webapp.webappb6944171e7bf4e119067f992a0053381',
       'de.edv_medien.muxxtv', 'com.xclude

In [23]:
amazon_tcp[amazon_tcp.hostname=="unknown"].sample(3)

Unnamed: 0,tcp.stream,frame.time_epoch,ip.src,ip.dst,tcp.dstport,channel_name,hostname,timestamp
130,138,1555431000.0,10.42.0.198,10.0.0.144,40526,com.amazon.rialto.cordova.webapp.webapp61704e9...,unknown,2019-04-16 12:08:46
181,179,1555450000.0,10.42.0.198,10.0.0.144,40526,com.amazon.rialto.cordova.webapp.webappf2cfaca...,unknown,2019-04-16 17:29:55
178,188,1555444000.0,10.42.0.198,10.0.0.165,48224,com.amazon.rialto.webapp.Ac20ad3090e933be164bc...,unknown,2019-04-16 15:49:05


In [24]:
roku_tcp[roku_tcp.hostname=="unknown"].head()

Unnamed: 0,tcp.stream,frame.time_epoch,ip.src,ip.dst,tcp.dstport,channel_name,hostname,timestamp
301,427,1555111000.0,10.42.0.119,8.8.8.8,53,73376,unknown,2019-04-12 19:13:29
518,739,1555111000.0,10.42.0.119,8.8.8.8,53,73376,unknown,2019-04-12 19:17:29
679,972,1555111000.0,10.42.0.119,8.8.8.8,53,73376,unknown,2019-04-12 19:20:53
769,1098,1555111000.0,10.42.0.119,8.8.8.8,53,73376,unknown,2019-04-12 19:22:41
1319,1710,1555206000.0,10.42.0.119,8.8.8.8,53,144078,unknown,2019-04-13 21:40:53


In [25]:
roku_tcp.hostname.nunique()

559

In [26]:
amazon_tcp.hostname.nunique()

364

In [27]:
df_n_domains_by_channel = roku_tcp.drop_duplicates(subset=["channel_name", "hostname"]).\
    groupby(["channel_name"]).size().reset_index(name="# hostname").\
    sort_values(by=['# hostname'], ascending=False)
df_n_domains_by_channel.head(10)

Unnamed: 0,channel_name,# hostname
86,38896,54
124,79647,53
120,74519,52
133,86186,51
143,90131,49
3,111255,46
144,90440,45
103,6119,44
102,56317,44
110,68669,44


In [28]:
roku_tcp[~roku_tcp.hostname.str.contains("roku.com") & (roku_tcp.hostname != 'unknown')].drop_duplicates(subset=["channel_name", "hostname"]).\
    groupby(["hostname"]).size().reset_index(name="# channels").\
    sort_values(by=['# channels'], ascending=False).head(10)

Unnamed: 0,hostname,# channels
353,pubads.g.doubleclick.net,119
80,b.scorecardresearch.com,41
465,tpc.googlesyndication.com,39
426,securepubads.g.doubleclick.net,33
419,search.spotxchange.com,30
512,www.google-analytics.com,24
165,dpm.demdex.net,24
22,ad.doubleclick.net,23
443,static.ifood.tv,21
336,player.vimeo.com,21


In [29]:
roku_tcp[~roku_tcp.hostname.str.contains("roku.com") &
         (roku_tcp["tcp.dstport"] == 80) &
         (roku_tcp.hostname != 'unknown')].drop_duplicates(subset=["channel_name", "hostname"]).\
    groupby(["hostname"]).size().reset_index(name="# channels").\
    sort_values(by=['# channels'], ascending=False).head(10)

Unnamed: 0,hostname,# channels
32,b.scorecardresearch.com,35
168,pubads.g.doubleclick.net,24
202,static.ifood.tv,20
193,search.spotxchange.com,18
36,boot.irchan.com,15
102,hallmarkchannel.sc.omtrdc.net,13
78,dpm.demdex.net,11
4,PD13201-bnblt.ads.tremorhub.com,10
84,event.spotxchange.com,10
228,www.google-analytics.com,9


In [39]:
amazon_tcp[
    ~amazon_tcp.hostname.str.contains("amazon.com") &
           (amazon_tcp.hostname != 'unknown')].drop_duplicates(subset=["channel_name", "hostname"]).\
    groupby(["hostname"]).size().reset_index(name="# channels").\
    sort_values(by=['# channels'], ascending=False).head(20)

Unnamed: 0,hostname,# channels
9,aax-us-east.amazon-adsystem.com,99
122,graph.facebook.com,95
328,z.moatads.com,93
247,sb.scorecardresearch.com,93
150,mobile-collector.newrelic.com,93
64,can.cbs.com,92
106,dpm.demdex.net,92
242,s.amazon-adsystem.com,86
292,udm.scorecardresearch.com,75
26,andr-785f3ec7eb-cbc62794911ff31b-c55635c491a5a...,72


In [127]:
amazon_tcp['timestamp'] = amazon_tcp['frame.time_epoch'].map(lambda x: datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S'))

In [134]:
amazon_tcp[amazon_tcp.hostname.str.contains("moatads")].\
    drop_duplicates(['channel_name', 'hostname']).sort_values("frame.time_epoch")

Unnamed: 0,tcp.stream,frame.time_epoch,ip.src,ip.dst,tcp.dstport,channel_name,hostname,timestamp
30,44,1.555349e+09,10.42.0.198,23.194.110.93,443,com.gtv.goodparent,z.moatads.com,2019-04-15 13:17:21
55,66,1.555350e+09,10.42.0.198,23.194.110.93,443,com.amazon.rialto.cordova.webapp.webappc26b1d7...,z.moatads.com,2019-04-15 13:44:14
67,79,1.555352e+09,10.42.0.198,23.194.110.93,443,com.ventuno.tv.firetv.frames21,z.moatads.com,2019-04-15 14:18:01
91,112,1.555354e+09,10.42.0.198,23.194.110.93,443,com.netflix.ninja,z.moatads.com,2019-04-15 14:44:15
80,100,1.555354e+09,10.42.0.198,23.194.110.93,443,com.viewlift.peopleawesome,z.moatads.com,2019-04-15 14:53:56
26,44,1.555356e+09,10.42.0.198,23.194.110.93,443,com.elevatetv.tv.platform,z.moatads.com,2019-04-15 15:17:27
37,50,1.555357e+09,10.42.0.198,23.194.110.93,443,com.tulix.ginikousafiretv,z.moatads.com,2019-04-15 15:44:17
141,146,1.555359e+09,10.42.0.198,23.194.110.93,443,com.antena3.atresplayer.tv,z.moatads.com,2019-04-15 16:17:26
63,72,1.555361e+09,10.42.0.198,23.194.110.93,443,com.nortextv.tv.platform,z.moatads.com,2019-04-15 16:44:20
17,25,1.555363e+09,10.42.0.198,23.194.110.93,443,com.wb.amzn.dcuniverse,z.moatads.com,2019-04-15 17:17:59


In [123]:
amazon_tcp[~amazon_tcp.hostname.str.contains("amazon.com") &
         (amazon_tcp["tcp.dstport"] == 80) &
         (amazon_tcp.hostname != 'unknown')].drop_duplicates(subset=["channel_name", "hostname"]).\
    groupby(["hostname"]).size().reset_index(name="# channels").\
    sort_values(by=['# channels'], ascending=False).head(10)

Unnamed: 0,hostname,# channels
2,aax-us-east.amazon-adsystem.com,99
54,spectrum.s3.amazonaws.com,68
27,globalstreamingserver.com,14
5,amazon-web-app-whitelist.s3.amazonaws.com,6
59,static1.dmcdn.net,5
6,amazon.streamingtelevisioninc.com,5
49,res.cloudinary.com,4
26,equality.bigstar.tv,3
0,8s3.lvlt.dash.us.aiv-cdn.net,3
50,s.amazon-adsystem.com,3


In [None]:
amazon_tcp[~amazon_tcp.hostname.str.contains("amazon.com") &
           (amazon_tcp["tcp.dstport"] == 80) &
           (amazon_tcp.hostname != 'unknown')].drop_duplicates(subset=["channel_name", "hostname"]).\
    groupby(["hostname"]).size().reset_index(name="# channels").\
    sort_values(by=['# channels'], ascending=False).head(10)

In [131]:
roku_tcp[roku_tcp.hostname == 'pubads.g.doubleclick.net']

Unnamed: 0,tcp.stream,frame.time_epoch,ip.src,ip.dst,tcp.dstport,channel_name,hostname
13,30,1.555088e+09,10.42.0.119,172.217.12.162,443,2285,pubads.g.doubleclick.net
14,33,1.555088e+09,10.42.0.119,172.217.12.162,443,2285,pubads.g.doubleclick.net
521,906,1.555089e+09,10.42.0.119,172.217.12.162,443,2285,pubads.g.doubleclick.net
522,907,1.555089e+09,10.42.0.119,172.217.12.162,443,2285,pubads.g.doubleclick.net
13,30,1.555111e+09,10.42.0.119,172.217.3.98,443,73376,pubads.g.doubleclick.net
14,33,1.555111e+09,10.42.0.119,172.217.3.98,443,73376,pubads.g.doubleclick.net
524,745,1.555111e+09,10.42.0.119,172.217.10.130,443,73376,pubads.g.doubleclick.net
526,747,1.555111e+09,10.42.0.119,172.217.10.130,443,73376,pubads.g.doubleclick.net
527,748,1.555111e+09,10.42.0.119,172.217.10.130,443,73376,pubads.g.doubleclick.net
528,749,1.555111e+09,10.42.0.119,172.217.3.98,443,73376,pubads.g.doubleclick.net
