In [21]:
import numpy as np
import pandas as pd
from log_analysis import get_crawl_status, load_dns_data

from os.path import isdir, join
from datetime import datetime
from glob import glob


from collections import Counter
from itertools import chain

AMAZON_CRAWL = 'amazon-data-20190415-124534'
ROKU_CRAWL = 'roku-data-20190412-122224'

ROOT_CRAWL_DIR = '/mnt/iot-house/crawl-data/'
if not isdir(ROOT_CRAWL_DIR):
    # ROOT_CRAWL_DIR = '/media/gacar/Data/iot-house/crawl-data/'
    ROOT_CRAWL_DIR = '/home/gacar/dev/smart-tv/data'

crawl_data_dir_amazon = join(ROOT_CRAWL_DIR, AMAZON_CRAWL)
crawl_data_dir_roku = join(ROOT_CRAWL_DIR, ROKU_CRAWL)

crawl_data_dirs = {
    "Amazon": crawl_data_dir_amazon,
    "Roku": crawl_data_dir_roku,
}

In [9]:
roku_crawl_results = get_crawl_status(crawl_data_dir_roku)
amazon_crawl_results = get_crawl_status(crawl_data_dir_amazon)
crawl_results = roku_crawl_results.copy()
crawl_results.update(amazon_crawl_results)

## Total number of channels crawled

In [19]:
print ("No. of channels crawled: Roku", len(roku_crawl_results))
print ("No. of channels crawled: Amazon", len(amazon_crawl_results))

No. of channels crawled: Roku 150
No. of channels crawled: Amazon 100


## Roku failed crawls

In [13]:
roku_failed_cnt = 0
for ch, result in roku_crawl_results.items():
    if result != "TERMINATED":
        # print (ch, result)
        roku_failed_cnt += 1
print ("%d of the %d channels failed" % (roku_failed_cnt, len(roku_crawl_results)))
counts = Counter(roku_crawl_results.values())
print (counts)

9 of the 150 channels failed
Counter({'TERMINATED': 141, 'INSTALLING': 8, 'LAUNCHING': 1})


## Amazon failed crawls

In [14]:
amazon_failed_cnt = 0
for ch, result in amazon_crawl_results.items():
    if result != "TERMINATED":
        # print (ch, result)
        amazon_failed_cnt += 1
print ("%d of the %d channels failed" % (amazon_failed_cnt, len(amazon_crawl_results)))
counts = Counter(amazon_crawl_results.values())
print (counts)


5 of the 100 channels failed
Counter({'TERMINATED': 95, 'LAUNCHING': 3, 'INSTALLING': 1, 'TERMINATING': 1})


## DNS data

In [37]:
roku_rIP2NameDB, roku_rName2IPDB = load_dns_data(crawl_data_dir_roku)
amazon_rIP2NameDB, amazon_rName2IPDB = load_dns_data(crawl_data_dir_amazon)

In [44]:
from tld import get_fld

In [38]:
### Number of distinct hostnames queried

In [56]:
distinct_hosts_queried_roku = set([host.rstrip(".") for hosts in roku_rIP2NameDB.values() for host in hosts])
distinct_hosts_queried_amazon = set([host.rstrip(".") for hosts in amazon_rIP2NameDB.values() for host in hosts])

distinct_domains_queried_roku = set([get_fld("http://" + host) for host in distinct_hosts_queried_roku])
distinct_domains_queried_amazon = set([get_fld("http://" + host) for host in distinct_hosts_queried_amazon])

print ("Roku - Distinct hosts:", len(distinct_hosts_queried_roku),
           "Distinct domains (PS+1):", len(distinct_domains_queried_roku))
print ("Amazon - Distinct hosts:", len(distinct_hosts_queried_amazon),
           "Distinct domains (PS+1):", len(distinct_domains_queried_amazon))

Roku - Distinct hosts: 561 Distinct domains (PS+1): 276
Amazon - Distinct hosts: 379 Distinct domains (PS+1): 209
