In [2]:
import LeakDetector
import pandas as pd
from glob import glob

In [3]:
from device_ids import TV_ID_MAP
from log_analysis import get_crawl_parameter
from crawl_ids import CrawlFireTVTop1KMITM
from df_utils import load_df

## Load leaks
- Run the following to detect and pickle leaks
  - Detect on all crawls: `python2 title_leaks.py`
  - Detect on a single crawl: `python2 title_leaks.py roku-data-20190508-013650`

In [4]:
leaks = load_df(CrawlFireTVTop1KMITM, "title_leak")

### Manual false positive removal

We manually removed false positives that occur within the resource URLs, not sent as a title parameter.
- "nowcast", "adtv" and "sbtv"
- `http://www.christian-tv.org/tv/firetv/video/adtv.html`
- `http://d27wx7ytq78mow.cloudfront.net/offair/sbtv-cb-v2/media_b3659760_0001.ts`
- We filtered out channels where channel name is same as the video

In [10]:
from log_analysis import add_domain_column
add_domain_column(leaks)
title_leaks = leaks[~leaks.search.isin(["nowcast", "adtv", "sbtv"]) & (leaks.id_type !="Channel name")]
title_leaks = title_leaks[title_leaks.adblocked]
# title_leaks = leaks[leaks.id_type !="Channel name"][['channel_id', 'channel_name', 'id_type', 'encoding', 'search', 'leak_type', 'req_domain']].drop_duplicates()
print(title_leaks.channel_id.nunique(), "channels")
title_leaks[['channel_id', 'channel_name', 'category', 'id_type', 'encoding', 'search', 'leak_type', 'req_domain']].drop_duplicates()

(14, 'channels')


Unnamed: 0,channel_id,channel_name,category,id_type,encoding,search,leak_type,req_domain
0,com.calkins.grahamksat,KSAT TV,News,imdb_title_KSAT-TV Livestream,urlencode,KSAT-TV Livestream,post_leaks,google-analytics.com
0,com.calkins.wral,WRAL,News,imdb_title_Severe storms batter central U.S.,urlencode,Severe storms batter central U.S.,url_leaks,scorecardresearch.com
0,com.calkins.wral,WRAL,News,imdb_title_Severe storms batter central U.S.,urlencode,Severe storms batter central U.S.,post_leaks,google-analytics.com
0,com.dmr.yuyu.tv,Yuyu - Movies &amp; TV,Movies & TV,imdb_title_Mood Indigo,urlencode,Mood Indigo,url_leaks,spotxchange.com
0,com.doapps.firetv.mln.MLN_6ae07dcb33ec3b7c814d...,WTMJ TODAY&#39;s TMJ4 Milwaukee,News,"imdb_title_Partly cloudy, cool Saturday",urlencode,"Partly cloudy, cool Saturday",post_leaks,google-analytics.com
0,com.hillsongchannel.now,Hillsong Channel NOW,Lifestyle,imdb_title_The Jesus Trek,unencoded,The Jesus Trek,post_leaks,litix.io
0,com.mylocaltv.kjrhfiretv,KJRH 2 Works For You Tulsa,News,imdb_title_In the Kitchen with Fireside Grill:...,urlencode,In the Kitchen with Fireside Grill: Caribbean ...,post_leaks,google-analytics.com
0,com.mylocaltv.wptvfiretv,WPTV NewsChannel 5 West Palm,News,imdb_title_To The Point,urlencode,To The Point,post_leaks,google-analytics.com
0,com.planetdiscover.granite.phone.wkbwfiretv,WKBW 7 Eyewitness News Buffalo,News,imdb_title_Graffiti Patio officially opens at ...,urlencode,Graffiti Patio officially opens at Tappo Pizza,post_leaks,google-analytics.com
0,com.zumobi.msnbc,NBC News,News,imdb_title_TODAY's Headlines,urlencode,TODAY's Headlines,url_leaks,omtrdc.net


In [9]:
from nb_utils import make_latex_table
print(make_latex_table(title_leaks[['channel_name', 'search', 'req_domain']].drop_duplicates(),
    caption="Title leaks in 100 random Fire TV channels",
    label="amazon_title_leaks"))




\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lll}
\toprule
 Channel name                    & Search                                                     & Req domain            \\
\midrule
 KSAT TV                         & KSAT-TV Livestream                                         & google-analytics.com  \\
 WRAL                            & Severe storms batter central U.S.                          & scorecardresearch.com \\
 WRAL                            & Severe storms batter central U.S.                          & google-analytics.com  \\
 Yuyu - Movies \&amp; TV          & Mood Indigo                                                & spotxchange.com       \\
 WTMJ TODAY\&\#39;s TMJ4 Milwaukee & Partly cloudy, cool Saturday                               & google-analytics.com  \\
 Hillsong Channel NOW            & The Jesus Trek                                             & litix.io              \\
 KJRH 2 Works For You Tulsa      & In the Kitchen 

In [16]:
## Categories

In [15]:
title_leaks[['channel_id', 'category']].drop_duplicates()['category'].value_counts()

News           7
Movies & TV    5
Lifestyle      1
Kids           1
Name: category, dtype: int64

In [17]:
title_leaks[title_leaks.url.str.startswith('https://')]

Unnamed: 0,adblocked,adblocked_by_url,category,channel_id,channel_name,cookie,decoded_data,disconnect_blocked,disconnect_blocked_by_url,domain_by_dns,...,rank,referer,req_domain,search,status,tcp_dstport,tcp_stream,time,url,user_agent
0,True,True,News,com.calkins.grahamksat,KSAT TV,,,True,True,google-analytics.com,...,679,,google-analytics.com,KSAT-TV Livestream,TERMINATED,443,50,1557669726.644327000,https://ssl.google-analytics.com/batch,GoogleAnalytics/12.4.51 (Linux; U; Android 5.1...
0,True,True,News,com.calkins.grahamksat,KSAT TV,,,True,True,google-analytics.com,...,679,,google-analytics.com,KSAT-TV Livestream,TERMINATED,443,50,1557669748.595021000,https://ssl.google-analytics.com/batch,GoogleAnalytics/12.4.51 (Linux; U; Android 5.1...
0,True,True,News,com.calkins.wral,WRAL,,,True,True,google-analytics.com,...,577,,google-analytics.com,Severe storms batter central U.S.,TERMINATED,443,83,1557662703.635857000,https://ssl.google-analytics.com/batch,GoogleAnalytics/12.4.51 (Linux; U; Android 5.1...
0,True,True,News,com.calkins.wral,WRAL,,,True,True,google-analytics.com,...,577,,google-analytics.com,Severe storms batter central U.S.,TERMINATED,443,83,1557662678.737298000,https://ssl.google-analytics.com/batch,GoogleAnalytics/12.4.51 (Linux; U; Android 5.1...
0,True,True,Movies & TV,com.dmr.yuyu.tv,Yuyu - Movies &amp; TV,,,True,True,spotxchange.com,...,470,,spotxchange.com,Mood Indigo,TERMINATED,443,28,1557655176.195926000,https://search.spotxchange.com/vast/2.0/204812...,Dalvik/2.1.0 (Linux; U; Android 5.1.1; AFTT Bu...
0,True,True,Movies & TV,com.dmr.yuyu.tv,Yuyu - Movies &amp; TV,,,True,True,spotxchange.com,...,470,,spotxchange.com,Mood Indigo,TERMINATED,443,27,1557655176.196145000,https://search.spotxchange.com/vast/2.0/204812...,Dalvik/2.1.0 (Linux; U; Android 5.1.1; AFTT Bu...
0,True,True,News,com.doapps.firetv.mln.MLN_6ae07dcb33ec3b7c814d...,WTMJ TODAY&#39;s TMJ4 Milwaukee,,,True,True,google-analytics.com,...,996,,google-analytics.com,"Partly cloudy, cool Saturday",TERMINATED,443,27,1557565041.862287000,https://ssl.google-analytics.com/batch,GoogleAnalytics/11.0.20 (Linux; U; Android 5.1...
0,True,True,Lifestyle,com.hillsongchannel.now,Hillsong Channel NOW,,,False,False,litix.io,...,1019,,litix.io,The Jesus Trek,TERMINATED,443,50,1557785664.983132000,https://sfrc0h2ursbel1jbo6s82jap5.litix.io/and...,Dalvik/2.1.0 (Linux; U; Android 5.1.1; AFTT Bu...
0,True,True,Lifestyle,com.hillsongchannel.now,Hillsong Channel NOW,,,False,False,litix.io,...,1019,,litix.io,The Jesus Trek,TERMINATED,443,51,1557785665.308241000,https://sfrc0h2ursbel1jbo6s82jap5.litix.io/and...,Dalvik/2.1.0 (Linux; U; Android 5.1.1; AFTT Bu...
0,True,True,News,com.mylocaltv.kjrhfiretv,KJRH 2 Works For You Tulsa,,,True,True,google-analytics.com,...,1114,,google-analytics.com,In the Kitchen with Fireside Grill: Caribbean ...,TERMINATED,443,21,1557701840.903336000,https://ssl.google-analytics.com/batch,GoogleAnalytics/11.0.20 (Linux; U; Android 5.1...


In [18]:
title_leaks[['channel_id', 'tcp_dstport']].drop_duplicates()['tcp_dstport'].value_counts()

443    14
80      2
Name: tcp_dstport, dtype: int64

In [22]:
title_leaks[['channel_id', 'channel_name', 'req_domain', 'tcp_dstport']].drop_duplicates()

Unnamed: 0,channel_id,channel_name,req_domain,tcp_dstport
0,com.calkins.grahamksat,KSAT TV,google-analytics.com,443
0,com.calkins.wral,WRAL,scorecardresearch.com,80
0,com.calkins.wral,WRAL,google-analytics.com,443
0,com.dmr.yuyu.tv,Yuyu - Movies &amp; TV,spotxchange.com,443
0,com.doapps.firetv.mln.MLN_6ae07dcb33ec3b7c814d...,WTMJ TODAY&#39;s TMJ4 Milwaukee,google-analytics.com,443
0,com.hillsongchannel.now,Hillsong Channel NOW,litix.io,443
0,com.mylocaltv.kjrhfiretv,KJRH 2 Works For You Tulsa,google-analytics.com,443
0,com.mylocaltv.wptvfiretv,WPTV NewsChannel 5 West Palm,google-analytics.com,443
0,com.planetdiscover.granite.phone.wkbwfiretv,WKBW 7 Eyewitness News Buffalo,google-analytics.com,443
0,com.zumobi.msnbc,NBC News,omtrdc.net,80
