In [2]:
import LeakDetector
import pandas as pd
from glob import glob

In [3]:
from device_ids import TV_ID_MAP
from log_analysis import get_crawl_parameter
from crawl_ids import CrawlFireTVTop1KMITM
from df_utils import load_df

## Load leaks
- Run the following to detect and pickle leaks
  - Detect on all crawls: `python2 title_leaks.py`
  - Detect on a single crawl: `python2 title_leaks.py roku-data-20190508-013650`

In [14]:
leaks = load_df(CrawlFireTVTop1KMITM, "title_leak")

### Manual false positive removal

We manually removed false positives that occur within the resource URLs, not sent as a title parameter.
- "nowcast", "adtv" and "sbtv"
- `http://www.christian-tv.org/tv/firetv/video/adtv.html`
- `http://d27wx7ytq78mow.cloudfront.net/offair/sbtv-cb-v2/media_b3659760_0001.ts`
- We filtered out channels where channel name is same as the video

In [21]:
from log_analysis import add_domain_column
add_domain_column(leaks)
title_leaks = leaks[~leaks.search.isin(["nowcast", "adtv", "sbtv"]) & (leaks.id_type !="Channel name")]
# title_leaks = leaks[leaks.id_type !="Channel name"][['channel_id', 'channel_name', 'id_type', 'encoding', 'search', 'leak_type', 'req_domain']].drop_duplicates()
print(title_leaks.channel_id.nunique(), "channels")
title_leaks[['channel_id', 'channel_name', 'id_type', 'encoding', 'search', 'leak_type', 'req_domain']].drop_duplicates()

(14, 'channels')


Unnamed: 0,channel_id,channel_name,id_type,encoding,search,leak_type,req_domain
0,com.calkins.grahamksat,KSAT TV,imdb_title_KSAT-TV Livestream,urlencode,KSAT-TV Livestream,post_leaks,google-analytics.com
0,com.calkins.wral,WRAL,imdb_title_Severe storms batter central U.S.,urlencode,Severe storms batter central U.S.,url_leaks,scorecardresearch.com
0,com.calkins.wral,WRAL,imdb_title_Severe storms batter central U.S.,urlencode,Severe storms batter central U.S.,post_leaks,google-analytics.com
0,com.dmr.yuyu.tv,Yuyu - Movies &amp; TV,imdb_title_Mood Indigo,urlencode,Mood Indigo,url_leaks,spotxchange.com
0,com.doapps.firetv.mln.MLN_6ae07dcb33ec3b7c814d...,WTMJ TODAY&#39;s TMJ4 Milwaukee,"imdb_title_Partly cloudy, cool Saturday",urlencode,"Partly cloudy, cool Saturday",post_leaks,google-analytics.com
0,com.hillsongchannel.now,Hillsong Channel NOW,imdb_title_The Jesus Trek,unencoded,The Jesus Trek,post_leaks,litix.io
0,com.mylocaltv.kjrhfiretv,KJRH 2 Works For You Tulsa,imdb_title_In the Kitchen with Fireside Grill:...,urlencode,In the Kitchen with Fireside Grill: Caribbean ...,post_leaks,google-analytics.com
0,com.mylocaltv.wptvfiretv,WPTV NewsChannel 5 West Palm,imdb_title_To The Point,urlencode,To The Point,post_leaks,google-analytics.com
0,com.planetdiscover.granite.phone.wkbwfiretv,WKBW 7 Eyewitness News Buffalo,imdb_title_Graffiti Patio officially opens at ...,urlencode,Graffiti Patio officially opens at Tappo Pizza,post_leaks,google-analytics.com
0,com.zumobi.msnbc,NBC News,imdb_title_TODAY's Headlines,urlencode,TODAY's Headlines,url_leaks,omtrdc.net


In [26]:
from nb_utils import make_latex_table
print(make_latex_table(title_leaks[['channel_name', 'search', 'req_domain']].drop_duplicates(),
    caption="Title leaks in 100 random Fire TV channels",
    label="amazon_title_leaks"))




\begin{table}[H]
%\centering
%\resizebox{\columnwidth}{!}{%
\begin{tabular}{lll}
\toprule
 Channel name                    & Search                                                     & Req domain            \\
\midrule
 KSAT TV                         & KSAT-TV Livestream                                         & google-analytics.com  \\
 WRAL                            & Severe storms batter central U.S.                          & scorecardresearch.com \\
 WRAL                            & Severe storms batter central U.S.                          & google-analytics.com  \\
 Yuyu - Movies \&amp; TV          & Mood Indigo                                                & spotxchange.com       \\
 WTMJ TODAY\&\#39;s TMJ4 Milwaukee & Partly cloudy, cool Saturday                               & google-analytics.com  \\
 Hillsong Channel NOW            & The Jesus Trek                                             & litix.io              \\
 KJRH 2 Works For You Tulsa      & In the Kitchen 