In [1]:
from adblockparser import AdblockRules
import pandas as pd
import tldextract
import json

In [2]:
# create adblockrules from easylist
with open('input/easylist.txt') as f:
    raw_rules = f.readlines()
    
rules = AdblockRules(raw_rules)

In [3]:
# load resources
resources_and_sources = pd.read_csv('output/third_party_resources.csv')
print len(resources_and_sources)

unique_resources = resources_and_sources['url'].unique().tolist()
print len(unique_resources)

63581
21682


In [4]:
# this is extremely slow
resource_blocked = dict()

for resource in unique_resources:
    resource_blocked[resource] = rules.should_block(resource)

In [5]:
resource_df = pd.DataFrame.from_dict(resource_blocked, orient='index', columns=['blocked']).reset_index()
resource_df = resource_df.rename(columns={'index':'resource'})
ad_df = resource_df[resource_df['blocked']]
print ad_df

                                                resource  blocked
4      http://img.secureserver.net/t/1/tl/event?cts=1...     True
15     http://image6.pubmatic.com/AdServer/PugMaster?...     True
50     https://securepubads.g.doubleclick.net/gampad/...     True
81     https://cdn.ampproject.org/rtv/011811091519050...     True
120    https://simage2.pubmatic.com/AdServer/Pug?vcod...     True
153    https://simage2.pubmatic.com/AdServer/Pug?vcod...     True
175    https://magnetic.t.domdex.com/sync/pubmatic?ne...     True
194    https://googleads.g.doubleclick.net/pagead/ads...     True
207    https://securepubads.g.doubleclick.net/gampad/...     True
275    http://ib.adnxs.com/jpt?callback=pbjs.handleAn...     True
325    http://gads.pubmatic.com/AdServer/AdCallAggreg...     True
396    https://pagead2.googlesyndication.com/pcs/acti...     True
487    http://pagead2.googlesyndication.com/pagead/ge...     True
508    https://adserver-us.adtech.advertising.com/pub...     True
695    htt

In [6]:
# write blocked resources to csv just in case
ad_df['resource'].to_csv('output/blocked_resources.csv', index=False)

In [7]:
# check same things as third_party_resource analysis

In [8]:
# how many ad resources per site

In [9]:
# restrict resource table to only those blocked as ads
print len(resources_and_sources)
ads_and_sources = resources_and_sources[resources_and_sources['url'].isin(ad_df['resource'])]
print len(ads_and_sources)

63581
1243


In [10]:
print ads_and_sources.groupby(by='source').count().reset_index().sort_values(by='url', ascending=False)

                                 source  url
48              pillowlab.wordpress.com  218
53   princeton.academia.edu/AnnaShields  148
54  princeton.academia.edu/MarinaRustow   99
55      princeton.academia.edu/MattKarp   61
4          blogs.princeton.edu/reelmudd   60
57  princeton.academia.edu/VeraCandiani   59
3           blogs.princeton.edu/italian   42
28                      francinekay.com   32
2               blogs.princeton.edu/eqn   24
59                 princeton.edu/leslie   20
56    princeton.academia.edu/RobCWegman   20
45                    paw.princeton.edu   17
30                goprincetontigers.com   17
21    facilitiesinclusion.princeton.edu   16
51        poets.org/poet.php/prmPID/248   16
19                dailyprincetonian.com   16
0             biodigester.princeton.edu   12
66                princetongleeclub.com   12
12                  chime.princeton.edu   12
9     caribbeanliterature.princeton.edu   12
7                         bobholman.com   12
26        

In [11]:
# get resources for 2 sites with >100 ad resources

pillow_lab = ads_and_sources[ads_and_sources['source'] == "pillowlab.wordpress.com"]['url']
print len(pillow_lab)
print len(pillow_lab.unique())
pillow_lab.to_csv('output/pillow_lab.csv', index=False)

anna_shields = ads_and_sources[ads_and_sources['source'] == "princeton.academia.edu/AnnaShields"]['url']
print len(anna_shields)
print len(anna_shields.unique())
anna_shields.to_csv('output/anna_shields.csv', index=False)

218
127
148
114


In [12]:
# how many ad domains per site

In [13]:
def get_domain_for_org(url):
    extracted = tldextract.extract(url)
    return "{}.{}".format(extracted.domain, extracted.suffix)

In [14]:
domains = pd.DataFrame({'domain': ads_and_sources['url'].apply(get_domain_for_org)})
ad_domains = pd.concat([ads_and_sources, domains], axis=1).drop(columns=['url'])
print ad_domains.groupby(by='source').domain.nunique()

source
biodigester.princeton.edu                     2
blogs.princeton.edu/caps                      2
blogs.princeton.edu/eqn                       2
blogs.princeton.edu/italian                   2
blogs.princeton.edu/reelmudd                  2
blueflowerarts.com/artist/alicia-ostriker     2
blueflowerarts.com/artist/mark-doty           2
bobholman.com                                 2
butlercollege.princeton.edu                   2
caribbeanliterature.princeton.edu             2
chignell.net                                  2
chikaokeke-agulu.blogspot.com                 1
chime.princeton.edu                           2
claudiarankine.com                            1
cs.princeton.edu                              1
cs.princeton.edu/~ehazan                      2
cs.princeton.edu/~kylej                       2
cs.princeton.edu/~sseung                      1
csctf.princeton.edu                           2
dailyprincetonian.com                         2
deanalawson.com                  

In [15]:
# which organizations do these domains represent

In [16]:
with open('webXray/domain_owners.json') as f:
    domain_owners = json.load(f)
    
domain_to_org = dict()
for do in domain_owners:
    org = do['owner_name']
    
    for domain in do['domains']:
        domain_to_org[domain] = org

In [17]:
domain_orgs = pd.DataFrame({'org': ad_domains['domain'].map(domain_to_org)})
ad_orgs = pd.concat([ad_domains, domain_orgs], axis=1).drop(columns=['domain'])
print ad_orgs.groupby(by='source').org.nunique()

source
biodigester.princeton.edu                     2
blogs.princeton.edu/caps                      2
blogs.princeton.edu/eqn                       2
blogs.princeton.edu/italian                   2
blogs.princeton.edu/reelmudd                  2
blueflowerarts.com/artist/alicia-ostriker     2
blueflowerarts.com/artist/mark-doty           2
bobholman.com                                 2
butlercollege.princeton.edu                   2
caribbeanliterature.princeton.edu             2
chignell.net                                  2
chikaokeke-agulu.blogspot.com                 1
chime.princeton.edu                           2
claudiarankine.com                            0
cs.princeton.edu                              1
cs.princeton.edu/~ehazan                      2
cs.princeton.edu/~kylej                       2
cs.princeton.edu/~sseung                      0
csctf.princeton.edu                           2
dailyprincetonian.com                         2
deanalawson.com                  

In [18]:
# check frequency of each ad resource

In [19]:
print ads_and_sources.groupby('url').count().sort_values(by='source', ascending=False)

                                                    source
url                                                       
https://www.youtube.com/ad_data_204                    255
https://static.doubleclick.net/instream/ad_stat...     255
https://pagead2.googlesyndication.com/pagead/js...      17
https://pagead2.googlesyndication.com/pagead/js...      16
https://pagead2.googlesyndication.com/pagead/sh...      16
https://pagead2.googlesyndication.com/pub-confi...      15
https://tpc.googlesyndication.com/pagead/js/r20...      13
https://tpc.googlesyndication.com/pagead/js/r20...      13
https://pagead2.googlesyndication.com/pagead/js...      13
https://securepubads.g.doubleclick.net/gpt/puba...      13
http://tpc.googlesyndication.com/sodar/6uQTKQJz...      11
http://tpc.googlesyndication.com/sodar/V6zvOIoD.js      11
http://pagead2.googlesyndication.com/bg/BvTIEte...      11
http://ads.pubmatic.com/AdServer/js/showad.js#P...      11
https://securepubads.g.doubleclick.net/gpt/puba...      