In [61]:
from adblockparser import AdblockRules
import pandas as pd
import tldextract
import json

In [18]:
# create adblockrules from easylist
with open('input/easylist.txt') as f:
    raw_rules = f.readlines()
    
rules = AdblockRules(raw_rules)

In [19]:
# load resources
resources_and_sources = pd.read_csv('output/third_party_resources.csv')
print len(resources_and_sources)

unique_resources = resources_and_sources['url'].unique().tolist()
print len(unique_resources)

57521
19657


In [20]:
# this is extremely slow
resource_blocked = dict()

for resource in unique_resources:
    resource_blocked[resource] = rules.should_block(resource)

In [40]:
resource_df = pd.DataFrame.from_dict(resource_blocked, orient='index', columns=['blocked']).reset_index()
resource_df = resource_df.rename(columns={'index':'resource'})
ad_df = resource_df[resource_df['blocked']]
print ad_df

                                                resource  blocked
3      http://img.secureserver.net/t/1/tl/event?cts=1...     True
14     http://image6.pubmatic.com/AdServer/PugMaster?...     True
46     https://securepubads.g.doubleclick.net/gampad/...     True
74     https://cdn.ampproject.org/rtv/011811091519050...     True
109    https://simage2.pubmatic.com/AdServer/Pug?vcod...     True
138    https://simage2.pubmatic.com/AdServer/Pug?vcod...     True
160    https://magnetic.t.domdex.com/sync/pubmatic?ne...     True
178    https://googleads.g.doubleclick.net/pagead/ads...     True
249    http://ib.adnxs.com/jpt?callback=pbjs.handleAn...     True
294    http://gads.pubmatic.com/AdServer/AdCallAggreg...     True
359    https://pagead2.googlesyndication.com/pcs/acti...     True
443    http://pagead2.googlesyndication.com/pagead/ge...     True
464    https://adserver-us.adtech.advertising.com/pub...     True
598    https://pagead2.googlesyndication.com/pagead/g...     True
635    htt

In [41]:
# write blocked resources to csv just in case
ad_df['resource'].to_csv('output/blocked_resources.csv', index=False)

In [65]:
# check same things as third_party_resource analysis

In [66]:
# how many ad resources per site

In [67]:
# restrict resource table to only those blocked as ads
print len(resources_and_sources)
ads_and_sources = resources_and_sources[resources_and_sources['url'].isin(ad_df['resource'])]
print len(ads_and_sources)

57521
1201


In [68]:
print ads_and_sources.groupby(by='source').count().reset_index().sort_values(by='url', ascending=False)

                                 source  url
46              pillowlab.wordpress.com  218
50   princeton.academia.edu/AnnaShields  148
51  princeton.academia.edu/MarinaRustow   99
52      princeton.academia.edu/MattKarp   61
3          blogs.princeton.edu/reelmudd   60
54  princeton.academia.edu/VeraCandiani   59
2           blogs.princeton.edu/italian   42
26                      francinekay.com   32
56                 princeton.edu/leslie   20
53    princeton.academia.edu/RobCWegman   20
43                    paw.princeton.edu   17
28                goprincetontigers.com   17
19    facilitiesinclusion.princeton.edu   16
17                dailyprincetonian.com   16
62                princetongleeclub.com   12
24                 fluids.princeton.edu   12
0             biodigester.princeton.edu   12
15              cs.princeton.edu/~kylej   12
14             cs.princeton.edu/~ehazan   12
11                  chime.princeton.edu   12
8     caribbeanliterature.princeton.edu   12
6         

In [76]:
# get resources for 2 sites with >100 ad resources

pillow_lab = ads_and_sources[ads_and_sources['source'] == "pillowlab.wordpress.com"]['url']
print len(pillow_lab)
print len(pillow_lab.unique())
pillow_lab.to_csv('output/pillow_lab.csv', index=False)

anna_shields = ads_and_sources[ads_and_sources['source'] == "princeton.academia.edu/AnnaShields"]['url']
print len(anna_shields)
print len(anna_shields.unique())
anna_shields.to_csv('output/anna_shields.csv', index=False)

218
127
148
114


In [77]:
# how many ad domains per site

In [78]:
def get_domain_for_org(url):
    extracted = tldextract.extract(url)
    return "{}.{}".format(extracted.domain, extracted.suffix)

In [79]:
domains = pd.DataFrame({'domain': ads_and_sources['url'].apply(get_domain_for_org)})
ad_domains = pd.concat([ads_and_sources, domains], axis=1).drop(columns=['url'])
print ad_domains.groupby(by='source').domain.nunique()

source
biodigester.princeton.edu                     2
blogs.princeton.edu/caps                      2
blogs.princeton.edu/italian                   2
blogs.princeton.edu/reelmudd                  2
blueflowerarts.com/artist/alicia-ostriker     2
blueflowerarts.com/artist/mark-doty           2
bobholman.com                                 2
butlercollege.princeton.edu                   2
caribbeanliterature.princeton.edu             2
chignell.net                                  2
chikaokeke-agulu.blogspot.com                 1
chime.princeton.edu                           2
claudiarankine.com                            1
cs.princeton.edu                              1
cs.princeton.edu/~ehazan                      2
cs.princeton.edu/~kylej                       2
csctf.princeton.edu                           2
dailyprincetonian.com                         2
deanalawson.com                               2
facilitiesinclusion.princeton.edu             2
facilitiesinsider.princeton.edu  

In [80]:
# which organizations do these domains represent

In [81]:
with open('webXray/domain_owners.json') as f:
    domain_owners = json.load(f)
    
domain_to_org = dict()
for do in domain_owners:
    org = do['owner_name']
    
    for domain in do['domains']:
        domain_to_org[domain] = org

In [82]:
domain_orgs = pd.DataFrame({'org': ad_domains['domain'].map(domain_to_org)})
ad_orgs = pd.concat([ad_domains, domain_orgs], axis=1).drop(columns=['domain'])
print ad_orgs.groupby(by='source').org.nunique()

source
biodigester.princeton.edu                     2
blogs.princeton.edu/caps                      2
blogs.princeton.edu/italian                   2
blogs.princeton.edu/reelmudd                  2
blueflowerarts.com/artist/alicia-ostriker     2
blueflowerarts.com/artist/mark-doty           2
bobholman.com                                 2
butlercollege.princeton.edu                   2
caribbeanliterature.princeton.edu             2
chignell.net                                  2
chikaokeke-agulu.blogspot.com                 1
chime.princeton.edu                           2
claudiarankine.com                            0
cs.princeton.edu                              1
cs.princeton.edu/~ehazan                      2
cs.princeton.edu/~kylej                       2
csctf.princeton.edu                           2
dailyprincetonian.com                         2
deanalawson.com                               2
facilitiesinclusion.princeton.edu             2
facilitiesinsider.princeton.edu  

In [None]:
# check frequency of each ad resource

In [86]:
print ads_and_sources.groupby('url').count().sort_values(by='source', ascending=False)

                                                    source
url                                                       
https://www.youtube.com/ad_data_204                    243
https://static.doubleclick.net/instream/ad_stat...     243
https://pagead2.googlesyndication.com/pagead/js...      17
https://pagead2.googlesyndication.com/pagead/js...      16
https://pagead2.googlesyndication.com/pagead/sh...      16
https://pagead2.googlesyndication.com/pub-confi...      15
https://tpc.googlesyndication.com/pagead/js/r20...      13
https://tpc.googlesyndication.com/pagead/js/r20...      13
https://pagead2.googlesyndication.com/pagead/js...      13
https://securepubads.g.doubleclick.net/gpt/puba...      12
http://tpc.googlesyndication.com/sodar/6uQTKQJz...      11
http://tpc.googlesyndication.com/sodar/V6zvOIoD.js      11
http://pagead2.googlesyndication.com/bg/BvTIEte...      11
http://ads.pubmatic.com/AdServer/js/showad.js#P...      11
http://tpc.googlesyndication.com/safeframe/1-0-...      