In [1]:
from urlparse import urlparse
import pandas as pd
import csv
import json
import matplotlib
import tldextract

In [2]:
def get_domain(url):
    if (pd.isnull(url)):
        return ""
    else:
        return urlparse(url.replace("www.","")).hostname

In [3]:
def process_url(url):
    if (pd.isnull(url)):
        return ""
    else:
        return url.replace("https://", "").replace("http://", "").replace("www.","").rstrip('/')

In [4]:
# import site list
site_csv = pd.read_csv('../final_sites.csv', names=['site'], header=None)
sites = site_csv['site'].apply(process_url).tolist()
site_domains = site_csv['site'].apply(get_domain).tolist()

In [5]:
# import request table
request_csv = pd.read_csv('input/requests.csv') # visit_id, url
print len(request_csv)
visit_id_csv = pd.read_csv('../visit_id_to_site.csv') # visit_id, arguments

requests = pd.merge(request_csv, visit_id_csv, on='visit_id')
print len(requests)

requests = requests.drop(columns=['visit_id']).rename(columns={'arguments': 'source'})
requests['source'] = requests['source'].apply(process_url)

152395
152395


In [6]:
# limit to third-party resources, or resources not originating from the source site
third_party_resources = requests[~requests['url'].apply(get_domain).isin(site_domains)]
print len(third_party_resources)

63581


In [7]:
# write to csv for ad analysis
third_party_resources.to_csv('output/third_party_resources.csv', index=False)

In [8]:
# how many resources per site

In [9]:
print third_party_resources.groupby(by='source').count().reset_index().sort_values(by='url', ascending=False)

                                                 source   url
176                             blogs.princeton.edu/sml  1386
173                             blogs.princeton.edu/sas  1327
374                                           dsrny.com  1268
768                             pillowlab.wordpress.com   897
140                       blogs.princeton.edu/notabilia   783
119                       blogs.princeton.edu/imabandit   777
803                  princeton.academia.edu/AnnaShields   717
101                            blogs.princeton.edu/csml   669
68                                  bacdancecompany.com   518
804                 princeton.academia.edu/MarinaRustow   500
52                                   arts.princeton.edu   453
259                       chikaokeke-agulu.blogspot.com   436
331                               dailyprincetonian.com   390
21                            agchallenge.princeton.edu   376
202                                       bobholman.com   373
805     

In [10]:
# which domains do these resources come from

In [11]:
def get_domain_for_org(url):
    extracted = tldextract.extract(url)
    return "{}.{}".format(extracted.domain, extracted.suffix)

In [12]:
domains = pd.DataFrame({'domain': third_party_resources['url'].apply(get_domain_for_org)})
resource_domains = pd.concat([third_party_resources, domains], axis=1).drop(columns=['url'])
print resource_domains.groupby(by='domain').count().sort_values(by='source', ascending=False)

                       source
domain                       
wp.com                  11931
google-analytics.com     6563
gstatic.com              5257
googleapis.com           3799
princeton.edu            3361
parastorage.com          2282
gravatar.com             2197
typekit.net              2086
squarespace.com          2005
facebook.com             1817
youtube.com              1739
sanity.io                1256
google.com               1125
nr-data.net              1021
newrelic.com             1019
doubleclick.net           945
twimg.com                 901
twitter.com               855
cloudfront.net            769
cloudflare.com            686
vimeocdn.com              617
editmysite.com            591
wix.com                   579
ytimg.com                 316
netdna-ssl.com            290
bootstrapcdn.com          276
pubwise.io                268
googlesyndication.com     258
archive-it.org            258
wixstatic.com             245
...                       ...
eyeviewads

In [13]:
# which organizations do these domains represent

In [14]:
with open('webXray/domain_owners.json') as f:
    domain_owners = json.load(f)
    
domain_to_org = dict()
for do in domain_owners:
    org = do['owner_name']
    
    for domain in do['domains']:
        domain_to_org[domain] = org

In [15]:
domain_orgs = pd.DataFrame({'org': resource_domains['domain'].map(domain_to_org)})
resource_orgs = pd.concat([resource_domains, domain_orgs], axis=1).drop(columns=['domain'])
print resource_orgs.groupby(by='org').count().sort_values(by='source', ascending=False)

                                      source
org                                         
Wordpress                              12086
Google Analytics                        6563
Google                                  6510
Google APIs                             3799
Gravatar                                2197
Typekit                                 2179
Facebook                                2177
YouTube                                 2055
New Relic                               2040
Twitter                                 1756
Amazon Web Services                      967
DoubleClick                              965
Vimeo                                    771
Cloudflare                               686
Blogger                                  298
StackPath                                276
AdSense                                  266
Google Tag Manager                       188
Pinterest                                180
Share This                               163
AddThis   