In [43]:
from urlparse import urlparse
import pandas as pd
import csv
import json
import tldextract

In [44]:
def get_domain(url):
    if (pd.isnull(url)):
        return ""
    else:
        return urlparse(url.replace("www.","")).hostname

In [45]:
def process_url(url):
    if (pd.isnull(url)):
        return ""
    else:
        return url.replace("https://", "").replace("http://", "").replace("www.","").rstrip('/')

In [46]:
# import site list
site_csv = pd.read_csv('../final_sites.csv', names=['site'], header=None)
sites = site_csv['site'].apply(process_url).tolist()
site_domains = site_csv['site'].apply(get_domain).tolist()

In [47]:
# import request table
request_csv = pd.read_csv('input/requests.csv') # visit_id, url
print len(request_csv)
visit_id_csv = pd.read_csv('../visit_id_to_site.csv') # visit_id, arguments

requests = pd.merge(request_csv, visit_id_csv, on='visit_id')
print len(requests)

requests = requests.drop(columns=['visit_id']).rename(columns={'arguments': 'source'})
requests['source'] = requests['source'].apply(process_url)

152395
152395


In [48]:
# limit to third-party resources, or resources not originating from the source site
third_party_resources = requests[~requests['url'].apply(get_domain).isin(site_domains)]
print len(third_party_resources)

63581


In [49]:
# write to csv for ad analysis
third_party_resources.to_csv('output/third_party_resources.csv', index=False)

In [50]:
# how many resources per site

In [51]:
print third_party_resources.groupby(by='source').count().reset_index().sort_values(by='url', ascending=False)

                                                 source   url
176                             blogs.princeton.edu/sml  1386
173                             blogs.princeton.edu/sas  1327
374                                           dsrny.com  1268
768                             pillowlab.wordpress.com   897
140                       blogs.princeton.edu/notabilia   783
119                       blogs.princeton.edu/imabandit   777
803                  princeton.academia.edu/AnnaShields   717
101                            blogs.princeton.edu/csml   669
68                                  bacdancecompany.com   518
804                 princeton.academia.edu/MarinaRustow   500
52                                   arts.princeton.edu   453
259                       chikaokeke-agulu.blogspot.com   436
331                               dailyprincetonian.com   390
21                            agchallenge.princeton.edu   376
202                                       bobholman.com   373
805     

In [52]:
# how many domains per site

In [53]:
def get_domain_for_org(url):
    extracted = tldextract.extract(url)
    return "{}.{}".format(extracted.domain, extracted.suffix)

In [54]:
domains = pd.DataFrame({'domain': third_party_resources['url'].apply(get_domain_for_org)})
resource_domains = pd.concat([third_party_resources, domains], axis=1).drop(columns=['url'])
print resource_domains.groupby(by='source').domain.nunique()

source
aaa.princeton.edu                              2
aamginput.princeton.edu                        6
aas.princeton.edu                              8
aasa.princeton.edu                             7
abbyznewutility.org                            3
aboutjamesrichardson.com                       3
acapellago.princeton.edu                       7
access.princeton.edu                           2
accessibility.princeton.edu                    4
acee.princeton.edu                             7
acsspu.princeton.edu                           5
act.princeton.edu                              3
adam-welch.com                                 3
adamsliwinski.blogspot.com                     5
adamwolf.princeton.edu                         5
addictionintheory.princeton.edu                4
adele.princeton.edu                            8
admission.princeton.edu                        2
adthis.princeton.edu                           9
adversarial-learning.princeton.edu             2
aftercensorsh

In [55]:
# which organizations do these domains represent

In [56]:
with open('webXray/domain_owners.json') as f:
    domain_owners = json.load(f)
    
domain_to_org = dict()
for do in domain_owners:
    org = do['owner_name']
    
    for domain in do['domains']:
        domain_to_org[domain] = org

In [57]:
domain_orgs = pd.DataFrame({'org': resource_domains['domain'].map(domain_to_org)})
resource_orgs = pd.concat([resource_domains, domain_orgs], axis=1).drop(columns=['domain'])
print resource_orgs.groupby(by='source').org.nunique()

source
aaa.princeton.edu                              2
aamginput.princeton.edu                        4
aas.princeton.edu                              6
aasa.princeton.edu                             6
abbyznewutility.org                            2
aboutjamesrichardson.com                       2
acapellago.princeton.edu                       6
access.princeton.edu                           2
accessibility.princeton.edu                    4
acee.princeton.edu                             5
acsspu.princeton.edu                           4
act.princeton.edu                              3
adam-welch.com                                 2
adamsliwinski.blogspot.com                     3
adamwolf.princeton.edu                         4
addictionintheory.princeton.edu                3
adele.princeton.edu                            6
admission.princeton.edu                        2
adthis.princeton.edu                           7
adversarial-learning.princeton.edu             2
aftercensorsh