In [335]:
from urlparse import urlparse
import pandas as pd
import csv
import json
import tldextract

In [336]:
def get_domain(url):
    if (pd.isnull(url)):
        return ""
    else:
        return urlparse(url.replace("www.","")).hostname

In [337]:
def process_url(url):
    if (pd.isnull(url)):
        return ""
    else:
        return url.replace("https://", "").replace("http://", "").replace("www.","").rstrip('/')

In [338]:
# import site list
site_csv = pd.read_csv('../final_sites.csv', names=['site'], header=None)
sites = site_csv['site'].apply(process_url).tolist()
site_domains = site_csv['site'].apply(get_domain).tolist()

In [339]:
# import resource table
resources = pd.read_csv('input/3_col_resources.csv', names=['url', 'top_level_url', 'loading_origin'], header=None)
print len(resources)

# limit to resources whose site we can reasonably identify
resources['tlu_hn'] = resources['top_level_url'].apply(process_url)
resources['lo_hn'] = resources['loading_origin'].apply(process_url)
ident_resources = resources[(resources['tlu_hn'].isin(sites) | resources['lo_hn'].isin(sites))
]

# then attribute each resource to that site
# use loading origin 
source = ident_resources['lo_hn'].where(ident_resources['lo_hn'].isin(sites), ident_resources['tlu_hn'])
source = pd.DataFrame({'source': source})
ident_resources = pd.concat([ident_resources, source], axis=1)
ident_resources = ident_resources.drop(columns=['top_level_url', 'loading_origin', 'tlu_hn', 'lo_hn'])
print len(ident_resources)

# limit to third-party resources, or resources not originating from the source site
third_party_resources = ident_resources[~ident_resources['url'].apply(get_domain).isin(site_domains)]
print len(third_party_resources)

152395
142396
57521


In [349]:
# write to csv for ad analysis
third_party_resources.to_csv('output/third_party_resources.csv', index=False)

In [341]:
# how many resources per site

In [342]:
print third_party_resources.groupby(by='source').count().reset_index().sort_values(by='url', ascending=False)

                                                source   url
80                                 blogs.princeton.edu  9494
763                              scholar.princeton.edu  2199
235                                          dsrny.com  1268
637                                      princeton.edu   922
603                            pillowlab.wordpress.com   894
632                 princeton.academia.edu/AnnaShields   713
64                                 bacdancecompany.com   512
633                princeton.academia.edu/MarinaRustow   497
51                                  arts.princeton.edu   450
141                      chikaokeke-agulu.blogspot.com   436
174                                   cs.princeton.edu   418
195                              dailyprincetonian.com   390
21                           agchallenge.princeton.edu   376
100                                      bobholman.com   370
634                    princeton.academia.edu/MattKarp   367
636                princ

In [343]:
# how many domains per site

In [344]:
def get_domain_for_org(url):
    extracted = tldextract.extract(url)
    return "{}.{}".format(extracted.domain, extracted.suffix)

In [345]:
domains = pd.DataFrame({'domain': third_party_resources['url'].apply(get_domain_for_org)})
resource_domains = pd.concat([third_party_resources, domains], axis=1).drop(columns=['url'])
print resource_domains.groupby(by='source').domain.nunique()

source
aaa.princeton.edu                                        2
aamginput.princeton.edu                                  6
aas.princeton.edu                                        8
aasa.princeton.edu                                       7
abbyznewutility.org                                      3
aboutjamesrichardson.com                                 3
acapellago.princeton.edu                                 7
access.princeton.edu                                     2
accessibility.princeton.edu                              4
acee.princeton.edu                                       7
acsspu.princeton.edu                                     5
act.princeton.edu                                        3
adam-welch.com                                           3
adamsliwinski.blogspot.com                               5
adamwolf.princeton.edu                                   5
addictionintheory.princeton.edu                          4
adele.princeton.edu                              

In [346]:
# which organizations do these domains represent

In [347]:
with open('webXray/domain_owners.json') as f:
    domain_owners = json.load(f)
    
domain_to_org = dict()
for do in domain_owners:
    org = do['owner_name']
    
    for domain in do['domains']:
        domain_to_org[domain] = org

In [348]:
domain_orgs = pd.DataFrame({'org': resource_domains['domain'].map(domain_to_org)})
resource_orgs = pd.concat([resource_domains, domain_orgs], axis=1).drop(columns=['domain'])
print resource_orgs.groupby(by='source').org.nunique()

source
aaa.princeton.edu                                        2
aamginput.princeton.edu                                  4
aas.princeton.edu                                        6
aasa.princeton.edu                                       6
abbyznewutility.org                                      2
aboutjamesrichardson.com                                 2
acapellago.princeton.edu                                 6
access.princeton.edu                                     2
accessibility.princeton.edu                              4
acee.princeton.edu                                       5
acsspu.princeton.edu                                     4
act.princeton.edu                                        3
adam-welch.com                                           2
adamsliwinski.blogspot.com                               3
adamwolf.princeton.edu                                   4
addictionintheory.princeton.edu                          3
adele.princeton.edu                              