In [1]:
cd ..

/Users/stein/repos/privacy-code-tools


In [4]:
# confirm that this is a "Django Shell-Plus Kernel"
from aws_requests_auth.boto_utils import BotoAWSRequestsAuth
from coder.api import models
from io import BytesIO
import bs4
import csv
import codecs
import datetime
import json
import os
import requests
import urllib
import zipfile

def get_top_1m(date):
    url = f"https://web.archive.org/web/{date:%Y%m%d%H%M%S}/s3.amazonaws.com/alexa-static/top-1m.csv.zip"
    top_1m = zipfile.ZipFile(BytesIO(requests.get(url).content))
    with top_1m.open("top-1m.csv", 'r') as f:
        sites = list(csv.reader(codecs.iterdecode(f, 'utf-8')))
    return [(s[1], int(s[0])) for s in sites]

def _clean(entry):
    if len(entry) == 1 and 'value' in entry:
         return entry['value']
    return entry

def _expand_soup(s):
    name = s.name
    if name == "aws:country":
        name=f"aws:{s.attrs['code']}"
    if name == "aws:contributingsubdomains":
        children_expanded = [
            ss[1] for ss in 
            filter(
                lambda x:x, 
                map(_expand_soup, s.children)
            )
        ]
        return (name[4:], children_expanded)
    if hasattr(s, "children") and name.startswith("aws:"):
        children_expanded = {
            k:v for k,v in 
            filter(
                lambda x:x, 
                map(_expand_soup, s.children)
            )
        }
        return (name[4:], _clean(children_expanded))
    elif str(s) == "\n":
        pass
    else:
        return ("value", s)

def _process_response(resp):
    soup = bs4.BeautifulSoup(resp.content)
    return _expand_soup(soup.find("aws:alexa"))[1]

```yaml
contentdata:
  adultcontent: 'no'
  dataurl: google.com
  language: locale: en
  owneddomains: {}
  sitedata:
    description: Enables users to search the worlds information, including webpa...
    onlinesince: 15-Sep-1997
    title: Google
  speed: { medianloadtime: '1198', percentile: '67' }

trafficdata:
  dataurl: google.com
  rank: '1'

  contributingsubdomains:
  - dataurl: mail.google.com
    pageviews: {percentage: 38.21%, peruser: '19.24'}
    reach: percentage: 36.01%
    timerange: months: '1'
  - dataurl: google.com
    pageviews: { percentage: 25.65%, peruser: '5.250'}
    reach: percentage: 88.52%
    timerange: months: '1'

  rankbycountry:
    IN:
      contribution: { pageviews: 16.1%, users: 10.6% }
      rank: '1'
    US:
      contribution: { pageviews: 27.7%, users: 19.2% }
      rank: '1'

  usagestatistics:
    usagestatistic:
      rank: {delta: '0', value: '1'}
      pageviews:
        permillion: {delta: "-24.46%", value: '219,700'}
        peruser: {delta: "-22.51%", value: '14.5'}
        rank: {delta: '0', value: '1'}
      reach:
        permillion: {delta: "-2.53%", value: '601,000'}
        rank: {delta: '0', value: '1'}
      timerange:
        days: '1'
```

In [None]:
# IN-MEMORY CACHES
_INFOS = {}
domain_set = set()

In [7]:
def get_site_info(url):
    """ Pull top site data from Alexa top site rankings. $0.036 / query """
    if url in _INFOS:
        return _INFOS[url]
    response_group = "Rank,RankByCountry,UsageStats,AdultContent,Speed,Language,OwnedDomains,SiteData"
    auth = BotoAWSRequestsAuth(
      aws_host='awis.us-west-1.amazonaws.com',
      aws_region='us-west-1',
      aws_service='awis'
    )
    api_url = 'https://awis.us-west-1.amazonaws.com/api'
    query_params = urllib.parse.quote(
      f'Action=UrlInfo&ResponseGroup={response_group}&Url={url}',
      safe = '/-_.~=&'
    )
    to_ret = _process_response(requests.get(f"{api_url}?{query_params}", auth=auth))
    _INFOS[url] = to_ret
    return to_ret

In [14]:
def _get(d, path, default=None):
    cur = d
    for p in path.split("."):
        cur = cur.get(p, None)
        if cur == None:
            return default
    return cur

def info_to_domains(info):
    return _get(info, 'contentdata.owneddomains', {})

def info_to_policy(info):
    online_since = _get(info, 'contentdata.sitedata.onlinesince')
    if online_since:
        try:
            online_since = datetime.datetime.strptime(online_since, "%d-%b-%Y")
        except:
            pass
        try:
            online_since = datetime.datetime.strptime(online_since, "%d-%b-%y")
        except:
            pass
    return models.Policy(
        company_name= _get(info, 'contentdata.sitedata.title'),
        site_name = _get(info, 'contentdata.dataurl'),
        alexa_rank = _get(info, 'trafficdata.rank'),
        alexa_rank_US = _get(info, 'trafficdata.rankbycountry.US.rank'),
        locale=_get(info, "contentdata.language.locale"),
        start_date = online_since,
        meta = info,
    )

In [21]:
def load_top_N(n, dt=datetime.datetime(2021,4,1)):
    """
    Load the top N sites by alexa ranking on date {dt}. By default uses April 2021.
    Already loaded sites will not be overwritten.
    New sites will incur a fee to our AWS account, so be very sure before using.
    """
    top_1m = get_top_1m(dt)
    policies = models.Policy.objects.all()
    domain_set.update(list(p.site_name for p in policies))
    _INFOS.update({p.site_name: p.meta for p in policies})
    for site, rank in top_1m[:n]:
        if site in domain_set:
            print(f'skiping {site}')
            continue
        print(site)
        info=get_site_info(site)
        domain_set.update(info_to_domains(info))
        new_policy = info_to_policy(info)
        new_policy.save()