## CommonCrawl

CommonCrawl is a publically accessible web crawl.

It comprises ca. 250TB and 3 billion web pages, hosted on AWS S3 as a public dataset.

In [45]:
test_data = [
    'http://google.ch/maps/place',
    'https://maps.google.se/maps',
    'https://www.google.com.sg/maps/place/The+Herencia/@1.2924232,103.8388365,17z/data=!4m5!1m2!2m1!1s46+Kim+Yam+Road,+%2305-03The+Herencia,+Singapore+239351!3m1!1s0x31da1974c7afb233:0x67979b46ae734638',
    '<a href="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d3324.0129670661863!2d-117.7320547847609!3d33.579012580737334!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x80dce8acf2f4fa83%3A0xe88f66f162179e44!2s120+Vantis+Dr+%23300%2C+Aliso+Viejo%2C+CA+92656!5e0!3m2!1sen!2sus!4v1515004788726">other stuff</a>',
    'http://maps.google.com/maps?f=q&source=s_q&hl=en&geocode=&q=3250+wilshire+blvd&sll=37.0625,-95.677068&sspn=76.898137,185.449219&ie=UTF8&hq=&hnear=3250+Wilshire+Blvd,+Los+Angeles,+California+90010&ll=34.061281,-118.293464&spn=0.010168,0.022638&z=16&iwloc=A'
    'https://www.google.de/maps/place/Stra%C3%9Fenbahnring+3/@53.5812503,9.9773535,17z/data=!3m1!4b1!4m2!3m1!1s0x47b18f4ad5b519b9:0x2d1c51827ac5ecfb',
    'https://www.google.ae/maps/place/Acorn+Strategy,+Marketing+and+Communications/@24.4199609,54.4395062,17z/data=!3m1!4b1!4m5!3m4!1s0x3e5e4273b83aaaab:0xfc6b9bfec5e761e6!8m2!3d24.419956!4d54.4416949?hl=en',
    'https://www.google.com/maps/embed?pb=!1m14!1m8!1m3!1d2370.03083612117!2d9.989029000000002!3d53.557216999999994!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x47b18f1792f86061%3A0x581b1e33bc4cb039!2sACG+audio+consulting+group+GmbH!5e0!3m2!1sde!2sde!4v1426080747695',
    'https://maps.google.com/maps?client=safari&oe=UTF-8&q=1307+W.+Morehead+St.+-+Suite+206Charlotte,+NC+28208&ie=UTF-8&hq=&hnear=0x8856a1d1ab1f8b3d:0xb3952553229945ef,1307+W+Morehead+St+%23206,+Charlotte,+NC+28208&gl=us&ei=aktKUqvoLKzD4APUvYDwAQ&ved=0CC0Q8gEwAA',
    '<iframe width="930" scrolling="no" height="350" frameborder="0" marginheight="0" marginwidth="0" src="https://maps.google.com/maps?f=q&amp;source=s_q&amp;hl=en&amp;geocode=&amp;q=4930+reed+road,+columbus,+ohio&amp;aq=&amp;sll=40.365277,-82.669252&amp;sspn=6.753944,9.558105&amp;ie=UTF8&amp;hq=&',
]

In [74]:
import re

RE_MAPS = re.compile('(https?://(?:(?:www\.)?google\.[^/]*/maps|maps\.google\.[^/]*|maps\.googleapis\.[^/]*)[^ <>"]*)')


def find_google_maps(unibody):
    """ find google maps links in the page HTML """
    return list(set(x for x in RE_MAPS.findall(unibody)
                    if 'sensor=true' not in x and 'geocode/json?' not in x))

In [75]:
from itertools import islice
import json

from warcio.archiveiterator import ArchiveIterator
import smart_open 
from tqdm import tqdm_notebook
import pandas as pd
from  w3lib.encoding import html_to_unicode

from html_to_etree import parse_html_bytes
from extract_social_media import find_links_tree


def process(url):
    """
    extract html responses from commoncrawl and extract social media links
    """
    warc_input = smart_open.smart_open(url) 

    records = ArchiveIterator(warc_input, arc2warc=True)
    for record in records: 
        if record.rec_type == 'response':
            content_type = record.http_headers.get_header('Content-Type')
            if 'text/html' not in content_type:
                continue
                
            uri = record.rec_headers.get_header('WARC-Target-URI')
            body = record.content_stream().read()
            
            unibody = html_to_unicode(content_type, body, default_encoding='utf8')[1]

            tree = parse_html_bytes(body, content_type)
            links = [x for x in set(find_links_tree(tree))
                     if x and len(x) < 100]
            yield {
                'uri': uri,
                'social_media': links,
                'google_maps': find_google_maps(unibody)
            }

In [76]:
SAMPLE_URL = 's3://commoncrawl/crawl-data/CC-NEWS/2018/04/CC-NEWS-20180405091124-00174.warc.gz'
recs = process(SAMPLE_URL)
recs = list(islice(tqdm_notebook(recs), 3000))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [77]:
with_maps = [r for r in recs if r['google_maps']]
print(json.dumps(with_maps, indent=4))

# https://github.com/aws-samples/pywren-workshops/blob/master/Lab-2-Common-Crawl/web_search.ipynb

[
    {
        "uri": "http://www.oregonwinepress.com/event-detail?eventTitle=the-brothers-reed-at-south-stage-cellars--1523430000--21952",
        "social_media": [
            "https://www.facebook.com/pages/Oregon-Wine-Press/56204313628",
            "https://www.instagram.com/oregonwinepress/",
            "https://twitter.com/oregonwinepress"
        ],
        "google_maps": [
            "http://maps.google.com/maps?f=q&#38;source=s_q&#38;hl=en&#38;geocode=&#38;iwloc=&#38;q=125+S+3rd+Street,+Jacksonville,+OR,+97530,+United+States,&#38;ie=UTF8&#38;hq=&#38;hnear=125+S+3rd+Street,+Jacksonville,+OR,+97530,+United+States,&#38;z=14&#38;output=embed",
            "http://maps.google.com/maps?q=125"
        ]
    },
    {
        "uri": "https://www.webnewswire.com/2018/04/05/rea-associates-announces-its-new-website/",
        "social_media": [],
        "google_maps": [
            "https://www.google.com/maps/place/Rea+%26+Associates/@41.67462,-81.3413647,17z/data=!3m1!4b1!4m5!3m4!1s