In [24]:
from itertools import islice
import json

from warcio.archiveiterator import ArchiveIterator
import smart_open 
from tqdm import tqdm_notebook
import pandas as pd

from html_to_etree import parse_html_bytes
from extract_social_media import find_links_tree


def process(url):
    """
    extract html responses from commoncrawl and extract social media links
    """
    warc_input = smart_open.smart_open(url) 

    records = ArchiveIterator(warc_input, arc2warc=True)
    for record in records: 
        if record.rec_type == 'response' and 'text/html' in record.http_headers.get_header('Content-Type'):
            uri = record.rec_headers.get_header('WARC-Target-URI')
            content_type = record.http_headers.get_header('Content-Type')
            body = record.content_stream().read()

            tree = parse_html_bytes(body, content_type)
            links = list(set(find_links_tree(tree)))
            yield {
                'uri': uri,
                'social_media': links
            }

In [25]:
SAMPLE_URL = 's3://commoncrawl/crawl-data/CC-NEWS/2018/04/CC-NEWS-20180405091124-00174.warc.gz'
recs = process(SAMPLE_URL)
recs = list(islice(tqdm_notebook(recs), 100))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [27]:
print(json.dumps(recs, indent=4))

[
    {
        "uri": "https://www.tvn24.pl/wiadomosci-ze-swiata,2/samochod-stanal-w-ogniu-w-aurorze-w-illinois,827264.html",
        "social_media": [
            "https://www.facebook.com/tvn24pl ",
            "https://plus.google.com/109311928196765369861?prsrc=3",
            "http://www.pinterest.com/tvn24pl/",
            "http://instagram.com/tvn24.pl?ref=badge",
            "https://twitter.com/tvn24"
        ]
    },
    {
        "uri": "https://au.news.yahoo.com/a/39713842/seven-cities-keen-on-2026-winter-games/",
        "social_media": [
            "http://www.pinterest.com/pin/create/button/?url=https%3A%2F%2Fau.sports.yahoo.com%2Fa%2F39713842%2Fseven-cities-keen-on-2026-winter-games%2F%3Fcmp%3Dst&description=The%20IOC%20says%20seven%20cities%2C%20or%20joint-bidding%20cities%2C%20have%20expressed%20interest%20in%20hosting%20the%202026%20Winter%20Olympic%20Games.&data-pin-do=%22buttonPin%22&data-pin-config=%22above%22&media=https%3A%2F%2Fs.yimg.com%2Fyt%2Fy7-assets%2F0.