In [1]:
# https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-10/segments/1707948223038.94/warc/CC-MAIN-20240305060427-20240305090427-00183.warc.gz
import asyncio
import httpx
import requests
import json
from urllib.parse import quote_plus
from tenacity import retry, stop_after_attempt, wait_fixed

# Please note: f-strings require Python 3.6+

# The URL of the Common Crawl Index server
CC_INDEX_SERVER = "http://index.commoncrawl.org/"

# The Common Crawl index you want to query
INDEX_NAME = "CC-MAIN-2023-40"  # Replace with the latest index name

# The URL you want to look up in the Common Crawl index
target_url = "commoncrawl.org/faq"  # Replace with your target URL


# Function to search the Common Crawl Index
def search_cc_index(url):
    encoded_url = quote_plus(url)
    index_url = f"{CC_INDEX_SERVER}{INDEX_NAME}-index?url={encoded_url}&output=json"
    response = requests.get(index_url)
    print("Response from CCI:", response.text)  # Output the response from the server
    if response.status_code == 200:
        records = response.text.strip().split("\n")
        return [json.loads(record) for record in records]
    else:
        return None


@retry(stop=stop_after_attempt(50), wait=wait_fixed(1))
async def download_from_cc(client, filename, offset, length):
    # async with httpx.AsyncClient() as client:
    # async with client:
    s3_url = f"https://data.commoncrawl.org/{filename}"
    response = await client.get(
        s3_url, headers={"Range": f"bytes={offset}-{offset+length-1}"}
    )
    if response.status_code < 300:
        # Process the response content if necessary
        # For example, you can use warcio to parse the WARC record
        return await response.aread()
    else:
        print(f"Failed to fetch data: {response.status_code}")
        return None


client = httpx.AsyncClient()  # async with httpx.AsyncClient() as client:

# Function to fetch the content from Common Crawl
async def fetch_page_from_cc(records):
    for record in records:
        offset, length = int(record["offset"]), int(record["length"])
        return await download_from_cc(client, record["filename"], offset, length)


# Search the index for the target URL
records = search_cc_index(target_url)
if records:
    print(f"Found {len(records)} records for {target_url}")

    # Fetch the page content from the first record
    response = await fetch_page_from_cc(records)
    if response:
        print(f"Successfully fetched content for {target_url}")
        # You can now process the 'content' variable as needed
else:
    print(f"No records found for {target_url}")

response

Response from CCI: {"urlkey": "org,commoncrawl)/faq", "timestamp": "20230922051346", "url": "https://commoncrawl.org/faq", "mime": "text/html", "mime-detected": "text/html", "status": "200", "digest": "JTYFLU7J2EQBLJ7A4GHLTLMYX7BXNKVM", "length": "7271", "offset": "211129121", "filename": "crawl-data/CC-MAIN-2023-40/segments/1695233506329.15/warc/CC-MAIN-20230922034112-20230922064112-00893.warc.gz", "languages": "eng", "encoding": "UTF-8"}
{"urlkey": "org,commoncrawl)/faq", "timestamp": "20230922070518", "url": "https://commoncrawl.org/faq", "mime": "text/html", "mime-detected": "text/html", "status": "200", "digest": "JTYFLU7J2EQBLJ7A4GHLTLMYX7BXNKVM", "length": "7269", "offset": "213873689", "filename": "crawl-data/CC-MAIN-2023-40/segments/1695233506339.10/warc/CC-MAIN-20230922070214-20230922100214-00893.warc.gz", "languages": "eng", "encoding": "UTF-8"}
{"urlkey": "org,commoncrawl)/faq", "timestamp": "20230922171307", "url": "https://commoncrawl.org/faq", "mime": "text/html", "mime-

b'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\xed=\xdbr\xdbF\x96\xcfQU\xfe\xa1\x83\xad\xcd\xd8\x135\t\x10\x00I0\x92\xb2\xb2|S\xc6N\x14[\x8e\x93\x99\x9ar5\x80\x06\t\x1b\x04\x18\\$\xd1\x19W\xcd\x0f\xec\x07\xec\xdb>m\xd5\xfe\xc6|\xca|\xc9\x9es\x1a \x01^dR\x925\xce:\x0fq\x04\xb0\xfb\xf4\xe9\xd3\xe7\xde}\x1a/\x0f\x9f\x1d\xb5\x8d\x96\xfe\xf9\xceK\xf8\x8b\x9fN\'r\xc0R\x99M\x928\x93\xe5\xcb\xfb"\x87\x97\x1d\xbdcr\xdd\xe1\x9d\xce\xa9n\x0f\x0cs`u\xff\\6x&\xbd$\xf5\xf9\xf1\xfd\x01\xdb+\xd2xP\x14\xa1?\xe8\xdbN\xd7\xf0\x8d.\x17N\xcf\xe7\x96\xef\xea\xdcq|\xc9\x85\xdd\x0b\xdc@\x08\xa3\xd3\x13\x07\x9f\xef\x1c%q.\xe3\x9c?\x91\xf10\x1f\r\x98\xd9\xe9\xdb\xfa\xfc\xb5BHL&Q\xe8\x89<L\xe2\xf6(\xcf\'_\xb3q6\xcc\xe1\xa7\xfd\x05T_\x8a\xd4\x0b\xe3 Y\xc0%\x10f\'p\xfb=n\xda^\xc0-\xc32\xb9c\xf7\xba\xbc\xaf\x07=\xcf7\x82~\xcf\xf4\x0eJ\x100\xb2W\xa4)\r\x9e\xd4\x81XB\xd7\xbb]\xdd\xe2\xb6\xe7\nn9}\x9f;\x86\x0f\x90:]#\xe8\xf4{\xbe\x90v\x05\xe4\xf8\x84\x1f\xfa> \x97\xc1\x8cZ\x1d\xd3l\x19\x9dn\xabcUd\x16\xe9P\xe6\xfc\xc5\xb3

In [2]:
from time import sleep


for i in range(10):
    try:
        response = await fetch_page_from_cc(records)
        print(response)
        break
    except Exception as e:
        print(e)
        print("Retrying in 1 second...")
    sleep(1)

b'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x00\xed=\xdbr\xdbF\x96\xcfQU\xfe\xa1\x83\xad\xcd\xd8\x135\t\x10\x00I0\x92\xb2\xb2|S\xc6N\x14[\x8e\x93\x99\x9ar5\x80\x06\t\x1b\x04\x18\\$\xd1\x19W\xcd\x0f\xec\x07\xec\xdb>m\xd5\xfe\xc6|\xca|\xc9\x9es\x1a \x01^dR\x925\xce:\x0fq\x04\xb0\xfb\xf4\xe9\xd3\xe7\xde}\x1a/\x0f\x9f\x1d\xb5\x8d\x96\xfe\xf9\xceK\xf8\x8b\x9fN\'r\xc0R\x99M\x928\x93\xe5\xcb\xfb"\x87\x97\x1d\xbdcr\xdd\xe1\x9d\xce\xa9n\x0f\x0cs`u\xff\\6x&\xbd$\xf5\xf9\xf1\xfd\x01\xdb+\xd2xP\x14\xa1?\xe8\xdbN\xd7\xf0\x8d.\x17N\xcf\xe7\x96\xef\xea\xdcq|\xc9\x85\xdd\x0b\xdc@\x08\xa3\xd3\x13\x07\x9f\xef\x1c%q.\xe3\x9c?\x91\xf10\x1f\r\x98\xd9\xe9\xdb\xfa\xfc\xb5BHL&Q\xe8\x89<L\xe2\xf6(\xcf\'_\xb3q6\xcc\xe1\xa7\xfd\x05T_\x8a\xd4\x0b\xe3 Y\xc0%\x10f\'p\xfb=n\xda^\xc0-\xc32\xb9c\xf7\xba\xbc\xaf\x07=\xcf7\x82~\xcf\xf4\x0eJ\x100\xb2W\xa4)\r\x9e\xd4\x81XB\xd7\xbb]\xdd\xe2\xb6\xe7\nn9}\x9f;\x86\x0f\x90:]#\xe8\xf4{\xbe\x90v\x05\xe4\xf8\x84\x1f\xfa> \x97\xc1\x8cZ\x1d\xd3l\x19\x9dn\xabcUd\x16\xe9P\xe6\xfc\xc5\xb3

In [3]:
# https://github.com/webrecorder/warcio
from warcio.archiveiterator import ArchiveIterator
from io import BytesIO


my_stream = BytesIO(response)


def read_warc_file(buffer):
    with buffer:
        pages = []
        for record in ArchiveIterator(buffer):
            # if record.rec_type == "response":
            #     print(record.rec_headers.get_header("WARC-Target-URI"))
            pages.append(
                {
                    "type": record.rec_type,
                    "content_type": record.rec_headers.get_header("Content-Type"),
                    "url": record.rec_headers.get_header("WARC-Target-URI"),
                    "rec_headers": record.rec_headers,
                    "http_headers": record.http_headers,
                    "content": record.content_stream().read(),
                }
            )

    return pages


pages = read_warc_file(my_stream)
pages

[{'type': 'response',
  'content_type': 'application/http; msgtype=response',
  'url': 'https://commoncrawl.org/faq',
  'rec_headers': StatusAndHeaders(protocol = 'WARC/1.0', statusline = '', headers = [('WARC-Type', 'response'), ('WARC-Date', '2023-09-22T05:13:46Z'), ('WARC-Record-ID', '<urn:uuid:85961d16-a97d-4db0-99de-a57fbfaa127a>'), ('Content-Length', '32850'), ('Content-Type', 'application/http; msgtype=response'), ('WARC-Warcinfo-ID', '<urn:uuid:fa32fb87-35cf-4143-9576-80f7cd1f873c>'), ('WARC-Concurrent-To', '<urn:uuid:4a006604-5cba-498d-91d3-9261f287dae5>'), ('WARC-IP-Address', '3.233.126.24'), ('WARC-Target-URI', 'https://commoncrawl.org/faq'), ('WARC-Payload-Digest', 'sha1:JTYFLU7J2EQBLJ7A4GHLTLMYX7BXNKVM'), ('WARC-Block-Digest', 'sha1:2DI3C656YMWHNMFUPVH45VEU5ZX2SJ4W'), ('WARC-Identified-Payload-Type', 'text/html')]),
  'http_headers': StatusAndHeaders(protocol = 'HTTP/1.1', statusline = '200 OK', headers = [('Date', 'Fri, 22 Sep 2023 05:13:46 GMT'), ('Content-Type', 'text/h

In [4]:
list(pages[0]["rec_headers"].__dict__.keys())

['statusline', 'headers', 'protocol', 'total_len', 'headers_buff']

In [5]:
def warc_header_to_dict(page_headers):
    return {k: v for k, v in page_headers.headers} | {
        k: v for k, v in page_headers.__dict__.items() if k != "headers"
    }


warc_header_to_dict(pages[0]["http_headers"])

{'Date': 'Fri, 22 Sep 2023 05:13:46 GMT',
 'Content-Type': 'text/html',
 'X-Crawler-Content-Length': '7328',
 'Content-Length': '32374',
 'Connection': 'keep-alive',
 'x-lambda-id': 'ce641072-2780-4a6f-81bc-7f0e9a43e555',
 'X-Crawler-Content-Encoding': 'gzip',
 'Accept-Ranges': 'bytes',
 'Age': '67479',
 'X-Served-By': 'cache-iad-kcgs7200061-IAD',
 'X-Cache': 'HIT',
 'X-Cache-Hits': '1',
 'X-Timer': 'S1695359626.126484,VS0,VE4',
 'Vary': 'x-wf-forwarded-proto, Accept-Encoding',
 'X-Cluster-Name': 'us-east-1-prod-hosting-red',
 'statusline': '200 OK',
 'protocol': 'HTTP/1.1',
 'total_len': 476,
 'headers_buff': None}

In [6]:
warc_header_to_dict(pages[0]["rec_headers"])

{'WARC-Type': 'response',
 'WARC-Date': '2023-09-22T05:13:46Z',
 'WARC-Record-ID': '<urn:uuid:85961d16-a97d-4db0-99de-a57fbfaa127a>',
 'Content-Length': '32850',
 'Content-Type': 'application/http; msgtype=response',
 'WARC-Warcinfo-ID': '<urn:uuid:fa32fb87-35cf-4143-9576-80f7cd1f873c>',
 'WARC-Concurrent-To': '<urn:uuid:4a006604-5cba-498d-91d3-9261f287dae5>',
 'WARC-IP-Address': '3.233.126.24',
 'WARC-Target-URI': 'https://commoncrawl.org/faq',
 'WARC-Payload-Digest': 'sha1:JTYFLU7J2EQBLJ7A4GHLTLMYX7BXNKVM',
 'WARC-Block-Digest': 'sha1:2DI3C656YMWHNMFUPVH45VEU5ZX2SJ4W',
 'WARC-Identified-Payload-Type': 'text/html',
 'statusline': '',
 'protocol': 'WARC/1.0',
 'total_len': 576,
 'headers_buff': None}

In [7]:
import bs4


def get_text_from_content(content):
    soup = bs4.BeautifulSoup(content, "html.parser")
    return soup.get_text(separator="\n")


print(get_text_from_content(pages[0]["content"]))

Common Crawl - FAQ




The Data
Overview
Web Graphs
Latest Crawl
Resources
Get Started
Blog
Examples
Use Cases
CCBot
FAQ
Community
Research Papers
Mailing List Archive
About
Team
Mission
Impact
Privacy Policy
Terms of Use
Search
Contact Us
Frequently asked questions
Everything you need to know regarding general and technical questions about 
Common Crawl.
General Questions
What is Common Crawl?
Common Crawl is a 501(c)(3) non-profit organization dedicated to providing a copy of the Internet to Internet researchers, companies and individuals at no cost for the purpose of research and analysis.
What can you do with Common Crawl data?
The possibilities are endless. People have used the data to improve language translation software, predict trends, track disease propagation, and much more.
‍
The crawl data is stored on Amazon’s S3 service, allowing it to be bulk downloaded as well as directly accessed for 
Map-Reduce
 processing in EC2.
Can’t Google or Microsoft just do what Common Crawl d

In [8]:
import pandas as pd

url_index_path = "./data/common-crawl/becker.gz.parquet"
index_df = pd.read_parquet(url_index_path)
index_df

Unnamed: 0,filename,file_row_number,url_surtkey,url,url_host_name,url_host_tld,url_host_2nd_last_part,url_host_3rd_last_part,url_host_4th_last_part,url_host_5th_last_part,...,content_mime_detected,content_charset,content_languages,content_truncated,warc_filename,warc_record_offset,warc_record_length,warc_segment,crawl,subset
0,./download/cc-index/table/cc-main/warc/crawl=C...,710200,"com,beckershospitalreview)/",https://www.beckershospitalreview.com/,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,application/xhtml+xml,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,683709535,22485,1707947474533.12,CC-MAIN-2024-10,warc
1,./download/cc-index/table/cc-main/warc/crawl=C...,710201,"com,beckershospitalreview)/",https://www.beckershospitalreview.com/,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,application/xhtml+xml,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,662961425,22657,1707947474649.44,CC-MAIN-2024-10,warc
2,./download/cc-index/table/cc-main/warc/crawl=C...,710202,"com,beckershospitalreview)/10-top-patient-safe...",https://www.beckershospitalreview.com/10-top-p...,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,application/xhtml+xml,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,661288654,23125,1707947474671.63,CC-MAIN-2024-10,warc
3,./download/cc-index/table/cc-main/warc/crawl=C...,710203,"com,beckershospitalreview)/100-community-hospi...",https://www.beckershospitalreview.com/100-comm...,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,application/xhtml+xml,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947475...,655320135,15953,1707947475203.41,CC-MAIN-2024-10,warc
4,./download/cc-index/table/cc-main/warc/crawl=C...,710204,"com,beckershospitalreview)/100-community-hospi...",https://www.beckershospitalreview.com/100-comm...,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,application/xhtml+xml,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,683219599,15968,1707947474526.76,CC-MAIN-2024-10,warc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18462,./download/cc-index/table/cc-main/warc/crawl=C...,728662,"com,beckershospitalreview,go)/your-rcm-needs-a...",https://go.beckershospitalreview.com/your-rcm-...,go.beckershospitalreview.com,com,beckershospitalreview,go,,,...,text/html,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947476...,277464735,15001,1707947476452.25,CC-MAIN-2024-10,warc
18463,./download/cc-index/table/cc-main/warc/crawl=C...,728663,"com,beckershospitalreview,go)/your-roadmap-to-...",https://go.beckershospitalreview.com/your-road...,go.beckershospitalreview.com,com,beckershospitalreview,go,,,...,text/html,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947476...,273327042,15173,1707947476442.30,CC-MAIN-2024-10,warc
18464,./download/cc-index/table/cc-main/warc/crawl=C...,728664,"com,beckershospitalreview,go)/your-roadmap-to-...",https://go.beckershospitalreview.com/your-road...,go.beckershospitalreview.com,com,beckershospitalreview,go,,,...,text/html,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947473...,295586637,15169,1707947473824.45,CC-MAIN-2024-10,warc
18465,./download/cc-index/table/cc-main/warc/crawl=C...,728665,"com,beckershospitalreview,go)/zero-contact-zer...",https://go.beckershospitalreview.com/zero-cont...,go.beckershospitalreview.com,com,beckershospitalreview,go,,,...,text/html,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,285814085,13532,1707947474649.44,CC-MAIN-2024-10,warc


In [9]:
index_df.columns

Index(['filename', 'file_row_number', 'url_surtkey', 'url', 'url_host_name',
       'url_host_tld', 'url_host_2nd_last_part', 'url_host_3rd_last_part',
       'url_host_4th_last_part', 'url_host_5th_last_part',
       'url_host_registry_suffix', 'url_host_registered_domain',
       'url_host_private_suffix', 'url_host_private_domain',
       'url_host_name_reversed', 'url_protocol', 'url_port', 'url_path',
       'url_query', 'fetch_time', 'fetch_status', 'fetch_redirect',
       'content_digest', 'content_mime_type', 'content_mime_detected',
       'content_charset', 'content_languages', 'content_truncated',
       'warc_filename', 'warc_record_offset', 'warc_record_length',
       'warc_segment', 'crawl', 'subset'],
      dtype='object')

In [10]:
WARC_FILENAME = "warc_filename"
WARC_RECORD_OFFSET = "warc_record_offset"
WARC_RECORD_LENGTH = "warc_record_length"
URL = "url"

example = index_df[[URL, WARC_FILENAME, WARC_RECORD_OFFSET, WARC_RECORD_LENGTH]].iloc[0]
example

url                              https://www.beckershospitalreview.com/
warc_filename         crawl-data/CC-MAIN-2024-10/segments/1707947474...
warc_record_offset                                            683709535
warc_record_length                                                22485
Name: 0, dtype: object

In [23]:
import logging
import re
import sys
import traceback

import tiktoken
from aiolimiter import AsyncLimiter
import tenacity

limiter = AsyncLimiter(1, 1)  # up to one request per second

# logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
# logger = logging.getLogger(__name__)

# persistent_download_from_cc = tenacity.retry(
#     download_from_cc,
#     stop=tenacity.stop_after_delay(5),
#     wait=tenacity.wait_fixed(1),
#     reraise=True,
#     # retry=tenacity.retry_if_exception_type(ValueError),
#     # before_sleep=tenacity.before_sleep_log(logger, logging.DEBUG),
# )


def clean_text(text: str, max_tokens: int = None, model: str = "gpt-3.5-turbo") -> str:
    # Get text as ascii
    text = text.encode("ascii", "ignore").decode("ascii")

    # clean whitespace and replace consecutive newlines
    text = re.sub(r"[\t ]*\n[\t ]*", "\\n", text)
    text = re.sub(r"\n{2,}", "\\n\\n", text)
    text = text.strip()

    # Up to max_tokens per website
    if max_tokens is not None:
        encoding = tiktoken.encoding_for_model(model)
        text = encoding.decode(encoding.encode(text)[:max_tokens])
    return text


async def process_page(client, example, limiter):
    async with asyncio.Semaphore(50):
        # print(f"Processing {example[URL]}")
        async with limiter:
            try:
                cc_raw = await download_from_cc(
                    client,
                    example[WARC_FILENAME],
                    example[WARC_RECORD_OFFSET],
                    example[WARC_RECORD_LENGTH],
                )
                # cc_raw = await persistent_download_from_cc(example[WARC_FILENAME], example[WARC_RECORD_OFFSET], example[WARC_RECORD_LENGTH])
            except Exception as e:
                tb = traceback.format_exc()
                error = str(e)
                return {"type": "error", "error": error, "traceback": tb}
        pages = read_warc_file(BytesIO(cc_raw))
        text = clean_text(get_text_from_content(pages[0]["content"]) if pages else '')
        #  {
        #                 "type": record.rec_type,
        #                 "content_type": record.rec_headers.get_header("Content-Type"),
        #                 "url": record.rec_headers.get_header("WARC-Target-URI"),
        #                 "rec_headers": record.rec_headers,
        #                 "http_headers": record.http_headers,
        #                 "content": record.content_stream().read(),
        #             }
        return {
            "type": pages[0]["type"],
            "content_type": pages[0]["content_type"],
            "url": pages[0]["url"],
            "rec_headers": warc_header_to_dict(pages[0]["rec_headers"]),
            "http_headers": warc_header_to_dict(pages[0]["http_headers"]),
            "text": text,
        }


result = await process_page(client, example, limiter)
result

{'type': 'response',
 'content_type': 'application/http; msgtype=response',
 'url': 'https://www.beckershospitalreview.com/',
 'rec_headers': {'WARC-Type': 'response',
  'WARC-Date': '2024-02-24T13:57:49Z',
  'WARC-Record-ID': '<urn:uuid:aa1cf157-c069-4a49-b0db-cab0a6428927>',
  'Content-Length': '111068',
  'Content-Type': 'application/http; msgtype=response',
  'WARC-Warcinfo-ID': '<urn:uuid:b4afb19e-e9ba-4876-918c-d7851fbde11b>',
  'WARC-Concurrent-To': '<urn:uuid:ec81848f-ee3c-4149-9405-4a084ee9d8d8>',
  'WARC-IP-Address': '67.227.242.177',
  'WARC-Target-URI': 'https://www.beckershospitalreview.com/',
  'WARC-Payload-Digest': 'sha1:PCQL6VVNRDLT2X64DJJKHOC2OQEBBDZ3',
  'WARC-Block-Digest': 'sha1:BIJR3Y7TLOFJU45U5YE6YGXEWI42M3WF',
  'WARC-Identified-Payload-Type': 'application/xhtml+xml',
  'statusline': '',
  'protocol': 'WARC/1.0',
  'total_len': 602,
  'headers_buff': None},
 'http_headers': {'Date': 'Sat, 24 Feb 2024 13:57:49 GMT',
  'Server': 'Apache',
  'Expires': 'Wed, 17 Aug

In [24]:
print(result["text"][:100])

Becker's Hospital Review | Healthcare News & Analysis

Becker's Healthcare:

Hospital

ASC

Spine

C


In [25]:
import tqdm

print(tqdm.__version__)

4.66.2


In [38]:
import asyncio
from tqdm.asyncio import tqdm


async def process_examples(examples):
    examples = [example for _, example in examples.iterrows()]
    limiter = AsyncLimiter(5, 1)
    client = httpx.AsyncClient()

    return await tqdm.gather(*[process_page(client, example, limiter) for example in examples], total=len(examples))


examples = index_df.iloc[:1]
results = await process_examples(
    examples
)
results



Failed to fetch data: 403


IndexError: list index out of range

In [37]:
limiter = AsyncLimiter(200, 1)

examples = index_df.iloc[:50]
results = await process_examples(
    examples
)
pd.DataFrame(results)

  0%|          | 0/50 [00:00<?, ?it/s]

Failed to fetch data: 403


IndexError: list index out of range

Failed to fetch data: 403


Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fetch data: 403
Failed to fe

In [None]:
index_df.shape

(18467, 35)

In [None]:
all_examples = [example for _, example in index_df.iterrows()]
pd.DataFrame(all_examples)

Unnamed: 0,filename,file_row_number,url_surtkey,url,url_host_name,url_host_tld,url_host_2nd_last_part,url_host_3rd_last_part,url_host_4th_last_part,url_host_5th_last_part,...,content_charset,content_languages,content_truncated,warc_filename,warc_record_offset,warc_record_length,warc_segment,crawl,subset,shard_idx
0,./download/cc-index/table/cc-main/warc/crawl=C...,710200,"com,beckershospitalreview)/",https://www.beckershospitalreview.com/,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,683709535,22485,1707947474533.12,CC-MAIN-2024-10,warc,0
1,./download/cc-index/table/cc-main/warc/crawl=C...,710201,"com,beckershospitalreview)/",https://www.beckershospitalreview.com/,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,662961425,22657,1707947474649.44,CC-MAIN-2024-10,warc,1
2,./download/cc-index/table/cc-main/warc/crawl=C...,710202,"com,beckershospitalreview)/10-top-patient-safe...",https://www.beckershospitalreview.com/10-top-p...,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,661288654,23125,1707947474671.63,CC-MAIN-2024-10,warc,2
3,./download/cc-index/table/cc-main/warc/crawl=C...,710203,"com,beckershospitalreview)/100-community-hospi...",https://www.beckershospitalreview.com/100-comm...,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947475...,655320135,15953,1707947475203.41,CC-MAIN-2024-10,warc,3
4,./download/cc-index/table/cc-main/warc/crawl=C...,710204,"com,beckershospitalreview)/100-community-hospi...",https://www.beckershospitalreview.com/100-comm...,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,683219599,15968,1707947474526.76,CC-MAIN-2024-10,warc,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18462,./download/cc-index/table/cc-main/warc/crawl=C...,728662,"com,beckershospitalreview,go)/your-rcm-needs-a...",https://go.beckershospitalreview.com/your-rcm-...,go.beckershospitalreview.com,com,beckershospitalreview,go,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947476...,277464735,15001,1707947476452.25,CC-MAIN-2024-10,warc,14
18463,./download/cc-index/table/cc-main/warc/crawl=C...,728663,"com,beckershospitalreview,go)/your-roadmap-to-...",https://go.beckershospitalreview.com/your-road...,go.beckershospitalreview.com,com,beckershospitalreview,go,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947476...,273327042,15173,1707947476442.30,CC-MAIN-2024-10,warc,15
18464,./download/cc-index/table/cc-main/warc/crawl=C...,728664,"com,beckershospitalreview,go)/your-roadmap-to-...",https://go.beckershospitalreview.com/your-road...,go.beckershospitalreview.com,com,beckershospitalreview,go,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947473...,295586637,15169,1707947473824.45,CC-MAIN-2024-10,warc,0
18465,./download/cc-index/table/cc-main/warc/crawl=C...,728665,"com,beckershospitalreview,go)/zero-contact-zer...",https://go.beckershospitalreview.com/zero-cont...,go.beckershospitalreview.com,com,beckershospitalreview,go,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,285814085,13532,1707947474649.44,CC-MAIN-2024-10,warc,1


In [30]:
index_df['shard_idx'] = index_df.index%16
index_df

Unnamed: 0,filename,file_row_number,url_surtkey,url,url_host_name,url_host_tld,url_host_2nd_last_part,url_host_3rd_last_part,url_host_4th_last_part,url_host_5th_last_part,...,content_charset,content_languages,content_truncated,warc_filename,warc_record_offset,warc_record_length,warc_segment,crawl,subset,shard_idx
0,./download/cc-index/table/cc-main/warc/crawl=C...,710200,"com,beckershospitalreview)/",https://www.beckershospitalreview.com/,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,683709535,22485,1707947474533.12,CC-MAIN-2024-10,warc,0
1,./download/cc-index/table/cc-main/warc/crawl=C...,710201,"com,beckershospitalreview)/",https://www.beckershospitalreview.com/,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,662961425,22657,1707947474649.44,CC-MAIN-2024-10,warc,1
2,./download/cc-index/table/cc-main/warc/crawl=C...,710202,"com,beckershospitalreview)/10-top-patient-safe...",https://www.beckershospitalreview.com/10-top-p...,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,661288654,23125,1707947474671.63,CC-MAIN-2024-10,warc,2
3,./download/cc-index/table/cc-main/warc/crawl=C...,710203,"com,beckershospitalreview)/100-community-hospi...",https://www.beckershospitalreview.com/100-comm...,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947475...,655320135,15953,1707947475203.41,CC-MAIN-2024-10,warc,3
4,./download/cc-index/table/cc-main/warc/crawl=C...,710204,"com,beckershospitalreview)/100-community-hospi...",https://www.beckershospitalreview.com/100-comm...,www.beckershospitalreview.com,com,beckershospitalreview,www,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,683219599,15968,1707947474526.76,CC-MAIN-2024-10,warc,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18462,./download/cc-index/table/cc-main/warc/crawl=C...,728662,"com,beckershospitalreview,go)/your-rcm-needs-a...",https://go.beckershospitalreview.com/your-rcm-...,go.beckershospitalreview.com,com,beckershospitalreview,go,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947476...,277464735,15001,1707947476452.25,CC-MAIN-2024-10,warc,14
18463,./download/cc-index/table/cc-main/warc/crawl=C...,728663,"com,beckershospitalreview,go)/your-roadmap-to-...",https://go.beckershospitalreview.com/your-road...,go.beckershospitalreview.com,com,beckershospitalreview,go,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947476...,273327042,15173,1707947476442.30,CC-MAIN-2024-10,warc,15
18464,./download/cc-index/table/cc-main/warc/crawl=C...,728664,"com,beckershospitalreview,go)/your-roadmap-to-...",https://go.beckershospitalreview.com/your-road...,go.beckershospitalreview.com,com,beckershospitalreview,go,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947473...,295586637,15169,1707947473824.45,CC-MAIN-2024-10,warc,0
18465,./download/cc-index/table/cc-main/warc/crawl=C...,728665,"com,beckershospitalreview,go)/zero-contact-zer...",https://go.beckershospitalreview.com/zero-cont...,go.beckershospitalreview.com,com,beckershospitalreview,go,,,...,UTF-8,eng,,crawl-data/CC-MAIN-2024-10/segments/1707947474...,285814085,13532,1707947474649.44,CC-MAIN-2024-10,warc,1


In [31]:
index_df[index_df['shard_idx'] == 0].shape

(1155, 35)

In [32]:
import concurrent.futures as cf

def add_one(num):
    for i in range(100000000):
        a = 1+1
    return num+1

with cf.ProcessPoolExecutor() as executor:
    plus_one = [executor.submit(add_one, idx) for idx in range(32)]

print(plus_one)

In [33]:
# with cf.ProcessPoolExecutor() as executor:
#     results = [executor.submit(process_examples, index_df.iloc[idx:idx+1]) for idx in range(2)]

  del r
  del r


In [36]:
asyncio.run(process_examples(index_df.iloc[:1]))

RuntimeError: asyncio.run() cannot be called from a running event loop

In [35]:
results[0].result()

TypeError: cannot pickle 'coroutine' object