In [6]:
import pandas as pd
import json
import gzip
import os 
from tqdm import tqdm 
import requests
import os
import numpy as np 


In [30]:
class ColumnarExplorer: 
    def __init__(self, monthly_path) -> None: 
        self.monthly_path = monthly_path
        self.schema = self._fetch_schema()
        self.monthly_urls = self._get_monthly_indices()

    def _fetch_schema(self):
        """
        Get schema of Common Crawl columnar format 
        """
        schema = 'https://github.com/commoncrawl/cc-index-table/blob/main/src/main/resources/schema/cc-index-schema-flat.json'
        resp = requests.get(schema)

        return json.loads(resp.text)

    def _get_monthly_indices(self):
        """
        Extract indices to access monthly dumps of the crawl 
        """
        index_paths = requests.get(self.monthly_path)
        open('monthly_index.gz', 'wb').write(index_paths.content)
        links_for_download_cdx = []
        with gzip.open('monthly_index.gz', 'rb') as f:
            for line in f:
                links_for_download_cdx.append(line)
        os.remove('monthly_index.gz')
        links_for_download_cdx_strings = [str(byte_string, 'UTF-8').rstrip('\n') for byte_string in links_for_download_cdx]
        links_to_get_indices = ['https://data.commoncrawl.org/' + x for x in links_for_download_cdx_strings] 
        links_to_get_indices = links_to_get_indices[0:6]

        return(links_to_get_indices)

    def get_domain(self, domain: str, chunks: int, clean = False):   #set default n of chunks as 1
        """
        Get monthly records of a user defnined domain (e.g. .com)
        """
        links_to_get_indices = self.monthly_urls
        chunks_of_indices = numpy.array_split(links_to_get_indices, chunks)
        results = pd.DataFrame()

        for chunk in chunks_of_indices:
            print(chunk)
            urls = pd.DataFrame()
            for link_of_indices in tqdm(chunk):
                link_to_download = 'curl -o columnar.gz.parquet ' + ''.join(link_of_indices) 
                os.system(link_to_download)
                df = pd.read_parquet('columnar.gz.parquet', engine='fastparquet')
                data = df.loc[df['url_host_private_suffix'] == domain] 
                print('number of found f'{domain}' in this chunk is')
                print(len(data))
                urls = pd.concat([urls, data], axis=0).reset_index(drop=True)
                print(len(urls))
                os.remove('columnar.gz.parquet')

                if clean:
                    urls = urls.drop(['url_surtkey', 'url_host_tld', 'url_host_2nd_last_part', 'url_host_3rd_last_part', 
                                    'url_host_4th_last_part', 'url_host_5th_last_part', 'url_host_name_reversed', 'url_protocol', 
                                    'url_port', 'url_path', 'url_query', 'content_digest', 'content_mime_detected'], axis = 1) 	
                    urls = urls.loc[(urls['fetch_status'] == 200)]
                    urls = urls[~urls.warc_filename.str.contains('robotstxt')]  

                else: 
                    urls = urls 
            
        results = pd.concat([results, urls])
        return results



In [31]:
trial_1 = ColumnarExplorer('https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-33/cc-index-table.paths.gz')

In [32]:
trial_1.monthly_urls

['https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/subset=crawldiagnostics/part-00000-d466b69e-be2b-4525-ac34-1b10d57329da.c000.gz.parquet',
 'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/subset=crawldiagnostics/part-00001-d466b69e-be2b-4525-ac34-1b10d57329da.c000.gz.parquet',
 'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/subset=crawldiagnostics/part-00002-d466b69e-be2b-4525-ac34-1b10d57329da.c000.gz.parquet',
 'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/subset=crawldiagnostics/part-00003-d466b69e-be2b-4525-ac34-1b10d57329da.c000.gz.parquet',
 'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/subset=crawldiagnostics/part-00004-d466b69e-be2b-4525-ac34-1b10d57329da.c000.gz.parquet',
 'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/subset=crawldiagnostics/part-00005-d466b69e-be2b-4525-ac34-1b10d

In [33]:
trial_1.get_domain('co.uk', 2, clean = True)

2
[array(['https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/subset=crawldiagnostics/part-00000-d466b69e-be2b-4525-ac34-1b10d57329da.c000.gz.parquet',
       'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/subset=crawldiagnostics/part-00001-d466b69e-be2b-4525-ac34-1b10d57329da.c000.gz.parquet',
       'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/subset=crawldiagnostics/part-00002-d466b69e-be2b-4525-ac34-1b10d57329da.c000.gz.parquet'],
      dtype='<U166'), array(['https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/subset=crawldiagnostics/part-00003-d466b69e-be2b-4525-ac34-1b10d57329da.c000.gz.parquet',
       'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/subset=crawldiagnostics/part-00004-d466b69e-be2b-4525-ac34-1b10d57329da.c000.gz.parquet',
       'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/su

  0%|          | 0/3 [00:00<?, ?it/s]  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  172M  100  172M    0     0  18.5M      0  0:00:09  0:00:09 --:--:-- 19.2M
 33%|███▎      | 1/3 [00:15<00:30, 15.19s/it]

quanti?
0
0


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  144M  100  144M    0     0  18.0M      0  0:00:07  0:00:07 --:--:-- 18.9M
 67%|██████▋   | 2/3 [00:29<00:14, 14.49s/it]

quanti?
0
0


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  219M  100  219M    0     0  19.1M      0  0:00:11  0:00:11 --:--:-- 19.6M
100%|██████████| 3/3 [00:48<00:00, 16.10s/it]


quanti?
0
0
['https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/subset=crawldiagnostics/part-00003-d466b69e-be2b-4525-ac34-1b10d57329da.c000.gz.parquet'
 'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/subset=crawldiagnostics/part-00004-d466b69e-be2b-4525-ac34-1b10d57329da.c000.gz.parquet'
 'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2022-33/subset=crawldiagnostics/part-00005-d466b69e-be2b-4525-ac34-1b10d57329da.c000.gz.parquet']


  0%|          | 0/3 [00:00<?, ?it/s]  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  207M  100  207M    0     0  14.5M      0  0:00:14  0:00:14 --:--:-- 13.8M
 33%|███▎      | 1/3 [00:21<00:43, 21.85s/it]

quanti?
0
0


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  165M  100  165M    0     0  17.1M      0  0:00:09  0:00:09 --:--:-- 19.0M
 67%|██████▋   | 2/3 [00:38<00:18, 18.61s/it]

quanti?
0
0


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  155M  100  155M    0     0  14.9M      0  0:00:10  0:00:10 --:--:-- 15.7M
100%|██████████| 3/3 [00:54<00:00, 18.16s/it]

quanti?
0
0





Unnamed: 0,url,url_host_name,url_host_registry_suffix,url_host_registered_domain,url_host_private_suffix,url_host_private_domain,fetch_time,fetch_status,fetch_redirect,content_mime_type,content_charset,content_languages,content_truncated,warc_filename,warc_record_offset,warc_record_length,warc_segment
