In [1]:
import requests
import json
import re
from url_normalize import url_normalize

Specify the interface with Common Crawl. As they have a URL search engine, it is quite convenient to search for URLs of target domain. Target domain could be specified as a regular expression. Here we simply specify target domain as a collection subpages of the domain.

In [2]:
def search_domain(domain):
    """
    Searches all Common Crawl Indices for a domain.
    """
    seen_urls = {}
    print ("[**] Trying target domain: %s\n" % domain)

    for index in index_list:
        print ("[*] Trying index %s\n" % index)

        cc_url = "http://index.commoncrawl.org/CC-MAIN-%s-index?" % index
        cc_url += "url=%s&matchType=domain&output=json" % domain

        response = requests.get(cc_url)

        if response.status_code == 200:

            records = response.content.splitlines()

            for record in records:
                record = json.loads(record)
                url = url_normalize(record['url'])
                if not url.endswith((file_exts)) and url not in seen_urls:
                    seen_urls[record['url']] = True

            print("[*] Found %d records\n" % len(records))

    urls = list(seen_urls.keys());
    print("[*] Found a total of %d hits\n" % len(urls))
    return urls

In [3]:
def deduplicate_urls_http_https (urls):
    urls_ignored = []
    for url in urls:
        if 'https'+url[4:] in urls:
            urls_ignored.append(url)
    urls_tmp = [url for url in urls if url not in urls_ignored]
    return urls_tmp

In [4]:
def printUrls(urls, domain):
    f = open(domain + ".csv","w+")
    for i in range(len(urls)):
        f.write(str(i) + "\t" + urls[i] + "\n")
    f.close()

Specifies the index names. We can just crawl the most recent one from Common Crawl http://index.commoncrawl.org. We can also crawl from multiple indices, which has better coverage but it has the potential risk to have duplicated pages. In the crawler of our Entity Search, we use a few tricks to remove duplication such as checking redirection and comparing titles and urls of potential duplicated pages. A more ideal approach is to compare screenshots of pages. It is not too expensive because duplicated pages usually have similar URLs so that we don't have to check all possible pairs of URLs. This leaves for future work.

In [5]:
# domain = "cs.illinois.edu"
# domain = "ece.illinois.edu"
# domain = "scs.cmu.edu"
# domain = "eecs.mit.edu"
# domain = "cs.stanford.edu"
# domain = "eecs.berkeley.edu"
# domain = "wiki.illinois.edu"
# domain = "illinois.edu"
# domain = "wiki.engr.illinois.edu"
# domain = "cs.cornell.edu"
# domain = "cs.washington.edu"
# domain = "cc.gatech.edu"
# domain = "cs.princeton.edu"
domain = "cs.utexas.edu"


index_list = ["2018-17"]
# use the index_list below if wanting to crawl old pages. It has better coverage than the above but it has potential risk of duplication
# index_list = ["2018-17","2018-09", "2018-05", "2017-51", "2017-47", "2017-43", "2017-39", "2017-34", "2017-30", "2017-26", "2017-22", "2017-17", "2017-13", "2017-09", "2017-04", "2016-50", "2016-44", "2016-40", "2016-36", "2016-30", "2016-26", "2016-22", "2016-18", "2016-07", "2015-48", "2015-40", "2015-35", "2015-32", "2015-27", "2015-22", "2015-18", "2015-14", "2015-11", "2015-06", "2014-52", "2014-49", "2014-42", "2014-41", "2014-35", "2014-23", "2014-15", "2014-10", "2013-48", "2013-20"]

# read list of file extensions to filter
file_exts = (".aif",".cda",".mid",".mp3",".mpa",".ogg",".wav",".wma",".wpl",".7z",".arj",".deb",".pkg",".rar",".rpm",".tar.gz",".z",".zip",".bin",".dmg",".iso",".toast",".vcd",".csv",".dat",".db",".log",".mdb",".sav",".sql",".tar",".xml",".apk",".bat",".bin",".cgi",".exe",".gadget",".jar",".py",".wsf",".fnt",".fon",".otf",".ttf",".ai",".bmp",".gif",".ico",".jpeg",".jpg",".png",".ps",".psd",".svg",".tif",".tiff",".asp",".aspx",".cer",".cfm",".cgi",".pl",".css",".js",".jsp",".part",".key",".odp",".pps",".ppt",".pptx",".c",".class",".cpp",".cs",".h",".java",".sh",".swift",".vb",".ods",".xlr",".xls",".xlsx",".bak",".cab",".cfg",".cpl",".cur",".dll",".dmp",".drv",".icns",".ico",".ini",".lnk",".msi",".sys",".tmp",".3g2",".3gp",".avi",".flv",".h264",".m4v",".mkv",".mov",".mp4",".mpg",".mpeg",".rm",".swf",".vob",".wmv",".doc",".docx",".odt",".pdf",".rtf",".tex",".txt",".wks",".wps",".wpd","ipynb")

In [6]:
urls = search_domain(domain)
len(urls)

[**] Trying target domain: cs.utexas.edu

[*] Trying index 2018-17

[*] Found 3209 records

[*] Found a total of 2908 hits



2908

In [7]:
printUrls(urls, domain)

In [8]:
urls = deduplicate_urls_http_https (urls)
len(urls)

2895

In [9]:
urls

['https://cs.utexas.edu/',
 'https://www.cs.utexas.edu/',
 'https://cs.utexas.edu/&nbsp;AustinVilla/sim/3dsimulation/AustinVilla3DSimulationFiles/2013/html/scram.html',
 'https://www.cs.utexas.edu/&nbsp;AustinVilla/sim/3dsimulation/AustinVilla3DSimulationFiles/2013/html/scram.html',
 'https://cs.utexas.edu/&nbsp;AustinVilla/sim/3dsimulation/overlappingLayeredLearning.html',
 'https://www.cs.utexas.edu/&nbsp;AustinVilla/sim/3dsimulation/overlappingLayeredLearning.html',
 'https://www.cs.utexas.edu/about',
 'https://www.cs.utexas.edu/about-us/new-building',
 'https://www.cs.utexas.edu/about/campus-austin/accommodations',
 'https://www.cs.utexas.edu/about/campus-austin/restaurants',
 'https://cs.utexas.edu/act/',
 'https://www.cs.utexas.edu/act/',
 'https://www.cs.utexas.edu/advisory-council',
 'https://www.cs.utexas.edu/alumni',
 'https://www.cs.utexas.edu/alumni/get-involved',
 'https://www.cs.utexas.edu/awards',
 'https://www.cs.utexas.edu/awards/graduate-student-awards',
 'https://www