In [5]:
import requests
import time
import json
import StringIO
import gzip
import csv
import codecs

from bs4 import BeautifulSoup

domain_list = ["cer.be", "eimrail.org"]
index_list = ["2016-40"]

#
# Searches the Common Crawl Index for a domain.
#
def search_domain(domain):

    record_list = []
    
    print "[*] Trying target domain: %s" % domain
    
    for index in index_list:
        
        print "[*] Trying index %s" % index
        
        cc_url  = "http://index.commoncrawl.org/CC-MAIN-%s-index?" % index
        cc_url += "url=%s&matchType=domain&output=json" % domain
        
        response = requests.get(cc_url)
        
        if response.status_code == 200:
            
            records = response.content.splitlines()
            
            for record in records:
                record_list.append(json.loads(record))
            
            print "[*] Added %d results." % len(records)
            
    
    print "[*] Found a total of %d hits." % len(record_list)
    
    return record_list        

#
# Downloads a page from Common Crawl - adapted graciously from @Smerity - thanks man!
# https://gist.github.com/Smerity/56bc6f21a8adec920ebf
#
def download_page(record):

    offset, length = int(record['offset']), int(record['length'])
    offset_end = offset + length - 1

    # We'll get the file via HTTPS so we don't need to worry about S3 credentials
    # Getting the file on S3 is equivalent however - you can request a Range
    prefix = 'https://commoncrawl.s3.amazonaws.com/'
    
    # We can then use the Range header to ask for just this set of bytes
    resp = requests.get(prefix + record['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
    
    # The page is stored compressed (gzip) to save space
    # We can extract it using the GZIP library
    raw_data = StringIO.StringIO(resp.content)
    f = gzip.GzipFile(fileobj=raw_data)
    
    # What we have now is just the WARC response, formatted:
    data = f.read()
    
    response = ""
    
    if len(data):
        try:
            warc, header, response = data.strip().split('\r\n\r\n', 2)
        except:
            pass
            
    return response


#
# Extract links from the HTML  
#
def extract_external_links(html_content,link_list, domain):

    parser = BeautifulSoup(html_content, "html.parser")
        
    links = parser.find_all("a")
    
    if links:
        
        for link in links:
            href = link.attrs.get("href")
            
            if href is not None:
                
                if domain not in href:
                    if href not in link_list and href.startswith("http"):
                        print "[*] Discovered external link: %s" % href
                        link_list.append(href)

    return link_list


def extract_domain_name(link_list):
    extracted_domain_list = []
    for link in link_list:
        website = link.split("/")[2]
        if "ww" in website:
            website = website[website.find('.')+1:]
        extracted_domain_list.append(website)
    
    return extracted_domain_list





def get_data(domain_list):
    master_dict = {}
    
    for domain in domain_list:

        record_list = search_domain(domain)
        link_list   = []

        for record in record_list:

            html_content = download_page(record)

            # print "[*] Retrieved %d bytes for %s" % (len(html_content),record['url'])

            link_list = extract_external_links(html_content,link_list,domain)

            master_dict[domain] = extract_domain_name(link_list)


        # print "[*] Total external links discovered: %d" % len(link_list)
    
    return master_dict

search_results = {}

for domain in domain_list:
    data = search_domain(domain)
    search_results[domain] = len(data)
    
search_results
# with codecs.open("%s-links.csv" % domain,"wb",encoding="utf-8") as output:

#    fields = ["URL"]
    
#    logger = csv.DictWriter(output,fieldnames=fields)
#    logger.writeheader()
    
#    for link in link_list:
#        logger.writerow({"URL":link})

[*] Trying target domain: cer.be
[*] Trying index 2016-40
[*] Added 9 results.
[*] Found a total of 9 hits.
[*] Trying target domain: eimrail.org
[*] Trying index 2016-40
[*] Added 3 results.
[*] Found a total of 3 hits.


{'cer.be': 9, 'eimrail.org': 3}

In [26]:
import networkx as nx


G = nx.Graph()

G.add_nodes_from(domain_list)

for node in G.nodes():
    print node
    
print "the full list:", G.nodes()

G

eimrail.org
cer.be
the full list: ['eimrail.org', 'cer.be']


<networkx.classes.graph.Graph at 0xaf3e70ec>

In [27]:
H = nx.path_graph(10)
G.add_nodes_from(H)

print "the full dict:", G.nodes()

the full dict: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'eimrail.org', 'cer.be']


In [28]:
G.add_node(H)
print "the full dict:", G.nodes()

the full dict: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'eimrail.org', 'cer.be', <networkx.classes.graph.Graph object at 0xaf066f8c>]


In [29]:


G.add_edge(1,2)
e = (2,3)
G.add_edge(*e)

DoubleG = nx.Graph()
for link in master_dict['eimrail.org']:
    DoubleG.add_edge('eimrail.org', link)
    
print G.edges()
DoubleG.edges()

[(1, 2), (2, 3)]


[(u'http://www.apotheka.gr/img/relogiosreplica.php', 'eimrail.org'),
 (u'https://eim.viadesk.com/do/userlogin', 'eimrail.org'),
 (u'https://www.facebook.com/pages/European-Rail-Infrastructure-Managers-EIM/130752031858',
  'eimrail.org'),
 (u'http://www.eagleproducts.com.au/melhor.php', 'eimrail.org'),
 (u'http://pabirds.org/Search/relogios.php', 'eimrail.org'),
 (u'http://www.vellendtech.com/homens.php', 'eimrail.org'),
 (u'http://ezineturk.com/indice.asp', 'eimrail.org'),
 (u'http://vlbk.se/home.asp', 'eimrail.org'),
 ('eimrail.org', u'http://onlyoffice.com/fr/?campaign=nonprofit'),
 ('eimrail.org', u'http://barsuraube.org/homepage.php'),
 ('eimrail.org', u'https://www.linkedin.com/company/1239529'),
 ('eimrail.org', u'http://www.jac.eu/products.asp')]

In [38]:
import matplotlib.pyplot as plt

nx.draw(DoubleG)
plt.savefig("test.png")