In [95]:
import pycld2 as cld2
import pandas as pd
import requests

from os import listdir
import matplotlib.pyplot as plt
from math import ceil

from queue import Queue
from threading import Thread

from pytictoc import TicToc
t = TicToc()


In [2]:
sites = pd.read_csv('top-1m.csv', names=['id', 'url']).drop('id', axis=1)

In [8]:
# some info
tld = ['com', 'net', 'org', 'gov', 'uk', 'au', 'ca', 'nz', 'in'] # top level domains
first_n = 100000
num = sites[:first_n].url.apply(lambda x: x.split('.')[-1] in tld).sum()
print(f'Within the {first_n} most popular sites')
print(f'{int(num/first_n*100)}% are in english domains')

Within the 100000 most popular sites
63% are in english domains


In [56]:
def get_site(domain, verbose = False):
    try:
        # get the site using https and http
        r = requests.get('https://' + domain, headers={'User-Agent': 'Mozilla/5.0'})
        if r.status_code != 200:
            r = requests.get('http://' + domain, headers={'User-Agent': 'Mozilla/5.0'})
        if r.status_code != 200:
            if verbose:
                print(f'status: {r.status_code}')
            return False
        # only return the content if it is in english
        isReliable, textBytesFound, details = cld2.detect(r.content)
        if not isReliable:
            if verbose:
                print('not reliable')
            return False
        is_english = False
        for lang in details:
            if lang[0] == 'ENGLISH' and lang[2] > 90:
                is_english = True
        if not is_english:
            if verbose:
                print('not english')
            return False
        return r.text
    except:
        if verbose:
            print('got error')
        return False
                

In [106]:
class DownloadWorker(Thread):

    def __init__(self, queue):
        Thread.__init__(self)
        self.queue = queue

    def run(self):
        while True:
            # Get the work from the queue and expand the tuple
            index, link = self.queue.get()
            if index%1000 == 0:
                print(index)
                t.toc()
            data = get_site(link)
            if data:
                with open(f'./sites/{index}.html', "w") as file:
                    file.write(data)
            self.queue.task_done()



In [109]:
def main(links):
    # Create a queue to communicate with the worker threads
    queue = Queue()
    # Create 8 worker threads
    for x in range(8):
        worker = DownloadWorker(queue)
        # Setting daemon to True will let the main thread exit even though the workers are blocking
        worker.daemon = True
        worker.start()
    # Put the tasks into the queue as a tuple
    for index, link in links.items():
        queue.put((index, link))
    # Causes the main thread to wait for the queue to finish processing all the tasks
    queue.join()


In [110]:
t.tic()
main(sites.url[:60000])
print('done')
t.toc()

0
Elapsed time is 0.020111 seconds.
1000
Elapsed time is 558.326943 seconds.
3000
Elapsed time is 658.044600 seconds.
4000
Elapsed time is 1451.731736 seconds.
2000
Elapsed time is 1457.784338 seconds.
3000
Elapsed time is 2237.405049 seconds.
5000
Elapsed time is 2451.996237 seconds.
4000
Elapsed time is 2982.735280 seconds.
6000
Elapsed time is 3713.739316 seconds.
5000
Elapsed time is 4209.323505 seconds.
7000
Elapsed time is 4863.865789 seconds.
6000
Elapsed time is 5834.644517 seconds.
8000
Elapsed time is 6117.545051 seconds.
7000
Elapsed time is 7007.844797 seconds.
9000
Elapsed time is 7525.485269 seconds.
8000
Elapsed time is 8378.743163 seconds.
10000
Elapsed time is 8694.284455 seconds.
9000
Elapsed time is 10054.140520 seconds.
11000
Elapsed time is 10203.869011 seconds.
10000
Elapsed time is 11510.126170 seconds.
12000
Elapsed time is 11552.268492 seconds.
13000
Elapsed time is 13121.093887 seconds.
11000
Elapsed time is 13324.739868 seconds.
14000
Elapsed time is 14927.45

KeyboardInterrupt: 

In [None]:
files = [file for file in listdir('./sites') if file[0]!='.'] 
site_indices = [int(file.split('.')[0]) for file in files]

plt.hist(site_indices, bins=ceil(max(site_indices)/200))
plt.show()