In [1]:
from concurrent.futures import Future,ThreadPoolExecutor,as_completed

In [2]:
import time
import requests

In [3]:
from lxml import html

In [4]:
import urllib.parse

In [17]:
from queue import Queue
from threading import Thread

In [8]:
def get_hostname(uls):
    parsed = urllib.parse.urlparse(uls)
    host = '://'.join((parsed.scheme,parsed.netloc))
    return host

In [9]:
def url_filter(fn,url_iter):
    if not callable(fn):
        raise TypeError(f'{fn} is not callable')
    return filter(fn,url_iter)
    

In [10]:
def get_link(url):
    resp = fetch(url)
    if not resp:
        print(f'not found {url}')
    tree = html.fromstring(resp.content)
    links = tree.xpath('//a/@href')
    for link in links:
        yield link

In [11]:
def fetch(url):
    print(f'fetching : {url}...')
    resp = requests.get(url)
    return resp if resp.status_code==200 else None

In [12]:
def parse(resp):
    print(f'parsing : {resp.url}...')
    tree = html.fromstring(resp.content)
    title = tree.xpath('//title/text()')
    if not title:
        raise ValueError(f'[Error!] parsing {resp.url} failed!')
    print(f'parsing {resp.url} done...')
    return title[0]

In [None]:
def main(start):
    host = get_hostname(start)
    urls = get_link(start)
    cleand_link = url_filter(clear_url,urls)
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        resp_fus = [executor.submit(fetch,urllib.parse.urljoin(host,url)) for url in cleand_link]
        for fu in as_completed(resp_fus):
            title = parse(fu.result().content)
            print(f'title:{title}')
        

In [27]:
def clear_link(link):
    if 'wiki' in link:
        return True
    else:
        return False
    

In [34]:
def source(start):
    host = get_hostname(start)
    urls = get_link(start)
    cleand_link = url_filter(clear_link,urls)
    return (urllib.parse.urljoin(host,link) for link in cleand_link)
  

In [29]:
def worker_fetch(u_q):
    while True:
        url = u_q.get()
        if url is None:
            break
        resp = fetch(url)
        print(f'{resp.status_code}')
        u_q.task_done()
#         resp_q.put(resp)

In [30]:
start='https://en.wikipedia.org/wiki/Algorithm'

In [31]:
q = Queue(maxsize=10)

In [32]:
threads = []

In [None]:
for i in range(4):
    t=Thread(target=worker_fetch,args=(q,))
    t.start()
    threads.append(t)
for i in source(start):
    q.put(i)
q.join()

for i in range(4):
    q.put(None)

for t in threads:
    t.join()

fetching : https://en.wikipedia.org/wiki/Algorithm...
fetching : https://en.wikipedia.org/wiki/Algorithm_(disambiguation)...fetching : https://en.wikipedia.org/wiki/File:Euclid_flowchart.svg...fetching : https://en.wikipedia.org/wiki/File:Euclid_flowchart.svg...fetching : https://en.wikipedia.org/wiki/Flowchart...fetching : https://en.wikipedia.org/wiki/Euclid%27s_algorithm...fetching : https://en.wikipedia.org/wiki/Mathematics...

fetching : https://en.wikipedia.org/wiki/Computer_science...
fetching : https://en.wikipedia.org/wiki/Help:IPA/English...




200
fetching : https://en.wikipedia.org/wiki/File:En-us-algorithm.ogg...
200
fetching : https://upload.wikimedia.org/wikipedia/commons/7/7f/En-us-algorithm.ogg...
200
fetching : https://en.wikipedia.org/wiki/Calculation...
200
fetching : https://en.wikipedia.org/wiki/Data_processing...
200
fetching : https://en.wikipedia.org/wiki/Automated_reasoning...
200
fetching : https://en.wikipedia.org/wiki/Effective_method...
200
fetching : htt

200
fetching : https://en.wikipedia.org/wiki/File:Sorting_quicksort_anim.gif...
200
fetching : https://en.wikipedia.org/wiki/File:Sorting_quicksort_anim.gif...
200
fetching : https://en.wikipedia.org/wiki/Quicksort...
200
fetching : https://en.wikipedia.org/wiki/Pseudocode...
200
fetching : https://en.wikipedia.org/wiki/Pidgin_code...
200
fetching : https://en.wikipedia.org/wiki/Assignment_(computer_science)...
200
fetching : https://en.wikipedia.org/wiki/Euclid%27s_algorithm...
200
fetching : https://en.wikipedia.org/wiki/File:Euclid%27s_algorithm_Book_VII_Proposition_2_2.png...
200
fetching : https://en.wikipedia.org/wiki/File:Euclid%27s_algorithm_Book_VII_Proposition_2_2.png...
200
fetching : https://en.wikipedia.org/wiki/Euclid...
200
fetching : https://en.wikipedia.org/wiki/Greatest_common_divisor...
200
fetching : https://en.wikipedia.org/wiki/Euclid%27s_Elements...
200
fetching : https://en.wikipedia.org/wiki/Reductio_ad_absurdum...
200
fetching : https://en.wikipedia.org/wiki/F

200
fetching : https://en.wikipedia.org/wiki/Search_algorithm...
200
fetching : https://en.wikipedia.org/wiki/Sorting_algorithm...
200
fetching : https://en.wikipedia.org/wiki/Merge_algorithm...
200
fetching : https://en.wikipedia.org/wiki/Numerical_analysis...
200
fetching : https://en.wikipedia.org/wiki/Graph_theory...
200
fetching : https://en.wikipedia.org/wiki/String_algorithms...
200
fetching : https://en.wikipedia.org/wiki/Computational_geometry...
200
fetching : https://en.wikipedia.org/wiki/Combinatorial...
200
fetching : https://en.wikipedia.org/wiki/Medical_algorithm...
200
fetching : https://en.wikipedia.org/wiki/Machine_learning...
200
fetching : https://en.wikipedia.org/wiki/Cryptography...
200
fetching : https://en.wikipedia.org/wiki/Data_compression...
200
fetching : https://en.wikipedia.org/wiki/Parsing...
200
fetching : https://en.wikipedia.org/wiki/Complexity_class...
200
fetching : https://en.wikipedia.org/wiki/Parameterized_complexity...
200
fetching : https://en.w

200
fetching : https://en.wikipedia.org/wiki/List_of_algorithm_general_topics...
200
fetching : https://en.wikipedia.org/wiki/List_of_important_publications_in_theoretical_computer_science#Algorithms...
200
fetching : https://en.wikipedia.org/wiki/Theory_of_computation...
200
fetching : https://en.wikipedia.org/wiki/Computability_theory...
200
fetching : https://en.wikipedia.org/wiki/Computational_complexity_theory...
200
fetching : https://en.wikipedia.org/wiki/Zero...
200
fetching : https://en.wikipedia.org/wiki/Quantity...
200
fetching : https://en.wikipedia.org/wiki/International_Standard_Book_Number...
200
fetching : https://en.wikipedia.org/wiki/Special:BookSources/9781118460290...
200
fetching : https://en.wikipedia.org/wiki/University_of_Indianapolis...
200
fetching : https://en.wikipedia.org/wiki/International_Standard_Book_Number...
200
fetching : https://en.wikipedia.org/wiki/Special:BookSources/978-1-4042-0513-0...
200
fetching : https://en.wikipedia.org/wiki/Carl_B._Boyer.

200
fetching : https://en.wikipedia.org/wiki/Simon_and_Schuster...
200
fetching : https://en.wikipedia.org/wiki/International_Standard_Book_Number...
200
fetching : https://en.wikipedia.org/wiki/Special:BookSources/0-671-49207-1...
200
fetching : https://en.wikipedia.org/wiki/International_Standard_Book_Number...
200
fetching : https://en.wikipedia.org/wiki/Special:BookSources/0-671-49207-1...
200
fetching : https://en.wikipedia.org/wiki/Stephen_Kleene...
200
fetching : https://en.wikipedia.org/wiki/Digital_object_identifier...
200
fetching : https://en.wikipedia.org/wiki/Stephen_Kleene...
200
fetching : https://en.wikipedia.org/wiki/Digital_object_identifier...
200
fetching : https://en.wikipedia.org/wiki/JSTOR...
200
fetching : https://en.wikipedia.org/wiki/Church_thesis...
200
fetching : https://en.wikipedia.org/wiki/Kleene...
200
fetching : https://en.wikipedia.org/wiki/International_Standard_Book_Number...
200
fetching : https://en.wikipedia.org/wiki/Special:BookSources/0-7204-210

200
fetching : https://en.wikipedia.org/wiki/Category:Articles_with_Curlie_links...
200
fetching : https://en.wikipedia.org/wiki/Category:Wikipedia_articles_with_BNE_identifiers...
200
fetching : https://en.wikipedia.org/wiki/Category:Wikipedia_articles_with_BNF_identifiers...
200
fetching : https://en.wikipedia.org/wiki/Category:Wikipedia_articles_with_GND_identifiers...
200
fetching : https://en.wikipedia.org/wiki/Category:Wikipedia_articles_with_LCCN_identifiers...
200
fetching : https://en.wikipedia.org/wiki/Category:Wikipedia_articles_with_NDL_identifiers...
200
fetching : https://en.wikipedia.org/wiki/Category:Articles_with_example_pseudocode...
200
fetching : https://en.wikipedia.org/wiki/Special:MyTalk...
200
fetching : https://en.wikipedia.org/wiki/Special:MyContributions...
200
fetching : https://en.wikipedia.org/wiki/Algorithm...
200
fetching : https://en.wikipedia.org/wiki/Talk:Algorithm...
200
fetching : https://en.wikipedia.org/wiki/Algorithm...
200
fetching : https://en.

Exception in thread Thread-23:
Traceback (most recent call last):
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    (self._dns_host, self.port), self.timeout, **extra_kw)
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\util\connection.py", line 57, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\socket.py", line 748, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11004] getaddrinfo failed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\connectionpool.p

200
fetching : https://en.wikipedia.org/wiki/Wikipedia:File_Upload_Wizard...
200
fetching : https://en.wikipedia.org/wiki/Special:SpecialPages...
200
fetching : https://www.wikidata.org/wiki/Special:EntityPage/Q8366...
200
fetching : https://commons.wikimedia.org/wiki/Category:Algorithms...
200
fetching : https://en.wikibooks.org/wiki/Algorithms...
200
fetching : https://en.wikiquote.org/wiki/Algorithms...
200
fetching : https://af.wikipedia.org/wiki/Algoritme...
200
fetching : https://als.wikipedia.org/wiki/Algorithmus...
200
fetching : https://am.wikipedia.org/wiki/%E1%8A%A0%E1%88%8D%E1%8C%8E%E1%88%AA%E1%8B%9D%E1%88%9D...
200
fetching : https://ar.wikipedia.org/wiki/%D8%AE%D9%88%D8%A7%D8%B1%D8%B2%D9%85%D9%8A%D8%A9...
200
fetching : https://an.wikipedia.org/wiki/Algorismo...
200
fetching : https://as.wikipedia.org/wiki/%E0%A6%8F%E0%A6%B2%E0%A6%97%E0%A7%B0%E0%A6%BF%E0%A6%A5%E0%A6%AE_%E0%A6%86%E0%A7%B0%E0%A7%81_%E0%A6%A1%E0%A7%87%E0%A6%87%E0%A6%9F%E0%A6%BE_%E0%A6%B7%E0%A7%8D%E0%A6%9F%E0

Exception in thread Thread-24:
Traceback (most recent call last):
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\connectionpool.py", line 600, in urlopen
    chunked=chunked)
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\connectionpool.py", line 343, in _make_request
    self._validate_conn(conn)
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\connectionpool.py", line 839, in _validate_conn
    conn.connect()
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\connection.py", line 344, in connect
    ssl_context=context)
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\util\ssl_.py", line 344, in ssl_wrap_socket
    return context.wrap_socket(sock, server_hostname=server_ho

200
fetching : https://uz.wikipedia.org/wiki/Algoritm...
200
fetching : https://pa.wikipedia.org/wiki/%E0%A8%95%E0%A8%B2%E0%A8%A8_%E0%A8%B5%E0%A8%BF%E0%A8%A7%E0%A9%80...
200
fetching : https://pnb.wikipedia.org/wiki/%D8%A7%D9%84%DA%AF%D9%88%D8%B1%D8%AA%DA%BE%D9%85...
200
fetching : https://pl.wikipedia.org/wiki/Algorytm...
200
fetching : https://pt.wikipedia.org/wiki/Algoritmo...
200
fetching : https://kaa.wikipedia.org/wiki/Algoritm...
200
fetching : https://ro.wikipedia.org/wiki/Algoritm...
200
fetching : https://rue.wikipedia.org/wiki/%D0%90%D0%BB%D2%91%D0%BE%D1%80%D1%96%D1%82%D0%BC...
200
fetching : https://ru.wikipedia.org/wiki/%D0%90%D0%BB%D0%B3%D0%BE%D1%80%D0%B8%D1%82%D0%BC...
200
fetching : https://sah.wikipedia.org/wiki/%D0%90%D0%BB%D0%B3%D0%BE%D1%80%D0%B8%D1%82%D0%BC...
200
fetching : https://sco.wikipedia.org/wiki/Algorithm...
200
fetching : https://sq.wikipedia.org/wiki/Algoritmi...
200
fetching : https://scn.wikipedia.org/wiki/Alguritmu...
200
fetching : https://si.wikiped

Exception in thread Thread-22:
Traceback (most recent call last):
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    (self._dns_host, self.port), self.timeout, **extra_kw)
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\util\connection.py", line 80, in create_connection
    raise err
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\util\connection.py", line 70, in create_connection
    sock.connect(sa)
TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\connectionpool.py", line 600, in urlopen
    chunked=chunked)
  File "c:\users\

Exception in thread Thread-21:
Traceback (most recent call last):
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    (self._dns_host, self.port), self.timeout, **extra_kw)
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\util\connection.py", line 80, in create_connection
    raise err
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\util\connection.py", line 70, in create_connection
    sock.connect(sa)
TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\users\administrator.pc-20180514durk\appdata\local\programs\python\python37\lib\site-packages\urllib3\connectionpool.py", line 600, in urlopen
    chunked=chunked)
  File "c:\users\

In [None]:
main(start)

In [None]:
resp = fetch(start)
t = parse(resp)

In [None]:
t