# Web Scraping with Python

In [1]:
import re
import urllib.request
import urllib.error


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap.decode())
    # download each link
    for link in links:
        html = download(link)
        # scrape html here
        # ...


crawl_sitemap('https://webscraping.com/sitemap.xml')


Downloading: https://webscraping.com/sitemap.xml
Downloading: http://webscraping.com
Downloading: http://webscraping.com/about
Downloading: http://webscraping.com/blog
Downloading: http://webscraping.com/blog/10/
Downloading: http://webscraping.com/blog/11/
Downloading: http://webscraping.com/blog/12/
Downloading: http://webscraping.com/blog/13/
Downloading: http://webscraping.com/blog/2/
Downloading: http://webscraping.com/blog/3/
Downloading: http://webscraping.com/blog/4/
Downloading: http://webscraping.com/blog/5/
Downloading: http://webscraping.com/blog/6/
Downloading: http://webscraping.com/blog/7/
Downloading: http://webscraping.com/blog/8/
Downloading: http://webscraping.com/blog/9/
Downloading: http://webscraping.com/blog/All-your-data-are-belong-to-us/
Downloading: http://webscraping.com/blog/Android-Apps-Update/
Downloading: http://webscraping.com/blog/Apple-Apps-Update/
Downloading: http://webscraping.com/blog/Asynchronous-support-in-Python/
Downloading: http://webscraping.

Downloading: http://webscraping.com/blog/category/website/
Downloading: http://webscraping.com/blog/category/xpath
Downloading: http://webscraping.com/contact
Downloading: http://webscraping.com/data
Downloading: http://webscraping.com/data/default/database/1/belgium-zip-codes
Downloading: http://webscraping.com/data/default/database/10/usa-restaurants
Downloading: http://webscraping.com/data/default/database/11/android-apps
Downloading: http://webscraping.com/data/default/database/12/universal-product-codes-upc-details
Downloading: http://webscraping.com/data/default/database/13/international-standard-book-numbers-isbn
Downloading: http://webscraping.com/data/default/database/14/hotel-details
Downloading: http://webscraping.com/data/default/database/17/aircraft-models
Downloading: http://webscraping.com/data/default/database/2/usa-cities
Downloading: http://webscraping.com/data/default/database/23/films
Downloading: http://webscraping.com/data/default/database/29/popular-websites
Down

In [2]:
import itertools
import re
import urllib.request
import urllib.error


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap.decode())
    # download each link
    for link in links:
        html = download(link)
        # scrape html here
        # ...


for page in itertools.count(1):
    url = 'http://example.webscraping.com/places/default/view/-{}'.format(page)
    html = download(url)
    if html is None:
        break
    else:
        # success - can scrape the result
        pass


Downloading: http://example.webscraping.com/places/default/view/-1
Downloading: http://example.webscraping.com/places/default/view/-2
Downloading: http://example.webscraping.com/places/default/view/-3
Downloading: http://example.webscraping.com/places/default/view/-4
Downloading: http://example.webscraping.com/places/default/view/-5
Downloading: http://example.webscraping.com/places/default/view/-6
Downloading: http://example.webscraping.com/places/default/view/-7
Downloading: http://example.webscraping.com/places/default/view/-8
Downloading: http://example.webscraping.com/places/default/view/-9
Downloading: http://example.webscraping.com/places/default/view/-10
Downloading: http://example.webscraping.com/places/default/view/-11
Downloading: http://example.webscraping.com/places/default/view/-12
Downloading: http://example.webscraping.com/places/default/view/-13
Downloading: http://example.webscraping.com/places/default/view/-14
Downloading: http://example.webscraping.com/places/defaul

Downloading: http://example.webscraping.com/places/default/view/-122
Downloading: http://example.webscraping.com/places/default/view/-123
Downloading: http://example.webscraping.com/places/default/view/-124
Downloading: http://example.webscraping.com/places/default/view/-125
Downloading: http://example.webscraping.com/places/default/view/-126
Downloading: http://example.webscraping.com/places/default/view/-127
Downloading: http://example.webscraping.com/places/default/view/-128
Downloading: http://example.webscraping.com/places/default/view/-129
Downloading: http://example.webscraping.com/places/default/view/-130
Downloading: http://example.webscraping.com/places/default/view/-131
Downloading: http://example.webscraping.com/places/default/view/-132
Downloading: http://example.webscraping.com/places/default/view/-133
Downloading: http://example.webscraping.com/places/default/view/-134
Downloading: http://example.webscraping.com/places/default/view/-135
Downloading: http://example.webscr

Downloading: http://example.webscraping.com/places/default/view/-241
Downloading: http://example.webscraping.com/places/default/view/-242
Downloading: http://example.webscraping.com/places/default/view/-243
Downloading: http://example.webscraping.com/places/default/view/-244
Downloading: http://example.webscraping.com/places/default/view/-245
Downloading: http://example.webscraping.com/places/default/view/-246
Downloading: http://example.webscraping.com/places/default/view/-247
Downloading: http://example.webscraping.com/places/default/view/-248
Downloading: http://example.webscraping.com/places/default/view/-249
Downloading: http://example.webscraping.com/places/default/view/-250
Downloading: http://example.webscraping.com/places/default/view/-251
Downloading: http://example.webscraping.com/places/default/view/-252
Downloading: http://example.webscraping.com/places/default/view/-253
Download error: NOT FOUND


In [5]:
import itertools
import urllib.request
import urllib.error


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.code, e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


# maximum number of consecutive download errors allowed
max_errors = 5
# current number of consecutive download errors
num_errors = 0
for page in itertools.count(1):
    url = 'http://example.webscraping.com/places/default/view/-{}'.format(page)
    html = download(url)
    if html is None:
        # received an error trying to download this webpage
        num_errors += 1
        if num_errors == max_errors:
            # reached maximum number of
            # consecutive errors so exit
            break


Downloading: http://example.webscraping.com/places/default/view/-1
Downloading: http://example.webscraping.com/places/default/view/-2
Downloading: http://example.webscraping.com/places/default/view/-3
Downloading: http://example.webscraping.com/places/default/view/-4
Downloading: http://example.webscraping.com/places/default/view/-5
Downloading: http://example.webscraping.com/places/default/view/-6
Downloading: http://example.webscraping.com/places/default/view/-7
Downloading: http://example.webscraping.com/places/default/view/-8
Downloading: http://example.webscraping.com/places/default/view/-9
Downloading: http://example.webscraping.com/places/default/view/-10
Downloading: http://example.webscraping.com/places/default/view/-11
Downloading: http://example.webscraping.com/places/default/view/-12
Downloading: http://example.webscraping.com/places/default/view/-13
Downloading: http://example.webscraping.com/places/default/view/-14
Downloading: http://example.webscraping.com/places/defaul

Downloading: http://example.webscraping.com/places/default/view/-122
Downloading: http://example.webscraping.com/places/default/view/-123
Downloading: http://example.webscraping.com/places/default/view/-124
Downloading: http://example.webscraping.com/places/default/view/-125
Downloading: http://example.webscraping.com/places/default/view/-126
Downloading: http://example.webscraping.com/places/default/view/-127
Downloading: http://example.webscraping.com/places/default/view/-128
Downloading: http://example.webscraping.com/places/default/view/-129
Downloading: http://example.webscraping.com/places/default/view/-130
Downloading: http://example.webscraping.com/places/default/view/-131
Downloading: http://example.webscraping.com/places/default/view/-132
Downloading: http://example.webscraping.com/places/default/view/-133
Downloading: http://example.webscraping.com/places/default/view/-134
Downloading: http://example.webscraping.com/places/default/view/-135
Downloading: http://example.webscr

Downloading: http://example.webscraping.com/places/default/view/-241
Downloading: http://example.webscraping.com/places/default/view/-242
Downloading: http://example.webscraping.com/places/default/view/-243
Downloading: http://example.webscraping.com/places/default/view/-244
Downloading: http://example.webscraping.com/places/default/view/-245
Downloading: http://example.webscraping.com/places/default/view/-246
Downloading: http://example.webscraping.com/places/default/view/-247
Downloading: http://example.webscraping.com/places/default/view/-248
Downloading: http://example.webscraping.com/places/default/view/-249
Downloading: http://example.webscraping.com/places/default/view/-250
Downloading: http://example.webscraping.com/places/default/view/-251
Downloading: http://example.webscraping.com/places/default/view/-252
Downloading: http://example.webscraping.com/places/default/view/-253
Download error: 404 NOT FOUND
Downloading: http://example.webscraping.com/places/default/view/-254
Down

In [7]:
import itertools
import urllib.request
import urllib.error


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


# maximum number of consecutive download errors allowed
max_errors = 5
# current number of consecutive download errors
num_errors = 0
for page in itertools.count(1):
    url = 'http://example.webscraping.com/places/default/view/-{}'.format(page)
    html = download(url)
    if html is None:
        # received an error trying to download this webpage
        num_errors += 1
        if num_errors == max_errors:
            # reached maximum number of
            # consecutive errors so exit
            break


Downloading: http://example.webscraping.com/places/default/view/-1
Downloading: http://example.webscraping.com/places/default/view/-2
Downloading: http://example.webscraping.com/places/default/view/-3
Downloading: http://example.webscraping.com/places/default/view/-4
Downloading: http://example.webscraping.com/places/default/view/-5
Downloading: http://example.webscraping.com/places/default/view/-6
Downloading: http://example.webscraping.com/places/default/view/-7
Downloading: http://example.webscraping.com/places/default/view/-8
Downloading: http://example.webscraping.com/places/default/view/-9
Downloading: http://example.webscraping.com/places/default/view/-10
Downloading: http://example.webscraping.com/places/default/view/-11
Downloading: http://example.webscraping.com/places/default/view/-12
Downloading: http://example.webscraping.com/places/default/view/-13
Downloading: http://example.webscraping.com/places/default/view/-14
Downloading: http://example.webscraping.com/places/defaul

Downloading: http://example.webscraping.com/places/default/view/-122
Downloading: http://example.webscraping.com/places/default/view/-123
Downloading: http://example.webscraping.com/places/default/view/-124
Downloading: http://example.webscraping.com/places/default/view/-125
Downloading: http://example.webscraping.com/places/default/view/-126
Downloading: http://example.webscraping.com/places/default/view/-127
Downloading: http://example.webscraping.com/places/default/view/-128
Downloading: http://example.webscraping.com/places/default/view/-129
Downloading: http://example.webscraping.com/places/default/view/-130
Downloading: http://example.webscraping.com/places/default/view/-131
Downloading: http://example.webscraping.com/places/default/view/-132
Downloading: http://example.webscraping.com/places/default/view/-133
Downloading: http://example.webscraping.com/places/default/view/-134
Downloading: http://example.webscraping.com/places/default/view/-135
Downloading: http://example.webscr

Downloading: http://example.webscraping.com/places/default/view/-241
Downloading: http://example.webscraping.com/places/default/view/-242
Downloading: http://example.webscraping.com/places/default/view/-243
Downloading: http://example.webscraping.com/places/default/view/-244
Downloading: http://example.webscraping.com/places/default/view/-245
Downloading: http://example.webscraping.com/places/default/view/-246
Downloading: http://example.webscraping.com/places/default/view/-247
Downloading: http://example.webscraping.com/places/default/view/-248
Downloading: http://example.webscraping.com/places/default/view/-249
Downloading: http://example.webscraping.com/places/default/view/-250
Downloading: http://example.webscraping.com/places/default/view/-251
Downloading: http://example.webscraping.com/places/default/view/-252
Downloading: http://example.webscraping.com/places/default/view/-253
Download error: NOT FOUND
Downloading: http://example.webscraping.com/places/default/view/-254
Download

In [8]:
import itertools
import urllib.request
import urllib.error


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


# maximum number of consecutive download errors allowed
max_errors = 5
# current number of consecutive download errors
num_errors = 0
for page in itertools.count(1):
    url = 'http://example.webscraping.com/places/default/view/-{}'.format(page)
    html = download(url)
    if html is None:
        # received an error trying to download this webpage
        num_errors += 1
        if num_errors == max_errors:
            # reached maximum number of
            # consecutive errors so exit
            break
    else:
        # success - can scrape the resule
        # ...
        num_errors = 0


Downloading: http://example.webscraping.com/places/default/view/-1
Downloading: http://example.webscraping.com/places/default/view/-2
Downloading: http://example.webscraping.com/places/default/view/-3
Downloading: http://example.webscraping.com/places/default/view/-4
Downloading: http://example.webscraping.com/places/default/view/-5
Downloading: http://example.webscraping.com/places/default/view/-6
Downloading: http://example.webscraping.com/places/default/view/-7
Downloading: http://example.webscraping.com/places/default/view/-8
Downloading: http://example.webscraping.com/places/default/view/-9
Downloading: http://example.webscraping.com/places/default/view/-10
Downloading: http://example.webscraping.com/places/default/view/-11
Downloading: http://example.webscraping.com/places/default/view/-12
Downloading: http://example.webscraping.com/places/default/view/-13
Downloading: http://example.webscraping.com/places/default/view/-14
Downloading: http://example.webscraping.com/places/defaul

Downloading: http://example.webscraping.com/places/default/view/-122
Downloading: http://example.webscraping.com/places/default/view/-123
Downloading: http://example.webscraping.com/places/default/view/-124
Downloading: http://example.webscraping.com/places/default/view/-125
Downloading: http://example.webscraping.com/places/default/view/-126
Downloading: http://example.webscraping.com/places/default/view/-127
Downloading: http://example.webscraping.com/places/default/view/-128
Downloading: http://example.webscraping.com/places/default/view/-129
Downloading: http://example.webscraping.com/places/default/view/-130
Downloading: http://example.webscraping.com/places/default/view/-131
Downloading: http://example.webscraping.com/places/default/view/-132
Downloading: http://example.webscraping.com/places/default/view/-133
Downloading: http://example.webscraping.com/places/default/view/-134
Downloading: http://example.webscraping.com/places/default/view/-135
Downloading: http://example.webscr

Downloading: http://example.webscraping.com/places/default/view/-241
Downloading: http://example.webscraping.com/places/default/view/-242
Downloading: http://example.webscraping.com/places/default/view/-243
Downloading: http://example.webscraping.com/places/default/view/-244
Downloading: http://example.webscraping.com/places/default/view/-245
Downloading: http://example.webscraping.com/places/default/view/-246
Downloading: http://example.webscraping.com/places/default/view/-247
Downloading: http://example.webscraping.com/places/default/view/-248
Downloading: http://example.webscraping.com/places/default/view/-249
Downloading: http://example.webscraping.com/places/default/view/-250
Downloading: http://example.webscraping.com/places/default/view/-251
Downloading: http://example.webscraping.com/places/default/view/-252
Downloading: http://example.webscraping.com/places/default/view/-253
Download error: NOT FOUND
Downloading: http://example.webscraping.com/places/default/view/-254
Download

In [1]:
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_quene = [seed_url]
    while crawl_quene:
        url = crawl_quene.pop()
        html = download(url)
        # filter for links matching our regular expression
        for link in get_links(html):
            if re.match(link_regex, link):
                crawl_quene.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


html = download('http://example.webscraping.com/places/default').decode()
get_links(html)

Downloading: http://example.webscraping.com/places/default


['/places/default/index',
 '#',
 '/places/default/user/register?_next=/places/default/index',
 '/places/default/user/login?_next=/places/default/index',
 '/places/default/index',
 '/places/default/search',
 '/places/default/view/Afghanistan-1',
 '/places/default/view/Aland-Islands-2',
 '/places/default/view/Albania-3',
 '/places/default/view/Algeria-4',
 '/places/default/view/American-Samoa-5',
 '/places/default/view/Andorra-6',
 '/places/default/view/Angola-7',
 '/places/default/view/Anguilla-8',
 '/places/default/view/Antarctica-9',
 '/places/default/view/Antigua-and-Barbuda-10',
 '/places/default/index/1']

In [5]:
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_quene = [seed_url]
    while crawl_quene:
        url = crawl_quene.pop()
        html = download(url)
        # filter for links matching our regular expression
        for link in get_links(html):
            if re.match(link_regex, link):
                crawl_quene.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


link_crawler('http://example.webscraping.com/places/default', '/places/default/(index|view)')

Downloading: http://example.webscraping.com/places/default
Downloading: /places/default/index/1


ValueError: unknown url type: '/places/default/index/1'

In [1]:
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error
import urllib.parse


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_quene = [seed_url]
    while crawl_quene:
        url = crawl_quene.pop()
        html = download(url)
        # filter for links matching our regular expression
        for link in get_links(html):
            if re.match(link_regex, link):
                link = urllib.parse.urljoin(seed_url, link)
                crawl_quene.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


link_crawler('http://example.webscraping.com', '/places/default/(index|view)')

Downloading: http://example.webscraping.com
Downloading: http://example.webscraping.com/places/default/index/1
Downloading: http://example.webscraping.com/places/default/index/2
Downloading: http://example.webscraping.com/places/default/index/3
Downloading: http://example.webscraping.com/places/default/index/4
Downloading: http://example.webscraping.com/places/default/index/5
Downloading: http://example.webscraping.com/places/default/index/6
Downloading: http://example.webscraping.com/places/default/index/7
Downloading: http://example.webscraping.com/places/default/index/8
Downloading: http://example.webscraping.com/places/default/index/9
Downloading: http://example.webscraping.com/places/default/index/10
Downloading: http://example.webscraping.com/places/default/index/11
Downloading: http://example.webscraping.com/places/default/index/12
Downloading: http://example.webscraping.com/places/default/index/13
Downloading: http://example.webscraping.com/places/default/index/14
Downloading: 

KeyboardInterrupt: 

爬取任何网站，User-Agent必须伪装成正常浏览器的UA，爬虫类的UA极易被屏蔽或是限制。

In [5]:
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error
import urllib.parse


def download(url, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]
    # keep track which URL's have seen before
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        # filter for links matching our regular expression
        for link in get_links(html):
            # check if link matches expected regex
            if re.match(link_regex, link):
                # form absolute link
                link = urllib.parse.urljoin(seed_url, link)
                # check if have already seen this link
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


link_crawler('http://example.webscraping.com', '/places/default/(index|view)')

Downloading: http://example.webscraping.com
Downloading: http://example.webscraping.com/places/default/index/1
Downloading: http://example.webscraping.com/places/default/index/2
Downloading: http://example.webscraping.com/places/default/index/3
Downloading: http://example.webscraping.com/places/default/index/4
Downloading: http://example.webscraping.com/places/default/index/5
Downloading: http://example.webscraping.com/places/default/index/6
Downloading: http://example.webscraping.com/places/default/index/7
Downloading: http://example.webscraping.com/places/default/index/8
Downloading: http://example.webscraping.com/places/default/index/9
Downloading: http://example.webscraping.com/places/default/index/10
Downloading: http://example.webscraping.com/places/default/index/11
Downloading: http://example.webscraping.com/places/default/index/12
Downloading: http://example.webscraping.com/places/default/index/13
Downloading: http://example.webscraping.com/places/default/index/14
Downloading: 

Downloading: http://example.webscraping.com/places/default/view/Palestinian-Territory-171
Downloading: http://example.webscraping.com/places/default/view/Palau-170
Downloading: http://example.webscraping.com/places/default/view/Pakistan-169
Downloading: http://example.webscraping.com/places/default/view/Oman-168
Downloading: http://example.webscraping.com/places/default/view/Norway-167
Downloading: http://example.webscraping.com/places/default/view/Northern-Mariana-Islands-166
Downloading: http://example.webscraping.com/places/default/view/North-Korea-165
Downloading: http://example.webscraping.com/places/default/view/Norfolk-Island-164
Downloading: http://example.webscraping.com/places/default/view/Niue-163
Downloading: http://example.webscraping.com/places/default/view/Nigeria-162
Downloading: http://example.webscraping.com/places/default/view/Niger-161
Downloading: http://example.webscraping.com/places/default/view/Nicaragua-160
Downloading: http://example.webscraping.com/places/def

Downloading: http://example.webscraping.com/places/default/view/Ecuador-65
Downloading: http://example.webscraping.com/places/default/view/East-Timor-64
Downloading: http://example.webscraping.com/places/default/view/Dominican-Republic-63
Downloading: http://example.webscraping.com/places/default/view/Dominica-62
Downloading: http://example.webscraping.com/places/default/view/Djibouti-61
Downloading: http://example.webscraping.com/places/default/view/Denmark-60
Downloading: http://example.webscraping.com/places/default/view/Democratic-Republic-of-the-Congo-59
Downloading: http://example.webscraping.com/places/default/view/Czech-Republic-58
Downloading: http://example.webscraping.com/places/default/view/Cyprus-57
Downloading: http://example.webscraping.com/places/default/view/Curacao-56
Downloading: http://example.webscraping.com/places/default/view/Cuba-55
Downloading: http://example.webscraping.com/places/default/view/Croatia-54
Downloading: http://example.webscraping.com/places/defau

In [7]:
import urllib.robotparser


rp = urllib.robotparser.RobotFileParser()
rp.set_url('http://example.webscraping.com/robots.txt')
rp.read()
url = 'http://example.webscraping.com'
user_agent = 'BadCrawler'
print(rp.can_fetch(user_agent, url))
user_agent = 'GoodCrawler'
print(rp.can_fetch(user_agent, url))

False
True


In [4]:
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error
import urllib.parse
import urllib.robotparser
import time


user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
rp = urllib.robotparser.RobotFileParser()
rp.set_url('http://example.webscraping.com/robots.txt')
rp.read()


def download(url, user_agent=user_agent, num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]
    # keep track which URL's have seen before
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            time.sleep(1)
            html = download(url)
        else:
            print('Blocked by robots.txt:', url)
            html = None
        # filter for links matching our regular expression
        for link in get_links(html):
            # check if link matches expected regex
            if re.match(link_regex, link):
                # form absolute link
                link = urllib.parse.urljoin(seed_url, link)
                # check if have already seen this link
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


link_crawler('http://example.webscraping.com', '/places/default/(index|view)')

Downloading: http://example.webscraping.com
Downloading: http://example.webscraping.com/places/default/index/1
Downloading: http://example.webscraping.com/places/default/index/2
Downloading: http://example.webscraping.com/places/default/index/3
Downloading: http://example.webscraping.com/places/default/index/4
Downloading: http://example.webscraping.com/places/default/index/5
Downloading: http://example.webscraping.com/places/default/index/6
Downloading: http://example.webscraping.com/places/default/index/7
Downloading: http://example.webscraping.com/places/default/index/8
Downloading: http://example.webscraping.com/places/default/index/9
Downloading: http://example.webscraping.com/places/default/index/10
Downloading: http://example.webscraping.com/places/default/index/11
Downloading: http://example.webscraping.com/places/default/index/12
Downloading: http://example.webscraping.com/places/default/index/13
Downloading: http://example.webscraping.com/places/default/index/14
Downloading: 

Downloading: http://example.webscraping.com/places/default/view/Palestinian-Territory-171
Downloading: http://example.webscraping.com/places/default/view/Palau-170
Downloading: http://example.webscraping.com/places/default/view/Pakistan-169
Downloading: http://example.webscraping.com/places/default/view/Oman-168
Downloading: http://example.webscraping.com/places/default/view/Norway-167
Downloading: http://example.webscraping.com/places/default/view/Northern-Mariana-Islands-166
Downloading: http://example.webscraping.com/places/default/view/North-Korea-165
Downloading: http://example.webscraping.com/places/default/view/Norfolk-Island-164
Downloading: http://example.webscraping.com/places/default/view/Niue-163
Downloading: http://example.webscraping.com/places/default/view/Nigeria-162
Downloading: http://example.webscraping.com/places/default/view/Niger-161
Downloading: http://example.webscraping.com/places/default/view/Nicaragua-160
Downloading: http://example.webscraping.com/places/def

Downloading: http://example.webscraping.com/places/default/view/Ecuador-65
Downloading: http://example.webscraping.com/places/default/view/East-Timor-64
Downloading: http://example.webscraping.com/places/default/view/Dominican-Republic-63
Downloading: http://example.webscraping.com/places/default/view/Dominica-62
Downloading: http://example.webscraping.com/places/default/view/Djibouti-61
Downloading: http://example.webscraping.com/places/default/view/Denmark-60
Downloading: http://example.webscraping.com/places/default/view/Democratic-Republic-of-the-Congo-59
Downloading: http://example.webscraping.com/places/default/view/Czech-Republic-58
Downloading: http://example.webscraping.com/places/default/view/Cyprus-57
Downloading: http://example.webscraping.com/places/default/view/Curacao-56
Downloading: http://example.webscraping.com/places/default/view/Cuba-55
Downloading: http://example.webscraping.com/places/default/view/Croatia-54
Downloading: http://example.webscraping.com/places/defau

##### 支持代理的Python 3代码

In [1]:
# coding=utf-8

from urllib.parse import urlparse
from urllib.request import Request, build_opener, ProxyHandler
from bs4 import BeautifulSoup

url = 'http://ip138.com'
_headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101 Firefox/58.0'}

proxy = '127.0.0.1:1080'
opener = build_opener()
proxy_params = {urlparse(url).scheme: proxy}
opener.add_handler(ProxyHandler(proxy_params))
req = Request(url, headers=_headers)
page = opener.open(req).read()
soup = BeautifulSoup(page, 'lxml')
target = soup.find_all('iframe')
for iframe in target:
    src_page = opener.open(Request(iframe.attrs['src'], headers=_headers))
    iframe_soup = BeautifulSoup(src_page, 'lxml')
    print(iframe_soup.center.get_text())


您的IP是：[45.77.166.94] 来自：美国


In [4]:
# coding=utf-8

from urllib.request import Request, build_opener, ProxyHandler
from urllib.parse import urlparse
from bs4 import BeautifulSoup

url = 'https://www.whatismybrowser.com/'
_headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101 Firefox/58.0'}

# Proxy setting start
proxy = '127.0.0.1:1080'
opener = build_opener()
proxy_params = {urlparse(url).scheme: proxy}
opener.add_handler(ProxyHandler(proxy_params))
# Proxy setting end

req = Request(url, headers=_headers)
page = opener.open(req).read()
soup = BeautifulSoup(page, 'lxml')
target_div = soup.find_all('div', class_="user-agent")[0].a.get_text()
print(target_div)


Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101 Firefox/58.0


In [7]:
from urllib.request import Request, build_opener, ProxyHandler
from urllib.parse import urlparse

from bs4 import BeautifulSoup

url = 'https://www.youtube.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}

# Proxy setting start
proxy = '127.0.0.1:1080'
proxy_params = {urlparse(url).scheme: proxy}
opener = build_opener()
opener.add_handler(ProxyHandler(proxy_params))
# Proxy setting end

req = Request(url, headers=headers)
page = opener.open(req).read()
soup = BeautifulSoup(page, 'lxml')
print(soup.title.contents[0])


YouTube


In [8]:
from urllib.request import Request, build_opener, ProxyHandler
from urllib.parse import urlparse

from bs4 import BeautifulSoup

url = 'https://twitter.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}

# Proxy setting start
proxy = '127.0.0.1:1080'
proxy_params = {urlparse(url).scheme: proxy}
opener = build_opener()
opener.add_handler(ProxyHandler(proxy_params))
# Proxy setting end

req = Request(url, headers=headers)
page = opener.open(req).read()
soup = BeautifulSoup(page, 'lxml')
print(soup.title.contents[0])


Twitter. It's what's happening.


#### 支持代理、自定义User-Agent、异常检测及自动下载重试的download函数

In [7]:
# -*- coding: utf-8 -*-

import urllib.request
import urllib.parse
import urllib.error


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries - 1)
    return html


ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
url= 'https://www.facebook.com'

# Proxy setting start
proxy = '127.0.0.1:1087'  # macOS上的shadowsocksX-NG
proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
opener = urllib.request.build_opener()
opener.add_handler(urllib.request.ProxyHandler(proxy_params))
# Proxy setting end

download(url, ua, proxy)


Downloading: https://www.facebook.com




In [2]:
# -*- coding: utf-8 -*-

import urllib.request
import urllib.parse
import urllib.error


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    '''Support custom User-Agent, proxy and auto retry
    '''
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries - 1)
    return html


url= 'https://twitter.com'
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
proxy = '127.0.0.1:1080'  # Windows上的shadowsocks

download(url, ua, proxy, 2)


Downloading: https://twitter.com




In [4]:
# -*- coding: utf-8 -*-

import urllib.request
import urllib.parse
import urllib.error


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    '''Support custom User-Agent, proxy and auto retry
    '''
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries - 1)
    return html


url= 'https://www.baidu.com'
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
proxy = '127.0.0.1:1080'  # Windows上的shadowsocks

download(url, ua, proxy, 2)


Downloading: https://www.baidu.com




In [5]:
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error
import urllib.parse
import urllib.robotparser
import time
import datetime


user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
rp = urllib.robotparser.RobotFileParser()
rp.set_url('http://example.webscraping.com/robots.txt')
rp.read()


class Throttle:
    """Add a delay between downloads to the same domain
    """

    def __init__(self, delay):
        # amount of delay between downloads for each domain
        self.delay = delay
        # timestamp of when a domain was last accessed
        self.domains = {}

    def wait(self, url):
        domain = urllib.parse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - \
                (datetime.datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                # domain has been accessed recently
                # so need to sleep
                time.sleep(sleep_secs)
        # update the last accessed time
        self.domains[domain] = datetime.datetime.now()

        
throttle=Throttle(2)


def download(url, user_agent=user_agent, num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]
    # keep track which URL's have seen before
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            throttle.wait(url)
            html = download(url)
        else:
            print('Blocked by robots.txt:', url)
            html = None
        # filter for links matching our regular expression
        for link in get_links(html):
            # check if link matches expected regex
            if re.match(link_regex, link):
                # form absolute link
                link = urllib.parse.urljoin(seed_url, link)
                # check if have already seen this link
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


link_crawler('http://example.webscraping.com', '/places/default/(index|view)')

Downloading: http://example.webscraping.com
Downloading: http://example.webscraping.com/places/default/index/1
Downloading: http://example.webscraping.com/places/default/index/2
Downloading: http://example.webscraping.com/places/default/index/3
Downloading: http://example.webscraping.com/places/default/index/4
Downloading: http://example.webscraping.com/places/default/index/5
Downloading: http://example.webscraping.com/places/default/index/6
Downloading: http://example.webscraping.com/places/default/index/7
Downloading: http://example.webscraping.com/places/default/index/8
Downloading: http://example.webscraping.com/places/default/index/9
Downloading: http://example.webscraping.com/places/default/index/10
Downloading: http://example.webscraping.com/places/default/index/11
Downloading: http://example.webscraping.com/places/default/index/12
Downloading: http://example.webscraping.com/places/default/index/13
Downloading: http://example.webscraping.com/places/default/index/14
Downloading: 

Downloading: http://example.webscraping.com/places/default/view/Palestinian-Territory-171
Downloading: http://example.webscraping.com/places/default/view/Palau-170
Downloading: http://example.webscraping.com/places/default/view/Pakistan-169
Downloading: http://example.webscraping.com/places/default/view/Oman-168
Downloading: http://example.webscraping.com/places/default/view/Norway-167
Downloading: http://example.webscraping.com/places/default/view/Northern-Mariana-Islands-166
Downloading: http://example.webscraping.com/places/default/view/North-Korea-165
Downloading: http://example.webscraping.com/places/default/view/Norfolk-Island-164
Downloading: http://example.webscraping.com/places/default/view/Niue-163
Downloading: http://example.webscraping.com/places/default/view/Nigeria-162
Downloading: http://example.webscraping.com/places/default/view/Niger-161
Downloading: http://example.webscraping.com/places/default/view/Nicaragua-160
Downloading: http://example.webscraping.com/places/def

Downloading: http://example.webscraping.com/places/default/view/Ecuador-65
Downloading: http://example.webscraping.com/places/default/view/East-Timor-64
Downloading: http://example.webscraping.com/places/default/view/Dominican-Republic-63
Downloading: http://example.webscraping.com/places/default/view/Dominica-62
Downloading: http://example.webscraping.com/places/default/view/Djibouti-61
Downloading: http://example.webscraping.com/places/default/view/Denmark-60
Downloading: http://example.webscraping.com/places/default/view/Democratic-Republic-of-the-Congo-59
Downloading: http://example.webscraping.com/places/default/view/Czech-Republic-58
Downloading: http://example.webscraping.com/places/default/view/Cyprus-57
Downloading: http://example.webscraping.com/places/default/view/Curacao-56
Downloading: http://example.webscraping.com/places/default/view/Cuba-55
Downloading: http://example.webscraping.com/places/default/view/Croatia-54
Downloading: http://example.webscraping.com/places/defau

In [6]:
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error
import urllib.parse
import urllib.robotparser
import time
import datetime


user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
rp = urllib.robotparser.RobotFileParser()
rp.set_url('http://example.webscraping.com/robots.txt')
rp.read()


class Throttle:
    """Add a delay between downloads to the same domain
    """

    def __init__(self, delay):
        # amount of delay between downloads for each domain
        self.delay = delay
        # timestamp of when a domain was last accessed
        self.domains = {}

    def wait(self, url):
        domain = urllib.parse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - \
                (datetime.datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                # domain has been accessed recently
                # so need to sleep
                time.sleep(sleep_secs)
        # update the last accessed time
        self.domains[domain] = datetime.datetime.now()

        
throttle=Throttle(1)


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    '''Support custom User-Agent, proxy and auto retry
    '''
    print('Downloading:', url)  # url is download function's first arguments
    headers = {'User-agent': user_agent}  # user_agent is the second arguments
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]
    # keep track which URL's have seen before
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            throttle.wait(url)
            html = download(url, user_agent, proxy, 2)
        else:
            print('Blocked by robots.txt:', url)
            html = None
        # filter for links matching our regular expression
        for link in get_links(html):
            # check if link matches expected regex
            if re.match(link_regex, link):
                # form absolute link
                link = urllib.parse.urljoin(seed_url, link)
                # check if have already seen this link
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


proxy = '127.0.0.1:1080'
link_crawler('http://example.webscraping.com', '/places/default/(index|view)')

Downloading: http://example.webscraping.com
Downloading: http://example.webscraping.com/places/default/index/1
Downloading: http://example.webscraping.com/places/default/index/2
Downloading: http://example.webscraping.com/places/default/index/3
Downloading: http://example.webscraping.com/places/default/index/4
Downloading: http://example.webscraping.com/places/default/index/5
Downloading: http://example.webscraping.com/places/default/index/6
Downloading: http://example.webscraping.com/places/default/index/7
Downloading: http://example.webscraping.com/places/default/index/8
Downloading: http://example.webscraping.com/places/default/index/9
Downloading: http://example.webscraping.com/places/default/index/10
Downloading: http://example.webscraping.com/places/default/index/11
Downloading: http://example.webscraping.com/places/default/index/12
Downloading: http://example.webscraping.com/places/default/index/13
Downloading: http://example.webscraping.com/places/default/index/14
Downloading: 

Downloading: http://example.webscraping.com/places/default/view/Palestinian-Territory-171
Downloading: http://example.webscraping.com/places/default/view/Palau-170
Downloading: http://example.webscraping.com/places/default/view/Pakistan-169
Downloading: http://example.webscraping.com/places/default/view/Oman-168
Downloading: http://example.webscraping.com/places/default/view/Norway-167
Downloading: http://example.webscraping.com/places/default/view/Northern-Mariana-Islands-166
Downloading: http://example.webscraping.com/places/default/view/North-Korea-165
Downloading: http://example.webscraping.com/places/default/view/Norfolk-Island-164
Downloading: http://example.webscraping.com/places/default/view/Niue-163
Downloading: http://example.webscraping.com/places/default/view/Nigeria-162
Downloading: http://example.webscraping.com/places/default/view/Niger-161
Downloading: http://example.webscraping.com/places/default/view/Nicaragua-160
Downloading: http://example.webscraping.com/places/def

Downloading: http://example.webscraping.com/places/default/view/Ecuador-65
Downloading: http://example.webscraping.com/places/default/view/East-Timor-64
Downloading: http://example.webscraping.com/places/default/view/Dominican-Republic-63
Downloading: http://example.webscraping.com/places/default/view/Dominica-62
Downloading: http://example.webscraping.com/places/default/view/Djibouti-61
Downloading: http://example.webscraping.com/places/default/view/Denmark-60
Downloading: http://example.webscraping.com/places/default/view/Democratic-Republic-of-the-Congo-59
Downloading: http://example.webscraping.com/places/default/view/Czech-Republic-58
Downloading: http://example.webscraping.com/places/default/view/Cyprus-57
Downloading: http://example.webscraping.com/places/default/view/Curacao-56
Downloading: http://example.webscraping.com/places/default/view/Cuba-55
Downloading: http://example.webscraping.com/places/default/view/Croatia-54
Downloading: http://example.webscraping.com/places/defau

In [11]:
# coding=utf-8
import datetime
import re
import time
import urllib.error
import urllib.parse
import urllib.request
import urllib.robotparser
import sys


user_agent = 'BadCrawler'
rp = urllib.robotparser.RobotFileParser()
rp.set_url('http://example.webscraping.com/robots.txt')
rp.read()


class Throttle:
    """Add a delay between downloads to the same domain
    """

    def __init__(self, delay):
        # amount of delay between downloads for each domain
        self.delay = delay
        # timestamp of when a domain was last accessed
        self.domains = {}

    def wait(self, url):
        domain = urllib.parse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - \
                (datetime.datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                # domain has been accessed recently
                # so need to sleep
                time.sleep(sleep_secs)
        # update the last accessed time
        self.domains[domain] = datetime.datetime.now()

        
throttle=Throttle(1)


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    '''Support custom User-Agent, proxy and auto retry
    '''
    print('Downloading:', url)  # url is download function's first arguments
    headers = {'User-agent': user_agent}  # user_agent is the second arguments
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex, max_depth=2):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]
    # keep track which URL's have seen before
    seen = {seed_url: 0}
    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            throttle.wait(url)
            html = download(url, user_agent, proxy, 2)
        else:
            print('Blocked by robots.txt:', url)
            html = None
            sys.exit()
        # filter for links matching our regular expression
        depth = seen[url]
        if depth != max_depth:
            for link in get_links(html):
                # check if link matches expected regex
                if re.match(link_regex, link):
                    # form absolute link
                    link = urllib.parse.urljoin(seed_url, link)
                    # check if have already seen this link
                    if link not in seen:
                        seen[link] = depth + 1
                        crawl_queue.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


link_crawler('http://example.webscraping.com', '/places/default/(index|view)', 2)


Blocked by robots.txt: http://example.webscraping.com


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [12]:
# coding=utf-8
import datetime
import re
import time
import urllib.error
import urllib.parse
import urllib.request
import urllib.robotparser
import sys


user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
rp = urllib.robotparser.RobotFileParser()
rp.set_url('http://example.webscraping.com/robots.txt')
rp.read()
proxy = None


class Throttle:
    """Add a delay between downloads to the same domain
    """

    def __init__(self, delay):
        # amount of delay between downloads for each domain
        self.delay = delay
        # timestamp of when a domain was last accessed
        self.domains = {}

    def wait(self, url):
        domain = urllib.parse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - \
                (datetime.datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                # domain has been accessed recently
                # so need to sleep
                time.sleep(sleep_secs)
        # update the last accessed time
        self.domains[domain] = datetime.datetime.now()

        
throttle=Throttle(1)


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    '''Support custom User-Agent, proxy and auto retry
    '''
    print('Downloading:', url)  # url is download function's first arguments
    headers = {'User-agent': user_agent}  # user_agent is the second arguments
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex, max_depth=2):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]
    # keep track which URL's have seen before
    seen = {seed_url: 0}
    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            throttle.wait(url)
            html = download(url, user_agent, proxy, 2)
        else:
            print('Blocked by robots.txt:', url)
            html = None
            sys.exit()
        # filter for links matching our regular expression
        depth = seen[url]
        if depth != max_depth:
            for link in get_links(html):
                # check if link matches expected regex
                if re.match(link_regex, link):
                    # form absolute link
                    link = urllib.parse.urljoin(seed_url, link)
                    # check if have already seen this link
                    if link not in seen:
                        seen[link] = depth + 1
                        crawl_queue.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


link_crawler('http://example.webscraping.com', '/places/default/(index|view)', 1)


Downloading: http://example.webscraping.com
Downloading: http://example.webscraping.com/places/default/index/1
Downloading: http://example.webscraping.com/places/default/view/Antigua-and-Barbuda-10
Downloading: http://example.webscraping.com/places/default/view/Antarctica-9
Downloading: http://example.webscraping.com/places/default/view/Anguilla-8
Downloading: http://example.webscraping.com/places/default/view/Angola-7
Downloading: http://example.webscraping.com/places/default/view/Andorra-6
Downloading: http://example.webscraping.com/places/default/view/American-Samoa-5
Downloading: http://example.webscraping.com/places/default/view/Algeria-4
Downloading: http://example.webscraping.com/places/default/view/Albania-3
Downloading: http://example.webscraping.com/places/default/view/Aland-Islands-2
Downloading: http://example.webscraping.com/places/default/view/Afghanistan-1
Downloading: http://example.webscraping.com/places/default/index


In [13]:
import re
import urllib.request
import urllib.parse
import urllib.error


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    '''Support custom User-Agent, proxy and auto retry
    '''
    print('Downloading:', url)  # url is download function's first arguments
    headers = {'User-agent': user_agent}  # user_agent is the second arguments
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries - 1)
    return html


url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
proxy = None
html = download(url)
re.findall('<td class="w2p_fw">(.*?)</td>', html)


Downloading: http://example.webscraping.com/places/default/view/United-Kingdom-239


['<img src="/places/static/images/flags/gb.png" />',
 '244,820 square kilometres',
 '62,348,447',
 'GB',
 'United Kingdom',
 'London',
 '<a href="/places/default/continent/EU">EU</a>',
 '.uk',
 'GBP',
 'Pound',
 '44',
 '@# #@@|@## #@@|@@# #@@|@@## #@@|@#@ #@@|@@#@ #@@|GIR0AA',
 '^(([A-Z]\\d{2}[A-Z]{2})|([A-Z]\\d{3}[A-Z]{2})|([A-Z]{2}\\d{2}[A-Z]{2})|([A-Z]{2}\\d{3}[A-Z]{2})|([A-Z]\\d[A-Z]\\d[A-Z]{2})|([A-Z]{2}\\d[A-Z]\\d[A-Z]{2})|(GIR0AA))$',
 'en-GB,cy-GB,gd',
 '<div><a href="/places/default/iso/IE">IE </a></div>']

In [16]:
import re
import urllib.request
import urllib.parse
import urllib.error


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    '''Support custom User-Agent, proxy and auto retry
    '''
    print('Downloading:', url)  # url is download function's first arguments
    headers = {'User-agent': user_agent}  # user_agent is the second arguments
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries - 1)
    return html


url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
proxy = None
html = download(url)
re.findall('<td class="w2p_fw">(.*?)</td>', html)[1]


Downloading: http://example.webscraping.com/places/default/view/United-Kingdom-239


'244,820 square kilometres'

In [18]:
import re
import urllib.request
import urllib.parse
import urllib.error


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    '''Support custom User-Agent, proxy and auto retry
    '''
    print('Downloading:', url)  # url is download function's first arguments
    headers = {'User-agent': user_agent}  # user_agent is the second arguments
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries - 1)
    return html


url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
proxy = None
html = download(url)
re.findall('<tr id="places_area__row"><td class="w2p_fl"><label class="readonly" for="places_area" id="places_area__label">Area: </label></td><td class="w2p_fw">(.*?)</td>', html)

Downloading: http://example.webscraping.com/places/default/view/United-Kingdom-239


['244,820 square kilometres']

In [19]:
import re
import urllib.request
import urllib.parse
import urllib.error


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    '''Support custom User-Agent, proxy and auto retry
    '''
    print('Downloading:', url)  # url is download function's first arguments
    headers = {'User-agent': user_agent}  # user_agent is the second arguments
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries - 1)
    return html


url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
proxy = None
html = download(url)
re.findall('<tr id="places_area__row">.*?<td\s*class=["\']w2p_fw["\']>(.*?)</td>', html)

Downloading: http://example.webscraping.com/places/default/view/United-Kingdom-239


['244,820 square kilometres']

In [20]:
from bs4 import BeautifulSoup


broken_html = '<ul class=country><li>Area<li>Population</ul>'
# parse the HTML
soup = BeautifulSoup(broken_html, 'html.parser')
fixed_html = soup.prettify()
print(fixed_html)


<ul class="country">
 <li>
  Area
  <li>
   Population
  </li>
 </li>
</ul>


In [21]:
from bs4 import BeautifulSoup


broken_html = '<ul class=country><li>Area<li>Population</ul>'
# parse the HTML
soup = BeautifulSoup(broken_html, 'lxml')
fixed_html = soup.prettify()
print(fixed_html)


<html>
 <body>
  <ul class="country">
   <li>
    Area
   </li>
   <li>
    Population
   </li>
  </ul>
 </body>
</html>


##### BeautifulSoup的prettify()方法、find()方法和find_all()方法  
find()方法和find_all()方法貌似包含prettify()的功能

In [22]:
from bs4 import BeautifulSoup


broken_html = '<ul class=country><li>Area<li>Population</ul>'
# parse the HTML
soup = BeautifulSoup(broken_html, 'lxml')
# fixed_html = soup.prettify()
# print(fixed_html)
ul = soup.find('ul', attrs={'class': 'country'})
print(ul.find('li'))  # returns just the first match
print(ul.find_all('li'))  # returns all matches


<li>Area</li>
[<li>Area</li>, <li>Population</li>]


In [24]:
from bs4 import BeautifulSoup


url = 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
html = download(url)
soup = BeautifulSoup(html, 'lxml')
# locate the area row
tr = soup.find(attrs={'id': 'places_area__row'})
td = tr.find(attrs={'class': 'w2p_fw'})  # locate the area tag
area = td.text  # extract the text from this tag
print(area)


Downloading: http://example.webscraping.com/places/default/view/United-Kingdom-239
244,820 square kilometres


In [25]:
import lxml.html


broken_html = '<ul class=country><li>Area<li>Population</ul>'
tree = lxml.html.fromstring(broken_html)  # parse the HTML
fixed_html = lxml.html.tostring(tree, pretty_print=True)
print(fixed_html)


b'<ul class="country">\n<li>Area</li>\n<li>Population</li>\n</ul>\n'


In [26]:
import lxml.html
import lxml.etree


broken_html = '<ul class=country><li>Area<li>Population</ul>'
tree = lxml.html.fromstring(broken_html)  # parse the HTML
fixed_html = lxml.html.tostring(tree, pretty_print=True, encoding='unicode')
print(fixed_html)
print('---------------------')
etree_html = lxml.etree.tostring(tree, pretty_print=True, encoding='unicode')
print(etree_html)


<ul class="country">
<li>Area</li>
<li>Population</li>
</ul>

---------------------
<ul class="country">
  <li>Area</li>
  <li>Population</li>
</ul>



In [27]:
# -*- coding: utf-8 -*-

import urllib.request
import urllib.parse
import urllib.error
import lxml.html


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    '''Support custom User-Agent, proxy and auto retry
    '''
    print('Downloading:', url)  # url is download function's first arguments
    headers = {'User-agent': user_agent}  # user_agent is the second arguments
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries - 1)
    return html


url= 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
html = download(url, ua)
tree = lxml.html.fromstring(html)
td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0]
area = td.text_content()
print(area)


Downloading: http://example.webscraping.com/places/default/view/United-Kingdom-239
244,820 square kilometres


In [28]:
import re
import lxml.html
import time
import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup

FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent', \
          'tld', 'currency_code', 'currency_name', 'phone', \
          'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    '''Support custom User-Agent, proxy and auto retry
    '''
    print('Downloading:', url)  # url is download function's first arguments
    headers = {'User-agent': user_agent}  # user_agent is the second arguments
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries - 1)
    return html


def re_scraper(html):
    results = {}
    for field in FIELDS:
        results[field] = re.search('<tr id="places_{}__row">.*?<td class="w2p_fw">(.*?)</td>'.format(field), html).groups()[0]
    return results


def bs_scraper(html):
    soup = BeautifulSoup(html, 'html.parser')
    results = {}
    for field in FIELDS:
        results[field] = soup.find('table').find('tr', id='places_{}__row'.format(field)).find('td', class_='w2p_fw').text
    return results


def lxml_scraper(html):
    tree = lxml.html.fromstring(html)
    results = {}
    for field in FIELDS:
        results[field] = tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content()
    return results


NUM_ITERATIONS = 1000  # number of times to test each scraper
html = download('http://example.webscraping.com/places/default/view/United-Kingdom-239')
for name, scraper in [('Regular expressions', re_scraper), ('BeautifulSoup', bs_scraper), ('Lxml', lxml_scraper)]:
    # record start time of scrape
    start = time.time()
    for i in range(NUM_ITERATIONS):
        if scraper == re_scraper:
            re.purge()
        result = scraper(html)
        # check scraped result is as expected
        assert(result['area'] == '244,820 square kilometres')
        # record end time of scrape and output the total
        end = time.time()
        print('%s: %.2f seconds' % (name, end - start))


Downloading: http://example.webscraping.com/places/default/view/United-Kingdom-239
Regular expressions: 0.01 seconds
Regular expressions: 0.02 seconds
Regular expressions: 0.03 seconds
Regular expressions: 0.04 seconds
Regular expressions: 0.04 seconds
Regular expressions: 0.05 seconds
Regular expressions: 0.05 seconds
Regular expressions: 0.06 seconds
Regular expressions: 0.06 seconds
Regular expressions: 0.07 seconds
Regular expressions: 0.07 seconds
Regular expressions: 0.08 seconds
Regular expressions: 0.08 seconds
Regular expressions: 0.09 seconds
Regular expressions: 0.09 seconds
Regular expressions: 0.09 seconds
Regular expressions: 0.10 seconds
Regular expressions: 0.10 seconds
Regular expressions: 0.11 seconds
Regular expressions: 0.11 seconds
Regular expressions: 0.12 seconds
Regular expressions: 0.12 seconds
Regular expressions: 0.12 seconds
Regular expressions: 0.13 seconds
Regular expressions: 0.13 seconds
Regular expressions: 0.14 seconds
Regular expressions: 0.14 seconds

Regular expressions: 1.33 seconds
Regular expressions: 1.33 seconds
Regular expressions: 1.34 seconds
Regular expressions: 1.34 seconds
Regular expressions: 1.35 seconds
Regular expressions: 1.35 seconds
Regular expressions: 1.35 seconds
Regular expressions: 1.36 seconds
Regular expressions: 1.36 seconds
Regular expressions: 1.37 seconds
Regular expressions: 1.37 seconds
Regular expressions: 1.38 seconds
Regular expressions: 1.39 seconds
Regular expressions: 1.39 seconds
Regular expressions: 1.40 seconds
Regular expressions: 1.40 seconds
Regular expressions: 1.41 seconds
Regular expressions: 1.41 seconds
Regular expressions: 1.42 seconds
Regular expressions: 1.42 seconds
Regular expressions: 1.43 seconds
Regular expressions: 1.43 seconds
Regular expressions: 1.44 seconds
Regular expressions: 1.44 seconds
Regular expressions: 1.45 seconds
Regular expressions: 1.45 seconds
Regular expressions: 1.46 seconds
Regular expressions: 1.46 seconds
Regular expressions: 1.46 seconds
Regular expres

Regular expressions: 2.49 seconds
Regular expressions: 2.49 seconds
Regular expressions: 2.50 seconds
Regular expressions: 2.51 seconds
Regular expressions: 2.51 seconds
Regular expressions: 2.52 seconds
Regular expressions: 2.53 seconds
Regular expressions: 2.53 seconds
Regular expressions: 2.54 seconds
Regular expressions: 2.54 seconds
Regular expressions: 2.55 seconds
Regular expressions: 2.55 seconds
Regular expressions: 2.56 seconds
Regular expressions: 2.56 seconds
Regular expressions: 2.57 seconds
Regular expressions: 2.57 seconds
Regular expressions: 2.57 seconds
Regular expressions: 2.58 seconds
Regular expressions: 2.58 seconds
Regular expressions: 2.59 seconds
Regular expressions: 2.59 seconds
Regular expressions: 2.60 seconds
Regular expressions: 2.60 seconds
Regular expressions: 2.61 seconds
Regular expressions: 2.61 seconds
Regular expressions: 2.62 seconds
Regular expressions: 2.62 seconds
Regular expressions: 2.63 seconds
Regular expressions: 2.63 seconds
Regular expres

Regular expressions: 3.81 seconds
Regular expressions: 3.82 seconds
Regular expressions: 3.82 seconds
Regular expressions: 3.83 seconds
Regular expressions: 3.83 seconds
Regular expressions: 3.84 seconds
Regular expressions: 3.84 seconds
Regular expressions: 3.85 seconds
Regular expressions: 3.85 seconds
Regular expressions: 3.85 seconds
Regular expressions: 3.86 seconds
Regular expressions: 3.87 seconds
Regular expressions: 3.87 seconds
Regular expressions: 3.88 seconds
Regular expressions: 3.88 seconds
Regular expressions: 3.89 seconds
Regular expressions: 3.89 seconds
Regular expressions: 3.90 seconds
Regular expressions: 3.90 seconds
Regular expressions: 3.90 seconds
Regular expressions: 3.91 seconds
Regular expressions: 3.91 seconds
Regular expressions: 3.92 seconds
Regular expressions: 3.92 seconds
Regular expressions: 3.92 seconds
Regular expressions: 3.93 seconds
Regular expressions: 3.93 seconds
Regular expressions: 3.93 seconds
Regular expressions: 3.94 seconds
Regular expres

BeautifulSoup: 0.43 seconds
BeautifulSoup: 0.48 seconds
BeautifulSoup: 0.53 seconds
BeautifulSoup: 0.57 seconds
BeautifulSoup: 0.61 seconds
BeautifulSoup: 0.66 seconds
BeautifulSoup: 0.71 seconds
BeautifulSoup: 0.75 seconds
BeautifulSoup: 0.78 seconds
BeautifulSoup: 0.82 seconds
BeautifulSoup: 0.86 seconds
BeautifulSoup: 0.89 seconds
BeautifulSoup: 0.94 seconds
BeautifulSoup: 0.97 seconds
BeautifulSoup: 1.01 seconds
BeautifulSoup: 1.04 seconds
BeautifulSoup: 1.07 seconds
BeautifulSoup: 1.11 seconds
BeautifulSoup: 1.14 seconds
BeautifulSoup: 1.19 seconds
BeautifulSoup: 1.23 seconds
BeautifulSoup: 1.27 seconds
BeautifulSoup: 1.30 seconds
BeautifulSoup: 1.33 seconds
BeautifulSoup: 1.37 seconds
BeautifulSoup: 1.41 seconds
BeautifulSoup: 1.45 seconds
BeautifulSoup: 1.49 seconds
BeautifulSoup: 1.53 seconds
BeautifulSoup: 1.57 seconds
BeautifulSoup: 1.60 seconds
BeautifulSoup: 1.65 seconds
BeautifulSoup: 1.69 seconds
BeautifulSoup: 1.72 seconds
BeautifulSoup: 1.75 seconds
BeautifulSoup: 1.79 

BeautifulSoup: 12.20 seconds
BeautifulSoup: 12.24 seconds
BeautifulSoup: 12.28 seconds
BeautifulSoup: 12.33 seconds
BeautifulSoup: 12.37 seconds
BeautifulSoup: 12.41 seconds
BeautifulSoup: 12.47 seconds
BeautifulSoup: 12.53 seconds
BeautifulSoup: 12.57 seconds
BeautifulSoup: 12.62 seconds
BeautifulSoup: 12.66 seconds
BeautifulSoup: 12.71 seconds
BeautifulSoup: 12.76 seconds
BeautifulSoup: 12.82 seconds
BeautifulSoup: 12.86 seconds
BeautifulSoup: 12.89 seconds
BeautifulSoup: 12.93 seconds
BeautifulSoup: 12.98 seconds
BeautifulSoup: 13.02 seconds
BeautifulSoup: 13.06 seconds
BeautifulSoup: 13.10 seconds
BeautifulSoup: 13.14 seconds
BeautifulSoup: 13.18 seconds
BeautifulSoup: 13.23 seconds
BeautifulSoup: 13.27 seconds
BeautifulSoup: 13.30 seconds
BeautifulSoup: 13.34 seconds
BeautifulSoup: 13.37 seconds
BeautifulSoup: 13.42 seconds
BeautifulSoup: 13.46 seconds
BeautifulSoup: 13.50 seconds
BeautifulSoup: 13.56 seconds
BeautifulSoup: 13.61 seconds
BeautifulSoup: 13.66 seconds
BeautifulSoup:

BeautifulSoup: 25.37 seconds
BeautifulSoup: 25.41 seconds
BeautifulSoup: 25.45 seconds
BeautifulSoup: 25.51 seconds
BeautifulSoup: 25.54 seconds
BeautifulSoup: 25.57 seconds
BeautifulSoup: 25.61 seconds
BeautifulSoup: 25.66 seconds
BeautifulSoup: 25.70 seconds
BeautifulSoup: 25.73 seconds
BeautifulSoup: 25.76 seconds
BeautifulSoup: 25.80 seconds
BeautifulSoup: 25.84 seconds
BeautifulSoup: 25.88 seconds
BeautifulSoup: 25.92 seconds
BeautifulSoup: 25.96 seconds
BeautifulSoup: 26.00 seconds
BeautifulSoup: 26.03 seconds
BeautifulSoup: 26.06 seconds
BeautifulSoup: 26.10 seconds
BeautifulSoup: 26.15 seconds
BeautifulSoup: 26.18 seconds
BeautifulSoup: 26.21 seconds
BeautifulSoup: 26.25 seconds
BeautifulSoup: 26.29 seconds
BeautifulSoup: 26.32 seconds
BeautifulSoup: 26.38 seconds
BeautifulSoup: 26.41 seconds
BeautifulSoup: 26.44 seconds
BeautifulSoup: 26.48 seconds
BeautifulSoup: 26.52 seconds
BeautifulSoup: 26.55 seconds
BeautifulSoup: 26.60 seconds
BeautifulSoup: 26.64 seconds
BeautifulSoup:

BeautifulSoup: 36.29 seconds
BeautifulSoup: 36.33 seconds
BeautifulSoup: 36.38 seconds
BeautifulSoup: 36.42 seconds
BeautifulSoup: 36.46 seconds
BeautifulSoup: 36.52 seconds
BeautifulSoup: 36.56 seconds
BeautifulSoup: 36.60 seconds
BeautifulSoup: 36.63 seconds
BeautifulSoup: 36.67 seconds
BeautifulSoup: 36.71 seconds
BeautifulSoup: 36.75 seconds
BeautifulSoup: 36.79 seconds
BeautifulSoup: 36.84 seconds
BeautifulSoup: 36.88 seconds
BeautifulSoup: 36.92 seconds
BeautifulSoup: 36.97 seconds
BeautifulSoup: 37.02 seconds
BeautifulSoup: 37.06 seconds
BeautifulSoup: 37.09 seconds
BeautifulSoup: 37.14 seconds
BeautifulSoup: 37.18 seconds
BeautifulSoup: 37.21 seconds
BeautifulSoup: 37.26 seconds
BeautifulSoup: 37.29 seconds
BeautifulSoup: 37.33 seconds
BeautifulSoup: 37.38 seconds
BeautifulSoup: 37.43 seconds
BeautifulSoup: 37.48 seconds
BeautifulSoup: 37.52 seconds
BeautifulSoup: 37.56 seconds
BeautifulSoup: 37.60 seconds
BeautifulSoup: 37.64 seconds
BeautifulSoup: 37.68 seconds
BeautifulSoup:

Lxml: 2.16 seconds
Lxml: 2.16 seconds
Lxml: 2.17 seconds
Lxml: 2.18 seconds
Lxml: 2.19 seconds
Lxml: 2.20 seconds
Lxml: 2.20 seconds
Lxml: 2.21 seconds
Lxml: 2.22 seconds
Lxml: 2.23 seconds
Lxml: 2.24 seconds
Lxml: 2.24 seconds
Lxml: 2.25 seconds
Lxml: 2.26 seconds
Lxml: 2.26 seconds
Lxml: 2.27 seconds
Lxml: 2.28 seconds
Lxml: 2.29 seconds
Lxml: 2.29 seconds
Lxml: 2.30 seconds
Lxml: 2.31 seconds
Lxml: 2.32 seconds
Lxml: 2.33 seconds
Lxml: 2.33 seconds
Lxml: 2.34 seconds
Lxml: 2.35 seconds
Lxml: 2.35 seconds
Lxml: 2.36 seconds
Lxml: 2.36 seconds
Lxml: 2.37 seconds
Lxml: 2.38 seconds
Lxml: 2.39 seconds
Lxml: 2.40 seconds
Lxml: 2.41 seconds
Lxml: 2.41 seconds
Lxml: 2.42 seconds
Lxml: 2.43 seconds
Lxml: 2.44 seconds
Lxml: 2.44 seconds
Lxml: 2.45 seconds
Lxml: 2.46 seconds
Lxml: 2.47 seconds
Lxml: 2.48 seconds
Lxml: 2.48 seconds
Lxml: 2.49 seconds
Lxml: 2.49 seconds
Lxml: 2.50 seconds
Lxml: 2.51 seconds
Lxml: 2.52 seconds
Lxml: 2.53 seconds
Lxml: 2.53 seconds
Lxml: 2.54 seconds
Lxml: 2.55 s

Lxml: 5.88 seconds
Lxml: 5.89 seconds
Lxml: 5.89 seconds
Lxml: 5.90 seconds
Lxml: 5.91 seconds
Lxml: 5.92 seconds
Lxml: 5.93 seconds
Lxml: 5.94 seconds
Lxml: 5.95 seconds
Lxml: 5.96 seconds
Lxml: 5.97 seconds
Lxml: 5.98 seconds
Lxml: 5.99 seconds
Lxml: 6.00 seconds
Lxml: 6.01 seconds
Lxml: 6.02 seconds
Lxml: 6.03 seconds
Lxml: 6.04 seconds
Lxml: 6.04 seconds
Lxml: 6.05 seconds
Lxml: 6.06 seconds
Lxml: 6.06 seconds
Lxml: 6.07 seconds
Lxml: 6.08 seconds
Lxml: 6.09 seconds
Lxml: 6.09 seconds
Lxml: 6.10 seconds
Lxml: 6.11 seconds
Lxml: 6.12 seconds
Lxml: 6.12 seconds
Lxml: 6.13 seconds
Lxml: 6.14 seconds
Lxml: 6.14 seconds
Lxml: 6.15 seconds
Lxml: 6.15 seconds
Lxml: 6.16 seconds
Lxml: 6.17 seconds
Lxml: 6.18 seconds
Lxml: 6.19 seconds
Lxml: 6.20 seconds
Lxml: 6.20 seconds
Lxml: 6.21 seconds
Lxml: 6.22 seconds
Lxml: 6.22 seconds
Lxml: 6.23 seconds
Lxml: 6.24 seconds
Lxml: 6.24 seconds
Lxml: 6.25 seconds
Lxml: 6.26 seconds
Lxml: 6.26 seconds
Lxml: 6.27 seconds
Lxml: 6.27 seconds
Lxml: 6.28 s

In [1]:
import re
import lxml.html
import time
import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup

FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent', \
          'tld', 'currency_code', 'currency_name', 'phone', \
          'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    '''Support custom User-Agent, proxy and auto retry
    '''
    print('Downloading:', url)  # url is download function's first arguments
    headers = {'User-agent': user_agent}  # user_agent is the second arguments
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries - 1)
    return html


def re_scraper(html):
    results = {}
    for field in FIELDS:
        results[field] = re.search('<tr id="places_{}__row">.*?<td class="w2p_fw">(.*?)</td>'.format(field), html).groups()[0]
    return results


def bs_scraper(html):
    soup = BeautifulSoup(html, 'html.parser')
    results = {}
    for field in FIELDS:
        results[field] = soup.find('table').find('tr', id='places_{}__row'.format(field)).find('td', class_='w2p_fw').text
    return results


def lxml_scraper(html):
    tree = lxml.html.fromstring(html)
    results = {}
    for field in FIELDS:
        results[field] = tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content()
    return results


NUM_ITERATIONS = 1000  # number of times to test each scraper
html = download('http://example.webscraping.com/places/default/view/United-Kingdom-239')
for name, scraper in [('Regular expressions', re_scraper), ('BeautifulSoup', bs_scraper), ('Lxml', lxml_scraper)]:
    # record start time of scrape
    start = time.time()
    for i in range(NUM_ITERATIONS):
        if scraper == re_scraper:
            re.purge()
        result = scraper(html)
        # check scraped result is as expected
        assert(result['area'] == '244,820 square kilometres')
    # record end time of scrape and output the total
    end = time.time()
    print('%s: %.2f seconds' % (name, end - start))


Downloading: http://example.webscraping.com/places/default/view/United-Kingdom-239
Regular expressions: 4.01 seconds
BeautifulSoup: 33.95 seconds
Lxml: 5.81 seconds


In [1]:
# coding=utf-8
import datetime
import re
import time
import urllib.error
import urllib.parse
import urllib.request
import urllib.robotparser
import sys


user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
rp = urllib.robotparser.RobotFileParser()
rp.set_url('http://example.webscraping.com/robots.txt')
rp.read()
proxy = None


class Throttle:
    """Add a delay between downloads to the same domain
    """

    def __init__(self, delay):
        # amount of delay between downloads for each domain
        self.delay = delay
        # timestamp of when a domain was last accessed
        self.domains = {}

    def wait(self, url):
        domain = urllib.parse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - \
                (datetime.datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                # domain has been accessed recently
                # so need to sleep
                time.sleep(sleep_secs)
        # update the last accessed time
        self.domains[domain] = datetime.datetime.now()

        
throttle=Throttle(1)


def download(url, user_agent='wswp', proxy=None, num_retries=2):
    '''Support custom User-Agent, proxy and auto retry
    '''
    print('Downloading:', url)  # url is download function's first arguments
    headers = {'User-agent': user_agent}  # user_agent is the second arguments
    request = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                html = download(url, user_agent, proxy, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex, max_depth=2, scrape_callback=None):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]
    # keep track which URL's have seen before
    seen = {seed_url: 0}
    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            throttle.wait(url)
            html = download(url, user_agent, proxy, 2)
            links = []
            if scrape_callback:
                links.extend(scrape_callback(url, html) or [])
        else:
            print('Blocked by robots.txt:', url)
            html = None
            sys.exit()
        # filter for links matching our regular expression
        depth = seen[url]
        if depth != max_depth:
            for link in get_links(html):
                # check if link matches expected regex
                if re.match(link_regex, link):
                    # form absolute link
                    link = urllib.parse.urljoin(seed_url, link)
                    # check if have already seen this link
                    if link not in seen:
                        seen[link] = depth + 1
                        crawl_queue.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


In [3]:
import csv
import lxml.html
import re


class ScrapeCallback:

    def __init__(self):
        self.writer = csv.writer(open('countries_jupyter.csv', 'w', newline=''))
        self.fields = ('area', 'population', 'iso', 'country', 'capital',
                       'continent', 'tld', 'currency_code', 'currency_name',
                       'phone', 'postal_code_format', 'postal_code_regex',
                       'languages', 'neighbours')
        self.writer.writerow(self.fields)

    def __call__(self, url, html):
        if re.search('/view/', url):
            tree = lxml.html.fromstring(html)
            row = []
            for field in self.fields:
                row.append(tree.cssselect(
                    'table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
            self.writer.writerow(row)


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com', '/places/default/(index|view)',
                 max_depth=-1, scrape_callback=ScrapeCallback())


Downloading: http://example.webscraping.com
Downloading: http://example.webscraping.com/places/default/index/1
Downloading: http://example.webscraping.com/places/default/index/2
Downloading: http://example.webscraping.com/places/default/index/3
Downloading: http://example.webscraping.com/places/default/index/4
Downloading: http://example.webscraping.com/places/default/index/5
Downloading: http://example.webscraping.com/places/default/index/6
Downloading: http://example.webscraping.com/places/default/index/7
Downloading: http://example.webscraping.com/places/default/index/8
Downloading: http://example.webscraping.com/places/default/index/9
Downloading: http://example.webscraping.com/places/default/index/10
Downloading: http://example.webscraping.com/places/default/index/11
Downloading: http://example.webscraping.com/places/default/index/12
Downloading: http://example.webscraping.com/places/default/index/13
Downloading: http://example.webscraping.com/places/default/index/14
Downloading: 

Downloading: http://example.webscraping.com/places/default/view/Palestinian-Territory-171
Downloading: http://example.webscraping.com/places/default/view/Palau-170
Downloading: http://example.webscraping.com/places/default/view/Pakistan-169
Downloading: http://example.webscraping.com/places/default/view/Oman-168
Downloading: http://example.webscraping.com/places/default/view/Norway-167
Downloading: http://example.webscraping.com/places/default/view/Northern-Mariana-Islands-166
Downloading: http://example.webscraping.com/places/default/view/North-Korea-165
Downloading: http://example.webscraping.com/places/default/view/Norfolk-Island-164
Downloading: http://example.webscraping.com/places/default/view/Niue-163
Downloading: http://example.webscraping.com/places/default/view/Nigeria-162
Downloading: http://example.webscraping.com/places/default/view/Niger-161
Downloading: http://example.webscraping.com/places/default/view/Nicaragua-160
Downloading: http://example.webscraping.com/places/def

Downloading: http://example.webscraping.com/places/default/view/Ecuador-65
Downloading: http://example.webscraping.com/places/default/view/East-Timor-64
Downloading: http://example.webscraping.com/places/default/view/Dominican-Republic-63
Downloading: http://example.webscraping.com/places/default/view/Dominica-62
Downloading: http://example.webscraping.com/places/default/view/Djibouti-61
Downloading: http://example.webscraping.com/places/default/view/Denmark-60
Downloading: http://example.webscraping.com/places/default/view/Democratic-Republic-of-the-Congo-59
Downloading: http://example.webscraping.com/places/default/view/Czech-Republic-58
Downloading: http://example.webscraping.com/places/default/view/Cyprus-57
Downloading: http://example.webscraping.com/places/default/view/Curacao-56
Downloading: http://example.webscraping.com/places/default/view/Cuba-55
Downloading: http://example.webscraping.com/places/default/view/Croatia-54
Downloading: http://example.webscraping.com/places/defau

In [1]:
import re


url = 'http://example.webscraping.com/places/default/view/Australia-1'
re.sub('[^/0-9a-zA-Z\-.,; _]', '_', url)


'http_//example.webscraping.com/places/default/view/Australia-1'

In [3]:
filename = 'http_//example.webscraping.com/places/default/view/Australia-1'
filename.split('/')

['http_',
 '',
 'example.webscraping.com',
 'places',
 'default',
 'view',
 'Australia-1']

In [5]:
for segment in filename.split('/'):
    print(segment)

http_

example.webscraping.com
places
default
view
Australia-1


In [6]:
segment[:255] for segment in filename.split('/')

SyntaxError: invalid syntax (<ipython-input-6-2073b4879b1b>, line 1)

In [8]:
s = [segment[:255] for segment in filename.split('/')]

In [9]:
print(s)

['http_', '', 'example.webscraping.com', 'places', 'default', 'view', 'Australia-1']


In [10]:
from urllib.parse import urlsplit


components = urlsplit('http://example.webscraping.com/index/')
print(components)
print(components.path)
path = components.path
if not path:
    path = '/index.html'
elif path.endswith('/'):
    path += 'index.html'
filename = components.netloc + path + components.query
print(filename)


SplitResult(scheme='http', netloc='example.webscraping.com', path='/index/', query='', fragment='')
/index/
example.webscraping.com/index/index.html


In [11]:
import os
import re
import urllib.parse


class DiskCache:
    def __init__(self, cache_dir='cache'):
        self.cache_dir = cache_dir
        self.max_length = max_length

    def url_to_path(self, url):
        """Create file system path for this URL
        """
        components = urllib.parse.urlsplit(url)
        # append index.html to empty paths
        path = components.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = components.netloc + path + components.query
        # replace invalid characters
        filename = re.sub('[^/0-9a-zA-Z\-.,; _]', '_', filename)
        # restric maximum number of characters
        filename = '/'.join(segment[:255] for segment in filename.split('/'))
        return os.path.join(self.cache_dir, filename)
