# Web Scraping with Python

In [1]:
import re
import urllib.request
import urllib.error


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap.decode())
    # download each link
    for link in links:
        html = download(link)
        # scrape html here
        # ...


crawl_sitemap('https://webscraping.com/sitemap.xml')


Downloading: https://webscraping.com/sitemap.xml
Downloading: http://webscraping.com
Downloading: http://webscraping.com/about
Downloading: http://webscraping.com/blog
Downloading: http://webscraping.com/blog/10/
Downloading: http://webscraping.com/blog/11/
Downloading: http://webscraping.com/blog/12/
Downloading: http://webscraping.com/blog/13/
Downloading: http://webscraping.com/blog/2/
Downloading: http://webscraping.com/blog/3/
Downloading: http://webscraping.com/blog/4/
Downloading: http://webscraping.com/blog/5/
Downloading: http://webscraping.com/blog/6/
Downloading: http://webscraping.com/blog/7/
Downloading: http://webscraping.com/blog/8/
Downloading: http://webscraping.com/blog/9/
Downloading: http://webscraping.com/blog/All-your-data-are-belong-to-us/
Downloading: http://webscraping.com/blog/Android-Apps-Update/
Downloading: http://webscraping.com/blog/Apple-Apps-Update/
Downloading: http://webscraping.com/blog/Asynchronous-support-in-Python/
Downloading: http://webscraping.

Downloading: http://webscraping.com/blog/category/website/
Downloading: http://webscraping.com/blog/category/xpath
Downloading: http://webscraping.com/contact
Downloading: http://webscraping.com/data
Downloading: http://webscraping.com/data/default/database/1/belgium-zip-codes
Downloading: http://webscraping.com/data/default/database/10/usa-restaurants
Downloading: http://webscraping.com/data/default/database/11/android-apps
Downloading: http://webscraping.com/data/default/database/12/universal-product-codes-upc-details
Downloading: http://webscraping.com/data/default/database/13/international-standard-book-numbers-isbn
Downloading: http://webscraping.com/data/default/database/14/hotel-details
Downloading: http://webscraping.com/data/default/database/17/aircraft-models
Downloading: http://webscraping.com/data/default/database/2/usa-cities
Downloading: http://webscraping.com/data/default/database/23/films
Downloading: http://webscraping.com/data/default/database/29/popular-websites
Down

In [2]:
import itertools
import re
import urllib.request
import urllib.error


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap.decode())
    # download each link
    for link in links:
        html = download(link)
        # scrape html here
        # ...


for page in itertools.count(1):
    url = 'http://example.webscraping.com/places/default/view/-{}'.format(page)
    html = download(url)
    if html is None:
        break
    else:
        # success - can scrape the result
        pass


Downloading: http://example.webscraping.com/places/default/view/-1
Downloading: http://example.webscraping.com/places/default/view/-2
Downloading: http://example.webscraping.com/places/default/view/-3
Downloading: http://example.webscraping.com/places/default/view/-4
Downloading: http://example.webscraping.com/places/default/view/-5
Downloading: http://example.webscraping.com/places/default/view/-6
Downloading: http://example.webscraping.com/places/default/view/-7
Downloading: http://example.webscraping.com/places/default/view/-8
Downloading: http://example.webscraping.com/places/default/view/-9
Downloading: http://example.webscraping.com/places/default/view/-10
Downloading: http://example.webscraping.com/places/default/view/-11
Downloading: http://example.webscraping.com/places/default/view/-12
Downloading: http://example.webscraping.com/places/default/view/-13
Downloading: http://example.webscraping.com/places/default/view/-14
Downloading: http://example.webscraping.com/places/defaul

Downloading: http://example.webscraping.com/places/default/view/-122
Downloading: http://example.webscraping.com/places/default/view/-123
Downloading: http://example.webscraping.com/places/default/view/-124
Downloading: http://example.webscraping.com/places/default/view/-125
Downloading: http://example.webscraping.com/places/default/view/-126
Downloading: http://example.webscraping.com/places/default/view/-127
Downloading: http://example.webscraping.com/places/default/view/-128
Downloading: http://example.webscraping.com/places/default/view/-129
Downloading: http://example.webscraping.com/places/default/view/-130
Downloading: http://example.webscraping.com/places/default/view/-131
Downloading: http://example.webscraping.com/places/default/view/-132
Downloading: http://example.webscraping.com/places/default/view/-133
Downloading: http://example.webscraping.com/places/default/view/-134
Downloading: http://example.webscraping.com/places/default/view/-135
Downloading: http://example.webscr

Downloading: http://example.webscraping.com/places/default/view/-241
Downloading: http://example.webscraping.com/places/default/view/-242
Downloading: http://example.webscraping.com/places/default/view/-243
Downloading: http://example.webscraping.com/places/default/view/-244
Downloading: http://example.webscraping.com/places/default/view/-245
Downloading: http://example.webscraping.com/places/default/view/-246
Downloading: http://example.webscraping.com/places/default/view/-247
Downloading: http://example.webscraping.com/places/default/view/-248
Downloading: http://example.webscraping.com/places/default/view/-249
Downloading: http://example.webscraping.com/places/default/view/-250
Downloading: http://example.webscraping.com/places/default/view/-251
Downloading: http://example.webscraping.com/places/default/view/-252
Downloading: http://example.webscraping.com/places/default/view/-253
Download error: NOT FOUND


In [5]:
import itertools
import urllib.request
import urllib.error


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.code, e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


# maximum number of consecutive download errors allowed
max_errors = 5
# current number of consecutive download errors
num_errors = 0
for page in itertools.count(1):
    url = 'http://example.webscraping.com/places/default/view/-{}'.format(page)
    html = download(url)
    if html is None:
        # received an error trying to download this webpage
        num_errors += 1
        if num_errors == max_errors:
            # reached maximum number of
            # consecutive errors so exit
            break


Downloading: http://example.webscraping.com/places/default/view/-1
Downloading: http://example.webscraping.com/places/default/view/-2
Downloading: http://example.webscraping.com/places/default/view/-3
Downloading: http://example.webscraping.com/places/default/view/-4
Downloading: http://example.webscraping.com/places/default/view/-5
Downloading: http://example.webscraping.com/places/default/view/-6
Downloading: http://example.webscraping.com/places/default/view/-7
Downloading: http://example.webscraping.com/places/default/view/-8
Downloading: http://example.webscraping.com/places/default/view/-9
Downloading: http://example.webscraping.com/places/default/view/-10
Downloading: http://example.webscraping.com/places/default/view/-11
Downloading: http://example.webscraping.com/places/default/view/-12
Downloading: http://example.webscraping.com/places/default/view/-13
Downloading: http://example.webscraping.com/places/default/view/-14
Downloading: http://example.webscraping.com/places/defaul

Downloading: http://example.webscraping.com/places/default/view/-122
Downloading: http://example.webscraping.com/places/default/view/-123
Downloading: http://example.webscraping.com/places/default/view/-124
Downloading: http://example.webscraping.com/places/default/view/-125
Downloading: http://example.webscraping.com/places/default/view/-126
Downloading: http://example.webscraping.com/places/default/view/-127
Downloading: http://example.webscraping.com/places/default/view/-128
Downloading: http://example.webscraping.com/places/default/view/-129
Downloading: http://example.webscraping.com/places/default/view/-130
Downloading: http://example.webscraping.com/places/default/view/-131
Downloading: http://example.webscraping.com/places/default/view/-132
Downloading: http://example.webscraping.com/places/default/view/-133
Downloading: http://example.webscraping.com/places/default/view/-134
Downloading: http://example.webscraping.com/places/default/view/-135
Downloading: http://example.webscr

Downloading: http://example.webscraping.com/places/default/view/-241
Downloading: http://example.webscraping.com/places/default/view/-242
Downloading: http://example.webscraping.com/places/default/view/-243
Downloading: http://example.webscraping.com/places/default/view/-244
Downloading: http://example.webscraping.com/places/default/view/-245
Downloading: http://example.webscraping.com/places/default/view/-246
Downloading: http://example.webscraping.com/places/default/view/-247
Downloading: http://example.webscraping.com/places/default/view/-248
Downloading: http://example.webscraping.com/places/default/view/-249
Downloading: http://example.webscraping.com/places/default/view/-250
Downloading: http://example.webscraping.com/places/default/view/-251
Downloading: http://example.webscraping.com/places/default/view/-252
Downloading: http://example.webscraping.com/places/default/view/-253
Download error: 404 NOT FOUND
Downloading: http://example.webscraping.com/places/default/view/-254
Down

In [7]:
import itertools
import urllib.request
import urllib.error


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


# maximum number of consecutive download errors allowed
max_errors = 5
# current number of consecutive download errors
num_errors = 0
for page in itertools.count(1):
    url = 'http://example.webscraping.com/places/default/view/-{}'.format(page)
    html = download(url)
    if html is None:
        # received an error trying to download this webpage
        num_errors += 1
        if num_errors == max_errors:
            # reached maximum number of
            # consecutive errors so exit
            break


Downloading: http://example.webscraping.com/places/default/view/-1
Downloading: http://example.webscraping.com/places/default/view/-2
Downloading: http://example.webscraping.com/places/default/view/-3
Downloading: http://example.webscraping.com/places/default/view/-4
Downloading: http://example.webscraping.com/places/default/view/-5
Downloading: http://example.webscraping.com/places/default/view/-6
Downloading: http://example.webscraping.com/places/default/view/-7
Downloading: http://example.webscraping.com/places/default/view/-8
Downloading: http://example.webscraping.com/places/default/view/-9
Downloading: http://example.webscraping.com/places/default/view/-10
Downloading: http://example.webscraping.com/places/default/view/-11
Downloading: http://example.webscraping.com/places/default/view/-12
Downloading: http://example.webscraping.com/places/default/view/-13
Downloading: http://example.webscraping.com/places/default/view/-14
Downloading: http://example.webscraping.com/places/defaul

Downloading: http://example.webscraping.com/places/default/view/-122
Downloading: http://example.webscraping.com/places/default/view/-123
Downloading: http://example.webscraping.com/places/default/view/-124
Downloading: http://example.webscraping.com/places/default/view/-125
Downloading: http://example.webscraping.com/places/default/view/-126
Downloading: http://example.webscraping.com/places/default/view/-127
Downloading: http://example.webscraping.com/places/default/view/-128
Downloading: http://example.webscraping.com/places/default/view/-129
Downloading: http://example.webscraping.com/places/default/view/-130
Downloading: http://example.webscraping.com/places/default/view/-131
Downloading: http://example.webscraping.com/places/default/view/-132
Downloading: http://example.webscraping.com/places/default/view/-133
Downloading: http://example.webscraping.com/places/default/view/-134
Downloading: http://example.webscraping.com/places/default/view/-135
Downloading: http://example.webscr

Downloading: http://example.webscraping.com/places/default/view/-241
Downloading: http://example.webscraping.com/places/default/view/-242
Downloading: http://example.webscraping.com/places/default/view/-243
Downloading: http://example.webscraping.com/places/default/view/-244
Downloading: http://example.webscraping.com/places/default/view/-245
Downloading: http://example.webscraping.com/places/default/view/-246
Downloading: http://example.webscraping.com/places/default/view/-247
Downloading: http://example.webscraping.com/places/default/view/-248
Downloading: http://example.webscraping.com/places/default/view/-249
Downloading: http://example.webscraping.com/places/default/view/-250
Downloading: http://example.webscraping.com/places/default/view/-251
Downloading: http://example.webscraping.com/places/default/view/-252
Downloading: http://example.webscraping.com/places/default/view/-253
Download error: NOT FOUND
Downloading: http://example.webscraping.com/places/default/view/-254
Download

In [8]:
import itertools
import urllib.request
import urllib.error


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


# maximum number of consecutive download errors allowed
max_errors = 5
# current number of consecutive download errors
num_errors = 0
for page in itertools.count(1):
    url = 'http://example.webscraping.com/places/default/view/-{}'.format(page)
    html = download(url)
    if html is None:
        # received an error trying to download this webpage
        num_errors += 1
        if num_errors == max_errors:
            # reached maximum number of
            # consecutive errors so exit
            break
    else:
        # success - can scrape the resule
        # ...
        num_errors = 0


Downloading: http://example.webscraping.com/places/default/view/-1
Downloading: http://example.webscraping.com/places/default/view/-2
Downloading: http://example.webscraping.com/places/default/view/-3
Downloading: http://example.webscraping.com/places/default/view/-4
Downloading: http://example.webscraping.com/places/default/view/-5
Downloading: http://example.webscraping.com/places/default/view/-6
Downloading: http://example.webscraping.com/places/default/view/-7
Downloading: http://example.webscraping.com/places/default/view/-8
Downloading: http://example.webscraping.com/places/default/view/-9
Downloading: http://example.webscraping.com/places/default/view/-10
Downloading: http://example.webscraping.com/places/default/view/-11
Downloading: http://example.webscraping.com/places/default/view/-12
Downloading: http://example.webscraping.com/places/default/view/-13
Downloading: http://example.webscraping.com/places/default/view/-14
Downloading: http://example.webscraping.com/places/defaul

Downloading: http://example.webscraping.com/places/default/view/-122
Downloading: http://example.webscraping.com/places/default/view/-123
Downloading: http://example.webscraping.com/places/default/view/-124
Downloading: http://example.webscraping.com/places/default/view/-125
Downloading: http://example.webscraping.com/places/default/view/-126
Downloading: http://example.webscraping.com/places/default/view/-127
Downloading: http://example.webscraping.com/places/default/view/-128
Downloading: http://example.webscraping.com/places/default/view/-129
Downloading: http://example.webscraping.com/places/default/view/-130
Downloading: http://example.webscraping.com/places/default/view/-131
Downloading: http://example.webscraping.com/places/default/view/-132
Downloading: http://example.webscraping.com/places/default/view/-133
Downloading: http://example.webscraping.com/places/default/view/-134
Downloading: http://example.webscraping.com/places/default/view/-135
Downloading: http://example.webscr

Downloading: http://example.webscraping.com/places/default/view/-241
Downloading: http://example.webscraping.com/places/default/view/-242
Downloading: http://example.webscraping.com/places/default/view/-243
Downloading: http://example.webscraping.com/places/default/view/-244
Downloading: http://example.webscraping.com/places/default/view/-245
Downloading: http://example.webscraping.com/places/default/view/-246
Downloading: http://example.webscraping.com/places/default/view/-247
Downloading: http://example.webscraping.com/places/default/view/-248
Downloading: http://example.webscraping.com/places/default/view/-249
Downloading: http://example.webscraping.com/places/default/view/-250
Downloading: http://example.webscraping.com/places/default/view/-251
Downloading: http://example.webscraping.com/places/default/view/-252
Downloading: http://example.webscraping.com/places/default/view/-253
Download error: NOT FOUND
Downloading: http://example.webscraping.com/places/default/view/-254
Download

In [1]:
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_quene = [seed_url]
    while crawl_quene:
        url = crawl_quene.pop()
        html = download(url)
        # filter for links matching our regular expression
        for link in get_links(html):
            if re.match(link_regex, link):
                crawl_quene.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


html = download('http://example.webscraping.com/places/default').decode()
get_links(html)

Downloading: http://example.webscraping.com/places/default


['/places/default/index',
 '#',
 '/places/default/user/register?_next=/places/default/index',
 '/places/default/user/login?_next=/places/default/index',
 '/places/default/index',
 '/places/default/search',
 '/places/default/view/Afghanistan-1',
 '/places/default/view/Aland-Islands-2',
 '/places/default/view/Albania-3',
 '/places/default/view/Algeria-4',
 '/places/default/view/American-Samoa-5',
 '/places/default/view/Andorra-6',
 '/places/default/view/Angola-7',
 '/places/default/view/Anguilla-8',
 '/places/default/view/Antarctica-9',
 '/places/default/view/Antigua-and-Barbuda-10',
 '/places/default/index/1']

In [5]:
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_quene = [seed_url]
    while crawl_quene:
        url = crawl_quene.pop()
        html = download(url)
        # filter for links matching our regular expression
        for link in get_links(html):
            if re.match(link_regex, link):
                crawl_quene.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


link_crawler('http://example.webscraping.com/places/default', '/places/default/(index|view)')

Downloading: http://example.webscraping.com/places/default
Downloading: /places/default/index/1


ValueError: unknown url type: '/places/default/index/1'

In [1]:
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error
import urllib.parse


def download(url, user_agent='wswp', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_quene = [seed_url]
    while crawl_quene:
        url = crawl_quene.pop()
        html = download(url)
        # filter for links matching our regular expression
        for link in get_links(html):
            if re.match(link_regex, link):
                link = urllib.parse.urljoin(seed_url, link)
                crawl_quene.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


link_crawler('http://example.webscraping.com', '/places/default/(index|view)')

Downloading: http://example.webscraping.com
Downloading: http://example.webscraping.com/places/default/index/1
Downloading: http://example.webscraping.com/places/default/index/2
Downloading: http://example.webscraping.com/places/default/index/3
Downloading: http://example.webscraping.com/places/default/index/4
Downloading: http://example.webscraping.com/places/default/index/5
Downloading: http://example.webscraping.com/places/default/index/6
Downloading: http://example.webscraping.com/places/default/index/7
Downloading: http://example.webscraping.com/places/default/index/8
Downloading: http://example.webscraping.com/places/default/index/9
Downloading: http://example.webscraping.com/places/default/index/10
Downloading: http://example.webscraping.com/places/default/index/11
Downloading: http://example.webscraping.com/places/default/index/12
Downloading: http://example.webscraping.com/places/default/index/13
Downloading: http://example.webscraping.com/places/default/index/14
Downloading: 

KeyboardInterrupt: 

爬取任何网站，User-Agent必须伪装成正常浏览器的UA，爬虫类的UA极易被屏蔽或是限制。

In [5]:
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error
import urllib.parse


def download(url, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]
    # keep track which URL's have seen before
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        # filter for links matching our regular expression
        for link in get_links(html):
            # check if link matches expected regex
            if re.match(link_regex, link):
                # form absolute link
                link = urllib.parse.urljoin(seed_url, link)
                # check if have already seen this link
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


link_crawler('http://example.webscraping.com', '/places/default/(index|view)')

Downloading: http://example.webscraping.com
Downloading: http://example.webscraping.com/places/default/index/1
Downloading: http://example.webscraping.com/places/default/index/2
Downloading: http://example.webscraping.com/places/default/index/3
Downloading: http://example.webscraping.com/places/default/index/4
Downloading: http://example.webscraping.com/places/default/index/5
Downloading: http://example.webscraping.com/places/default/index/6
Downloading: http://example.webscraping.com/places/default/index/7
Downloading: http://example.webscraping.com/places/default/index/8
Downloading: http://example.webscraping.com/places/default/index/9
Downloading: http://example.webscraping.com/places/default/index/10
Downloading: http://example.webscraping.com/places/default/index/11
Downloading: http://example.webscraping.com/places/default/index/12
Downloading: http://example.webscraping.com/places/default/index/13
Downloading: http://example.webscraping.com/places/default/index/14
Downloading: 

Downloading: http://example.webscraping.com/places/default/view/Palestinian-Territory-171
Downloading: http://example.webscraping.com/places/default/view/Palau-170
Downloading: http://example.webscraping.com/places/default/view/Pakistan-169
Downloading: http://example.webscraping.com/places/default/view/Oman-168
Downloading: http://example.webscraping.com/places/default/view/Norway-167
Downloading: http://example.webscraping.com/places/default/view/Northern-Mariana-Islands-166
Downloading: http://example.webscraping.com/places/default/view/North-Korea-165
Downloading: http://example.webscraping.com/places/default/view/Norfolk-Island-164
Downloading: http://example.webscraping.com/places/default/view/Niue-163
Downloading: http://example.webscraping.com/places/default/view/Nigeria-162
Downloading: http://example.webscraping.com/places/default/view/Niger-161
Downloading: http://example.webscraping.com/places/default/view/Nicaragua-160
Downloading: http://example.webscraping.com/places/def

Downloading: http://example.webscraping.com/places/default/view/Ecuador-65
Downloading: http://example.webscraping.com/places/default/view/East-Timor-64
Downloading: http://example.webscraping.com/places/default/view/Dominican-Republic-63
Downloading: http://example.webscraping.com/places/default/view/Dominica-62
Downloading: http://example.webscraping.com/places/default/view/Djibouti-61
Downloading: http://example.webscraping.com/places/default/view/Denmark-60
Downloading: http://example.webscraping.com/places/default/view/Democratic-Republic-of-the-Congo-59
Downloading: http://example.webscraping.com/places/default/view/Czech-Republic-58
Downloading: http://example.webscraping.com/places/default/view/Cyprus-57
Downloading: http://example.webscraping.com/places/default/view/Curacao-56
Downloading: http://example.webscraping.com/places/default/view/Cuba-55
Downloading: http://example.webscraping.com/places/default/view/Croatia-54
Downloading: http://example.webscraping.com/places/defau

In [7]:
import urllib.robotparser


rp = urllib.robotparser.RobotFileParser()
rp.set_url('http://example.webscraping.com/robots.txt')
rp.read()
url = 'http://example.webscraping.com'
user_agent = 'BadCrawler'
print(rp.can_fetch(user_agent, url))
user_agent = 'GoodCrawler'
print(rp.can_fetch(user_agent, url))

False
True


In [4]:
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error
import urllib.parse
import urllib.robotparser
import time


user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
rp = urllib.robotparser.RobotFileParser()
rp.set_url('http://example.webscraping.com/robots.txt')
rp.read()


def download(url, user_agent=user_agent, num_retries=2):
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    try:
        html = urllib.request.urlopen(request).read().decode()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # retry 5XX HTTP errors
                return download(url, user_agent, num_retries - 1)
    return html


def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]
    # keep track which URL's have seen before
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            time.sleep(1)
            html = download(url)
        else:
            print('Blocked by robots.txt:', url)
            html = None
        # filter for links matching our regular expression
        for link in get_links(html):
            # check if link matches expected regex
            if re.match(link_regex, link):
                # form absolute link
                link = urllib.parse.urljoin(seed_url, link)
                # check if have already seen this link
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


link_crawler('http://example.webscraping.com', '/places/default/(index|view)')

Downloading: http://example.webscraping.com
Downloading: http://example.webscraping.com/places/default/index/1
Downloading: http://example.webscraping.com/places/default/index/2
Downloading: http://example.webscraping.com/places/default/index/3
Downloading: http://example.webscraping.com/places/default/index/4
Downloading: http://example.webscraping.com/places/default/index/5
Downloading: http://example.webscraping.com/places/default/index/6
Downloading: http://example.webscraping.com/places/default/index/7
Downloading: http://example.webscraping.com/places/default/index/8
Downloading: http://example.webscraping.com/places/default/index/9
Downloading: http://example.webscraping.com/places/default/index/10
Downloading: http://example.webscraping.com/places/default/index/11
Downloading: http://example.webscraping.com/places/default/index/12
Downloading: http://example.webscraping.com/places/default/index/13
Downloading: http://example.webscraping.com/places/default/index/14
Downloading: 

Downloading: http://example.webscraping.com/places/default/view/Palestinian-Territory-171
Downloading: http://example.webscraping.com/places/default/view/Palau-170
Downloading: http://example.webscraping.com/places/default/view/Pakistan-169
Downloading: http://example.webscraping.com/places/default/view/Oman-168
Downloading: http://example.webscraping.com/places/default/view/Norway-167
Downloading: http://example.webscraping.com/places/default/view/Northern-Mariana-Islands-166
Downloading: http://example.webscraping.com/places/default/view/North-Korea-165
Downloading: http://example.webscraping.com/places/default/view/Norfolk-Island-164
Downloading: http://example.webscraping.com/places/default/view/Niue-163
Downloading: http://example.webscraping.com/places/default/view/Nigeria-162
Downloading: http://example.webscraping.com/places/default/view/Niger-161
Downloading: http://example.webscraping.com/places/default/view/Nicaragua-160
Downloading: http://example.webscraping.com/places/def

Downloading: http://example.webscraping.com/places/default/view/Ecuador-65
Downloading: http://example.webscraping.com/places/default/view/East-Timor-64
Downloading: http://example.webscraping.com/places/default/view/Dominican-Republic-63
Downloading: http://example.webscraping.com/places/default/view/Dominica-62
Downloading: http://example.webscraping.com/places/default/view/Djibouti-61
Downloading: http://example.webscraping.com/places/default/view/Denmark-60
Downloading: http://example.webscraping.com/places/default/view/Democratic-Republic-of-the-Congo-59
Downloading: http://example.webscraping.com/places/default/view/Czech-Republic-58
Downloading: http://example.webscraping.com/places/default/view/Cyprus-57
Downloading: http://example.webscraping.com/places/default/view/Curacao-56
Downloading: http://example.webscraping.com/places/default/view/Cuba-55
Downloading: http://example.webscraping.com/places/default/view/Croatia-54
Downloading: http://example.webscraping.com/places/defau

##### 支持代理的Python 3代码

In [1]:
# coding=utf-8

from urllib.parse import urlparse
from urllib.request import Request, build_opener, ProxyHandler
from bs4 import BeautifulSoup

url = 'http://ip138.com'
_headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101 Firefox/58.0'}

proxy = '127.0.0.1:1080'
opener = build_opener()
proxy_params = {urlparse(url).scheme: proxy}
opener.add_handler(ProxyHandler(proxy_params))
req = Request(url, headers=_headers)
page = opener.open(req).read()
soup = BeautifulSoup(page, 'lxml')
target = soup.find_all('iframe')
for iframe in target:
    src_page = opener.open(Request(iframe.attrs['src'], headers=_headers))
    iframe_soup = BeautifulSoup(src_page, 'lxml')
    print(iframe_soup.center.get_text())


您的IP是：[45.77.166.94] 来自：美国


In [4]:
# coding=utf-8

from urllib.request import Request, build_opener, ProxyHandler
from urllib.parse import urlparse
from bs4 import BeautifulSoup

url = 'https://www.whatismybrowser.com/'
_headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101 Firefox/58.0'}

# Proxy setting start
proxy = '127.0.0.1:1080'
opener = build_opener()
proxy_params = {urlparse(url).scheme: proxy}
opener.add_handler(ProxyHandler(proxy_params))
# Proxy setting end

req = Request(url, headers=_headers)
page = opener.open(req).read()
soup = BeautifulSoup(page, 'lxml')
target_div = soup.find_all('div', class_="user-agent")[0].a.get_text()
print(target_div)


Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101 Firefox/58.0


In [7]:
from urllib.request import Request, build_opener, ProxyHandler
from urllib.parse import urlparse

from bs4 import BeautifulSoup

url = 'https://www.youtube.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}

# Proxy setting start
proxy = '127.0.0.1:1080'
proxy_params = {urlparse(url).scheme: proxy}
opener = build_opener()
opener.add_handler(ProxyHandler(proxy_params))
# Proxy setting end

req = Request(url, headers=headers)
page = opener.open(req).read()
soup = BeautifulSoup(page, 'lxml')
print(soup.title.contents[0])


YouTube


In [8]:
from urllib.request import Request, build_opener, ProxyHandler
from urllib.parse import urlparse

from bs4 import BeautifulSoup

url = 'https://twitter.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}

# Proxy setting start
proxy = '127.0.0.1:1080'
proxy_params = {urlparse(url).scheme: proxy}
opener = build_opener()
opener.add_handler(ProxyHandler(proxy_params))
# Proxy setting end

req = Request(url, headers=headers)
page = opener.open(req).read()
soup = BeautifulSoup(page, 'lxml')
print(soup.title.contents[0])


Twitter. It's what's happening.
