用来获取网页内容的 urllib 库还包含有用来获取文件内容的方法。下面的程序使用 urllib.request.urlretrieve 从远程 URL 下载图片:

In [2]:
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com")

In [3]:
bs = BeautifulSoup(html, 'html.parser')


In [5]:
imageLocation = bs.find('div', {'class': 'pagelayer-image-holder pagelayer-anim-par'}).find('img')['src']

In [7]:
urlretrieve (imageLocation, 'data_output/logo.jpg')

('data_output/logo.jpg', <http.client.HTTPMessage at 0x7fa2ff56fee0>)

如果你只需要下载一个文件，而且知道如何获取它，以及它的文件类型，这么做就可以 了。但是大多数爬虫都不可能一天只下载一个文件。下面的程序会把 http://pythonscraping. com 主页上所有 src 属性的内部文件都下载下来

In [8]:
import os 
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [9]:
downloadDirectory = 'downloaded'
baseUrl = "http://www.pythonscraping.com"

In [10]:
'''得到绝对路径'''
def getAbsoluteUrl(baseUrl,source):
    if source.startswith("http://www."):
        url = 'http://{}'.format(source[11:])
    elif source.startswith("http://"):
        url = source
    elif source.startswith('www.'):
        url = source[4:]
        url = 'http://{}'.format(source)
    else:
        url = '{}/{}'.format(baseUrl, source)

    if baseUrl not in url:
        return None
    return url

In [16]:
'''下载路径'''
from http.client import HTTPException


def getDownloadPath(baseUrl,absoluteUrl,downloadDirectory):
    try:
        path = absoluteUrl.replace("www","")
        path = path.replace(baseUrl, '')
        path = downloadDirectory+path
        directory = os.path.dirname(path)

        if not os.path.exists(directory):
            os.makedirs(directory)
    except HTTPException as e:
        print(e)
        pass 
    
    return path 


In [17]:
html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')
downloadList = bs.findAll(src=True)

In [18]:
for download in downloadList:
    fileurl = getAbsoluteUrl(baseUrl,download['src'])
    if fileurl is not None:
        print(fileurl)
        urlretrieve(fileurl,getDownloadPath(baseUrl,fileurl,downloadDirectory))

http://www.pythonscraping.com/https://pythonscraping.com/wp-includes/js/jquery/jquery.min.js?ver=3.6.0


HTTPError: HTTP Error 404: Not Found

<b>慎用，易下载到病毒</b>


------------------------------------------

CSV(comma-separated values，逗号分隔值)是存储表格数据的常用文件格式。Microsoft
Excel 和很多应用都支持 CSV 格式，因为它很简洁。下面就是一个 CSV 文件的例子:

In [20]:
import csv 
file = open("data_output/test.csv","w+")
try:
    writer = csv.writer(file)
    writer.writerow(('number', 'number plus 2', 'number times 2'))
    for i in range(10):
        writer.writerow( (i, i+2, i*2))
finally:
    file.close()

表格形式数据爬取以及存储为csv格式

In [22]:
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://en.wikipedia.org/wiki/'
         'Comparison_of_text_editors')
bs = BeautifulSoup(html, 'html.parser')
# 主对比表格是当前页面上的第一个表格
table = bs.findAll('table',{'class':'wikitable'})[0] 
rows = table.findAll('tr')

csvFile = open('data_output/editors.csv', 'wt+')
writer = csv.writer(csvFile)
try:
    for row in rows:
        csvRow = []
        for cell in row.findAll(['td', 'th']):
            csvRow.append(cell.get_text())
        writer.writerow(csvRow)
finally:
    csvFile.close()

---------------------------

python与mysql的整合

In [1]:
# pip install pymsql

In [3]:
import pymysql
conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock',
                            user='root', passwd="2022mysql", db='mysql')
cur = conn.cursor()
cur.execute('USE scraping')
cur.execute('select * from pages')
print(cur.fetchall())
cur.close()
conn.close()


((1, 'test title', 'test content', datetime.datetime(2022, 8, 23, 20, 46, 33)), (2, 'Test page title', 'This is some test page content. It can be up to 10,000 characters\n      long.', datetime.datetime(2014, 9, 21, 10, 25, 32)))


<u>将数据库的字符更改为中文</u>:
ALTER DATABASE scraping CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci;
ALTER TABLE pages CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
ALTER TABLE pages CHANGE title title VARCHAR(200) CHARACTER SET utf8mb4 COLLATE
utf8mb4_unicode_ci;
ALTER TABLE pages CHANGE content content VARCHAR(10000) CHARACTER SET utf8mb4 CO
LLATE utf8mb4_unicode_ci;

In [4]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import pymysql
import re 

In [7]:
conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock',
                            user='root', passwd="2022mysql", db='mysql',charset="utf8")
cur = conn.cursor()

In [8]:
cur.execute("USE scraping")

0

In [10]:
random.seed(datetime.datetime.now())
def store(title,content):
    cur.execute("INSERT INTO pages(title,content)VALUES(%s,%s)",(title,content))
    cur.connection.commit()

def get_links(articleUrl):
    html = urlopen('http://en.wikipedia.org'+articleUrl)
    bs = BeautifulSoup(html, 'html.parser')
    title = bs.find("h1").get_text()
    content= bs.find("div",{"id":"mw-content-text"}).find("p").get_text()

    store(title,content)
    return bs.find("div",{"id":"bodyContent"}).findAll("a",href=re.compile('^(/wiki/)((?!:).)*$'))

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(datetime.datetime.now())


In [11]:
links = get_links('/wiki/Kevin_Bacon')
try:
    while len(links) > 0:
            newArticle = links[random.randint(0, len(links)-1)].attrs['href']
            print(newArticle)
            links = get_links(newArticle)
finally:
    cur.close()
    conn.close()

/wiki/Raul_Julia
/wiki/Richard_Foreman
/wiki/ISNI_(identifier)
/wiki/ISO_3166-3
/wiki/Serbia_and_Montenegro
/wiki/Montenegro
/wiki/Montenegrins_(ethnic_group)
/wiki/Herzegovina_uprising_(1875%E2%80%931877)
/wiki/Kulen_Vakuf
/wiki/Donja_Gata
/wiki/Doljani,_Biha%C4%87
/wiki/Municipalities_of_Bosnia_and_Herzegovina
/wiki/City_council_(Israel)
/wiki/Municipalities_of_Bosnia_and_Herzegovina
/wiki/Municipalities_of_Republika_Srpska
/wiki/Trnovo,_Republika_Srpska
/wiki/Domaljevac-%C5%A0amac
/wiki/Bosanski_%C5%A0amac
/wiki/Herzegovina-Neretva_Canton
/wiki/Zavidovi%C4%87i
/wiki/Petrovo,_Bosnia_and_Herzegovina
/wiki/%C5%A0ekovi%C4%87i
/wiki/%C5%A0iroki_Brijeg
/wiki/Tur%C4%8Dinovi%C4%87i
/wiki/Donji_Crna%C4%8D


KeyboardInterrupt: 

6.3.5 -MySQL里的“六度空间游戏”

In [12]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import pymysql
from random import shuffle

In [13]:
conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock',
                            user='root', passwd="2022mysql", db='mysql',charset="utf8")
cur = conn.cursor()
cur.execute('USE scraping')

0

In [14]:
def insertPageIfNotExists(url):
    '''正如其名称所示，当页面不存在时，该函数会插入一个新的页面记录。
    该页面以及其他 已经抓取的页面作为列表存储在 pages 变量中，
    以确保页面记录不会重复。它也提供了 一个供查询的 pageId 数，以创建新的链接。'''
    cur.execute("select * from wikipedia_pages where url = %s ",(url))
    if cur.rowcount == 0:
        cur.execute("insert into wikipedia_pages (url) values (%s)",(url))
        conn.commit()
    else:
        return cur.fetchone()[0]

In [19]:
def loadPages():
    '''该函数将当前所有页面从数据库加载到一个列表中，这样可以确定新的页面是否被访 问过。
    在程序运行时也会收集页面，因此如果从一个空的数据库开始，
    爬虫仅仅运行 一遍，那么理论上说 loadPages 是不需要的。
    实际上，这会导致问题。原因是网络可 能会中断，
    或者你希望在不同的时间段抓取各链接，因此让爬虫可以重新自我加载非常 重要。'''
    cur.execute("select * from wikipedia_pages")
    pages = [row[1] for row in cur.fetchall()]
    return pages

In [20]:
def insertLink(fromPageId,toPageId):
    '''该函数在数据库中创建一个新的链接。如果该链接已经存在，则不会创建。
    如果同一个 页面中存在两个或者多个相同的链接，我们会将其当作同一个链接，表示同样的关系， 
    并且应该被当作一条记录。这样，如果程序对同一页面运行多遍，也有助于维护数据库 的一致性。'''
    # 判读处理
    if fromPageId is None or toPageId is None:
        return 
    else:    
        cur.execute("select * from wikipedia_links where fromPageId=%s and toPageId = %s",(int(fromPageId),int(toPageId)))
    if cur.rowcount==0:
        cur.execute("insert into wikipedia_links(fromPageId,toPageId) values (%s,%s)",(int(fromPageId),int(toPageId)))
        conn.commit()
        

In [21]:
def getLinks(pageUrl,recursionLevel,pages):
    if recursionLevel > 4:
        return 
    pageId = insertPageIfNotExists(pageUrl)
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    links = bs.findAll('a', href=re.compile('^(/wiki/)((?!:).)*$'))
    links = [link.attrs['href'] for link in links]
     

    for link in links:
        insertLink(pageId,insertPageIfNotExists(link))
        if link not in pages:
            # 遇到新页面，加入集合并搜索里面的词条链接
            pages.append(link)
            getLinks(link,recursionLevel+1,pages)


In [22]:
getLinks('/wiki/Kevin_Bacon', 0, loadPages())
cur.close()
conn.close()

KeyboardInterrupt: 

相关sql代码片段：

<code>
-- create table pages(
-- id BIGINT(7) not null auto_increment,
-- title VARCHAR(200),
-- content VARCHAR(10000),
-- created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-- PRIMARY KEY(id)
-- );
</code>
<code>
-- DESCRIBE pages;
</code>

<code>
-- INSERT into pages
-- (title,content) 
-- VALUES
-- ("test title","test content");
</code>

<code>
-- INSERT INTO pages (id, title, content, created) VALUES (3,
--       "Test page title",
--       "This is some test page content. It can be up to 10,000 characters
--       long.", "2014-09-21 10:25:32");
</code>

<code>
select * from wikipedia_links;
-- SELECT id, title FROM pages WHERE content LIKE "%page content%";
-- CREATE TABLE wikipedia_pages(
-- id int(7) not null auto_increment,
-- url varchar(255) not null,
-- created timestamp not null default current_timestamp,
-- PRIMARY KEY(id)
-- );
</code>

<code>
-- CREATE TABLE wikipedia_links(
-- id int not null auto_increment ,
-- fromPageId int null,
-- toPageId int null,
-- created timestamp not null default current_timestamp,
-- PRIMARY key (id)
-- );
</code>

email:
**Python** 有两个重要的包可以发送邮件:smtplib 和 email。

In [24]:
import smtplib
from email.mime.text import MIMEText

msg = MIMEText("i am the email from python")
msg['Subject'] = "python email"
msg["From"] = '17702069165@163.com'
msg['To'] = 'danyowchueng@gmail.com'
s = smtplib.SMTP('smtp.163.com',)
s.send_message(msg)
s.quit()


# # 发信方的信息：发信邮箱，QQ邮箱授权码
# from_addr = '17702069165@163.com'
# password = ''

# # 收信方邮箱
# to_addr = 'xxx@qq.com'

# # 发信服务器
# smtp_server = 'smtp.qq.com'


# server = smtplib.SMTP_SSL()
# server.connect(smtp_server,465)

# server.login(from_addr, password)
# # 发送邮件
# server.sendmail(from_addr, to_addr, msg.as_string())
# # 关闭服务器
# server.quit()


SMTPSenderRefused: (553, b'authentication is required,163 smtp8,DMCowADHAiHj6wRjJ07GVw--.45077S2 1661266916', '17702069165@163.com')

In [None]:
import smtplib
from email.mime.text import MIMEText
from bs4 import BeautifulSoup
from urllib.request import urlopen
import time


def sendMail(subject, body):
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] ='christmas_alerts@pythonscraping.com'
    msg['To'] = 'ryan@pythonscraping.com'
    s = smtplib.SMTP('localhost')
    s.send_message(msg)
    s.quit()
    

bs = BeautifulSoup(urlopen('https://isitchristmas.com/'), 'html.parser')
while(bs.find('a', {'id':'answer'}).attrs['title'] == 'NO'):
    print('It is not Christmas yet.')
    time.sleep(3600)
    bs = BeautifulSoup(urlopen('https://isitchristmas.com/'), 'html.parser')
    sendMail('It\'s Christmas!','According to https://isitchristmas.com, it is Christmas!')