## 第 4 章  解析库的使用  
### 4.1 使用 XPath
### 4.2 使用 Beautiful Soup
### 4.3 使用 pyquery

### 4.1 使用 XPath

In [2]:
from lxml import etree

text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">forth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''

html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))


<html><body><div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">forth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</li></ul>
</div>
</body></html>


In [7]:
html = etree.parse('./test.html', etree.HTMLParser())
result = etree.tostring(html)
print(result.decode('utf-8'))

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body><div>&#13;
<ul>&#13;
<li class="item-0"><a href="link1.html">first item</a></li>&#13;
<li class="item-1"><a href="link2.html">second item</a></li>&#13;
<li class="item-inactive"><a href="link3.html">third item</a></li>&#13;
<li class="item-1"><a href="link4.html">forth item</a></li>&#13;
<li class="item-0"><a href="link5.html">fifth item</a>&#13;
</li></ul>&#13;
</div>&#13;
</body></html>


**在XPath中，“*”表示匹配所有节点，“//\*”表示所有节点都会被获取**

In [8]:
result = html.xpath('//*')
print(result)

[<Element html at 0x20b94043b00>, <Element body at 0x20b94024ac0>, <Element div at 0x20b924d3900>, <Element ul at 0x20b92550380>, <Element li at 0x20b92551e40>, <Element a at 0x20b9407a440>, <Element li at 0x20b9407aa80>, <Element a at 0x20b9407afc0>, <Element li at 0x20b938f8200>, <Element a at 0x20b9407a200>, <Element li at 0x20b92562e00>, <Element a at 0x20b92563900>, <Element li at 0x20b92560080>, <Element a at 0x20b94053c40>]


In [9]:
result = html.xpath('//li')
print(result)

[<Element li at 0x20b92551e40>, <Element li at 0x20b9407aa80>, <Element li at 0x20b938f8200>, <Element li at 0x20b92562e00>, <Element li at 0x20b92560080>]


In [10]:
print(result[0])

<Element li at 0x20b92551e40>


**'//tag/subtag' 获取节点 tag 的所有 ***直接*** 子节点 subtag**

In [11]:
result = html.xpath('//li/a')
print(result)

[<Element a at 0x20b94079e40>, <Element a at 0x20b9407a080>, <Element a at 0x20b9407bd40>, <Element a at 0x20b9407a300>, <Element a at 0x20b9407abc0>]


**'//node//node2' 获取node节点下的所有子孙节点 node2**  
**/用于获取直接子节点，//用于获取子孙节点**

In [12]:
result = html.xpath('//ul//a')
print(result)

[<Element a at 0x20b94079e40>, <Element a at 0x20b9407a080>, <Element a at 0x20b9407bd40>, <Element a at 0x20b9407a300>, <Element a at 0x20b9407abc0>]


In [13]:
result = html.xpath('//ul/a')
print(result)

[]


In [14]:
result = html.xpath('//a[@href="link4.html"]/../@class')
print(result)

['item-1']


In [16]:
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
print(result)

['item-1']


### 8. 属性匹配

In [17]:
result = html.xpath('//li[@class="item-0"]')
print(result)

[<Element li at 0x20b94053540>, <Element li at 0x20b924e4940>]


### 9. 文本获取

In [19]:
result = html.xpath('//li[@class="item-0"]/text()')
print(result)

['\r\n']


In [20]:
result = html.xpath('//li[@class="item-0"]/a/text()')
print(result)

['first item', 'fifth item']


In [21]:
result = html.xpath('//li[@class="item-0"]//text()')
print(result)

['first item', 'fifth item', '\r\n']


### 10. 属性获取

In [22]:
result = html.xpath('//li/a/@href')
print(result)

['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']


### 11. 属性多值匹配

In [23]:
text = '''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
print(html)

<Element html at 0x20b93fb0100>


In [24]:
result = html.xpath('//li[@class="li"]/a/text()')
print(result)

[]


In [26]:
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)

['first item']


### 12. 多属性匹配

In [27]:
text = '''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
print(result)

['first item']


### 13. 按序选择

In [28]:
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">forth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
print(html)

<Element html at 0x20b940a6b40>


In [29]:
result = html.xpath('//li[1]/a/text()')
print(result)

['first item']


In [30]:
result = html.xpath('//li[last()]/a/text()')
print(result)

['fifth item']


In [31]:
result = html.xpath('//li[position()<3]/a/text()')
print(result)

['first item', 'second item']


In [32]:
result = html.xpath('//li[last()-2]/a/text()')
print(result)

['third item']


In [33]:
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">forth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
print(html)

<Element html at 0x20b94053140>


In [34]:
result = html.xpath('//li[1]/ancestor::*')
print(result)

[<Element html at 0x20b94053140>, <Element body at 0x20b940c80c0>, <Element div at 0x20b940cb0c0>, <Element ul at 0x20b92494540>]


In [35]:
result = html.xpath('//li[1]/ancestor::div')
print(result)

[<Element div at 0x20b940cb0c0>]


In [37]:
result = html.xpath('//li[1]/attribute::*')
print(result)

['item-0']


In [38]:
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)

[<Element a at 0x20b940b63c0>]


In [39]:
result = html.xpath('//li[1]/descendant::span')
print(result)

[<Element span at 0x20b9407b540>]


In [40]:
result = html.xpath('//li[1]/following::*')
print(result)

[<Element li at 0x20b940c6400>, <Element a at 0x20b940b52c0>, <Element li at 0x20b940a6b40>, <Element a at 0x20b92563180>, <Element li at 0x20b92483780>, <Element a at 0x20b940f20c0>, <Element li at 0x20b940f3440>, <Element a at 0x20b940f3700>]


In [41]:
print(result[1])

<Element a at 0x20b940b52c0>


In [43]:
print(result[2])

<Element li at 0x20b940a6b40>


In [45]:
print(result[1].xpath('//text()'))

['\n', '\n', 'first item', '\n', 'second item', '\n', 'third item', '\n', 'forth item', '\n', 'fifth item', '\n', '\n', '\n']


In [46]:
result = html.xpath('//li[1]/following::*[2]')
print(result)

[<Element a at 0x20b940b52c0>]


In [47]:
result = html.xpath('//li[1]/following::*[2]/text()')
print(result)

['second item']


In [48]:
result = html.xpath('//li[1]/following-sibling::*')
print(result)

[<Element li at 0x20b940f2180>, <Element li at 0x20b940f0fc0>, <Element li at 0x20b940f2440>, <Element li at 0x20b940f2dc0>]


### 4.2 使用 Beautiful Soup

In [49]:
from bs4 import BeautifulSoup

soup = BeautifulSoup('<p>Hello</p>', 'lxml')
print(soup.p.string)

Hello


### 4. 基本用法

In [52]:
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dormouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup)

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dormouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>


In [53]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title" name="dormouse">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    <!-- Elsie -->
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [54]:
print(soup.title.string)

The Dormouse's story


### 5. 节点选择器

In [55]:
print(soup.title)

<title>The Dormouse's story</title>


In [56]:
print(type(soup.title))

<class 'bs4.element.Tag'>


In [57]:
print(soup.title.string)

The Dormouse's story


In [58]:
print(soup.head)

<head><title>The Dormouse's story</title></head>


In [59]:
print(soup.p)

<p class="title" name="dormouse"><b>The Dormouse's story</b></p>


In [60]:
print(soup.title.name)

title


In [62]:
print(soup.p.attrs)

{'class': ['title'], 'name': 'dormouse'}


In [63]:
print(soup.p.attrs['name'])

dormouse


In [64]:
print(soup.p['name'])

dormouse


In [65]:
print(soup.p['class'])

['title']


In [66]:
print(soup.p.string)

The Dormouse's story


In [67]:
print(soup.head.title)

<title>The Dormouse's story</title>


In [68]:
print(type(soup.head.title))

<class 'bs4.element.Tag'>


In [69]:
print(soup.head.title.string)

The Dormouse's story


#### 子节点和子孙节点  
可以通过调用 contents 属性来获取节点元素的直接子节点  

In [74]:
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="story">
    Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.p.contents)

['\n    Once upon a time there were three little sisters; and their names were\n    ', <a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>, '\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, '\nand\n', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, '\nand they lived at the bottom of a well.\n']


In [75]:
print(soup.p.children)

<list_iterator object at 0x0000020B92657F10>


In [76]:
for i, child in enumerate(soup.p.children):
    print(i, child)

0 
    Once upon a time there were three little sisters; and their names were
    
1 <a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
2 

3 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
4 
and

5 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
6 
and they lived at the bottom of a well.



In [77]:
for child in soup.p.children:
    print(child)


    Once upon a time there were three little sisters; and their names were
    
<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>


<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

and

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

and they lived at the bottom of a well.



In [80]:
list(enumerate(soup.p.children))

[(0,
  '\n    Once upon a time there were three little sisters; and their names were\n    '),
 (1,
  <a class="sister" href="http://example.com/elsie" id="link1">
  <span>Elsie</span>
  </a>),
 (2, '\n'),
 (3, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>),
 (4, '\nand\n'),
 (5, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>),
 (6, '\nand they lived at the bottom of a well.\n')]

In [81]:
for i, child in enumerate(soup.p.children):
    print(i, child)

0 
    Once upon a time there were three little sisters; and their names were
    
1 <a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
2 

3 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
4 
and

5 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
6 
and they lived at the bottom of a well.



In [82]:
print(soup.p.descendants)

<generator object Tag.descendants at 0x0000020B94260580>


descendants可以获取节点的所有子孙节点

In [83]:
for i, child in enumerate(soup.p.descendants):
    print(i, child)

0 
    Once upon a time there were three little sisters; and their names were
    
1 <a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
2 

3 <span>Elsie</span>
4 Elsie
5 

6 

7 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
8 Lacie
9 
and

10 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
11 Tillie
12 
and they lived at the bottom of a well.



#### （2）父节点和祖先节点

In [84]:
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
    Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.a.parent)

<p class="story">
    Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
</p>


In [85]:
html = """
<html>
<body>
<p class="story">
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.a.parents)

<generator object PageElement.parents at 0x0000020B942627A0>


In [86]:
print(type(soup.a.parents))

<class 'generator'>


In [87]:
print(list(enumerate(soup.a.parents)))

[(0, <p class="story">
<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
</p>), (1, <body>
<p class="story">
<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
</p>
</body>), (2, <html>
<body>
<p class="story">
<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
</p>
</body></html>), (3, <html>
<body>
<p class="story">
<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
</p>
</body></html>)]


#### (3) 兄弟节点

In [88]:
html = """
<html>
<body>
<p class="story">
    Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
            Hello
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
            and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
</p>
"""
soup = BeautifulSoup(html, 'lxml')
print('Next Sibling', soup.a.next_sibling)

Next Sibling 
            Hello



In [89]:
print('Prev Sibling', soup.a.previous_sibling)

Prev Sibling 
    Once upon a time there were three little sisters; and their names were



In [90]:
print('Next Siblings', list(enumerate(soup.a.next_siblings)))

Next Siblings [(0, '\n            Hello\n'), (1, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>), (2, '\n            and\n'), (3, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>), (4, '\n            and they lived at the bottom of a well.\n')]


In [91]:
print('Prev Sibling', list(enumerate(soup.a.previous_siblings)))

Prev Sibling [(0, '\n    Once upon a time there were three little sisters; and their names were\n')]


#### （4）提取信息

In [92]:
html = """
<html>
<body>
<p class="story">
            Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Bob</a><a href="http://example.com/lacie"
class="sister" id="link2">Lacie</a>
</p>
"""
soup = BeautifulSoup(html, 'lxml')
print('Next Sibling:')

Next Sibling:


In [93]:
print(type(soup.a.next_sibling))

<class 'bs4.element.Tag'>


In [94]:
print(soup.a.next_sibling)

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>


In [95]:
print(soup.a.next_sibling.string)

Lacie


In [96]:
print('Parent:')

Parent:


In [97]:
print(type(soup.a.parents))

<class 'generator'>


In [98]:
print(list(soup.a.parents)[0])

<p class="story">
            Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Bob</a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
</p>


In [99]:
print(list(soup.a.parents)[0].attrs['class'])

['story']


#### 6. 方法选择器  
* __find_all()__

In [100]:
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(name='ul'))

[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>, <ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>]


In [101]:
print(type(soup.find_all(name='ul')[0]))

<class 'bs4.element.Tag'>


In [102]:
for ul in soup.find_all(name='ul'):
    print(ul.find_all(name='li'))

[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>]


In [103]:
for ul in soup.find_all(name='ul'):
    print(ul.find_all(name='li'))
    for li in ul.find_all(name='li'):
        print(li.string)

[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
Foo
Bar
Jay
[<li class="element">Foo</li>, <li class="element">Bar</li>]
Foo
Bar


In [104]:
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}))

[<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]


In [105]:
print(soup.find_all(attrs={'name': 'elements'}))

[<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]


In [106]:
print(soup.find_all(id='list-1'))

[<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]


In [107]:
print(soup.find_all(class_='element'))

[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]


In [108]:
import re
html = '''
<div class="panel">
<div class="panel-body">
<a>Hello, this is a link</a>
<a>Hello, this is a link, too</a>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(text=re.compile('link')))

['Hello, this is a link', 'Hello, this is a link, too']


* __find()__  
find() 方法返回的是单个元素，也就是第一个匹配的元素，而 find_all() 方法返回的是所有匹配的元素组成的列表。

In [109]:
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find(name='ul'))

<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>


In [110]:
print(type(soup.find(name='ul')))

<class 'bs4.element.Tag'>


In [111]:
print(soup.find(class_='list'))

<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>


#### 7. CSS 选择器

In [112]:
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.select('.panel'))

[<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>]


In [113]:
print(soup.select('.panel-heading'))

[<div class="panel-heading">
<h4>Hello</h4>
</div>]


In [114]:
print(soup.select('.panel .panel-heading'))

[<div class="panel-heading">
<h4>Hello</h4>
</div>]


In [115]:
print(soup.select('ul li'))

[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]


In [116]:
print(soup.select('#list-2 .element'))

[<li class="element">Foo</li>, <li class="element">Bar</li>]


In [117]:
print(soup.select('ul'))

[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>, <ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>]


In [118]:
print(soup.select('ul')[0])

<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>


In [119]:
print(type(soup.select('ul')[0]))

<class 'bs4.element.Tag'>


In [120]:
for ul in soup.select('ul'):
    print(ul.select('li'))

[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>]


In [121]:
for ul in soup.select('ul'):
    print(ul)

<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>


In [122]:
for ul in soup.select('ul'):
    print(type(ul))
    print(ul['id'])

<class 'bs4.element.Tag'>
list-1
<class 'bs4.element.Tag'>
list-2


In [124]:
for ul in soup.select('ul'):
    print(ul['id'])
    print(ul.attrs['id'])

list-1
list-1
list-2
list-2


In [125]:
for li in soup.select('li'):
    print('Get Text:', li.get_text())
    print('String:', li.string)

Get Text: Foo
String: Foo
Get Text: Bar
String: Bar
Get Text: Jay
String: Jay
Get Text: Foo
String: Foo
Get Text: Bar
String: Bar


### 4.3 使用 pyquery

#### 2. 初始化  
* __字符串初始化__

In [1]:
html = '''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li'))

<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>



* **URL 初始化**

In [2]:
doc = pq(url='https://cuiqingcai.com')
print(doc('title'))

<title>静觅丨崔庆才的个人站点 - Python爬虫教程</title>
  


In [3]:
import requests

type(requests.get('https://cuiqingcai.com'))

requests.models.Response

In [4]:
requests.get('https://cuiqingcai.com')

<Response [200]>

In [5]:
type(requests.get('https://cuiqingcai.com').text)

str

In [None]:
requests.get('https://cuiqingcai.com').text

In [7]:
print(requests.get('https://cuiqingcai.com').text)

<!DOCTYPE html>
<html lang="zh-CN">

<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
  <meta name="theme-color" content="#222">
  <meta name="generator" content="Hexo 4.2.1">
  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png">
  <link rel="mask-icon" href="/images/safari-pinned-tab.svg" color="#222">
  <meta http-equiv="Cache-Control" content="no-transform">
  <meta http-equiv="Cache-Control" content="no-siteapp">
  <link rel="stylesheet" href="/css/main.css">
  <link rel="stylesheet" href="/lib/font-awesome/css/all.min.css">
  <link rel="stylesheet" href="/lib/pace/pace-theme-minimal.min.css">
  <script src="/lib/pace/pace.min.js"></script>
  <script id="hexo-configurations">
    var NexT = window.NexT 

In [9]:
doc = pq(requests.get('https://cuiqingcai.com').text)
print(doc('title'))

<title>静觅丨崔庆才的个人站点 - Python爬虫教程</title>
  


* **文件初始化**

In [10]:
doc = pq(filename='test.html')
print(doc('li'))

<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">forth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</li>


#### 3. 基本 CSS 选择器

In [11]:
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = pq(html)
print(doc('#container .list li'))

<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>



In [12]:
print(type(doc('#container .list li')))

<class 'pyquery.pyquery.PyQuery'>


#### 4. 查找节点

In [14]:
doc = pq(html)
items = doc('.list')
print(type(items))

<class 'pyquery.pyquery.PyQuery'>


In [15]:
print(items)

<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>



In [16]:
lis = items.find('li')
print(type(lis))

<class 'pyquery.pyquery.PyQuery'>


In [17]:
print(lis)

<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>



find() 的查找范围是节点的所有子孙节点，而如果我们只想查找子孙节点，那么可以用 children() 方法：

In [18]:
lis = items.children()
print(type(lis))

<class 'pyquery.pyquery.PyQuery'>


In [19]:
print(lis)

<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>



In [20]:
lis = items.children('.active')
print(lis)

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>



* **父节点**

我们可以用 parent() 方法来获取某个节点的父节点，示例如下：

In [21]:
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
items = doc('.list')
container = items.parent()
print(type(container))

<class 'pyquery.pyquery.PyQuery'>


In [22]:
print(container)

<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>



如果想获取某个祖先节点，可以使用 parents() 方法，parents() 会返回所有祖先节点

In [23]:
parents = items.parents()
print(type(parents))

<class 'pyquery.pyquery.PyQuery'>


In [24]:
print(parents)

<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div><div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>



In [25]:
parent = items.parents('.wrap')
print(parent)

<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>


* **兄弟节点**

.item-0.active表示class为 item-0 和 active的节点

In [27]:
li = doc('.list .item-0.active')
print(li.siblings())

<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0">first item</li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>



In [28]:
print(li.siblings('.active'))

<li class="item-1 active"><a href="link4.html">fourth item</a></li>



In [29]:
print(doc('.list .item-0').siblings())

<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0">first item</li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>



#### 5. 遍历

In [30]:
li = doc('.item-0.active')
print(li)

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>



In [31]:
print(str(li))

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>



In [32]:
type(li)

pyquery.pyquery.PyQuery

In [33]:
type(str(li))

str

虽然 print(li) 和 print(str(li)) 的输出的结果一样，但实际上其类型是不一样的

In [34]:
lis = doc('li').items()
print(type(lis))

<class 'generator'>


In [35]:
for li in lis:
    print(li, type(li))

<li class="item-0">first item</li>
 <class 'pyquery.pyquery.PyQuery'>
<li class="item-1"><a href="link2.html">second item</a></li>
 <class 'pyquery.pyquery.PyQuery'>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
 <class 'pyquery.pyquery.PyQuery'>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
 <class 'pyquery.pyquery.PyQuery'>
<li class="item-0"><a href="link5.html">fifth item</a></li>
 <class 'pyquery.pyquery.PyQuery'>


#### 6. 获取信息  
* **获取属性**

In [36]:
a = doc('.item-0.active a')
print(a, type(a))

<a href="link3.html"><span class="bold">third item</span></a> <class 'pyquery.pyquery.PyQuery'>


In [37]:
print(a.attr('href'))

link3.html


In [38]:
dir(pq)

['Fn',
 '__add__',
 '__call__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__html__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_append',
 '_copy',
 '_css_to_xpath',
 '_extend',
 '_filter_only',
 '_get_root',
 '_next_all',
 '_prev_all',
 '_translator_class',
 '_traverse',
 '_traverse_parent_topdown',
 'addClass',
 'add_class',
 'after',
 'append',
 'appendTo',
 'append_to',
 'attr',
 'base_url',
 'before',
 'children',
 'clear',
 'clone',
 'closest',
 'contents',
 'copy',
 'count',
 'css',
 'each',
 'empty',
 

In [41]:
help(pq.attr)

Help on _element in module pyquery.pyquery:

None


In [43]:
print(a.attr.href)

link3.html


In [44]:
a = doc('a')
print(a, type(a))

<a href="link2.html">second item</a><a href="link3.html"><span class="bold">third item</span></a><a href="link4.html">fourth item</a><a href="link5.html">fifth item</a> <class 'pyquery.pyquery.PyQuery'>


In [45]:
print(a.attr['href'])

link2.html


In [46]:
print(a.attr.href)

link2.html


In [47]:
for item in a.items():
    print(item.attr.href)

link2.html
link3.html
link4.html
link5.html


* **获取文本**

In [48]:
a = doc('.item-0.active a')
print(a)

<a href="link3.html"><span class="bold">third item</span></a>


In [49]:
print(a.text())

third item


In [50]:
li = doc('.item-0.active')
print(li)

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>



In [51]:
print(li.html())

<a href="link3.html"><span class="bold">third item</span></a>


In [53]:
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
li = doc('li')
print(li.html())

<a href="link2.html">second item</a>


In [54]:
print(li.text())

second item third item fourth item fifth item


In [55]:
print(type(li.text()))

<class 'str'>


#### 7. 节点操作

* **addClass 和 removeClass**

In [56]:
li = doc('.item-0.active')
print(li)

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>



In [57]:
li.removeClass('active')
print(li)

<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>



In [58]:
li.addClass('active')
print(li)

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>



* **attr、text 和 html**

In [63]:
html = '''
<ul class="list">
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
</ul>
'''
doc = pq(html)
li = doc('.item-0.active')
print(li)

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>



In [64]:
li.attr('name', 'link')
print(li)

<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>



In [65]:
li.text('changed item')
print(li)

<li class="item-0 active" name="link">changed item</li>



In [66]:
li.html('<span>changed item</span>')
print(li)

<li class="item-0 active" name="link"><span>changed item</span></li>



所以说，如果 attr() 方法只传入第一个参数（即属性名称），则是获取相应的属性值；如果传入第二个参数，可以用来修改属性值。text() 方法和 html() 方法如果不传参数，则是获取节点内的纯文本和 HTML 文本；如果传入参数，则进行赋值。

* **remove()**

In [67]:
html = '''
<div class="wrap">
    Hello, World
<p>This is a paragraph.</p>
</div>
'''
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())

Hello, World
This is a paragraph.


In [68]:
wrap.find('p').remove()
print(wrap.text())

Hello, World


#### 8. 伪类选择器

In [69]:
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
li = doc('li:first-child')
print(li)

<li class="item-0">first item</li>



In [70]:
li = doc('li:nth-child(2)')
print(li)

<li class="item-1"><a href="link2.html">second item</a></li>



:gt(2) 表示选择索引大于2的元素，索引是从0开始的。

In [71]:
li = doc('li:gt(2)')
print(li)

<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>



In [72]:
li = doc('li:nth-child(2n)')
print(li)

<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>



In [73]:
li = doc('li:contains(second)')
print(li)

<li class="item-1"><a href="link2.html">second item</a></li>



In [74]:
li = doc('li:last-child')
print(li)

<li class="item-0"><a href="link5.html">fifth item</a></li>



In [75]:
print(doc('li:gt(0)'))

<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>



In [76]:
print(doc('li:gt(4)'))




In [77]:
print(doc('li:gt(3)'))

<li class="item-0"><a href="link5.html">fifth item</a></li>



In [78]:
print(doc('li:gt(-1)'))

<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>



In [79]:
doc('li:gt(2)')

[<li.item-1.active>, <li.item-0>]

In [80]:
type(doc('li:gt(2)'))

pyquery.pyquery.PyQuery

In [86]:
for j in doc('li:gt(2)'):
    print(type(j))

<class 'lxml.etree._Element'>
<class 'lxml.etree._Element'>


In [87]:
print(doc('li'))

<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>



In [88]:
type(doc('li'))

pyquery.pyquery.PyQuery

In [89]:
print(doc('li:gt(2)'))

<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>



In [91]:
doc('li:gt(2)')

[<li.item-1.active>, <li.item-0>]

In [92]:
print(doc('li:gt(2)'))

<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>



In [93]:
type(doc('li:gt(2)'))

pyquery.pyquery.PyQuery

In [95]:
dir(j)

['__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_init',
 'addnext',
 'addprevious',
 'append',
 'attrib',
 'base',
 'clear',
 'cssselect',
 'extend',
 'find',
 'findall',
 'findtext',
 'get',
 'getchildren',
 'getiterator',
 'getnext',
 'getparent',
 'getprevious',
 'getroottree',
 'index',
 'insert',
 'items',
 'iter',
 'iterancestors',
 'iterchildren',
 'iterdescendants',
 'iterfind',
 'itersiblings',
 'itertext',
 'keys',
 'makeelement',
 'nsmap',
 'prefix',
 'remove',
 'replace',
 'set',
 'sourceline',
 'tag',
 'tail',
 'text',
 'values',
 'xpath']