In [None]:
# XPath
# nodename 所有子节点
# / 直接子节点
# //子孙节点 
# . 当前节点
# ..当前节点的父节点 
# @ 属性

In [19]:
# 实例
from lxml import etree
text = '''
    <div>
        <ul>
            <li class="li item-0" name="item"><a href="link0.html">first item</a></li>
            <li class="item-1"><a href="link1.html">seccond item</a></li>
            <li class="item-2"><a href="link2.html">third item</a></li>
            <li class="item-3"><a href="link3.html">fourth item</a></li>
            <li class="item-4"><a href="link4.html">fifth item</a></li>
        </ul>
    </div>
'''
html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))

<html><body><div>
        <ul>
            <li class="li item-0" name="item"><a href="link0.html">first item</a></li>
            <li class="item-1"><a href="link1.html">seccond item</a></li>
            <li class="item-2"><a href="link2.html">third item</a></li>
            <li class="item-3"><a href="link3.html">fourth item</a></li>
            <li class="item-4"><a href="link4.html">fifth item</a></li>
        </ul>
    </div>
</body></html>


In [20]:
# 所有的节点
html.xpath('//*')

[<Element html at 0x7f22883a5d70>,
 <Element body at 0x7f22884dbf00>,
 <Element div at 0x7f22884db960>,
 <Element ul at 0x7f22884dbeb0>,
 <Element li at 0x7f22884dbaf0>,
 <Element a at 0x7f22884db4b0>,
 <Element li at 0x7f22884db140>,
 <Element a at 0x7f22884db2d0>,
 <Element li at 0x7f22884dbb40>,
 <Element a at 0x7f22884dbbe0>,
 <Element li at 0x7f2288779f00>,
 <Element a at 0x7f2288779230>,
 <Element li at 0x7f2288779b90>,
 <Element a at 0x7f2288779cd0>]

In [21]:
# 子节点
html.xpath('//li/a')

[<Element a at 0x7f22884db4b0>,
 <Element a at 0x7f22884db2d0>,
 <Element a at 0x7f22884dbbe0>,
 <Element a at 0x7f2288779230>,
 <Element a at 0x7f2288779cd0>]

In [22]:
# 父节点
html.xpath('//a[@href="link4.html"]/../@class')

['item-4']

In [24]:
# 文本选取
html.xpath('//li[@class="item-2"]/a/text()')

['third item']

In [25]:
# 属性匹配
html.xpath('//li/a/@href')

['link0.html', 'link1.html', 'link2.html', 'link3.html', 'link4.html']

In [26]:
# 属性多值匹配
html.xpath('//li[contains(@class, "li")]/a/text()')

['first item']

In [29]:
# 多属性匹配
html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')

['first item']

In [31]:
# 按序选择
html.xpath('//li[1]/a/text()')
html.xpath('//li[last()]/a/text()')

['fifth item']

In [32]:
# 节点轴选取
html.xpath('//li[1]/ancestor::*')

[<Element html at 0x7f22883a5d70>,
 <Element body at 0x7f22884dbf00>,
 <Element div at 0x7f22884db960>,
 <Element ul at 0x7f22884dbeb0>]

In [33]:
html.xpath('//li[1]/attribute::*')

['li item-0', 'item']

In [34]:
# Beautiful Soup
# 解析器
# Python标准库 'html.parser'
# html HTML解析器 'lxml'
# lxml XML解析器 'xml'
# html5lib 'html5lib'

In [64]:
bs4_html = """
    <html><head><title>The story</title></head>
    <body>
        <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
        <p class="story"><a href="https://www.python.org">python</a><p>
"""

In [65]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(bs4_html, 'lxml')
print(soup.prettify())

<html>
 <head>
  <title>
   The story
  </title>
 </head>
 <body>
  <p class="title" name="dromouse">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   <a href="https://www.python.org">
    python
   </a>
  </p>
  <p>
  </p>
 </body>
</html>


In [42]:
# 节点选择器
soup.title.string
soup.head
soup.p

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

In [46]:
# 提取信息
soup.title.name
# 获取属性
soup.p.attrs['name']
soup.p['name']
soup.p.string

"The Dormouse's story"

In [47]:
# 嵌套选择
soup.head.title

<title>The story</title>

In [54]:
# 直接子节点
soup.p.contents
for i, child in enumerate(soup.p.children):
    print(i, child)

0 <b>The Dormouse's story</b>


In [56]:
# 所有子孙节点
for i, child in enumerate(soup.p.descendants):
    print(i, child)

0 <b>The Dormouse's story</b>
1 The Dormouse's story


In [58]:
# 父节点
soup.p.parent

<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
</body>

In [60]:
# 祖先节点
list(enumerate(soup.p.parents))

[(0, <body>
  <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
  </body>), (1, <html><head><title>The story</title></head>
  <body>
  <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
  </body></html>), (2, <html><head><title>The story</title></head>
  <body>
  <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
  </body></html>)]

In [67]:
# 兄弟节点
soup.p.next_sibling
soup.p.next_sibling
soup.p.previous_sibling
soup.p.previous_sibling

'\n'

In [69]:
# 提取信息
soup.p.attrs["class"]

['title']

In [70]:
# 方法选择器
soup.find_all(name="p")

[<p class="title" name="dromouse"><b>The Dormouse's story</b></p>,
 <p class="story"><a href="https://www.python.org">python</a></p>,
 <p>
 </p>]

In [71]:
soup.find_all(attrs={"class": "story"})

[<p class="story"><a href="https://www.python.org">python</a></p>]

In [72]:
import re
soup.find_all(text=re.compile("python"))

['python']

In [73]:
soup.find(name="p")

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

In [None]:
# page 182.