# 解析器

In [1]:
from bs4 import BeautifulSoup

soup = BeautifulSoup("<p>Hello</p>", "lxml")
print(soup.p.string)

Hello


# 基本用法

In [3]:
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, "lxml")
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title" name="dromouse">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    <!-- Elsie -->
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [4]:
soup.title.string

"The Dormouse's story"

# 节点选择器

In [5]:
soup.title

<title>The Dormouse's story</title>

In [6]:
type(soup.title)

bs4.element.Tag

In [7]:
soup.head

<head><title>The Dormouse's story</title></head>

In [8]:
soup.p

<p class="title" name="dromouse"><b>The Dormouse's story</b></p>

## 提取信息

### 获取名称

In [9]:
soup.title.name

'title'

### 获取属性

In [10]:
soup.p.attrs

{'class': ['title'], 'name': 'dromouse'}

In [11]:
soup.p["name"]

'dromouse'

### 获取内容

In [12]:
soup.p.string

"The Dormouse's story"

## 嵌套选择

In [13]:
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
"""

soup = BeautifulSoup(html, "lxml")
soup.head.title

<title>The Dormouse's story</title>

In [14]:
soup.head.title.string

"The Dormouse's story"

## 关联选择

### 子节点和子孙节点

In [16]:
html = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
        <p class="story">...</p>
"""
soup = BeautifulSoup(html, "lxml")
for i, child in enumerate(soup.p.contents):
    print(i, child)

0 
            Once upon a time there were three little sisters; and their names were
            
1 <a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
2 

3 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
4  
            and
            
5 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
6 
            and they lived at the bottom of a well.
        


### 父节点和祖先节点

In [17]:
# 父节点
soup.p.parent

<body>
<p class="story">
            Once upon a time there were three little sisters; and their names were
            <a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> 
            and
            <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
<p class="story">...</p>
</body>

In [18]:
# 祖先节点
soup.p.parents

<generator object parents at 0x000000000423B410>

### 兄弟节点

In [19]:
html = """
<html>
    <body>
        <p class="story">
            Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1">
                <span>Elsie</span>
            </a>
            Hello
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
            and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
            and they lived at the bottom of a well.
        </p>
"""
soup = BeautifulSoup(html, 'lxml')
print('Next Sibling', soup.a.next_sibling)
print('Prev Sibling', soup.a.previous_sibling)
print('Next Siblings', list(enumerate(soup.a.next_siblings)))
print('Prev Siblings', list(enumerate(soup.a.previous_siblings)))

Next Sibling 
            Hello
            
Prev Sibling 
            Once upon a time there were three little sisters; and their names were
            
Next Siblings [(0, '\n            Hello\n            '), (1, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>), (2, ' \n            and\n            '), (3, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>), (4, '\n            and they lived at the bottom of a well.\n        ')]
Prev Siblings [(0, '\n            Once upon a time there were three little sisters; and their names were\n            ')]


# 方法选择器

## find_all() 

In [None]:
find_all(name, attrs, recursive, text, **kwargs)

### name

In [20]:
html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
soup = BeautifulSoup(html, "lxml")
print(soup.find_all(name="ul"))
soup.find_all(name="ul")[0]

[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>, <ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>]


<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>

In [23]:
for ul in soup.find_all(name="ul"):
    for li in ul.find_all(name="li"):
        print(li.string)

Foo
Bar
Jay
Foo
Bar


### attrs

In [25]:
soup.find_all(attrs={"id": "list-1"})

[<ul class="list" id="list-1">
 <li class="element">Foo</li>
 <li class="element">Bar</li>
 <li class="element">Jay</li>
 </ul>]

In [28]:
soup.find_all(attrs={"class": "element"})

[<li class="element">Foo</li>,
 <li class="element">Bar</li>,
 <li class="element">Jay</li>,
 <li class="element">Foo</li>,
 <li class="element">Bar</li>]

### text

传入的相识可以是字符串，可以是正则表达式对象

In [29]:
import re
html='''
<div class="panel">
    <div class="panel-body">
        <a>Hello, this is a link</a>
        <a>Hello, this is a link, too</a>
    </div>
</div>
'''
soup = BeautifulSoup(html, "lxml")
print(soup.find_all(text=re.compile("link")))

['Hello, this is a link', 'Hello, this is a link, too']


## find()

与find_all()相似，只是返回的第一个匹配的元素

# CSS选择器

## select

In [30]:
html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
soup = BeautifulSoup(html, "lxml")
print(soup.select(".panel .panel-heading"))
print(soup.select("ul li"))
print(soup.select("#list-2 .element"))
print(soup.select("ul")[0])

[<div class="panel-heading">
<h4>Hello</h4>
</div>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>]
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>


## 嵌套选择器

In [31]:
for ul in soup.select("ul"):
    print(ul.select("li"))

[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>]


## 获取属性

In [32]:
for ul in soup.select("ul"):
    print(ul["id"])

list-1
list-2


## 获取文本

In [33]:
for li in soup.select("li"):
    print(li.string)

Foo
Bar
Jay
Foo
Bar
