# Beautiful Soup

[Beautiful Soup 官方文檔](https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/)

[parsers 介紹](https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#id12)

[parsers 之間的差別](https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#id53)

```
官方推薦使用 lxml 解析器
```

In [1]:
from bs4 import BeautifulSoup

In [2]:
html_doc = """
<html>
  <head>
    <title>網頁標題</title>
  </head>
  <body>
    <div id="header">
      <a class="logo" href="https://test.com.tw/logo-link">
        PTT
      </a>
    </div>
    <div class="images">
      <img src="0.png" />
      <img src="01.png" />
      <img src="02.png" />
      <img src="03.png" />
      <img src="11.png" />
      <img src="12.png" />
      <img src="13.png" />
      <img src="01111.png" />
    </div>
    <div id="container">
      <h2 id="title">八卦版</h2>
      <div class="article a1">
        <h3 class="title t1">文章標題1</h3>
        <div class="author">作者1</div>
        <div class="date">11/01</div>
      </div>
      <div class="article a2">
        <h3 class="title t2">文章標題2</h3>
        <div class="author">作者2</div>
        <div class="date">11/02</div>
      </div>
      <div class="article a3">
        <h3 class="title t3">文章標題3</h3>
        <div class="author">作者3</div>
        <div class="date">11/03</div>
      </div>
    </div>
    <div id="footer">
      <p class="address">臺北市信義區</p>
      <p class="copyright">&copy; Copyright 2019</p>
    </div>
  </body>
</html>
"""

In [3]:
soup = BeautifulSoup(html_doc, 'lxml')

In [4]:
soup

<html>
<head>
<title>網頁標題</title>
</head>
<body>
<div id="header">
<a class="logo" href="https://test.com.tw/logo-link">
        PTT
      </a>
</div>
<div class="images">
<img src="0.png"/>
<img src="01.png"/>
<img src="02.png"/>
<img src="03.png"/>
<img src="11.png"/>
<img src="12.png"/>
<img src="13.png"/>
<img src="01111.png"/>
</div>
<div id="container">
<h2 id="title">八卦版</h2>
<div class="article a1">
<h3 class="title t1">文章標題1</h3>
<div class="author">作者1</div>
<div class="date">11/01</div>
</div>
<div class="article a2">
<h3 class="title t2">文章標題2</h3>
<div class="author">作者2</div>
<div class="date">11/02</div>
</div>
<div class="article a3">
<h3 class="title t3">文章標題3</h3>
<div class="author">作者3</div>
<div class="date">11/03</div>
</div>
</div>
<div id="footer">
<p class="address">臺北市信義區</p>
<p class="copyright">© Copyright 2019</p>
</div>
</body>
</html>

In [5]:
print(soup.title)
print(soup.a)

<title>網頁標題</title>
<a class="logo" href="https://test.com.tw/logo-link">
        PTT
      </a>


In [6]:
print(soup.title.text)
print(soup.a.get("href"))
print(soup.a["href"])

網頁標題
https://test.com.tw/logo-link
https://test.com.tw/logo-link


In [7]:
print(soup.find("h2"))
print(soup.find(id="title"))
print(soup.find("h2", id="title"))
print(soup.find("h2", {"id": "title"}))
print('--------')
print(soup.find("h3"))
print(soup.find(class_="title"))
print(soup.find("h3", class_= "title"))
print(soup.find("h3", {"class": "title"}))
print(soup.find("h3", "title")) # 預設 class

<h2 id="title">八卦版</h2>
<h2 id="title">八卦版</h2>
<h2 id="title">八卦版</h2>
<h2 id="title">八卦版</h2>
--------
<h3 class="title t1">文章標題1</h3>
<h3 class="title t1">文章標題1</h3>
<h3 class="title t1">文章標題1</h3>
<h3 class="title t1">文章標題1</h3>
<h3 class="title t1">文章標題1</h3>


### 取得多個項目

In [8]:
# 可將上面任一項目從 find 改成 find_all (id 只會有一個，class 才會有多個)
print(soup.find_all(class_="title"))

[<h3 class="title t1">文章標題1</h3>, <h3 class="title t2">文章標題2</h3>, <h3 class="title t3">文章標題3</h3>]


In [9]:
# 如果 class 順序不對就抓不到值
print(soup.find_all(class_="title t1"))
print(soup.find_all(class_="t1 title"))

[<h3 class="title t1">文章標題1</h3>]
[]


### 使用 select

In [10]:
# 使用 css 選擇器的方式做篩選
print(soup.select(".title"))

[<h3 class="title t1">文章標題1</h3>, <h3 class="title t2">文章標題2</h3>, <h3 class="title t3">文章標題3</h3>]


In [11]:
# 可以不用管 class 順序
print(soup.select(".t1.title"))
print(soup.select(".a1.article .t1"))

[<h3 class="title t1">文章標題1</h3>]
[<h3 class="title t1">文章標題1</h3>]


## 將所有選到的項目一個個取出來

In [12]:
for item in soup.select(".title"):
    print(item.text)

文章標題1
文章標題2
文章標題3


## 使用 re

In [13]:
import re
print(soup.find_all(class_=re.compile("tle")))
# print(soup.find_all(class_=re.compile("^tle")))

# 利用 regex 找出所有 .png 結尾的圖片
# for img in soup.find_all('img', {'src': re.compile('\.png$')}):
#     print(img['src'])
    
# 利用 regex 找出所有 .png 結尾且含 '0' 的圖片
# for img in soup.find_all('img', {'src': re.compile('0.*\.png$')}):
#     print(img['src'])

[<h3 class="title t1">文章標題1</h3>, <h3 class="title t2">文章標題2</h3>, <h3 class="title t3">文章標題3</h3>]


## flask

In [14]:
# 執行後，伺服器會持續開著
# 到瀏覽器開啟 http://127.0.0.1:5000/ 就可以看到網頁
from flask import Flask

app = Flask(__name__)

@app.route("/")
def hello():
    return "Hello World!"

if __name__ == "__main__":
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [22/Nov/2019 10:36:21] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [22/Nov/2019 10:36:22] "GET /favicon.ico HTTP/1.1" 404 -


In [1]:
# 把以下程式放到 .py 上執行
# 執行後，伺服器會持續開著
# 到瀏覽器開啟 http://127.0.0.1:5000/ 就可以看到網頁
from flask import Flask

app = Flask(__name__)

@app.route("/")
def hello():
    return """
    <html>
      <head>
        <title>網頁標題</title>
      </head>
      <body>
        <div id="header">
          <a class="logo" href="https://test.com.tw/logo-link">
            PTT
          </a>
        </div>
        <div class="images">
          <img src="0.png" />
          <img src="01.png" />
          <img src="02.png" />
          <img src="03.png" />
          <img src="11.png" />
          <img src="12.png" />
          <img src="13.png" />
          <img src="01111.png" />
        </div>
        <div id="container">
          <h2 id="title">八卦版</h2>
          <div class="article a1">
            <h3 class="title t1">文章標題1</h3>
            <div class="author">作者1</div>
            <div class="date">11/01</div>
          </div>
          <div class="article a2">
            <h3 class="title t2">文章標題2</h3>
            <div class="author">作者2</div>
            <div class="date">11/02</div>
          </div>
          <div class="article a3">
            <h3 class="title t3">文章標題3</h3>
            <div class="author">作者3</div>
            <div class="date">11/03</div>
          </div>
        </div>
        <div id="footer">
          <p class="address">臺北市信義區</p>
          <p class="copyright">&copy; Copyright 2019</p>
        </div>
      </body>
    </html>
"""

if __name__ == "__main__":
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [19/Nov/2019 14:51:27] "GET / HTTP/1.1" 200 -


## 使用 requests 取得網頁上的資訊

In [15]:
import requests
from bs4 import BeautifulSoup

resp = requests.get('http://127.0.0.1:5000/')

if resp.status_code == 200:
    print(resp.text) # 也可以在 http://127.0.0.1:5000/ 按右鍵『檢視網頁原始碼』


    <html>
      <head>
        <title>網頁標題</title>
      </head>
      <body>
        <div id="header">
          <a class="logo" href="https://test.com.tw/logo-link">
            PTT
          </a>
        </div>
        <div class="images">
          <img src="0.png" />
          <img src="01.png" />
          <img src="02.png" />
          <img src="03.png" />
          <img src="11.png" />
          <img src="12.png" />
          <img src="13.png" />
          <img src="01111.png" />
        </div>
        <div id="container">
          <h2 id="title">八卦版</h2>
          <div class="article a1">
            <h3 class="title t1">文章標題1</h3>
            <div class="author">作者1</div>
            <div class="date">11/01</div>
          </div>
          <div class="article a2">
            <h3 class="title t2">文章標題2</h3>
            <div class="author">作者2</div>
            <div class="date">11/02</div>
          </div>
          <div class="article a3">
            <h3 class="title t3"

In [16]:
soup = BeautifulSoup(resp.text, 'lxml')

In [17]:
print(soup.title)
print(soup.a)
print(soup.select(".title"))

<title>網頁標題</title>
<a class="logo" href="https://test.com.tw/logo-link">
            PTT
          </a>
[<h3 class="title t1">文章標題1</h3>, <h3 class="title t2">文章標題2</h3>, <h3 class="title t3">文章標題3</h3>]


# 作業
- 可以試著使用 requests 去爬取其他網頁看看
- 再依照上面的方式取得想要的資訊