# 其他工具：Grab、PyQuery


* 利用 Grab 套件的存取 HTML 資源
* 利用 PyQuery 套件的解析 HTML 格式


## 作業目標

將之前用 requests + beatifulsoup 實作的方式，改寫成 grab + pyquery，並且比較有哪些地方不同。





## requests + BeautifulSoup


In [1]:
import requests
import os

from bs4 import BeautifulSoup
from PIL import Image

url = 'https://www.ptt.cc/bbs/Beauty/M.1556291059.A.75A.html'

#  PTT 會詢問「是否滿 18 歲」，這邊可以用 cookies 繞過
resp = requests.get(url, cookies={'over18': '1'})

soup = BeautifulSoup(resp.text, features="lxml")

# 決定要儲存的資料夾
output_dir = 'downloads'

# 假如資料夾不存在就新增一個資料夾
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 定位所有圖片的 tag
image_tags = soup.find(id='main-content').findChildren('a', recursive=False)
for img_tag in image_tags:
    # 取得所有圖片在第三方服務的 id
    if 'imgur' not in img_tag['href']:
        continue

    print(img_tag)
    print(img_tag.attrs)
    img_id = img_tag['href'].split('/')[-1]
    print(img_id)
    
    # 組合圖片而非網站的網址
    img_url = 'https://i.imgur.com/{}.jpg'.format(img_id)
    
    # 對圖片送出請求
    with requests.get(img_url, stream=True) as r:
        r.raise_for_status()
        # 檢查圖片副檔名
        img = Image.open(r.raw)
        img_savename = '{outdir}/{img_id}.{img_ext}'.format(outdir=output_dir, img_id=img_id, img_ext=img.format.lower())
        img.save(img_savename)
        print('Save image {}'.format(img_savename))

<a href="https://imgur.com/Cgb5oo1" rel="nofollow" target="_blank">https://imgur.com/Cgb5oo1</a>
{'href': 'https://imgur.com/Cgb5oo1', 'target': '_blank', 'rel': ['nofollow']}
Cgb5oo1
Save image downloads/Cgb5oo1.jpeg
<a href="https://imgur.com/MgjHY4f" rel="nofollow" target="_blank">https://imgur.com/MgjHY4f</a>
{'href': 'https://imgur.com/MgjHY4f', 'target': '_blank', 'rel': ['nofollow']}
MgjHY4f
Save image downloads/MgjHY4f.jpeg
<a href="https://imgur.com/yeoRtAs" rel="nofollow" target="_blank">https://imgur.com/yeoRtAs</a>
{'href': 'https://imgur.com/yeoRtAs', 'target': '_blank', 'rel': ['nofollow']}
yeoRtAs
Save image downloads/yeoRtAs.jpeg
<a href="https://imgur.com/wsauNEW" rel="nofollow" target="_blank">https://imgur.com/wsauNEW</a>
{'href': 'https://imgur.com/wsauNEW', 'target': '_blank', 'rel': ['nofollow']}
wsauNEW
Save image downloads/wsauNEW.jpeg
<a href="https://imgur.com/IjiETcs" rel="nofollow" target="_blank">https://imgur.com/IjiETcs</a>
{'href': 'https://imgur.com/Iji

## Grab + PyQuery

In [2]:
import os
import requests

from PIL import Image
from grab import Grab
from pyquery import PyQuery as pq

# 決定要儲存的資料夾
output_dir = 'downloads'

# 假如資料夾不存在就新增一個資料夾
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
g = Grab()
resp = g.go('https://www.ptt.cc/bbs/Beauty/M.1556291059.A.75A.html', cookies={'over18': '1'})
doc = pq(resp.body)('blockquote').filter('.imgur-embed-pub')

for item in doc.items():
    img_id = item.attr("data-id")
    # 組合圖片而非網站的網址
    img_url = 'https://i.imgur.com/{}.jpg'.format(img_id)
    
    # 對圖片送出請求
    with requests.get(img_url, stream=True) as r:
        r.raise_for_status()
        # 檢查圖片副檔名
        img = Image.open(r.raw)
        img_savename = '{outdir}/{img_id}.{img_ext}'.format(outdir=output_dir, img_id=img_id, img_ext=img.format.lower())
        img.save(img_savename)
        print('Save image {}'.format(img_savename))


Save image downloads/Cgb5oo1.jpeg
Save image downloads/MgjHY4f.jpeg
Save image downloads/yeoRtAs.jpeg
Save image downloads/wsauNEW.jpeg
Save image downloads/IjiETcs.jpeg
