從《Python 網路爬蟲與資料分析入門實戰》第三章的範例中練習爬蟲

從第三章蘋果日報今日熱門爬蟲範例中練習自己寫的程式碼

書中原始程式碼來源：https://github.com/jwlin/web-crawler-tutorial/tree/master/ch3

# 今日熱門新聞：(1)爬取資料 (2)儲存資料

In [1]:
import requests
from bs4 import BeautifulSoup

url='https://tw.appledaily.com/hot/daily'

#1.向網站發出請求
resp = requests.get(url)
#2.剖析網站原始碼
soup = BeautifulSoup(resp.text, 'html5lib')
#3.定位資訊的大概位置:ul標籤all屬性中所有li標籤
apple = soup.find('ul', 'all').find_all('li')
#4.創造空的清單(放爬下來的資料)
news = []

#5.用迴圈將所需資訊一行一行爬取出來
for a in apple:
    #5.1創建一個空的字典(放資料)
    new=dict()
    #5.2創造「編號」:定位div標籤名稱/aht_title_num屬性的字串(30個新聞位置的共同名稱)
    new['number']=a.find('div', 'aht_title_num').text
    #5.3創造「新聞標題」:定位div標籤名稱/aht_title屬性的字串
    new['title']=a.find('div', 'aht_title').text
    #5.4創造「網址」:定位div標籤名稱/aht_title屬性/a標籤中的網址屬性
    new['href']=a.find('div', 'aht_title').a['href']
    #5.5將資料加進原先的清單中
    news.append(new)
    #5.6印出結果
print(news)

[{'number': '01', 'title': '人渣文本專欄：國民黨射得出核彈嗎？（周偉航）', 'href': 'https://tw.news.appledaily.com/headline/daily/20191209/38516990'}, {'number': '02', 'title': '《蘋果》最新民調明刊登 韓靠網軍案反攻嗎', 'href': 'https://tw.news.appledaily.com/headline/daily/20191209/38517357'}, {'number': '03', 'title': '陳玉珍被夾手指 自己人下重手 影片還原 驚見黃昭順「塞門縫 加壓」', 'href': 'https://tw.news.appledaily.com/headline/daily/20191209/38517330'}, {'number': '04', 'title': '【選戰袐辛】葉元之玻璃心碎了 討拍佳芬姊呼呼', 'href': 'https://tw.news.appledaily.com/headline/daily/20191209/38517485'}, {'number': '05', 'title': '夾手指片一夜狂看30遍 卓榮泰：我無法入睡一直笑', 'href': 'https://tw.news.appledaily.com/headline/daily/20191209/38517334'}, {'number': '06', 'title': '認愛助理10餘年 「感情非人生重點」', 'href': 'https://tw.news.appledaily.com/headline/daily/20191209/38517376'}, {'number': '07', 'title': '汪用和黃光芹求子失敗 領養之愛超越血緣', 'href': 'https://tw.news.appledaily.com/headline/daily/20191209/38517509'}, {'number': '08', 'title': '為人權日而走 80萬港人 向世界展現堅持', 'href': 'https://tw.news.appledaily.com/headline/

In [2]:
import pandas as pd
import numpy as np  

#6.處理資料並儲存
    #6.1將資料轉為dataframe型態
test=pd.DataFrame(data=news)
    #6.2將內建索引值替換為「編號」
test.set_index('number', inplace=True)
    #6.3印出結果
print(test)
    #6.4儲存成csv檔案
test.to_csv('news.csv',encoding='utf_8_sig')

                                                     href  \
number                                                      
01      https://tw.news.appledaily.com/headline/daily/...   
02      https://tw.news.appledaily.com/headline/daily/...   
03      https://tw.news.appledaily.com/headline/daily/...   
04      https://tw.news.appledaily.com/headline/daily/...   
05      https://tw.news.appledaily.com/headline/daily/...   
06      https://tw.news.appledaily.com/headline/daily/...   
07      https://tw.news.appledaily.com/headline/daily/...   
08      https://tw.news.appledaily.com/headline/daily/...   
09      https://tw.news.appledaily.com/headline/daily/...   
10      https://tw.news.appledaily.com/headline/daily/...   
11      https://tw.news.appledaily.com/headline/daily/...   
12      https://tw.news.appledaily.com/headline/daily/...   
13      https://tw.news.appledaily.com/headline/daily/...   
14      https://tw.news.appledaily.com/headline/daily/...   
15      https://tw.news.

# 無註解程式碼

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np  


url='https://tw.appledaily.com/hot/daily'

resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html5lib')
apple = soup.find('ul', 'all').find_all('li')
news = []

for a in apple:
    new=dict()
    new['number']=a.find('div', 'aht_title_num').text
    new['title']=a.find('div', 'aht_title').text
    new['href']=a.find('div', 'aht_title').a['href']
    news.append(new)

print(news)
 

test=pd.DataFrame(data=news)
test.set_index('number', inplace=True)
print(test)
test.to_csv('news.csv',encoding='utf_8_sig')

# urllib套件版本的爬蟲

In [None]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np  


url='https://tw.appledaily.com/hot/daily'

resp = urllib.request.urlopen(url).read()
soup = BeautifulSoup(resp, 'html5lib')
apple = soup.find('ul', 'all').find_all('li')
news = []

for a in apple:
    new=dict()
    new['number']=a.find('div', 'aht_title_num').text
    new['title']=a.find('div', 'aht_title').text
    new['href']=a.find('div', 'aht_title').a['href']
    news.append(new)

print(news)
 

test=pd.DataFrame(data=news)
test.set_index('number', inplace=True)
print(test)
test.to_csv('news.csv',encoding='utf_8_sig')