# 抓取 Dcard 資料

### 載入套件

In [1]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

### 取得 Dcard 網頁內容

In [2]:
URL = 'https://www.dcard.tw/f'
resp = requests.get(URL)
soup = BeautifulSoup(resp.text, 'html.parser')
# soup
str(soup)[:3000] # 把結果放到 github 會變太長，所以只顯示前面的部分

'<!DOCTYPE html>\n<html lang="zh-Hant-TW"><head itemscope="" itemtype="https://schema.org/WebSite" prefix="og: http://ogp.me/ns#"><title data-react-helmet="true">Dcard</title><meta content="https://www.dcard.tw/build/landing-c9e7b8fb.png" data-react-helmet="true" property="og:image"/><meta content="https://www.dcard.tw/build/landing-c9e7b8fb.png" data-react-helmet="true" property="og:image:secure_url"/><meta charset="utf-8" data-react-helmet="true"/><meta content="IE=edge" data-react-helmet="true" http-equiv="X-UA-Compatible"/><meta content="Dcard" data-react-helmet="true" name="application-name"/><meta content="app-id=951353454" data-react-helmet="true" name="apple-itunes-app"/><meta content="#006aa6" data-react-helmet="true" name="theme-color"/><meta content="yes" data-react-helmet="true" name="mobile-web-app-capable"/><meta content="yes" data-react-helmet="true" name="apple-mobile-web-app-capable"/><meta content="211628828926493" data-react-helmet="true" property="fb:app_id"/><meta 

### 找出熱門文章

In [4]:
articles = []
# 利用 regex 找出所有貼文
for item in soup.find_all('div', re.compile('PostList_entry')):
    articles.append({
        'title': item.h3.text,
        'excerpt': item.find_all('div', re.compile('PostEntry_(excerpt|reply)'))[0].text,
        'like_count': item.find_all('div', re.compile('PostEntry__LikeCount'))[0].text,
        'comments': re.findall(r'\d+', item.find_all('span', re.compile('PostEntry_comments'))[0].text)[0],
        'link': item.find_all('a', re.compile('PostEntry_root'))[0]['href']
    })

df = pd.DataFrame(articles, columns=['title', 'excerpt', 'like_count', 'comments', 'link'])  # 使用 columns 調整排列順序
print('共 %d 篇' % (len(df)))
df[:5] # 印出前 5 筆

共 30 篇


Unnamed: 0,title,excerpt,like_count,comments,link
0,幫高調！急！！！,昨天用機車載了我跟妹妹去上學之後，回家換了汽車不知道去哪裡，手機也沒帶，公司也打來家裡說他沒...,25825,258,/f/relationship/p/231283905-幫高調！急！！！
1,有人把韓國瑜海綿寶寶做出來,漫威要求創作韓國瑜的故事,8597,225,/f/trending/p/231286770-有人把韓國瑜海綿寶寶做出來
2,要怎麼陪爸爸考英文,上大學之後就很少回家了，就和大部分同學一樣，大概幾個禮拜回家那種，有時候回去，才發現父母的白...,6940,101,/f/exam/p/231285879-要怎麼陪爸爸考英文
3,會幫女友吹頭髮的男友最帥了,大多女生都不太愛吹頭髮，我也算在這之中，尤其是長頭髮真的要命，之前和前任在一起時，他是會幫我...,5811,209,/f/relationship/p/231283276-會幫女友吹頭髮的男友最帥了
4,如果中午打開便當長這樣....,腳丫也來一點好了～恩....嚇到吃腳腳，這是嘔吐物的概念ＸＤ，來源：只能說太有才啦，但拜託我...,4212,73,/f/food/p/231285581-如果中午打開便當長這樣....


# 使用 mLab 雲端資料庫

### 載入套件

In [43]:
import pandas as pd
import urllib.parse
import pymongo
from datetime import datetime
from pymongo import MongoClient
from bson.objectid import ObjectId

### 設置基本資料

In [8]:
host = 'ds263089.mlab.com'
port = '63089'
username = urllib.parse.quote_plus('fergus')
password = urllib.parse.quote_plus('abc123')
Authdb = 'dcard'

### 與資料庫連線

In [26]:
client = MongoClient('mongodb://%s:%s@%s:%s/%s?authMechanism=SCRAM-SHA-1'
                      % (username, password, host, port, Authdb))

db = client[Authdb]

### 創建 collection

In [27]:
collection_name = 'posts'
coll = db[collection_name]
# 查看是否連線成功
coll.stats

Collection(Database(MongoClient(host=['ds263089.mlab.com:63089'], document_class=dict, tz_aware=False, connect=True, authmechanism='SCRAM-SHA-1'), 'dcard'), 'posts.stats')

In [29]:
#取出所有doc
print(coll.find())

# 轉成list
print(list(coll.find()))

<pymongo.cursor.Cursor object at 0x1142c7908>
[]


### 插入單筆資料

In [49]:
dic = {
    'userid':'01',
    'username':'user1',
    'creattime':datetime.now(),
    'category': 'category1'
}

coll.insert_one(dic)
list(coll.find())  # 查看資料

[{'_id': ObjectId('5cdc269b3fc7f2c0c5786879'),
  'userid': '01',
  'username': 'user1',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 55, 697000),
  'category': 'category1'}]

### 插入多筆資料

In [50]:
dic_list = [
    {
        'userid':'02',
        'username':'user2',
        'creattime':datetime.now(),
        'category': 'category1'
    },
    {
        'userid':'03',
        'username':'user3',
        'creattime':datetime.strptime('2019-05-01 18:26:42', '%Y-%m-%d %H:%M:%S'),
        'category': 'category1'
    },
    {
        'userid':'04',
        'username':'user4',
        'creattime':datetime.now(),
        'category': 'category2'
    },
    {
        'userid':'05',
        'username':'user5',
        'creattime':datetime.now(),
        'category': 'category2'
    },
]

coll.insert_many(dic_list)
list(coll.find())

[{'_id': ObjectId('5cdc269b3fc7f2c0c5786879'),
  'userid': '01',
  'username': 'user1',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 55, 697000),
  'category': 'category1'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687a'),
  'userid': '02',
  'username': 'user2',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 58, 860000),
  'category': 'category1'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687b'),
  'userid': '03',
  'username': 'user3',
  'creattime': datetime.datetime(2019, 5, 1, 18, 26, 42),
  'category': 'category1'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687c'),
  'userid': '04',
  'username': 'user4',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 58, 861000),
  'category': 'category2'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687d'),
  'userid': '05',
  'username': 'user5',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 58, 861000),
  'category': 'category2'}]

### 排序

In [23]:
# 預設遞增(ASCENDING)排序 A -> Z
list(coll.find().sort("username"))
# list(coll.find().sort("username",pymongo.ASCENDING))
# list(coll.find().sort("username",pymongo.DESCENDING))

[{'_id': ObjectId('5cdc180e3fc7f2c0c5786862'),
  'userid': '01',
  'username': 'user1',
  'creattime': datetime.datetime(2019, 5, 15, 21, 45, 50, 291000),
  'type': 'type1'},
 {'_id': ObjectId('5cdc18be3fc7f2c0c5786867'),
  'userid': '02',
  'username': 'user2',
  'creattime': datetime.datetime(2019, 5, 15, 21, 48, 46, 167000),
  'type': 'type1'},
 {'_id': ObjectId('5cdc18be3fc7f2c0c5786868'),
  'userid': '03',
  'username': 'user3',
  'creattime': datetime.datetime(2019, 5, 15, 21, 48, 46, 167000),
  'type': 'type1'},
 {'_id': ObjectId('5cdc18be3fc7f2c0c5786869'),
  'userid': '04',
  'username': 'user4',
  'creattime': datetime.datetime(2019, 5, 15, 21, 48, 46, 167000),
  'type': 'type2'},
 {'_id': ObjectId('5cdc18be3fc7f2c0c578686a'),
  'userid': '05',
  'username': 'user5',
  'creattime': datetime.datetime(2019, 5, 15, 21, 48, 46, 167000),
  'type': 'type5'}]

### 依時間區間選取

In [41]:
start = datetime.strptime('2019-05-01 00:00:00',
                           '%Y-%m-%d %H:%M:%S')
end = datetime.strptime('2019-05-01 23:59:59',
                           '%Y-%m-%d %H:%M:%S')
list(coll.find({'creattime': {'$gte': start, '$lt': end}}))

[{'_id': ObjectId('5cdc1f823fc7f2c0c5786876'),
  'userid': '03',
  'username': 'user3',
  'creattime': datetime.datetime(2019, 5, 1, 18, 26, 42),
  'category': 'category1'}]

### 依照 id 選取

In [44]:
list(coll.find({'_id':ObjectId('5cdc1e4c3fc7f2c0c5786870')}))

[{'_id': ObjectId('5cdc1e4c3fc7f2c0c5786870'),
  'userid': '01',
  'username': 'user1',
  'creattime': datetime.datetime(2019, 5, 15, 22, 12, 28, 104000),
  'category': 'category1'}]

### 更新資料(整個覆蓋)

In [51]:
coll.replace_one({'username':'user1'} ,
             {'username':'user6', 'category':'category3'})
list(coll.find())

[{'_id': ObjectId('5cdc269b3fc7f2c0c5786879'),
  'username': 'user6',
  'category': 'category3'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687a'),
  'userid': '02',
  'username': 'user2',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 58, 860000),
  'category': 'category1'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687b'),
  'userid': '03',
  'username': 'user3',
  'creattime': datetime.datetime(2019, 5, 1, 18, 26, 42),
  'category': 'category1'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687c'),
  'userid': '04',
  'username': 'user4',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 58, 861000),
  'category': 'category2'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687d'),
  'userid': '05',
  'username': 'user5',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 58, 861000),
  'category': 'category2'}]

### 更新資料(修改元素)

In [52]:
coll.update_one({"username":"user2"},
            {"$set":{"category":"category4","comments": 123}})
list(coll.find())

[{'_id': ObjectId('5cdc269b3fc7f2c0c5786879'),
  'username': 'user6',
  'category': 'category3'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687b'),
  'userid': '03',
  'username': 'user3',
  'creattime': datetime.datetime(2019, 5, 1, 18, 26, 42),
  'category': 'category1'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687c'),
  'userid': '04',
  'username': 'user4',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 58, 861000),
  'category': 'category2'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687d'),
  'userid': '05',
  'username': 'user5',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 58, 861000),
  'category': 'category2'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687a'),
  'userid': '02',
  'username': 'user2',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 58, 860000),
  'category': 'category4',
  'comments': 123}]

### 移除單筆資料

In [54]:
coll.delete_one({'username':'user6'})
list(coll.find())

[{'_id': ObjectId('5cdc269e3fc7f2c0c578687b'),
  'userid': '03',
  'username': 'user3',
  'creattime': datetime.datetime(2019, 5, 1, 18, 26, 42),
  'category': 'category1'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687c'),
  'userid': '04',
  'username': 'user4',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 58, 861000),
  'category': 'category2'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687d'),
  'userid': '05',
  'username': 'user5',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 58, 861000),
  'category': 'category2'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687a'),
  'userid': '02',
  'username': 'user2',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 58, 860000),
  'category': 'category4',
  'comments': 123}]

### 移除多筆資料

In [56]:
coll.delete_many({'category':'category2'})
list(coll.find())

[{'_id': ObjectId('5cdc269e3fc7f2c0c578687b'),
  'userid': '03',
  'username': 'user3',
  'creattime': datetime.datetime(2019, 5, 1, 18, 26, 42),
  'category': 'category1'},
 {'_id': ObjectId('5cdc269e3fc7f2c0c578687a'),
  'userid': '02',
  'username': 'user2',
  'creattime': datetime.datetime(2019, 5, 15, 22, 47, 58, 860000),
  'category': 'category4',
  'comments': 123}]

### 移除所有資料

In [57]:
coll.delete_many({})
list(coll.find())

[]