# PTTer

A crawler for www.ptt.cc (批踢踢實業坊)

In [1]:
import requests
import json
from bs4 import BeautifulSoup as bs

In [2]:
IP_KEYWORD = '來自: '
URL_KEYWORD = '文章網址: '

def get_page(board, id, save_path=None):
    url = 'https://www.ptt.cc/bbs/{}/{}.html'.format(board, id)
    res = requests.get(url)
    text = res.text
    if save_path is not None:
        with open(save_path, 'w', encoding='utf-8') as f:
            f.write(text)
    soup = bs(text, 'html.parser')
    return soup

def parse_page(page, keep_main_content=False, save_path=None):
    post = {}
    post['url'] = page.find(lambda tag: tag.name == 'span' and URL_KEYWORD in tag.text).find('a').text
    _, post['article_id'] = post['url'].rsplit('.', 1)[0].rsplit('/', 2)[1:]
    for metaline_div in page.find_all('div', {'class': 'article-metaline'}):
        if metaline_div.find('span', {'class': 'article-meta-tag'}).text == '作者':
            post['author'] = metaline_div.find('span', {'class': 'article-meta-value'}).text
        if metaline_div.find('span', {'class': 'article-meta-tag'}).text == '看板':
            post['board'] = metaline_div.find('span', {'class': 'article-meta-value'}).text
        if metaline_div.find('span', {'class': 'article-meta-tag'}).text == '標題':
            post['article_title'] = metaline_div.find('span', {'class': 'article-meta-value'}).text
        if metaline_div.find('span', {'class': 'article-meta-tag'}).text == '時間':
            post['datetime'] = metaline_div.find('span', {'class': 'article-meta-value'}).text
    post['ip'] = page.find(lambda tag: tag.name == 'span' and IP_KEYWORD in tag.text).text.split('來自: ')[1].strip()
    post['main_content'] = page.find('div', {'id': 'main-container'}).text.split('\n')
    if not keep_main_content:
        post['content'] = []
        for line in post['main_content'][2:]:
            if IP_KEYWORD in line:
                break
            post['content'].append(line)
        post['pushes'] = []
        for push_div in page.find_all('div', {'class': 'push'}):
            push = {
                'push_tag': push_div.find('span', {'class': 'push-tag'}).text.strip(), 
                'push_userid': push_div.find('span', {'class': 'push-userid'}).text.strip(), 
                'push_content': push_div.find('span', {'class': 'push-content'}).text[1:].strip(), 
                'push_ipdatetime': push_div.find('span', {'class': 'push-ipdatetime'}).text.strip()
            }
            post['pushes'].append(push)
        del post['main_content']
    if save_path is not None:
        with open(save_path, 'w', encoding='utf-8') as f:
            f.write(json.dumps(post))
    return post

In [3]:
board = 'Beauty'
id = 'M.1527694127.A.D6E'

page = get_page(board, id, save_path=None)
post = parse_page(page, keep_main_content=False, save_path=None)

In [4]:
page

<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8">
<meta content="width=device-width, initial-scale=1" name="viewport">
<title>[正妹] 新竹女中 回眸 - 看板 Beauty - 批踢踢實業坊</title>
<meta content="all" name="robots">
<meta content="Ptt BBS 批踢踢" name="keywords">
<meta content="新竹女中 學生自己創作的畢業歌《回眸》
https://youtu.be/2_zxqFmE9hE
其中MV裡面的高中妹妹，顏值顏質看起來還不錯
就稍微找了一下
1號 沒繡學號的這位 是我最喜歡的類型
" name="description">
<meta content="Ptt 批踢踢實業坊" property="og:site_name">
<meta content="[正妹] 新竹女中 回眸" property="og:title">
<meta content="新竹女中 學生自己創作的畢業歌《回眸》
https://youtu.be/2_zxqFmE9hE
其中MV裡面的高中妹妹，顏值顏質看起來還不錯
就稍微找了一下
1號 沒繡學號的這位 是我最喜歡的類型
" property="og:description">
<link href="https://www.ptt.cc/bbs/Beauty/M.1527694127.A.D6E.html" rel="canonical">
<link href="//images.ptt.cc/bbs/v2.25/bbs-common.css" rel="stylesheet" type="text/css">
<link href="//images.ptt.cc/bbs/v2.25/bbs-base.css" media="screen" rel="stylesheet" type="text/css">
<link href="//images.ptt.cc/bbs/v2.25/bbs-custom.css" rel="stylesheet" type="text/css">
<lin

In [5]:
post

{'article_id': 'M.1527694127.A.D6E',
 'article_title': '[正妹] 新竹女中 回眸',
 'author': 'james7923 (詹姆士Q)',
 'content': ['新竹女中 學生自己創作的畢業歌《回眸》',
  'https://youtu.be/2_zxqFmE9hE',
  '',
  '其中MV裡面的高中妹妹，顏值顏質看起來還不錯',
  '',
  '就稍微找了一下',
  '',
  '1號 沒繡學號的這位 是我最喜歡的類型',
  '[左一]',
  'https://i.imgur.com/SbRLjHJ.jpg',
  '[左一]',
  'https://i.imgur.com/xeT3dpg.jpg',
  '[右邊]',
  'https://i.imgur.com/tHlCNQ3.jpg',
  'https://i.imgur.com/bExaTkO.jpg',
  'https://i.imgur.com/zXtIa0C.jpg',
  'https://i.imgur.com/SsISpLn.jpg',
  'https://i.imgur.com/DlNBX25.jpg',
  'https://i.imgur.com/QqTe7fp.jpg',
  'https://i.imgur.com/xOrt8eT.jpg',
  'https://i.imgur.com/xyylS5l.jpg',
  'https://i.imgur.com/s2uSQNL.jpg',
  '',
  '2號',
  'https://i.imgur.com/TgocRlr.jpg',
  'https://i.imgur.com/f5kqQYh.jpg',
  'https://i.imgur.com/AJoNV29.jpg',
  'https://i.imgur.com/pH7LBRv.jpg',
  'https://i.imgur.com/T2cAlBt.jpg',
  'https://i.imgur.com/cGbfstR.jpg',
  'https://i.imgur.com/m024ZnR.jpg',
  'https://i.imgur.com/nlh26M1.jpg