Skip to content

Commit

Permalink
taitungcc councilors crawler, data, parser
Browse files Browse the repository at this point in the history
  • Loading branch information
thewayiam committed Nov 19, 2014
1 parent aa9d2cf commit 744885e
Show file tree
Hide file tree
Showing 16 changed files with 1,976 additions and 203 deletions.
12 changes: 6 additions & 6 deletions crawler/README.md
Expand Up @@ -38,17 +38,17 @@ About xpath, recommend this [xpath tutorial](http://www.zvon.org/comp/r/tut-XPat

### 東部

| 縣市 | 代號 | 現任議員 | 歷屆議員 | 議案 | 議事錄 |
|--------|------|----------|----------|------|--------|
| 宜蘭縣 | ilcc | | X | | |
| 花蓮縣 | hlcc | | X | X | X |
| 台東縣 | ttcc | | | X | X |
| 縣市 | 代號 | 現任議員 | 歷屆議員 | 議案 | 議事錄 |
|--------|-----------|----------|----------|------|--------|
| 宜蘭縣 | ilcc | | X | | |
| 花蓮縣 | hlcc | | X | X | X |
| 台東縣 | taitungcc | | X | X | X |

### 南部

| 縣市 | 代號 | 現任議員 | 歷屆議員 | 議案 | 議事錄 |
|--------|------|----------|----------|------|--------|
| 台南市 | | | | | |
| 台南市 | tncc | | X | | X |
| 高雄市 | kcc | | | | |
| 屏東縣 | ptcc | | X | X | X |

Expand Down
11 changes: 11 additions & 0 deletions crawler/taitungcc/scrapy.cfg
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# http://doc.scrapy.org/en/latest/topics/scrapyd.html

[settings]
default = taitungcc.settings

[deploy]
#url = http://localhost:6800/
project = taitungcc
Empty file.
69 changes: 69 additions & 0 deletions crawler/taitungcc/taitungcc/items.py
@@ -0,0 +1,69 @@
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item, Field

class Councilor(Item):
name = Field()
stage = Field()
election_year = Field()
title = Field()
gender = Field()
birth = Field()
party = Field()
constituency = Field()
county = Field()
district = Field()
contact_details = Field()
education = Field()
experience = Field()
platform = Field()
remark = Field()
in_office = Field()
term_end = Field()
term_start = Field()
image = Field()
links = Field()
uid = Field()

class Bills(Item):
election_year = Field()
county = Field()
id = Field()
type = Field()
sitting = Field()
category = Field()
abstract = Field()
description = Field()
methods = Field()
last_action = Field()
proposed_by = Field()
brought_by = Field()
related_units = Field()
petitioned_by = Field()
motions = Field()
committee = Field()
resolusion_date = Field()
resolusion_sitting = Field()
resolusion = Field()
bill_no = Field()
intray_date = Field()
intray_no = Field()
receipt_date = Field()
examination_date = Field()
examination = Field()
dispatch_no = Field()
dispatch_date = Field()
execution = Field()
remark = Field()
links = Field()

class MeetingMinutes(Item):
county = Field()
sitting = Field()
category = Field()
date = Field()
meeting = Field()
download_url = Field()
8 changes: 8 additions & 0 deletions crawler/taitungcc/taitungcc/pipelines.py
@@ -0,0 +1,8 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

class TaitungccPipeline(object):
def process_item(self, item, spider):
return item
28 changes: 28 additions & 0 deletions crawler/taitungcc/taitungcc/settings.py
@@ -0,0 +1,28 @@
# Scrapy settings for taipei project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
import os
import sys
from os.path import dirname


# add python path for crawler_lib
_PROJECT_PATH = dirname(dirname(dirname(dirname(__file__))))
sys.path.append(os.path.join(_PROJECT_PATH, 'crawler'))

BOT_NAME = 'taitungcc'

SPIDER_MODULES = ['taitungcc.spiders']
NEWSPIDER_MODULE = 'taitungcc.spiders'
LOG_FILE = 'log.txt'

FEED_EXPORTERS = {
'json': 'crawler_lib.misc.UnicodeJsonItemExporter',
}

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'taitung (+http://www.yourdomain.com)'
4 changes: 4 additions & 0 deletions crawler/taitungcc/taitungcc/spiders/__init__.py
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
16 changes: 16 additions & 0 deletions crawler/taitungcc/taitungcc/spiders/constituency.json
@@ -0,0 +1,16 @@
{
"第1選區": "台東市、綠島鄉、蘭嶼鄉",
"第2選區": "卑南鄉、延平鄉",
"第3選區": "成功鎮、東河鄉、長濱鄉",
"第4選區": "關山鎮、鹿野鄉、池上鄉、海端鄉",
"第5選區": "大武鄉、達仁鄉、金峰鄉、太麻里鄉",
"第6選區": "平地原住民(臺東市)",
"第7選區": "平地原住民(大武鄉、達仁鄉、金峰鄉、太麻里鄉、蘭嶼鄉)",
"第8選區": "平地原住民(關山鎮、池上鄉、鹿野鄉、海端鄉、延平鄉)",
"第9選區": "平地原住民(成功鎮、東河鄉、長濱鄉)",
"第10選區": "山地原住民(成功鎮、卑南鄉、延平鄉、東河鄉、長濱鄉)",
"第11選區": "山地原住民(關山鎮、池上鄉、鹿野鄉、海端鄉)",
"第12選區": "山地原住民(金峰鄉、太麻里鄉)",
"第13選區": "山地原住民(大武鄉、達仁鄉)",
"第14選區": "山地原住民(臺東市、綠島鄉、蘭嶼鄉)"
}
66 changes: 66 additions & 0 deletions crawler/taitungcc/taitungcc/spiders/councilors.py
@@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-
import re
import os
import json
import urllib
from urlparse import urljoin
import scrapy
from scrapy.http import Request, FormRequest
from scrapy.selector import Selector
from taitungcc.items import Councilor


class Spider(scrapy.Spider):
name = "councilors"
allowed_domains = ["www.taitungcc.gov.tw"]
start_urls = ["http://www.taitungcc.gov.tw/ourteam/%02dourteam.html" % x for x in range(2, 16)]
download_delay = 0.5

def __init__(self):
fh = open(os.path.join(os.path.dirname(__file__), 'constituency.json'), 'r')
self.constituency = json.loads(fh.read())

def parse(self, response):
for node in response.xpath('//td[@class="ourteam01"]/a'):
item = Councilor()
item['name'] = node.xpath('text()').extract()[0]
item['constituency'] = u'第%d選區' % (int(re.search('(\d+)ourteam', response.url).group(1))-1)
item['district'] = self.constituency[item['constituency']]
yield Request(urljoin(response.url, node.xpath('@href').extract()[0]), callback=self.parse_profile, meta={'item': item})

def parse_profile(self, response):
item = response.request.meta['item']
item['county'] = u'臺東縣'
item['election_year'] = '2009'
item['term_start'] = '%s-12-25' % item['election_year']
item['term_end'] = {'date': "2014-12-25"}
item['in_office'] = True
item['links'] = [{'url': response.url, 'note': u'議會個人官網'}]
item['image'] = urljoin(response.url, response.xpath('//td[@background="images/page02_pic02_013.jpg"]/img/@src').extract()[0])
item['contact_details'] = []
for x in response.xpath('//img[contains(@src, "page05_pic02_002.jpg")]'):
label = ''.join([y.strip() for y in x.xpath('../descendant::text()').extract()])
value = x.xpath('../following-sibling::td[1]/descendant::text()').extract()
if value:
if re.search(u'性別', label):
item['gender'] = value[0]
elif re.search(u'黨籍', label):
item['party'] = value[0]
elif re.search(u'電話', label):
item['contact_details'].append({"label" : u'電話', "type" : "voice", "value" : value[0]})
elif re.search(u'傳真', label):
item['contact_details'].append({"label" : u'傳真', "type" : "fax", "value" : value[0]})
elif re.search(u'地址', label):
item['contact_details'].append({"label" : u'地址', "type" : "address", "value" : value[0]})
elif re.search(u'E-mail', label):
item['contact_details'].append({"label" : u'E-mail', "type" : "email", "value" : value[0]})
for td in response.xpath(u'//td//strong'):
label = td.xpath('text()').extract()[0].strip()
value = [line.strip() for line in td.xpath('ancestor::td/text()').extract()]
if re.search(u'學歷', label):
item['education'] = value
elif re.search(u'經歷', label):
item['experience'] = value
elif re.search(u'政見', label):
item['platform'] = value
return item

0 comments on commit 744885e

Please sign in to comment.