taitungcc councilors crawler, data, parser

g0v · Nov 19, 2014 · 744885e · 744885e
1 parent aa9d2cf
commit 744885e
Show file tree

Hide file tree

Showing 16 changed files with 1,976 additions and 203 deletions.
diff --git a/crawler/README.md b/crawler/README.md
@@ -38,17 +38,17 @@ About xpath, recommend this [xpath tutorial](http://www.zvon.org/comp/r/tut-XPat
 
 ### 東部
 
-| 縣市   | 代號 | 現任議員 | 歷屆議員 | 議案 | 議事錄 |
-|--------|------|----------|----------|------|--------|
-| 宜蘭縣 | ilcc |          | X        |      |        |
-| 花蓮縣 | hlcc |          | X        | X    | X      |
-| 台東縣 | ttcc |          |          | X    | X      |
+| 縣市   | 代號      | 現任議員 | 歷屆議員 | 議案 | 議事錄 |
+|--------|-----------|----------|----------|------|--------|
+| 宜蘭縣 | ilcc      |          | X        |      |        |
+| 花蓮縣 | hlcc      |          | X        | X    | X      |
+| 台東縣 | taitungcc |          | X        | X    | X      |
 
 ### 南部
 
 | 縣市   | 代號 | 現任議員 | 歷屆議員 | 議案 | 議事錄 |
 |--------|------|----------|----------|------|--------|
-| 台南市 |      |          |          |      |        |
+| 台南市 | tncc |          | X        |      | X      |
 | 高雄市 | kcc  |          |          |      |        |
 | 屏東縣 | ptcc |          | X        | X    | X      |
 

diff --git a/crawler/taitungcc/scrapy.cfg b/crawler/taitungcc/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/en/latest/topics/scrapyd.html
+
+[settings]
+default = taitungcc.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = taitungcc
diff --git a/crawler/taitungcc/taitungcc/__init__.py b/crawler/taitungcc/taitungcc/__init__.py
diff --git a/crawler/taitungcc/taitungcc/items.py b/crawler/taitungcc/taitungcc/items.py
@@ -0,0 +1,69 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+from scrapy.item import Item, Field
+
+class Councilor(Item):
+    name = Field()
+    stage = Field()
+    election_year = Field()
+    title = Field()
+    gender = Field()
+    birth = Field()
+    party = Field()
+    constituency = Field()
+    county = Field()
+    district = Field()
+    contact_details = Field()
+    education = Field()
+    experience = Field()
+    platform = Field()
+    remark = Field()
+    in_office = Field()
+    term_end = Field()
+    term_start = Field()
+    image = Field()
+    links = Field()
+    uid = Field()
+
+class Bills(Item):
+    election_year = Field()
+    county = Field()
+    id = Field()
+    type = Field()
+    sitting = Field()
+    category = Field()
+    abstract = Field()
+    description = Field()
+    methods = Field()
+    last_action = Field()
+    proposed_by = Field()
+    brought_by = Field()
+    related_units = Field()
+    petitioned_by = Field()
+    motions = Field()
+    committee = Field()
+    resolusion_date = Field()
+    resolusion_sitting = Field()
+    resolusion = Field()
+    bill_no = Field()
+    intray_date = Field()
+    intray_no = Field()
+    receipt_date = Field()
+    examination_date = Field()
+    examination = Field()
+    dispatch_no = Field()
+    dispatch_date = Field()
+    execution = Field()
+    remark = Field()
+    links = Field()
+
+class MeetingMinutes(Item):
+    county = Field()
+    sitting = Field()
+    category = Field()
+    date = Field()
+    meeting = Field()
+    download_url = Field()
diff --git a/crawler/taitungcc/taitungcc/pipelines.py b/crawler/taitungcc/taitungcc/pipelines.py
@@ -0,0 +1,8 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+class TaitungccPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/crawler/taitungcc/taitungcc/settings.py b/crawler/taitungcc/taitungcc/settings.py
@@ -0,0 +1,28 @@
+# Scrapy settings for taipei project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+import os
+import sys
+from os.path import dirname
+
+
+# add python path for crawler_lib
+_PROJECT_PATH = dirname(dirname(dirname(dirname(__file__))))
+sys.path.append(os.path.join(_PROJECT_PATH, 'crawler'))
+
+BOT_NAME = 'taitungcc'
+
+SPIDER_MODULES = ['taitungcc.spiders']
+NEWSPIDER_MODULE = 'taitungcc.spiders'
+LOG_FILE = 'log.txt'
+
+FEED_EXPORTERS = {
+    'json': 'crawler_lib.misc.UnicodeJsonItemExporter',
+}
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'taitung (+http://www.yourdomain.com)'
diff --git a/crawler/taitungcc/taitungcc/spiders/__init__.py b/crawler/taitungcc/taitungcc/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/crawler/taitungcc/taitungcc/spiders/constituency.json b/crawler/taitungcc/taitungcc/spiders/constituency.json
@@ -0,0 +1,16 @@
+{
+	"第1選區":  "台東市、綠島鄉、蘭嶼鄉",
+	"第2選區":  "卑南鄉、延平鄉",
+	"第3選區":  "成功鎮、東河鄉、長濱鄉",
+	"第4選區":  "關山鎮、鹿野鄉、池上鄉、海端鄉",
+	"第5選區":  "大武鄉、達仁鄉、金峰鄉、太麻里鄉",
+	"第6選區":  "平地原住民（臺東市）",
+	"第7選區":  "平地原住民（大武鄉、達仁鄉、金峰鄉、太麻里鄉、蘭嶼鄉）",
+	"第8選區":  "平地原住民（關山鎮、池上鄉、鹿野鄉、海端鄉、延平鄉）",
+	"第9選區":  "平地原住民（成功鎮、東河鄉、長濱鄉）",
+	"第10選區":  "山地原住民（成功鎮、卑南鄉、延平鄉、東河鄉、長濱鄉）",
+	"第11選區":  "山地原住民（關山鎮、池上鄉、鹿野鄉、海端鄉）",
+	"第12選區":  "山地原住民（金峰鄉、太麻里鄉）",
+	"第13選區":  "山地原住民（大武鄉、達仁鄉）",
+	"第14選區":  "山地原住民（臺東市、綠島鄉、蘭嶼鄉）"
+}
diff --git a/crawler/taitungcc/taitungcc/spiders/councilors.py b/crawler/taitungcc/taitungcc/spiders/councilors.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+import re
+import os
+import json
+import urllib
+from urlparse import urljoin
+import scrapy
+from scrapy.http import Request, FormRequest
+from scrapy.selector import Selector
+from taitungcc.items import Councilor
+
+
+class Spider(scrapy.Spider):
+    name = "councilors"
+    allowed_domains = ["www.taitungcc.gov.tw"]
+    start_urls = ["http://www.taitungcc.gov.tw/ourteam/%02dourteam.html" % x for x in range(2, 16)]
+    download_delay = 0.5
+
+    def __init__(self):
+        fh = open(os.path.join(os.path.dirname(__file__), 'constituency.json'), 'r')
+        self.constituency = json.loads(fh.read())
+
+    def parse(self, response):
+        for node in response.xpath('//td[@class="ourteam01"]/a'):
+            item = Councilor()
+            item['name'] = node.xpath('text()').extract()[0]
+            item['constituency'] = u'第%d選區' % (int(re.search('(\d+)ourteam', response.url).group(1))-1)
+            item['district'] = self.constituency[item['constituency']]
+            yield Request(urljoin(response.url, node.xpath('@href').extract()[0]), callback=self.parse_profile, meta={'item': item})
+
+    def parse_profile(self, response):
+        item = response.request.meta['item']
+        item['county'] = u'臺東縣'
+        item['election_year'] = '2009'
+        item['term_start'] = '%s-12-25' % item['election_year']
+        item['term_end'] = {'date': "2014-12-25"}
+        item['in_office'] = True
+        item['links'] = [{'url': response.url, 'note': u'議會個人官網'}]
+        item['image'] = urljoin(response.url, response.xpath('//td[@background="images/page02_pic02_013.jpg"]/img/@src').extract()[0])
+        item['contact_details'] = []
+        for x in response.xpath('//img[contains(@src, "page05_pic02_002.jpg")]'):
+            label = ''.join([y.strip() for y in x.xpath('../descendant::text()').extract()])
+            value = x.xpath('../following-sibling::td[1]/descendant::text()').extract()
+            if value:
+                if re.search(u'性別', label):
+                    item['gender'] = value[0]
+                elif re.search(u'黨籍', label):
+                    item['party'] = value[0]
+                elif re.search(u'電話', label):
+                    item['contact_details'].append({"label" : u'電話', "type" : "voice", "value" : value[0]})
+                elif re.search(u'傳真', label):
+                    item['contact_details'].append({"label" : u'傳真', "type" : "fax", "value" : value[0]})
+                elif re.search(u'地址', label):
+                    item['contact_details'].append({"label" : u'地址', "type" : "address", "value" : value[0]})
+                elif re.search(u'E-mail', label):
+                    item['contact_details'].append({"label" : u'E-mail', "type" : "email", "value" : value[0]})
+        for td in response.xpath(u'//td//strong'):
+            label = td.xpath('text()').extract()[0].strip()
+            value = [line.strip() for line in td.xpath('ancestor::td/text()').extract()]
+            if re.search(u'學歷', label):
+                item['education'] = value
+            elif re.search(u'經歷', label):
+                item['experience'] = value
+            elif re.search(u'政見', label):
+                item['platform'] = value
+        return item