Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
taitungcc councilors crawler, data, parser
- Loading branch information
Showing
16 changed files
with
1,976 additions
and
203 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Automatically created by: scrapy startproject | ||
# | ||
# For more information about the [deploy] section see: | ||
# http://doc.scrapy.org/en/latest/topics/scrapyd.html | ||
|
||
[settings] | ||
default = taitungcc.settings | ||
|
||
[deploy] | ||
#url = http://localhost:6800/ | ||
project = taitungcc |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
# Define here the models for your scraped items | ||
# | ||
# See documentation in: | ||
# http://doc.scrapy.org/en/latest/topics/items.html | ||
|
||
from scrapy.item import Item, Field | ||
|
||
class Councilor(Item): | ||
name = Field() | ||
stage = Field() | ||
election_year = Field() | ||
title = Field() | ||
gender = Field() | ||
birth = Field() | ||
party = Field() | ||
constituency = Field() | ||
county = Field() | ||
district = Field() | ||
contact_details = Field() | ||
education = Field() | ||
experience = Field() | ||
platform = Field() | ||
remark = Field() | ||
in_office = Field() | ||
term_end = Field() | ||
term_start = Field() | ||
image = Field() | ||
links = Field() | ||
uid = Field() | ||
|
||
class Bills(Item): | ||
election_year = Field() | ||
county = Field() | ||
id = Field() | ||
type = Field() | ||
sitting = Field() | ||
category = Field() | ||
abstract = Field() | ||
description = Field() | ||
methods = Field() | ||
last_action = Field() | ||
proposed_by = Field() | ||
brought_by = Field() | ||
related_units = Field() | ||
petitioned_by = Field() | ||
motions = Field() | ||
committee = Field() | ||
resolusion_date = Field() | ||
resolusion_sitting = Field() | ||
resolusion = Field() | ||
bill_no = Field() | ||
intray_date = Field() | ||
intray_no = Field() | ||
receipt_date = Field() | ||
examination_date = Field() | ||
examination = Field() | ||
dispatch_no = Field() | ||
dispatch_date = Field() | ||
execution = Field() | ||
remark = Field() | ||
links = Field() | ||
|
||
class MeetingMinutes(Item): | ||
county = Field() | ||
sitting = Field() | ||
category = Field() | ||
date = Field() | ||
meeting = Field() | ||
download_url = Field() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Define your item pipelines here | ||
# | ||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting | ||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html | ||
|
||
class TaitungccPipeline(object): | ||
def process_item(self, item, spider): | ||
return item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# Scrapy settings for taipei project | ||
# | ||
# For simplicity, this file contains only the most important settings by | ||
# default. All the other settings are documented here: | ||
# | ||
# http://doc.scrapy.org/en/latest/topics/settings.html | ||
# | ||
import os | ||
import sys | ||
from os.path import dirname | ||
|
||
|
||
# add python path for crawler_lib | ||
_PROJECT_PATH = dirname(dirname(dirname(dirname(__file__)))) | ||
sys.path.append(os.path.join(_PROJECT_PATH, 'crawler')) | ||
|
||
BOT_NAME = 'taitungcc' | ||
|
||
SPIDER_MODULES = ['taitungcc.spiders'] | ||
NEWSPIDER_MODULE = 'taitungcc.spiders' | ||
LOG_FILE = 'log.txt' | ||
|
||
FEED_EXPORTERS = { | ||
'json': 'crawler_lib.misc.UnicodeJsonItemExporter', | ||
} | ||
|
||
# Crawl responsibly by identifying yourself (and your website) on the user-agent | ||
#USER_AGENT = 'taitung (+http://www.yourdomain.com)' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# This package will contain the spiders of your Scrapy project | ||
# | ||
# Please refer to the documentation for information on how to create and manage | ||
# your spiders. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
{ | ||
"第1選區": "台東市、綠島鄉、蘭嶼鄉", | ||
"第2選區": "卑南鄉、延平鄉", | ||
"第3選區": "成功鎮、東河鄉、長濱鄉", | ||
"第4選區": "關山鎮、鹿野鄉、池上鄉、海端鄉", | ||
"第5選區": "大武鄉、達仁鄉、金峰鄉、太麻里鄉", | ||
"第6選區": "平地原住民(臺東市)", | ||
"第7選區": "平地原住民(大武鄉、達仁鄉、金峰鄉、太麻里鄉、蘭嶼鄉)", | ||
"第8選區": "平地原住民(關山鎮、池上鄉、鹿野鄉、海端鄉、延平鄉)", | ||
"第9選區": "平地原住民(成功鎮、東河鄉、長濱鄉)", | ||
"第10選區": "山地原住民(成功鎮、卑南鄉、延平鄉、東河鄉、長濱鄉)", | ||
"第11選區": "山地原住民(關山鎮、池上鄉、鹿野鄉、海端鄉)", | ||
"第12選區": "山地原住民(金峰鄉、太麻里鄉)", | ||
"第13選區": "山地原住民(大武鄉、達仁鄉)", | ||
"第14選區": "山地原住民(臺東市、綠島鄉、蘭嶼鄉)" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# -*- coding: utf-8 -*- | ||
import re | ||
import os | ||
import json | ||
import urllib | ||
from urlparse import urljoin | ||
import scrapy | ||
from scrapy.http import Request, FormRequest | ||
from scrapy.selector import Selector | ||
from taitungcc.items import Councilor | ||
|
||
|
||
class Spider(scrapy.Spider): | ||
name = "councilors" | ||
allowed_domains = ["www.taitungcc.gov.tw"] | ||
start_urls = ["http://www.taitungcc.gov.tw/ourteam/%02dourteam.html" % x for x in range(2, 16)] | ||
download_delay = 0.5 | ||
|
||
def __init__(self): | ||
fh = open(os.path.join(os.path.dirname(__file__), 'constituency.json'), 'r') | ||
self.constituency = json.loads(fh.read()) | ||
|
||
def parse(self, response): | ||
for node in response.xpath('//td[@class="ourteam01"]/a'): | ||
item = Councilor() | ||
item['name'] = node.xpath('text()').extract()[0] | ||
item['constituency'] = u'第%d選區' % (int(re.search('(\d+)ourteam', response.url).group(1))-1) | ||
item['district'] = self.constituency[item['constituency']] | ||
yield Request(urljoin(response.url, node.xpath('@href').extract()[0]), callback=self.parse_profile, meta={'item': item}) | ||
|
||
def parse_profile(self, response): | ||
item = response.request.meta['item'] | ||
item['county'] = u'臺東縣' | ||
item['election_year'] = '2009' | ||
item['term_start'] = '%s-12-25' % item['election_year'] | ||
item['term_end'] = {'date': "2014-12-25"} | ||
item['in_office'] = True | ||
item['links'] = [{'url': response.url, 'note': u'議會個人官網'}] | ||
item['image'] = urljoin(response.url, response.xpath('//td[@background="images/page02_pic02_013.jpg"]/img/@src').extract()[0]) | ||
item['contact_details'] = [] | ||
for x in response.xpath('//img[contains(@src, "page05_pic02_002.jpg")]'): | ||
label = ''.join([y.strip() for y in x.xpath('../descendant::text()').extract()]) | ||
value = x.xpath('../following-sibling::td[1]/descendant::text()').extract() | ||
if value: | ||
if re.search(u'性別', label): | ||
item['gender'] = value[0] | ||
elif re.search(u'黨籍', label): | ||
item['party'] = value[0] | ||
elif re.search(u'電話', label): | ||
item['contact_details'].append({"label" : u'電話', "type" : "voice", "value" : value[0]}) | ||
elif re.search(u'傳真', label): | ||
item['contact_details'].append({"label" : u'傳真', "type" : "fax", "value" : value[0]}) | ||
elif re.search(u'地址', label): | ||
item['contact_details'].append({"label" : u'地址', "type" : "address", "value" : value[0]}) | ||
elif re.search(u'E-mail', label): | ||
item['contact_details'].append({"label" : u'E-mail', "type" : "email", "value" : value[0]}) | ||
for td in response.xpath(u'//td//strong'): | ||
label = td.xpath('text()').extract()[0].strip() | ||
value = [line.strip() for line in td.xpath('ancestor::td/text()').extract()] | ||
if re.search(u'學歷', label): | ||
item['education'] = value | ||
elif re.search(u'經歷', label): | ||
item['experience'] = value | ||
elif re.search(u'政見', label): | ||
item['platform'] = value | ||
return item |
Oops, something went wrong.