Skip to content

Commit

Permalink
Merge pull request #3 from everypolitician-scrapers/japanese-pages
Browse files Browse the repository at this point in the history
Also scrape Japanese data, and merge it with English data
  • Loading branch information
tmtmtmtm committed Jan 5, 2017
2 parents 55c11bd + 1edfc8e commit 6ac9ae7
Showing 1 changed file with 84 additions and 8 deletions.
92 changes: 84 additions & 8 deletions scraper.rb
Expand Up @@ -11,11 +11,11 @@
# OpenURI::Cache.cache_path = '.cache'
require 'scraped_page_archive/open-uri'

class LetterListPage < Scraped::HTML
class LetterListPageEn < Scraped::HTML
decorator Scraped::Response::Decorator::AbsoluteUrls

field :members do
wanted_rows.map { |tr| fragment tr => MemberRow }
wanted_rows.map { |tr| fragment tr => MemberRowEn }
end

field :letter_pages do
Expand All @@ -33,7 +33,11 @@ def wanted_rows
end
end

class MemberRow < Scraped::HTML
class MemberRowEn < Scraped::HTML
field :id do
File.basename(image, '.jpg')
end

field :name do
raw_name.gsub(/M[rs]\./, '').tidy
end
Expand All @@ -52,7 +56,7 @@ class MemberRow < Scraped::HTML
end

field :area do
tds[3].text
tds[3].text.tidy
end

field :term do
Expand All @@ -74,15 +78,87 @@ def raw_name
end
end

class LetterListPageJp < Scraped::HTML
decorator Scraped::Response::Decorator::AbsoluteUrls

field :members do
noko.xpath('//tr[td[@class="sh1td5"]]').map do |tr|
fragment tr => MemberRowJp
end
end

field :letter_pages do
noko.xpath('//div[@id="breadcrumb"]/following-sibling::p/a/@href').map(&:text) - [url]
end
end

class MemberRowJp < Scraped::HTML
field :id do
File.basename(source, '.html')
end

field :name__jp do
tds[0].text.tidy
end

field :name__jp_hiragana do
tds[1].text.tidy
end

field :faction__jp do
tds[2].text.tidy
end

field :area__jp do
tds[3].text.tidy
end

field :source do
tds[0].css('a/@href').text
end

private

def tds
noko.css('td')
end
end

class MemberPageJp < Scraped::HTML
decorator Scraped::Response::Decorator::AbsoluteUrls

field :image do
noko.css('#photo img/@src').text
end
end

def scrape(h)
url, klass = h.to_a.first
klass.new(response: Scraped::Request.new(url: url).response)
end

start = 'http://www.shugiin.go.jp/internet/itdb_english.nsf/html/statics/member/mem_a.htm'
front = scrape start => LetterListPage
pages = [front, front.letter_pages.map { |url| scrape url => LetterListPage }].flatten
def english_data
start = 'http://www.shugiin.go.jp/internet/itdb_english.nsf/html/statics/member/mem_a.htm'
front = scrape start => LetterListPageEn
pages = [front, front.letter_pages.map { |url| scrape url => LetterListPageEn }].flatten
pages.flat_map(&:members).map(&:to_h)
end

def japanese_data
start = 'http://www.shugiin.go.jp/internet/itdb_annai.nsf/html/statics/syu/1giin.htm'
front = scrape start => LetterListPageJp
pages = [front, front.letter_pages.map { |url| scrape url => LetterListPageJp }].flatten
pages.flat_map(&:members).map do |mem|
mem.to_h.merge(scrape(mem.source => MemberPageJp).to_h)
end
end

jp_data = japanese_data.group_by { |h| h[:id] }

data = english_data.map do |en_mem|
jp_mem = jp_data[en_mem[:id]] or raise binding.pry
en_mem.merge(jp_mem.first)
end

data = pages.flat_map(&:members).map(&:to_h)
# puts data
ScraperWiki.save_sqlite(%i(name area), data)

0 comments on commit 6ac9ae7

Please sign in to comment.