Permalink
Browse files

grab DOB and email from details page, store more name info

use details page as source URL
tidy up name munging and refactor to a function
store original name, given name etc
store original party ID
use source URL to generate id ( although unicode :/ )
  • Loading branch information...
struan committed Sep 9, 2015
1 parent c6f02f2 commit 029c75d8055fffebdd6c2b900aaffb3e4d8b2bf9
Showing with 62 additions and 15 deletions.
  1. +62 −15 scraper.rb
View
@@ -4,6 +4,7 @@
require 'scraperwiki'
require 'nokogiri'
require 'open-uri/cached'
require 'date'
OpenURI::Cache.cache_path = '.cache'
@@ -31,30 +32,76 @@ def expand_party(party)
return party
end
def scrape_list(url)
def get_name_parts(tds)
sort_name = tds[0].css('a').text
name_parts = tds[0].css('a').text.split(',')
first_name = tds[1].css('span').text
last_name = name_parts.first
middle_names = name_parts.last.split('.').drop(1)
middle_name = middle_names.join(' ')
name = [first_name, middle_name, last_name].join(' ')
name = name.strip.gsub(/\s+/, " ")
return {
name: name,
sort_name: sort_name,
family_name: last_name,
given_name: first_name,
}
end
def scrape_list(url, base_url)
noko = noko_for(url)
noko.css('table.member-list tbody tr').each do |tr|
tds = tr.css('td')
next if tds.size == 1
name_parts = tds[0].css('a').text.split(',')
first_name = tds[1].css('span').text
last_name = name_parts.first
middle_names = name_parts.last.split('.').drop(1)
middle_name = middle_names.join(' ')
name = [first_name, middle_name, last_name].join(' ')
name = name.strip.gsub(/\s+/, " ")
faction = expand_party( tds[2].css('span').text )
name_parts = get_name_parts(tds)
faction_id = tds[2].css('span').text
faction = expand_party(faction_id)
data_rel = tds[0].css('a/@data-rel').text
img = noko.css('div.' + data_rel + ' img/@src').text
extra_div = noko.css('div.' + data_rel )
img = extra_div.css('img/@src').text
extra_url = URI.join(base_url, extra_div.css('a/@href').text.to_s)
extra_data = get_extra_data(extra_url)
data = {
name: name,
id: extra_url.to_s.split('/').last,
name: name_parts[:name],
family_name: name_parts[:family_name],
given_name: name_parts[:given_name],
sort_name: name_parts[:sort_name],
faction_id: faction_id,
faction: faction,
gender: tds[5].css('span').text.downcase,
img: 'http://www.houseofrepresentatives.nl' + img,
source: url
img: URI.join(base_url, img.to_s).to_s,
dob: extra_data[:dob],
email: extra_data[:email],
source: extra_url.to_s
}
ScraperWiki.save_sqlite([:name, :faction], data)
puts data
ScraperWiki.save_sqlite([:id], data)
end
end
scrape_list('http://www.houseofrepresentatives.nl/members_of_parliament/members_of_parliament')
def get_extra_data(url)
noko = noko_for(url)
email = noko.css('div.box-contact a').text
details = noko.css('#passport dl')
dob = details.xpath('//dl/dt[contains(.,"Date of birth")]/following-sibling::dd[not(position() > 1)]/text()')
dob = Date.parse(dob.to_s)
details = {
email: email,
dob: dob.to_s
}
return details
end
scrape_list('http://www.houseofrepresentatives.nl/members_of_parliament/members_of_parliament', 'http://www.houseofrepresentatives.nl')

0 comments on commit 029c75d

Please sign in to comment.